diff --git a/.planning/phases/04-input-sources/04-01-PLAN.md b/.planning/phases/04-input-sources/04-01-PLAN.md new file mode 100644 index 0000000..b1f8f9c --- /dev/null +++ b/.planning/phases/04-input-sources/04-01-PLAN.md @@ -0,0 +1,114 @@ +--- +phase: 04-input-sources +plan: 01 +type: execute +wave: 0 +depends_on: [] +files_modified: + - go.mod + - go.sum +autonomous: true +requirements: [] +must_haves: + truths: + - "go-git/v5, atotto/clipboard, x/exp/mmap are available as imports" + - "go build ./... succeeds with new dependencies" + artifacts: + - path: "go.mod" + provides: "Module declarations for go-git, clipboard, and x/exp" + contains: "github.com/go-git/go-git/v5" + - path: "go.sum" + provides: "Checksums for added dependencies" + key_links: + - from: "go.mod" + to: "module cache" + via: "go mod tidy" + pattern: "go-git/go-git/v5" +--- + + +Add the three external Go dependencies that Phase 4 input sources require: +- `github.com/go-git/go-git/v5` — git history traversal (INPUT-02) +- `github.com/atotto/clipboard` — cross-platform clipboard access (INPUT-05) +- `golang.org/x/exp/mmap` — memory-mapped large file reads (CORE-07) + +Purpose: Wave 0 dependency bootstrap so the parallel source implementation plans (04-02, 04-03, 04-04) compile cleanly on first attempt with no dependency resolution thrash. +Output: Updated go.mod and go.sum with all three modules resolved. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/04-input-sources/04-CONTEXT.md +@go.mod + + + + + + Task 1: Add go-git, clipboard, and x/exp/mmap dependencies + + - go.mod + - .planning/phases/04-input-sources/04-CONTEXT.md + + go.mod, go.sum + +Run the following commands from the repo root in order: + +```bash +go get github.com/go-git/go-git/v5@latest +go get github.com/atotto/clipboard@latest +go get golang.org/x/exp/mmap@latest +go mod tidy +go build ./... +``` + +Verify the `require` block in go.mod now contains direct entries (non-indirect) for: + +``` +github.com/go-git/go-git/v5 vX.Y.Z +github.com/atotto/clipboard vX.Y.Z +golang.org/x/exp vYYYYMMDD-hash +``` + +If `go build ./...` fails, do NOT try to fix anything beyond the dependency graph — unrelated build failures must be surfaced. If `go mod tidy` moves a module to indirect, that is acceptable only if no source file yet imports it; the follow-on plans in Wave 1 will promote them to direct. + +Do NOT modify any source files in this plan. This is dependency bootstrap only. + + + go build ./... && grep -E "go-git/go-git/v5|atotto/clipboard|golang.org/x/exp" go.mod + + + - `grep "github.com/go-git/go-git/v5" go.mod` returns a match + - `grep "github.com/atotto/clipboard" go.mod` returns a match + - `grep "golang.org/x/exp" go.mod` returns a match + - `go build ./...` exits 0 + - `go.sum` contains entries for all three modules + + All three new modules are present in go.mod, go.sum has their checksums, and `go build ./...` succeeds. + + + + + +- `go build ./...` succeeds +- `go vet ./...` succeeds +- `grep -c "go-git/go-git/v5\|atotto/clipboard\|golang.org/x/exp" go.mod` returns 3 or more + + + +Dependencies resolved and build is green. Wave 1 plans can import from these modules without needing their own `go get` calls. + + + +After completion, create `.planning/phases/04-input-sources/04-01-SUMMARY.md` with: +- Resolved version numbers for the three modules +- Any warnings from `go mod tidy` +- Confirmation that `go build ./...` passed + diff --git a/.planning/phases/04-input-sources/04-02-PLAN.md b/.planning/phases/04-input-sources/04-02-PLAN.md new file mode 100644 index 0000000..4d41893 --- /dev/null +++ b/.planning/phases/04-input-sources/04-02-PLAN.md @@ -0,0 +1,573 @@ +--- +phase: 04-input-sources +plan: 02 +type: execute +wave: 1 +depends_on: ["04-01"] +files_modified: + - pkg/engine/sources/dir.go + - pkg/engine/sources/dir_test.go + - pkg/engine/sources/file.go + - pkg/engine/sources/file_test.go +autonomous: true +requirements: + - INPUT-01 + - CORE-07 +must_haves: + truths: + - "DirSource recursively walks a directory and emits Chunks for every non-excluded file" + - "Glob exclusion patterns (--exclude) skip matching files by basename AND full relative path" + - "Default exclusions skip .git/, node_modules/, vendor/, *.min.js, *.map" + - "Binary files (null byte in first 512 bytes) are skipped" + - "Files larger than the mmap threshold (10MB) are read via golang.org/x/exp/mmap, smaller files via os.ReadFile" + - "File emission order is deterministic (sorted) for reproducible tests" + artifacts: + - path: "pkg/engine/sources/dir.go" + provides: "DirSource implementing Source interface for recursive directory scanning" + exports: ["DirSource", "NewDirSource"] + min_lines: 120 + - path: "pkg/engine/sources/dir_test.go" + provides: "Test coverage for recursive walk, exclusion, binary skip, mmap threshold" + min_lines: 100 + - path: "pkg/engine/sources/file.go" + provides: "FileSource extended to use mmap for files > 10MB" + contains: "mmap" + key_links: + - from: "pkg/engine/sources/dir.go" + to: "golang.org/x/exp/mmap" + via: "mmap.Open for large files" + pattern: "mmap\\.Open" + - from: "pkg/engine/sources/dir.go" + to: "filepath.WalkDir" + via: "recursive traversal" + pattern: "filepath\\.WalkDir" + - from: "pkg/engine/sources/dir.go" + to: "types.Chunk" + via: "channel send" + pattern: "out <- types\\.Chunk" +--- + + +Implement `DirSource` — a recursive directory scanner that walks a root path via `filepath.WalkDir`, honors glob exclusion patterns, detects and skips binary files, and uses memory-mapped I/O for large files. This satisfies INPUT-01 (directory/recursive scanning with exclusions) and CORE-07 (mmap large file reading). + +Purpose: The most common scan target is a repo directory, not a single file. This plan replaces the "wrap FileSource per path" hack with a purpose-built recursive source that emits deterministically ordered chunks and scales to multi-GB files without blowing out memory. +Output: `pkg/engine/sources/dir.go`, `dir_test.go`, plus a small `file.go` update to share the mmap read helper. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/phases/04-input-sources/04-CONTEXT.md +@pkg/engine/sources/source.go +@pkg/engine/sources/file.go +@pkg/types/chunk.go + + +Source interface (pkg/engine/sources/source.go): +```go +type Source interface { + Chunks(ctx context.Context, out chan<- types.Chunk) error +} +``` + +Chunk type (pkg/types/chunk.go): +```go +type Chunk struct { + Data []byte + Source string + Offset int64 +} +``` + +Existing constants in pkg/engine/sources/file.go: +```go +const defaultChunkSize = 4096 +const chunkOverlap = 256 +``` + + + + + + + Task 1: Implement DirSource with recursive walk, exclusion, binary detection, and mmap + + - pkg/engine/sources/source.go + - pkg/engine/sources/file.go + - pkg/types/chunk.go + - .planning/phases/04-input-sources/04-CONTEXT.md (Directory/File Scanning section) + + + pkg/engine/sources/dir.go, + pkg/engine/sources/dir_test.go, + pkg/engine/sources/file.go + + + - Test 1: DirSource walks a temp dir containing 3 text files, emits 3 chunks, source fields match file paths + - Test 2: Default exclusions skip `.git/config`, `node_modules/foo.js`, `vendor/bar.go`, `app.min.js`, `app.js.map` + - Test 3: User-supplied exclude pattern `*.log` skips `foo.log` but keeps `foo.txt` + - Test 4: Binary file (first 512 bytes contain a null byte) is skipped; text file is emitted + - Test 5: File >10MB is read via mmap path and emits chunks whose concatenated data equals file content + - Test 6: File emission order is deterministic (sorted lexicographically) across two runs on same dir + - Test 7: ctx cancellation mid-walk returns ctx.Err() promptly + - Test 8: Non-existent root returns an error + + +Create `pkg/engine/sources/dir.go` with the following complete implementation: + +```go +package sources + +import ( + "bytes" + "context" + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "sort" + "strings" + + "golang.org/x/exp/mmap" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// MmapThreshold is the file size above which DirSource/FileSource use memory-mapped reads. +const MmapThreshold int64 = 10 * 1024 * 1024 // 10 MB + +// BinarySniffSize is the number of leading bytes inspected for a NUL byte +// to classify a file as binary and skip it. +const BinarySniffSize = 512 + +// DefaultExcludes are glob patterns excluded from directory scans unless +// the caller passes an empty slice explicitly via NewDirSourceRaw. +var DefaultExcludes = []string{ + ".git/**", + "node_modules/**", + "vendor/**", + "*.min.js", + "*.map", +} + +// DirSource walks a directory recursively and emits Chunks for every +// non-excluded, non-binary file it finds. Files larger than MmapThreshold +// are read via mmap; smaller files use os.ReadFile. +type DirSource struct { + Root string + Excludes []string // glob patterns applied to path basename AND full relative path + ChunkSize int +} + +// NewDirSource creates a DirSource with the default exclusions merged +// with the caller-supplied extras. +func NewDirSource(root string, extraExcludes ...string) *DirSource { + merged := make([]string, 0, len(DefaultExcludes)+len(extraExcludes)) + merged = append(merged, DefaultExcludes...) + merged = append(merged, extraExcludes...) + return &DirSource{Root: root, Excludes: merged, ChunkSize: defaultChunkSize} +} + +// NewDirSourceRaw creates a DirSource with ONLY the caller-supplied excludes +// (no defaults). Useful for tests and advanced users. +func NewDirSourceRaw(root string, excludes []string) *DirSource { + return &DirSource{Root: root, Excludes: excludes, ChunkSize: defaultChunkSize} +} + +// Chunks implements Source. It walks d.Root, filters excluded and binary +// files, reads each remaining file (via mmap above MmapThreshold), and +// emits overlapping chunks through out. +func (d *DirSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + if d.Root == "" { + return errors.New("DirSource: Root is empty") + } + info, err := os.Stat(d.Root) + if err != nil { + return fmt.Errorf("DirSource: stat root: %w", err) + } + if !info.IsDir() { + return fmt.Errorf("DirSource: root %q is not a directory", d.Root) + } + + // Collect paths first for deterministic ordering across runs. + var paths []string + err = filepath.WalkDir(d.Root, func(path string, de fs.DirEntry, werr error) error { + if werr != nil { + return werr + } + if de.IsDir() { + rel, _ := filepath.Rel(d.Root, path) + if d.isExcluded(rel, de.Name()) { + return filepath.SkipDir + } + return nil + } + rel, _ := filepath.Rel(d.Root, path) + if d.isExcluded(rel, de.Name()) { + return nil + } + paths = append(paths, path) + return nil + }) + if err != nil { + return fmt.Errorf("DirSource: walk: %w", err) + } + sort.Strings(paths) + + for _, p := range paths { + if err := ctx.Err(); err != nil { + return err + } + if err := d.emitFile(ctx, p, out); err != nil { + // Per-file errors are non-fatal: continue walking, but respect ctx. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + // Swallow per-file errors; the engine logs elsewhere. + continue + } + } + return nil +} + +// isExcluded returns true if either the relative path or the basename matches +// any configured glob pattern. +func (d *DirSource) isExcluded(rel, base string) bool { + rel = filepath.ToSlash(rel) + for _, pat := range d.Excludes { + pat = filepath.ToSlash(pat) + // Match against basename. + if ok, _ := filepath.Match(pat, base); ok { + return true + } + // Match against full relative path. + if ok, _ := filepath.Match(pat, rel); ok { + return true + } + // `dir/**` style — naive prefix match against the leading segment. + if strings.HasSuffix(pat, "/**") { + prefix := strings.TrimSuffix(pat, "/**") + if rel == prefix || strings.HasPrefix(rel, prefix+"/") { + return true + } + } + } + return false +} + +// emitFile reads a single file and pushes its chunks onto out. +func (d *DirSource) emitFile(ctx context.Context, path string, out chan<- types.Chunk) error { + fi, err := os.Stat(path) + if err != nil { + return err + } + size := fi.Size() + if size == 0 { + return nil + } + + var data []byte + if size >= MmapThreshold { + ra, err := mmap.Open(path) + if err != nil { + return fmt.Errorf("mmap open %s: %w", path, err) + } + defer ra.Close() + data = make([]byte, ra.Len()) + if _, err := ra.ReadAt(data, 0); err != nil { + return fmt.Errorf("mmap read %s: %w", path, err) + } + } else { + data, err = os.ReadFile(path) + if err != nil { + return err + } + } + + if isBinary(data) { + return nil + } + return emitChunks(ctx, data, path, d.ChunkSize, out) +} + +// isBinary reports whether the leading BinarySniffSize bytes contain a NUL byte. +func isBinary(data []byte) bool { + n := len(data) + if n > BinarySniffSize { + n = BinarySniffSize + } + return bytes.IndexByte(data[:n], 0x00) >= 0 +} + +// emitChunks is the shared overlapping-chunk emitter used by FileSource and DirSource. +func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error { + if chunkSize <= 0 { + chunkSize = defaultChunkSize + } + if len(data) <= chunkSize { + select { + case <-ctx.Done(): + return ctx.Err() + case out <- types.Chunk{Data: data, Source: source, Offset: 0}: + } + return nil + } + var offset int64 + for start := 0; start < len(data); start += chunkSize - chunkOverlap { + end := start + chunkSize + if end > len(data) { + end = len(data) + } + select { + case <-ctx.Done(): + return ctx.Err() + case out <- types.Chunk{Data: data[start:end], Source: source, Offset: offset}: + } + offset += int64(end - start) + if end == len(data) { + break + } + } + return nil +} +``` + +Update `pkg/engine/sources/file.go` so FileSource reuses `emitChunks` and adopts the same mmap threshold for large single-file scans: + +```go +package sources + +import ( + "context" + "os" + + "golang.org/x/exp/mmap" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +const defaultChunkSize = 4096 +const chunkOverlap = 256 + +// FileSource reads a single file and emits overlapping chunks. +// For files >= MmapThreshold it uses golang.org/x/exp/mmap. +type FileSource struct { + Path string + ChunkSize int +} + +func NewFileSource(path string) *FileSource { + return &FileSource{Path: path, ChunkSize: defaultChunkSize} +} + +func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + fi, err := os.Stat(f.Path) + if err != nil { + return err + } + size := fi.Size() + if size == 0 { + return nil + } + var data []byte + if size >= MmapThreshold { + ra, err := mmap.Open(f.Path) + if err != nil { + return err + } + defer ra.Close() + data = make([]byte, ra.Len()) + if _, err := ra.ReadAt(data, 0); err != nil { + return err + } + } else { + data, err = os.ReadFile(f.Path) + if err != nil { + return err + } + } + if isBinary(data) { + return nil + } + return emitChunks(ctx, data, f.Path, f.ChunkSize, out) +} +``` + +Create `pkg/engine/sources/dir_test.go` with a comprehensive suite: + +```go +package sources + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +func drain(t *testing.T, src Source) []types.Chunk { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + out := make(chan types.Chunk, 1024) + errCh := make(chan error, 1) + go func() { errCh <- src.Chunks(ctx, out); close(out) }() + var got []types.Chunk + for c := range out { + got = append(got, c) + } + require.NoError(t, <-errCh) + return got +} + +func writeFile(t *testing.T, path, content string) { + t.Helper() + require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o755)) + require.NoError(t, os.WriteFile(path, []byte(content), 0o644)) +} + +func TestDirSource_RecursiveWalk(t *testing.T) { + root := t.TempDir() + writeFile(t, filepath.Join(root, "a.txt"), "alpha content") + writeFile(t, filepath.Join(root, "sub", "b.txt"), "bravo content") + writeFile(t, filepath.Join(root, "sub", "deep", "c.txt"), "charlie content") + + chunks := drain(t, NewDirSourceRaw(root, nil)) + require.Len(t, chunks, 3) + + sources := make([]string, 0, len(chunks)) + for _, c := range chunks { + sources = append(sources, c.Source) + } + // Deterministic sorted order. + require.True(t, sort_IsSorted(sources), "emission order must be sorted, got %v", sources) +} + +func sort_IsSorted(s []string) bool { + for i := 1; i < len(s); i++ { + if s[i-1] > s[i] { + return false + } + } + return true +} + +func TestDirSource_DefaultExcludes(t *testing.T) { + root := t.TempDir() + writeFile(t, filepath.Join(root, "keep.txt"), "keep me") + writeFile(t, filepath.Join(root, ".git", "config"), "[core]") + writeFile(t, filepath.Join(root, "node_modules", "foo.js"), "x") + writeFile(t, filepath.Join(root, "vendor", "bar.go"), "package x") + writeFile(t, filepath.Join(root, "app.min.js"), "y") + writeFile(t, filepath.Join(root, "app.js.map"), "{}") + + chunks := drain(t, NewDirSource(root)) + require.Len(t, chunks, 1) + require.Contains(t, chunks[0].Source, "keep.txt") +} + +func TestDirSource_UserExclude(t *testing.T) { + root := t.TempDir() + writeFile(t, filepath.Join(root, "keep.txt"), "keep") + writeFile(t, filepath.Join(root, "drop.log"), "drop") + + chunks := drain(t, NewDirSourceRaw(root, []string{"*.log"})) + require.Len(t, chunks, 1) + require.Contains(t, chunks[0].Source, "keep.txt") +} + +func TestDirSource_BinarySkipped(t *testing.T) { + root := t.TempDir() + writeFile(t, filepath.Join(root, "text.txt"), "plain text content") + binPath := filepath.Join(root, "blob.bin") + require.NoError(t, os.WriteFile(binPath, []byte{0x7f, 'E', 'L', 'F', 0x00, 0x01, 0x02}, 0o644)) + + chunks := drain(t, NewDirSourceRaw(root, nil)) + require.Len(t, chunks, 1) + require.Contains(t, chunks[0].Source, "text.txt") +} + +func TestDirSource_MmapLargeFile(t *testing.T) { + if testing.Short() { + t.Skip("skipping large file test in short mode") + } + root := t.TempDir() + big := filepath.Join(root, "big.txt") + // Construct a payload slightly above MmapThreshold. + payload := strings.Repeat("API_KEY=xxxxxxxxxxxxxxxxxxxx\n", (int(MmapThreshold)/28)+10) + require.NoError(t, os.WriteFile(big, []byte(payload), 0o644)) + + chunks := drain(t, NewDirSourceRaw(root, nil)) + // Reconstruct data accounting for chunk overlap. + require.NotEmpty(t, chunks) + require.Equal(t, big, chunks[0].Source) +} + +func TestDirSource_MissingRoot(t *testing.T) { + src := NewDirSourceRaw("/definitely/does/not/exist/keyhunter-xyz", nil) + ctx := context.Background() + out := make(chan types.Chunk, 1) + err := src.Chunks(ctx, out) + require.Error(t, err) +} + +func TestDirSource_CtxCancellation(t *testing.T) { + root := t.TempDir() + for i := 0; i < 50; i++ { + writeFile(t, filepath.Join(root, "f", string(rune('a'+i%26))+".txt"), "payload") + } + ctx, cancel := context.WithCancel(context.Background()) + cancel() // pre-cancelled + out := make(chan types.Chunk, 1024) + err := NewDirSourceRaw(root, nil).Chunks(ctx, out) + require.ErrorIs(t, err, context.Canceled) +} +``` + +Also add a minimal update to `pkg/engine/sources/file_test.go` if it exists — if not present, skip. Do NOT alter any other source files in this plan. + + + go test ./pkg/engine/sources/... -run 'TestDirSource|TestFileSource' -race -count=1 + + + - `go build ./pkg/engine/sources/...` exits 0 + - `go test ./pkg/engine/sources/... -run TestDirSource -race -count=1` passes all subtests + - `grep -n "mmap.Open" pkg/engine/sources/dir.go pkg/engine/sources/file.go` returns two hits + - `grep -n "filepath.WalkDir" pkg/engine/sources/dir.go` returns a hit + - `grep -n "DefaultExcludes" pkg/engine/sources/dir.go` returns a hit + - `grep -n "isBinary" pkg/engine/sources/dir.go` returns a hit + + + DirSource implements Source, walks recursively, honors default and user glob exclusions, skips binary files, and uses mmap above 10MB. FileSource refactored to share the same mmap/emit helpers. All tests green under -race. + + + + + + +- `go test ./pkg/engine/sources/... -race -count=1` passes +- `go vet ./pkg/engine/sources/...` clean +- All acceptance criteria grep matches hit + + + +A caller can create `sources.NewDirSource("./myrepo", "*.log")` and receive chunks for every non-excluded, non-binary file in deterministic order, with files >10MB read via mmap. + + + +After completion, create `.planning/phases/04-input-sources/04-02-SUMMARY.md` documenting: +- File list with line counts +- Test names and pass status +- Any deviations from the planned exclude semantics (e.g., `**` handling) + diff --git a/.planning/phases/04-input-sources/04-03-PLAN.md b/.planning/phases/04-input-sources/04-03-PLAN.md new file mode 100644 index 0000000..a25b8b9 --- /dev/null +++ b/.planning/phases/04-input-sources/04-03-PLAN.md @@ -0,0 +1,456 @@ +--- +phase: 04-input-sources +plan: 03 +type: execute +wave: 1 +depends_on: ["04-01"] +files_modified: + - pkg/engine/sources/git.go + - pkg/engine/sources/git_test.go +autonomous: true +requirements: + - INPUT-02 +must_haves: + truths: + - "GitSource opens a local git repo via go-git and iterates commits on all branches and tags" + - "Each unique blob (by OID) is scanned exactly once — duplicate blobs across commits are skipped" + - "Finding.Source is formatted as 'git::' for every emitted chunk" + - "--since filter (passed via GitSource.Since time.Time) excludes commits older than the cutoff" + - "Bare repos and regular repos with worktrees both work" + artifacts: + - path: "pkg/engine/sources/git.go" + provides: "GitSource implementing Source interface via go-git/v5" + exports: ["GitSource", "NewGitSource"] + min_lines: 120 + - path: "pkg/engine/sources/git_test.go" + provides: "Tests using an in-process go-git repo fixture" + min_lines: 100 + key_links: + - from: "pkg/engine/sources/git.go" + to: "github.com/go-git/go-git/v5" + via: "git.PlainOpen" + pattern: "git\\.PlainOpen" + - from: "pkg/engine/sources/git.go" + to: "repo.References" + via: "iterating refs/heads + refs/tags" + pattern: "References\\(\\)" + - from: "pkg/engine/sources/git.go" + to: "types.Chunk" + via: "channel send with git:sha:path source" + pattern: "git:" +--- + + +Implement `GitSource` — a git-history-aware input adapter that walks every commit across every branch and tag in a local repository, deduplicates blob scans by OID, and emits chunks with commit-SHA-prefixed source identifiers. Satisfies INPUT-02. + +Purpose: Leaked keys often exist only in git history — deleted from HEAD but still reachable via old commits. A one-shot HEAD scan misses them. This source walks the full commit graph using `go-git/v5` with blob-level deduplication so a 10k-commit repo with 200k historical files scans in minutes, not hours. +Output: `pkg/engine/sources/git.go` and `git_test.go`. Wired into CLI in plan 04-05. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/phases/04-input-sources/04-CONTEXT.md +@pkg/engine/sources/source.go +@pkg/types/chunk.go + + +Source interface: +```go +type Source interface { + Chunks(ctx context.Context, out chan<- types.Chunk) error +} +``` + +Chunk struct: +```go +type Chunk struct { + Data []byte + Source string // will be "git::" + Offset int64 +} +``` + +Relevant go-git/v5 APIs (from https://pkg.go.dev/github.com/go-git/go-git/v5): +```go +import "github.com/go-git/go-git/v5" +import "github.com/go-git/go-git/v5/plumbing" +import "github.com/go-git/go-git/v5/plumbing/object" + +repo, err := git.PlainOpen(path) // opens local repo +refs, err := repo.References() // iterator over refs +refs.ForEach(func(*plumbing.Reference) error { }) // walk refs +commit, err := repo.CommitObject(hash) // resolve commit +iter, err := repo.Log(&git.LogOptions{From: hash, All: false}) +iter.ForEach(func(*object.Commit) error { }) // walk commits +tree, err := commit.Tree() +tree.Files().ForEach(func(*object.File) error { }) // walk blobs +file.Contents() // returns (string, error) +file.Binary() // (bool, error) +file.Hash // plumbing.Hash (blob OID) +``` + +emitChunks helper from 04-02 plan (pkg/engine/sources/dir.go) — reuse: +```go +func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error +``` + + + + + + + Task 1: Implement GitSource with full-history traversal and blob deduplication + + - pkg/engine/sources/source.go + - pkg/engine/sources/dir.go (for emitChunks helper — produced by plan 04-02) + - pkg/types/chunk.go + - .planning/phases/04-input-sources/04-CONTEXT.md (Git History section) + + + pkg/engine/sources/git.go, + pkg/engine/sources/git_test.go + + + - Test 1: GitSource on a fresh repo with 3 commits (each adding a file) emits exactly 3 unique blob scans + - Test 2: Second commit modifying file A creates a new blob — both old and new versions are scanned + - Test 3: Duplicate blob (same content in two files on same commit) is scanned once (dedup by OID) + - Test 4: Multi-branch repo — branch A with file X, branch B with file Y — both are scanned + - Test 5: Tag pointing to an old commit makes that commit's blobs reachable + - Test 6: Since filter set to "now + 1 hour" emits zero chunks + - Test 7: Finding.Source field matches pattern `git:[0-9a-f]{7}:.*` + - Test 8: Non-existent repo path returns an error + + +Create `pkg/engine/sources/git.go`: + +```go +package sources + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "time" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing" + "github.com/go-git/go-git/v5/plumbing/object" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// GitSource scans the full history of a local git repository: every commit +// on every branch and tag, deduplicating blob scans by OID. +type GitSource struct { + // RepoPath is the path to the local git repo (working tree or bare). + RepoPath string + // Since, if non-zero, excludes commits older than this timestamp + // (using commit author date). + Since time.Time + // ChunkSize is the overlap-chunker size; zero uses defaultChunkSize. + ChunkSize int +} + +// NewGitSource creates a GitSource for the given repo path. +func NewGitSource(repoPath string) *GitSource { + return &GitSource{RepoPath: repoPath, ChunkSize: defaultChunkSize} +} + +// Chunks walks every commit reachable from every branch, tag, and the +// stash ref (if present), streaming each unique blob's content through +// the shared emitChunks helper. +func (g *GitSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + if g.RepoPath == "" { + return errors.New("GitSource: RepoPath is empty") + } + repo, err := git.PlainOpen(g.RepoPath) + if err != nil { + return fmt.Errorf("GitSource: open %q: %w", g.RepoPath, err) + } + + // Collect commit hashes to walk from every ref under refs/heads, refs/tags, refs/stash. + seedCommits, err := collectSeedCommits(repo) + if err != nil { + return fmt.Errorf("GitSource: collect refs: %w", err) + } + if len(seedCommits) == 0 { + return nil // empty repo is not an error + } + + seenCommits := make(map[plumbing.Hash]struct{}) + seenBlobs := make(map[plumbing.Hash]struct{}) + + for _, seed := range seedCommits { + if err := ctx.Err(); err != nil { + return err + } + iter, err := repo.Log(&git.LogOptions{From: seed, All: false}) + if err != nil { + continue + } + err = iter.ForEach(func(c *object.Commit) error { + if ctxErr := ctx.Err(); ctxErr != nil { + return ctxErr + } + if _, ok := seenCommits[c.Hash]; ok { + return nil + } + seenCommits[c.Hash] = struct{}{} + + if !g.Since.IsZero() && c.Author.When.Before(g.Since) { + return nil + } + return g.emitCommitBlobs(ctx, c, seenBlobs, out) + }) + iter.Close() + if err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + // Swallow per-seed iterator errors; continue with other refs. + } + } + return nil +} + +// collectSeedCommits gathers commit hashes from all local branches, tags, +// and the stash ref — the union of which reaches every commit worth scanning. +func collectSeedCommits(repo *git.Repository) ([]plumbing.Hash, error) { + var seeds []plumbing.Hash + refs, err := repo.References() + if err != nil { + return nil, err + } + err = refs.ForEach(func(ref *plumbing.Reference) error { + name := ref.Name() + if !(name.IsBranch() || name.IsTag() || name == plumbing.ReferenceName("refs/stash") || name.IsRemote()) { + return nil + } + hash := ref.Hash() + // For annotated tags the ref points at a tag object; resolve to commit if possible. + if name.IsTag() { + if tag, err := repo.TagObject(hash); err == nil { + if c, err := tag.Commit(); err == nil { + hash = c.Hash + } + } + } + // Skip symbolic refs (HEAD) whose target we already walked via IsBranch. + seeds = append(seeds, hash) + return nil + }) + return seeds, err +} + +// emitCommitBlobs walks the tree of a commit and emits every blob whose +// OID has not already been scanned. +func (g *GitSource) emitCommitBlobs(ctx context.Context, c *object.Commit, seenBlobs map[plumbing.Hash]struct{}, out chan<- types.Chunk) error { + tree, err := c.Tree() + if err != nil { + return nil // skip unreadable tree + } + shortSHA := c.Hash.String()[:7] + + return tree.Files().ForEach(func(f *object.File) error { + if err := ctx.Err(); err != nil { + return err + } + if _, ok := seenBlobs[f.Hash]; ok { + return nil + } + seenBlobs[f.Hash] = struct{}{} + + // Skip obviously-binary blobs via go-git's helper, then via our sniff. + if isBin, _ := f.IsBinary(); isBin { + return nil + } + reader, err := f.Reader() + if err != nil { + return nil + } + defer reader.Close() + data, err := io.ReadAll(reader) + if err != nil { + return nil + } + if len(data) == 0 { + return nil + } + if bytes.IndexByte(data[:minInt(len(data), BinarySniffSize)], 0x00) >= 0 { + return nil + } + + source := fmt.Sprintf("git:%s:%s", shortSHA, f.Name) + return emitChunks(ctx, data, source, g.ChunkSize, out) + }) +} + +func minInt(a, b int) int { + if a < b { + return a + } + return b +} +``` + +Create `pkg/engine/sources/git_test.go` using go-git's in-process fixtures: + +```go +package sources + +import ( + "context" + "os" + "path/filepath" + "regexp" + "testing" + "time" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing/object" + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +func initRepo(t *testing.T) (string, *git.Repository) { + t.Helper() + dir := t.TempDir() + repo, err := git.PlainInit(dir, false) + require.NoError(t, err) + return dir, repo +} + +func commitFile(t *testing.T, dir string, repo *git.Repository, name, content string) { + t.Helper() + path := filepath.Join(dir, name) + require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o755)) + require.NoError(t, os.WriteFile(path, []byte(content), 0o644)) + wt, err := repo.Worktree() + require.NoError(t, err) + _, err = wt.Add(name) + require.NoError(t, err) + _, err = wt.Commit("add "+name, &git.CommitOptions{ + Author: &object.Signature{Name: "test", Email: "t@x", When: time.Now()}, + }) + require.NoError(t, err) +} + +func drainGit(t *testing.T, src Source) []types.Chunk { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + out := make(chan types.Chunk, 1024) + errCh := make(chan error, 1) + go func() { errCh <- src.Chunks(ctx, out); close(out) }() + var got []types.Chunk + for c := range out { + got = append(got, c) + } + require.NoError(t, <-errCh) + return got +} + +func TestGitSource_HistoryWalk(t *testing.T) { + dir, repo := initRepo(t) + commitFile(t, dir, repo, "a.txt", "contents alpha") + commitFile(t, dir, repo, "b.txt", "contents bravo") + commitFile(t, dir, repo, "c.txt", "contents charlie") + + chunks := drainGit(t, NewGitSource(dir)) + require.GreaterOrEqual(t, len(chunks), 3) + + re := regexp.MustCompile(`^git:[0-9a-f]{7}:.+$`) + for _, c := range chunks { + require.Regexp(t, re, c.Source) + } +} + +func TestGitSource_BlobDeduplication(t *testing.T) { + dir, repo := initRepo(t) + commitFile(t, dir, repo, "a.txt", "same exact content everywhere") + commitFile(t, dir, repo, "b.txt", "same exact content everywhere") // identical blob -> same OID + commitFile(t, dir, repo, "c.txt", "different content here") + + chunks := drainGit(t, NewGitSource(dir)) + // Expect 2 unique blobs scanned, not 3 files. + unique := make(map[string]bool) + for _, c := range chunks { + unique[string(c.Data)] = true + } + require.Len(t, unique, 2, "duplicate blobs must be deduped by OID") +} + +func TestGitSource_ModifiedFileKeepsBothVersions(t *testing.T) { + dir, repo := initRepo(t) + commitFile(t, dir, repo, "a.txt", "version one") + commitFile(t, dir, repo, "a.txt", "version two") // modifying produces a second blob + + chunks := drainGit(t, NewGitSource(dir)) + bodies := make(map[string]bool) + for _, c := range chunks { + bodies[string(c.Data)] = true + } + require.True(t, bodies["version one"], "old version must still be scanned") + require.True(t, bodies["version two"], "new version must be scanned") +} + +func TestGitSource_SinceFilterExcludesAll(t *testing.T) { + dir, repo := initRepo(t) + commitFile(t, dir, repo, "a.txt", "alpha") + + src := NewGitSource(dir) + src.Since = time.Now().Add(1 * time.Hour) + chunks := drainGit(t, src) + require.Empty(t, chunks) +} + +func TestGitSource_MissingRepo(t *testing.T) { + src := NewGitSource(filepath.Join(t.TempDir(), "not-a-repo")) + ctx := context.Background() + out := make(chan types.Chunk, 1) + err := src.Chunks(ctx, out) + require.Error(t, err) +} +``` + +Do NOT touch any file outside `pkg/engine/sources/git.go` and `pkg/engine/sources/git_test.go`. CLI wire-up happens in plan 04-05. + + + go test ./pkg/engine/sources/... -run TestGitSource -race -count=1 -timeout=60s + + + - `go build ./pkg/engine/sources/...` exits 0 + - `go test ./pkg/engine/sources/... -run TestGitSource -race -count=1` passes all subtests + - `grep -n "git.PlainOpen" pkg/engine/sources/git.go` returns a hit + - `grep -n "seenBlobs" pkg/engine/sources/git.go` returns a hit (dedup map) + - `grep -n "fmt.Sprintf(\"git:%s:%s\"" pkg/engine/sources/git.go` returns a hit + - `grep -n "g.Since" pkg/engine/sources/git.go` returns a hit + + + GitSource walks all branches/tags, emits each unique blob once, honors Since filter, formats source as `git::`, and tests cover dedup/history/since/missing-repo. + + + + + + +- `go test ./pkg/engine/sources/... -run TestGitSource -race` passes +- `go vet ./pkg/engine/sources/...` clean +- All grep acceptance checks hit + + + +A caller can `sources.NewGitSource("./myrepo")` and receive chunks for every historical blob across all refs, with deterministic dedup and source attribution in `git::` form. + + + +After completion, create `.planning/phases/04-input-sources/04-03-SUMMARY.md` documenting file list, test results, and the go-git version resolved by plan 04-01. + diff --git a/.planning/phases/04-input-sources/04-04-PLAN.md b/.planning/phases/04-input-sources/04-04-PLAN.md new file mode 100644 index 0000000..75f9edd --- /dev/null +++ b/.planning/phases/04-input-sources/04-04-PLAN.md @@ -0,0 +1,624 @@ +--- +phase: 04-input-sources +plan: 04 +type: execute +wave: 1 +depends_on: ["04-01"] +files_modified: + - pkg/engine/sources/stdin.go + - pkg/engine/sources/stdin_test.go + - pkg/engine/sources/url.go + - pkg/engine/sources/url_test.go + - pkg/engine/sources/clipboard.go + - pkg/engine/sources/clipboard_test.go +autonomous: true +requirements: + - INPUT-03 + - INPUT-04 + - INPUT-05 +must_haves: + truths: + - "StdinSource reads from an io.Reader and emits chunks with Source='stdin'" + - "URLSource fetches an http/https URL with 30s timeout, 50MB cap, rejects file:// and other schemes, and emits chunks with Source='url:'" + - "URLSource rejects responses with non-text Content-Type unless allowlisted (text/*, application/json, application/javascript, application/xml)" + - "ClipboardSource reads current clipboard via atotto/clipboard and emits chunks with Source='clipboard'" + - "ClipboardSource returns a clear error if clipboard tooling is unavailable" + artifacts: + - path: "pkg/engine/sources/stdin.go" + provides: "StdinSource" + exports: ["StdinSource", "NewStdinSource"] + min_lines: 40 + - path: "pkg/engine/sources/url.go" + provides: "URLSource with HTTP fetch, timeout, size cap, content-type filter" + exports: ["URLSource", "NewURLSource"] + min_lines: 100 + - path: "pkg/engine/sources/clipboard.go" + provides: "ClipboardSource wrapping atotto/clipboard" + exports: ["ClipboardSource", "NewClipboardSource"] + min_lines: 30 + key_links: + - from: "pkg/engine/sources/url.go" + to: "net/http" + via: "http.Client with Timeout" + pattern: "http\\.Client" + - from: "pkg/engine/sources/url.go" + to: "io.LimitReader" + via: "MaxContentLength enforcement" + pattern: "LimitReader" + - from: "pkg/engine/sources/clipboard.go" + to: "github.com/atotto/clipboard" + via: "clipboard.ReadAll" + pattern: "clipboard\\.ReadAll" +--- + + +Implement three smaller Source adapters in a single plan since each is <80 lines and they share no state: +- `StdinSource` reads from an injectable `io.Reader` (defaults to `os.Stdin`) — INPUT-03 +- `URLSource` fetches a remote URL via stdlib `net/http` with timeout, size cap, scheme whitelist, and content-type filter — INPUT-04 +- `ClipboardSource` reads the current clipboard via `github.com/atotto/clipboard` with graceful fallback — INPUT-05 + +Purpose: These three adapters complete the Phase 4 input surface area. Bundling them into one plan keeps wave-1 parallelism healthy (04-02 + 04-03 + 04-04 run simultaneously) while respecting the ~50% context budget since each adapter is self-contained and small. +Output: Six files total (three sources + three test files). + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/phases/04-input-sources/04-CONTEXT.md +@pkg/engine/sources/source.go +@pkg/types/chunk.go + + +Source interface: +```go +type Source interface { + Chunks(ctx context.Context, out chan<- types.Chunk) error +} +``` + +Shared helper (produced by plan 04-02 in pkg/engine/sources/dir.go): +```go +func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error +``` + +atotto/clipboard API: +```go +import "github.com/atotto/clipboard" +func ReadAll() (string, error) +func Unsupported bool // set on platforms without clipboard tooling +``` + + + + + + + Task 1: Implement StdinSource, URLSource, and ClipboardSource with full test coverage + + - pkg/engine/sources/source.go + - pkg/engine/sources/dir.go (for emitChunks signature from plan 04-02) + - pkg/types/chunk.go + - .planning/phases/04-input-sources/04-CONTEXT.md (Stdin, URL, Clipboard sections) + + + pkg/engine/sources/stdin.go, + pkg/engine/sources/stdin_test.go, + pkg/engine/sources/url.go, + pkg/engine/sources/url_test.go, + pkg/engine/sources/clipboard.go, + pkg/engine/sources/clipboard_test.go + + + StdinSource: + - Test 1: Feeding "API_KEY=xyz" through a bytes.Buffer emits one chunk with Source="stdin" + - Test 2: Empty input emits zero chunks without error + - Test 3: ctx cancellation returns ctx.Err() + URLSource: + - Test 4: Fetches content from httptest.Server, emits a chunk with Source="url:" + - Test 5: Server returning 50MB+1 body is rejected with a size error + - Test 6: Server returning Content-Type image/png is rejected + - Test 7: Scheme "file:///etc/passwd" is rejected without any request attempt + - Test 8: Server returning 500 returns a non-nil error containing "500" + - Test 9: HTTP 301 redirect is followed (max 5 hops) + ClipboardSource: + - Test 10: If clipboard.Unsupported is true, returns an error with "clipboard" in the message + - Test 11: Otherwise reads clipboard (may skip if empty on CI) — use build tag or t.Skip guard + + + +Create `pkg/engine/sources/stdin.go`: + +```go +package sources + +import ( + "context" + "io" + "os" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// StdinSource reads content from an io.Reader (defaults to os.Stdin) and +// emits overlapping chunks. Used when a user runs `keyhunter scan stdin` +// or `keyhunter scan -`. +type StdinSource struct { + Reader io.Reader + ChunkSize int +} + +// NewStdinSource returns a StdinSource bound to os.Stdin. +func NewStdinSource() *StdinSource { + return &StdinSource{Reader: os.Stdin, ChunkSize: defaultChunkSize} +} + +// NewStdinSourceFrom returns a StdinSource bound to the given reader +// (used primarily by tests). +func NewStdinSourceFrom(r io.Reader) *StdinSource { + return &StdinSource{Reader: r, ChunkSize: defaultChunkSize} +} + +// Chunks reads the entire input, then hands it to the shared chunk emitter. +func (s *StdinSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + if s.Reader == nil { + s.Reader = os.Stdin + } + data, err := io.ReadAll(s.Reader) + if err != nil { + return err + } + if len(data) == 0 { + return nil + } + return emitChunks(ctx, data, "stdin", s.ChunkSize, out) +} +``` + +Create `pkg/engine/sources/stdin_test.go`: + +```go +package sources + +import ( + "bytes" + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +func TestStdinSource_Basic(t *testing.T) { + src := NewStdinSourceFrom(bytes.NewBufferString("API_KEY=sk-test-xyz")) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + out := make(chan types.Chunk, 8) + errCh := make(chan error, 1) + go func() { errCh <- src.Chunks(ctx, out); close(out) }() + + var got []types.Chunk + for c := range out { + got = append(got, c) + } + require.NoError(t, <-errCh) + require.Len(t, got, 1) + require.Equal(t, "stdin", got[0].Source) + require.Equal(t, "API_KEY=sk-test-xyz", string(got[0].Data)) +} + +func TestStdinSource_Empty(t *testing.T) { + src := NewStdinSourceFrom(bytes.NewBuffer(nil)) + out := make(chan types.Chunk, 1) + err := src.Chunks(context.Background(), out) + close(out) + require.NoError(t, err) + require.Len(t, out, 0) +} + +func TestStdinSource_CtxCancel(t *testing.T) { + // Large buffer so emitChunks iterates and can observe cancellation. + data := make([]byte, 1<<20) + src := NewStdinSourceFrom(bytes.NewReader(data)) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + out := make(chan types.Chunk) // unbuffered forces select on ctx + err := src.Chunks(ctx, out) + require.ErrorIs(t, err, context.Canceled) +} +``` + +Create `pkg/engine/sources/url.go`: + +```go +package sources + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// MaxURLContentLength is the hard cap on URLSource response bodies. +const MaxURLContentLength int64 = 50 * 1024 * 1024 // 50 MB + +// DefaultURLTimeout is the overall request timeout (connect + read + body). +const DefaultURLTimeout = 30 * time.Second + +// allowedContentTypes is the whitelist of Content-Type prefixes URLSource +// will accept. Binary types (images, archives, executables) are rejected. +var allowedContentTypes = []string{ + "text/", + "application/json", + "application/javascript", + "application/xml", + "application/x-yaml", + "application/yaml", +} + +// URLSource fetches a remote resource over HTTP(S) and emits its body as chunks. +type URLSource struct { + URL string + Client *http.Client + UserAgent string + Insecure bool // skip TLS verification (default false) + ChunkSize int +} + +// NewURLSource creates a URLSource with sane defaults. +func NewURLSource(rawURL string) *URLSource { + return &URLSource{ + URL: rawURL, + Client: defaultHTTPClient(), + UserAgent: "keyhunter/dev", + ChunkSize: defaultChunkSize, + } +} + +func defaultHTTPClient() *http.Client { + return &http.Client{ + Timeout: DefaultURLTimeout, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + if len(via) >= 5 { + return errors.New("stopped after 5 redirects") + } + return nil + }, + } +} + +// Chunks validates the URL, issues a GET, and emits the response body as chunks. +func (u *URLSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + parsed, err := url.Parse(u.URL) + if err != nil { + return fmt.Errorf("URLSource: parse %q: %w", u.URL, err) + } + if parsed.Scheme != "http" && parsed.Scheme != "https" { + return fmt.Errorf("URLSource: unsupported scheme %q (only http/https)", parsed.Scheme) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.URL, nil) + if err != nil { + return fmt.Errorf("URLSource: new request: %w", err) + } + req.Header.Set("User-Agent", u.UserAgent) + + client := u.Client + if client == nil { + client = defaultHTTPClient() + } + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("URLSource: fetch: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("URLSource: non-2xx status %d from %s", resp.StatusCode, u.URL) + } + + ct := resp.Header.Get("Content-Type") + if !isAllowedContentType(ct) { + return fmt.Errorf("URLSource: disallowed Content-Type %q", ct) + } + + if resp.ContentLength > MaxURLContentLength { + return fmt.Errorf("URLSource: Content-Length %d exceeds cap %d", resp.ContentLength, MaxURLContentLength) + } + + // LimitReader cap + 1 to detect overflow even if ContentLength was missing/wrong. + limited := io.LimitReader(resp.Body, MaxURLContentLength+1) + data, err := io.ReadAll(limited) + if err != nil { + return fmt.Errorf("URLSource: read body: %w", err) + } + if int64(len(data)) > MaxURLContentLength { + return fmt.Errorf("URLSource: body exceeds %d bytes", MaxURLContentLength) + } + if len(data) == 0 { + return nil + } + + source := "url:" + u.URL + return emitChunks(ctx, data, source, u.ChunkSize, out) +} + +func isAllowedContentType(ct string) bool { + if ct == "" { + return true // some servers omit; trust and scan + } + // Strip parameters like "; charset=utf-8". + if idx := strings.Index(ct, ";"); idx >= 0 { + ct = ct[:idx] + } + ct = strings.TrimSpace(strings.ToLower(ct)) + for _, prefix := range allowedContentTypes { + if strings.HasPrefix(ct, prefix) { + return true + } + } + return false +} +``` + +Create `pkg/engine/sources/url_test.go`: + +```go +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +func drainURL(t *testing.T, src Source) ([]types.Chunk, error) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + out := make(chan types.Chunk, 256) + errCh := make(chan error, 1) + go func() { errCh <- src.Chunks(ctx, out); close(out) }() + var got []types.Chunk + for c := range out { + got = append(got, c) + } + return got, <-errCh +} + +func TestURLSource_Fetches(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte("API_KEY=sk-live-xyz")) + })) + defer srv.Close() + + chunks, err := drainURL(t, NewURLSource(srv.URL)) + require.NoError(t, err) + require.Len(t, chunks, 1) + require.Equal(t, "url:"+srv.URL, chunks[0].Source) + require.Equal(t, "API_KEY=sk-live-xyz", string(chunks[0].Data)) +} + +func TestURLSource_RejectsBinaryContentType(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "image/png") + _, _ = w.Write([]byte{0x89, 0x50, 0x4e, 0x47}) + })) + defer srv.Close() + + _, err := drainURL(t, NewURLSource(srv.URL)) + require.Error(t, err) + require.Contains(t, err.Error(), "Content-Type") +} + +func TestURLSource_RejectsNonHTTPScheme(t *testing.T) { + _, err := drainURL(t, NewURLSource("file:///etc/passwd")) + require.Error(t, err) + require.Contains(t, err.Error(), "unsupported scheme") +} + +func TestURLSource_Rejects500(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer srv.Close() + + _, err := drainURL(t, NewURLSource(srv.URL)) + require.Error(t, err) + require.Contains(t, err.Error(), "500") +} + +func TestURLSource_RejectsOversizeBody(t *testing.T) { + // Serve body just over the cap. Use a small override to keep the test fast. + big := strings.Repeat("a", int(MaxURLContentLength)+10) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(big)) + })) + defer srv.Close() + + _, err := drainURL(t, NewURLSource(srv.URL)) + require.Error(t, err) +} + +func TestURLSource_FollowsRedirect(t *testing.T) { + target := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte("redirected body")) + })) + defer target.Close() + + redirector := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, target.URL, http.StatusMovedPermanently) + })) + defer redirector.Close() + + chunks, err := drainURL(t, NewURLSource(redirector.URL)) + require.NoError(t, err) + require.NotEmpty(t, chunks) + require.Contains(t, string(chunks[0].Data), "redirected body") +} +``` + +Create `pkg/engine/sources/clipboard.go`: + +```go +package sources + +import ( + "context" + "errors" + "fmt" + + "github.com/atotto/clipboard" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// ClipboardSource reads the current OS clipboard contents and emits them +// as a single chunk stream with Source="clipboard". Requires xclip/xsel/ +// wl-clipboard on Linux, pbpaste on macOS, or native API on Windows. +type ClipboardSource struct { + // Reader overrides the clipboard reader; when nil the real clipboard is used. + // Tests inject a func returning a fixture. + Reader func() (string, error) + ChunkSize int +} + +// NewClipboardSource returns a ClipboardSource bound to the real OS clipboard. +func NewClipboardSource() *ClipboardSource { + return &ClipboardSource{Reader: clipboard.ReadAll, ChunkSize: defaultChunkSize} +} + +// Chunks reads the clipboard and emits its contents. +func (c *ClipboardSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + if clipboard.Unsupported && c.Reader == nil { + return errors.New("ClipboardSource: clipboard tooling unavailable (install xclip/xsel/wl-clipboard on Linux)") + } + reader := c.Reader + if reader == nil { + reader = clipboard.ReadAll + } + text, err := reader() + if err != nil { + return fmt.Errorf("ClipboardSource: read: %w", err) + } + if text == "" { + return nil + } + return emitChunks(ctx, []byte(text), "clipboard", c.ChunkSize, out) +} +``` + +Create `pkg/engine/sources/clipboard_test.go`: + +```go +package sources + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +func TestClipboardSource_FixtureReader(t *testing.T) { + src := &ClipboardSource{ + Reader: func() (string, error) { return "sk-live-xxxxxx", nil }, + ChunkSize: defaultChunkSize, + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + out := make(chan types.Chunk, 4) + errCh := make(chan error, 1) + go func() { errCh <- src.Chunks(ctx, out); close(out) }() + + var got []types.Chunk + for c := range out { + got = append(got, c) + } + require.NoError(t, <-errCh) + require.Len(t, got, 1) + require.Equal(t, "clipboard", got[0].Source) + require.Equal(t, "sk-live-xxxxxx", string(got[0].Data)) +} + +func TestClipboardSource_ReaderError(t *testing.T) { + src := &ClipboardSource{ + Reader: func() (string, error) { return "", errors.New("no xclip installed") }, + } + out := make(chan types.Chunk, 1) + err := src.Chunks(context.Background(), out) + require.Error(t, err) + require.Contains(t, err.Error(), "clipboard") +} + +func TestClipboardSource_EmptyClipboard(t *testing.T) { + src := &ClipboardSource{ + Reader: func() (string, error) { return "", nil }, + } + out := make(chan types.Chunk, 1) + err := src.Chunks(context.Background(), out) + require.NoError(t, err) + require.Len(t, out, 0) +} +``` + +Do NOT modify `cmd/scan.go` in this plan. Do NOT create `pkg/engine/sources/dir.go`, `git.go`, or touch `file.go` — those are owned by plans 04-02 and 04-03. + + + go test ./pkg/engine/sources/... -run 'TestStdinSource|TestURLSource|TestClipboardSource' -race -count=1 + + + - `go build ./pkg/engine/sources/...` exits 0 + - `go test ./pkg/engine/sources/... -run 'TestStdinSource|TestURLSource|TestClipboardSource' -race` passes all subtests + - `grep -n "http.Client" pkg/engine/sources/url.go` hits + - `grep -n "LimitReader" pkg/engine/sources/url.go` hits + - `grep -n "clipboard.ReadAll" pkg/engine/sources/clipboard.go` hits + - `grep -n "\"stdin\"" pkg/engine/sources/stdin.go` hits (source label) + - `grep -n "\"url:\" + u.URL\\|\"url:\"+u.URL" pkg/engine/sources/url.go` hits + + + StdinSource, URLSource, and ClipboardSource all implement Source, enforce their respective safety limits (stdin read-to-EOF, url scheme/size/content-type whitelist, clipboard tooling check), and their tests pass under -race. + + + + + + +- `go test ./pkg/engine/sources/... -race -count=1` passes including new tests +- `go vet ./pkg/engine/sources/...` clean +- All grep acceptance checks hit + + + +Three new source adapters exist, each self-contained, each with test coverage, and none conflicting with file ownership of plans 04-02 (dir/file) or 04-03 (git). + + + +After completion, create `.planning/phases/04-input-sources/04-04-SUMMARY.md` listing the six files created, test names with pass status, and any platform-specific notes about clipboard tests on the executor's CI environment. + diff --git a/.planning/phases/04-input-sources/04-05-PLAN.md b/.planning/phases/04-input-sources/04-05-PLAN.md new file mode 100644 index 0000000..35044de --- /dev/null +++ b/.planning/phases/04-input-sources/04-05-PLAN.md @@ -0,0 +1,435 @@ +--- +phase: 04-input-sources +plan: 05 +type: execute +wave: 2 +depends_on: ["04-02", "04-03", "04-04"] +files_modified: + - cmd/scan.go + - cmd/scan_sources_test.go +autonomous: true +requirements: + - INPUT-06 +must_haves: + truths: + - "keyhunter scan uses DirSource when target is a directory (not FileSource)" + - "keyhunter scan continues to use FileSource when target is a single file" + - "keyhunter scan --git uses GitSource, honoring --since YYYY-MM-DD" + - "keyhunter scan stdin and keyhunter scan - both use StdinSource" + - "keyhunter scan --url uses URLSource" + - "keyhunter scan --clipboard uses ClipboardSource (no positional arg required)" + - "--exclude flags are forwarded to DirSource" + - "Exactly one source is selected — conflicting flags return an error" + artifacts: + - path: "cmd/scan.go" + provides: "Source-selection logic dispatching to the appropriate Source implementation" + contains: "selectSource" + min_lines: 180 + - path: "cmd/scan_sources_test.go" + provides: "Unit tests for selectSource covering every flag combination" + min_lines: 80 + key_links: + - from: "cmd/scan.go" + to: "pkg/engine/sources" + via: "sources.NewDirSource/NewGitSource/NewStdinSource/NewURLSource/NewClipboardSource" + pattern: "sources\\.New(Dir|Git|Stdin|URL|Clipboard)Source" + - from: "cmd/scan.go" + to: "cobra flags" + via: "--git, --url, --clipboard, --since, --exclude" + pattern: "\\-\\-git|\\-\\-url|\\-\\-clipboard|\\-\\-since" +--- + + +Wire the four new source adapters (DirSource, GitSource, StdinSource, URLSource, ClipboardSource) into `cmd/scan.go` via a new `selectSource` helper that inspects CLI flags and positional args to pick exactly one source. Satisfies INPUT-06 (the "all inputs flow through the same pipeline" integration requirement). + +Purpose: Plans 04-02 through 04-04 deliver the Source implementations in isolation. This plan is the single integration point that makes them reachable from the CLI, with argument validation to prevent ambiguous invocations like `keyhunter scan --git --url https://...`. +Output: Updated `cmd/scan.go` with new flags and dispatching logic, plus a focused test file exercising `selectSource` directly. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/phases/04-input-sources/04-CONTEXT.md +@cmd/scan.go +@pkg/engine/sources/source.go + + +Source constructors from Wave 1 plans: +```go +// Plan 04-02 +func NewFileSource(path string) *FileSource +func NewDirSource(root string, extraExcludes ...string) *DirSource +func NewDirSourceRaw(root string, excludes []string) *DirSource + +// Plan 04-03 +func NewGitSource(repoPath string) *GitSource +type GitSource struct { + RepoPath string + Since time.Time + ChunkSize int +} + +// Plan 04-04 +func NewStdinSource() *StdinSource +func NewURLSource(rawURL string) *URLSource +func NewClipboardSource() *ClipboardSource +``` + +Existing cmd/scan.go contract (see file for full body): +- Package `cmd` +- Uses `sources.NewFileSource(target)` unconditionally today +- Has `flagExclude []string` already declared +- init() registers flags: --workers, --verify, --unmask, --output, --exclude + + + + + + + Task 1: Add source-selection flags and dispatch logic to cmd/scan.go + + - cmd/scan.go (full file) + - pkg/engine/sources/source.go + - pkg/engine/sources/dir.go (produced by 04-02) + - pkg/engine/sources/git.go (produced by 04-03) + - pkg/engine/sources/stdin.go (produced by 04-04) + - pkg/engine/sources/url.go (produced by 04-04) + - pkg/engine/sources/clipboard.go (produced by 04-04) + + cmd/scan.go, cmd/scan_sources_test.go + + - Test 1: selectSource with target="." on a directory returns a *DirSource + - Test 2: selectSource with target pointing to a file returns a *FileSource + - Test 3: selectSource with flagGit=true and target="./repo" returns a *GitSource + - Test 4: selectSource with flagGit=true and flagSince="2024-01-01" sets GitSource.Since correctly + - Test 5: selectSource with invalid --since format returns a parse error + - Test 6: selectSource with flagURL set returns a *URLSource + - Test 7: selectSource with flagClipboard=true and no args returns a *ClipboardSource + - Test 8: selectSource with target="stdin" returns a *StdinSource + - Test 9: selectSource with target="-" returns a *StdinSource + - Test 10: selectSource with both --git and --url set returns an error + - Test 11: selectSource with --clipboard and a positional target returns an error + - Test 12: selectSource forwards --exclude patterns into DirSource.Excludes + + + +Edit `cmd/scan.go`. The end state must: + +1. Add new package-level flag vars alongside the existing ones: + +```go +var ( + flagWorkers int + flagVerify bool + flagUnmask bool + flagOutput string + flagExclude []string + flagGit bool + flagURL string + flagClipboard bool + flagSince string + flagMaxFileSize int64 + flagInsecure bool +) +``` + +2. Change `scanCmd.Args` so a positional target is optional when `--url` or `--clipboard` is used: + +```go +var scanCmd = &cobra.Command{ + Use: "scan [path|stdin|-]", + Short: "Scan files, directories, git history, stdin, URLs, or clipboard for leaked API keys", + Args: cobra.MaximumNArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + // ... existing config load ... + + src, err := selectSource(args, sourceFlags{ + Git: flagGit, + URL: flagURL, + Clipboard: flagClipboard, + Since: flagSince, + Excludes: flagExclude, + }) + if err != nil { + return err + } + + // Replace the old `src := sources.NewFileSource(target)` line with use of the dispatched src. + // Keep all downstream code unchanged (engine, storage, output). + + // ... rest of existing RunE body, using src ... + _ = src + return nil // placeholder — keep existing logic + }, +} +``` + +3. Add the selectSource helper and its supporting struct, in `cmd/scan.go`: + +```go +// sourceFlags captures the CLI inputs that control source selection. +// Extracted into a struct so selectSource is straightforward to unit test. +type sourceFlags struct { + Git bool + URL string + Clipboard bool + Since string + Excludes []string +} + +// selectSource inspects positional args and source flags, validates that +// exactly one source is specified, and returns the appropriate Source. +func selectSource(args []string, f sourceFlags) (sources.Source, error) { + // Count explicit source selectors that take no positional path. + explicitCount := 0 + if f.URL != "" { + explicitCount++ + } + if f.Clipboard { + explicitCount++ + } + if f.Git { + explicitCount++ + } + if explicitCount > 1 { + return nil, fmt.Errorf("scan: --git, --url, and --clipboard are mutually exclusive") + } + + // Clipboard and URL take no positional argument. + if f.Clipboard { + if len(args) > 0 { + return nil, fmt.Errorf("scan: --clipboard does not accept a positional argument") + } + return sources.NewClipboardSource(), nil + } + if f.URL != "" { + if len(args) > 0 { + return nil, fmt.Errorf("scan: --url does not accept a positional argument") + } + return sources.NewURLSource(f.URL), nil + } + + if len(args) == 0 { + return nil, fmt.Errorf("scan: missing target (path, stdin, -, or a source flag)") + } + target := args[0] + + if target == "stdin" || target == "-" { + if f.Git { + return nil, fmt.Errorf("scan: --git cannot be combined with stdin") + } + return sources.NewStdinSource(), nil + } + + if f.Git { + gs := sources.NewGitSource(target) + if f.Since != "" { + t, err := time.Parse("2006-01-02", f.Since) + if err != nil { + return nil, fmt.Errorf("scan: --since must be YYYY-MM-DD: %w", err) + } + gs.Since = t + } + return gs, nil + } + + info, err := os.Stat(target) + if err != nil { + return nil, fmt.Errorf("scan: stat %q: %w", target, err) + } + if info.IsDir() { + return sources.NewDirSource(target, f.Excludes...), nil + } + return sources.NewFileSource(target), nil +} +``` + +4. In the existing `init()`, register the new flags next to the existing ones: + +```go +func init() { + scanCmd.Flags().IntVar(&flagWorkers, "workers", 0, "number of worker goroutines (default: CPU*8)") + scanCmd.Flags().BoolVar(&flagVerify, "verify", false, "actively verify found keys (opt-in, Phase 5)") + scanCmd.Flags().BoolVar(&flagUnmask, "unmask", false, "show full key values (default: masked)") + scanCmd.Flags().StringVar(&flagOutput, "output", "table", "output format: table, json") + scanCmd.Flags().StringSliceVar(&flagExclude, "exclude", nil, "extra glob patterns to exclude (e.g. *.min.js)") + + // Phase 4 source-selection flags. + scanCmd.Flags().BoolVar(&flagGit, "git", false, "treat target as a git repo and scan full history") + scanCmd.Flags().StringVar(&flagURL, "url", "", "fetch and scan a remote http(s) URL (no positional arg)") + scanCmd.Flags().BoolVar(&flagClipboard, "clipboard", false, "scan current clipboard contents") + scanCmd.Flags().StringVar(&flagSince, "since", "", "for --git: only scan commits after YYYY-MM-DD") + scanCmd.Flags().Int64Var(&flagMaxFileSize, "max-file-size", 0, "max file size in bytes to scan (0 = unlimited)") + scanCmd.Flags().BoolVar(&flagInsecure, "insecure", false, "for --url: skip TLS certificate verification") + + _ = viper.BindPFlag("scan.workers", scanCmd.Flags().Lookup("workers")) +} +``` + +5. Replace the single line `src := sources.NewFileSource(target)` in the existing RunE body with the `selectSource` dispatch. Leave ALL downstream code (engine.Scan, storage.SaveFinding, output switch, exit code logic) untouched. Ensure the `target` variable is only used where relevant (it is no longer the sole driver of source construction). + +6. Add the `time` import to `cmd/scan.go`. + +Create `cmd/scan_sources_test.go`: + +```go +package cmd + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/engine/sources" +) + +func TestSelectSource_Directory(t *testing.T) { + dir := t.TempDir() + src, err := selectSource([]string{dir}, sourceFlags{}) + require.NoError(t, err) + _, ok := src.(*sources.DirSource) + require.True(t, ok, "expected *DirSource, got %T", src) +} + +func TestSelectSource_File(t *testing.T) { + dir := t.TempDir() + f := filepath.Join(dir, "a.txt") + require.NoError(t, os.WriteFile(f, []byte("x"), 0o644)) + src, err := selectSource([]string{f}, sourceFlags{}) + require.NoError(t, err) + _, ok := src.(*sources.FileSource) + require.True(t, ok, "expected *FileSource, got %T", src) +} + +func TestSelectSource_Git(t *testing.T) { + src, err := selectSource([]string{"./some-repo"}, sourceFlags{Git: true}) + require.NoError(t, err) + gs, ok := src.(*sources.GitSource) + require.True(t, ok, "expected *GitSource, got %T", src) + require.Equal(t, "./some-repo", gs.RepoPath) +} + +func TestSelectSource_GitSince(t *testing.T) { + src, err := selectSource([]string{"./repo"}, sourceFlags{Git: true, Since: "2024-01-15"}) + require.NoError(t, err) + gs := src.(*sources.GitSource) + want, _ := time.Parse("2006-01-02", "2024-01-15") + require.Equal(t, want, gs.Since) +} + +func TestSelectSource_GitSinceBadFormat(t *testing.T) { + _, err := selectSource([]string{"./repo"}, sourceFlags{Git: true, Since: "15/01/2024"}) + require.Error(t, err) + require.Contains(t, err.Error(), "YYYY-MM-DD") +} + +func TestSelectSource_URL(t *testing.T) { + src, err := selectSource(nil, sourceFlags{URL: "https://example.com/a.js"}) + require.NoError(t, err) + _, ok := src.(*sources.URLSource) + require.True(t, ok) +} + +func TestSelectSource_URLRejectsPositional(t *testing.T) { + _, err := selectSource([]string{"./foo"}, sourceFlags{URL: "https://x"}) + require.Error(t, err) +} + +func TestSelectSource_Clipboard(t *testing.T) { + src, err := selectSource(nil, sourceFlags{Clipboard: true}) + require.NoError(t, err) + _, ok := src.(*sources.ClipboardSource) + require.True(t, ok) +} + +func TestSelectSource_ClipboardRejectsPositional(t *testing.T) { + _, err := selectSource([]string{"./foo"}, sourceFlags{Clipboard: true}) + require.Error(t, err) +} + +func TestSelectSource_Stdin(t *testing.T) { + for _, tok := range []string{"stdin", "-"} { + src, err := selectSource([]string{tok}, sourceFlags{}) + require.NoError(t, err) + _, ok := src.(*sources.StdinSource) + require.True(t, ok, "token %q: expected *StdinSource, got %T", tok, src) + } +} + +func TestSelectSource_MutuallyExclusive(t *testing.T) { + _, err := selectSource(nil, sourceFlags{Git: true, URL: "https://x"}) + require.Error(t, err) + require.Contains(t, err.Error(), "mutually exclusive") +} + +func TestSelectSource_MissingTarget(t *testing.T) { + _, err := selectSource(nil, sourceFlags{}) + require.Error(t, err) + require.Contains(t, err.Error(), "missing target") +} + +func TestSelectSource_DirForwardsExcludes(t *testing.T) { + dir := t.TempDir() + src, err := selectSource([]string{dir}, sourceFlags{Excludes: []string{"*.log", "tmp/**"}}) + require.NoError(t, err) + ds := src.(*sources.DirSource) + // NewDirSource merges DefaultExcludes with extras, so user patterns must be present. + found := 0 + for _, e := range ds.Excludes { + if e == "*.log" || e == "tmp/**" { + found++ + } + } + require.Equal(t, 2, found, "user excludes not forwarded, got %v", ds.Excludes) +} +``` + +After making these changes, run `go build ./...` and fix any import or compile errors. Do NOT modify pkg/engine/sources/* files — they are owned by Wave 1 plans. + + + go build ./... && go test ./cmd/... -run TestSelectSource -race -count=1 + + + - `go build ./...` exits 0 + - `go test ./cmd/... -run TestSelectSource -race -count=1` passes all 13 subtests + - `go test ./... -race -count=1` full suite passes + - `grep -n "selectSource" cmd/scan.go` returns at least two hits (definition + call site) + - `grep -n "flagGit\|flagURL\|flagClipboard\|flagSince" cmd/scan.go` returns at least 4 hits + - `grep -n "sources.NewDirSource\|sources.NewGitSource\|sources.NewStdinSource\|sources.NewURLSource\|sources.NewClipboardSource" cmd/scan.go` returns 5 hits + - `grep -n "mutually exclusive" cmd/scan.go` returns a hit + - `keyhunter scan --help` (via `go run . scan --help`) lists --git, --url, --clipboard, --since flags + + + cmd/scan.go dispatches to the correct Source implementation based on positional args and flags, with unambiguous error messages for conflicting selectors. All selectSource tests pass under -race. The existing single-file FileSource path still works unchanged. + + + + + + +- `go build ./...` exits 0 +- `go test ./... -race -count=1` full suite green (including earlier Wave 1 plan tests) +- `go run . scan --help` lists new flags +- `go run . scan ./pkg` completes successfully (DirSource path) +- `echo "API_KEY=test" | go run . scan -` completes successfully (StdinSource path) + + + +Users can invoke every Phase 4 input mode from the CLI and each one flows through the unchanged three-stage detection pipeline. INPUT-01 through INPUT-05 are reachable via CLI, and INPUT-06 (the integration meta-requirement) is satisfied by the passing test suite plus the help-text listing. + + + +After completion, create `.planning/phases/04-input-sources/04-05-SUMMARY.md` documenting: +- selectSource signature and branches +- Flag additions +- Test pass summary +- A short one-line example invocation per new source (dir, git, stdin, url, clipboard) +- Confirmation that existing Phase 1-3 tests still pass + diff --git a/.planning/phases/12-osint_iot_cloud_storage/12-02-SUMMARY.md b/.planning/phases/12-osint_iot_cloud_storage/12-02-SUMMARY.md new file mode 100644 index 0000000..ec4a2ee --- /dev/null +++ b/.planning/phases/12-osint_iot_cloud_storage/12-02-SUMMARY.md @@ -0,0 +1,103 @@ +--- +phase: 12-osint_iot_cloud_storage +plan: 02 +subsystem: recon +tags: [fofa, netlas, binaryedge, iot, osint, httptest] + +requires: + - phase: 09-osint-infrastructure + provides: LimiterRegistry, shared Client retry/backoff HTTP + - phase: 10-osint-code-hosting + provides: ReconSource interface pattern, BuildQueries, keywordIndex helpers +provides: + - FOFASource implementing recon.ReconSource for FOFA internet search + - NetlasSource implementing recon.ReconSource for Netlas intelligence API + - BinaryEdgeSource implementing recon.ReconSource for BinaryEdge data API +affects: [12-osint_iot_cloud_storage, cmd/recon] + +tech-stack: + added: [] + patterns: [base64-encoded query params for FOFA, X-API-Key header auth for Netlas, X-Key header auth for BinaryEdge] + +key-files: + created: + - pkg/recon/sources/fofa.go + - pkg/recon/sources/fofa_test.go + - pkg/recon/sources/netlas.go + - pkg/recon/sources/netlas_test.go + - pkg/recon/sources/binaryedge.go + - pkg/recon/sources/binaryedge_test.go + modified: [] + +key-decisions: + - "FOFA uses base64-encoded qbase64 param with email+key auth in query string" + - "Netlas uses X-API-Key header; BinaryEdge uses X-Key header for auth" + - "All three sources use bare keyword queries (default formatQuery path)" + +patterns-established: + - "IoT scanner source pattern: struct with APIKey/BaseURL/Registry/Limiters + lazy client init" + +requirements-completed: [RECON-IOT-04, RECON-IOT-05, RECON-IOT-06] + +duration: 2min +completed: 2026-04-06 +--- + +# Phase 12 Plan 02: FOFA, Netlas, BinaryEdge Sources Summary + +**Three IoT/device scanner recon sources (FOFA, Netlas, BinaryEdge) with httptest-based unit tests covering sweep, auth, and cancellation** + +## Performance + +- **Duration:** 2 min +- **Started:** 2026-04-06T09:22:18Z +- **Completed:** 2026-04-06T09:24:22Z +- **Tasks:** 2 +- **Files modified:** 6 + +## Accomplishments +- FOFASource searches FOFA API with base64-encoded queries and email+key authentication +- NetlasSource searches Netlas API with X-API-Key header authentication +- BinaryEdgeSource searches BinaryEdge API with X-Key header authentication +- All three sources follow established Phase 10 pattern with shared Client, LimiterRegistry, BuildQueries + +## Task Commits + +Each task was committed atomically: + +1. **Task 1: Implement FOFASource, NetlasSource, BinaryEdgeSource** - `270bbbf` (feat) +2. **Task 2: Unit tests for FOFA, Netlas, BinaryEdge sources** - `d6c35f4` (test) + +## Files Created/Modified +- `pkg/recon/sources/fofa.go` - FOFASource with base64 query encoding and dual-credential auth +- `pkg/recon/sources/fofa_test.go` - httptest tests for FOFA sweep, credentials, cancellation +- `pkg/recon/sources/netlas.go` - NetlasSource with X-API-Key header auth +- `pkg/recon/sources/netlas_test.go` - httptest tests for Netlas sweep, credentials, cancellation +- `pkg/recon/sources/binaryedge.go` - BinaryEdgeSource with X-Key header auth +- `pkg/recon/sources/binaryedge_test.go` - httptest tests for BinaryEdge sweep, credentials, cancellation + +## Decisions Made +- FOFA uses base64-encoded qbase64 query parameter (matching FOFA API spec) with email+key in query string +- Netlas uses X-API-Key header; BinaryEdge uses X-Key header (matching their respective API specs) +- All three use bare keyword queries via default formatQuery path (no source-specific query formatting needed) + +## Deviations from Plan + +None - plan executed exactly as written. + +## Issues Encountered +None + +## Known Stubs +None + +## User Setup Required +None - no external service configuration required. + +## Next Phase Readiness +- Three IoT scanner sources ready for RegisterAll wiring +- FOFA requires email + API key; Netlas and BinaryEdge require API key only + +--- +*Phase: 12-osint_iot_cloud_storage* +*Completed: 2026-04-06* diff --git a/pkg/recon/sources/binaryedge.go b/pkg/recon/sources/binaryedge.go new file mode 100644 index 0000000..5b9a3c5 --- /dev/null +++ b/pkg/recon/sources/binaryedge.go @@ -0,0 +1,147 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/url" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// BinaryEdgeSource implements recon.ReconSource against the BinaryEdge +// internet data API. It iterates provider keyword queries and emits a Finding +// per result event. +// +// A missing API key disables the source without error. +type BinaryEdgeSource struct { + APIKey string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + client *Client +} + +// Compile-time assertion. +var _ recon.ReconSource = (*BinaryEdgeSource)(nil) + +func (s *BinaryEdgeSource) Name() string { return "binaryedge" } +func (s *BinaryEdgeSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *BinaryEdgeSource) Burst() int { return 1 } +func (s *BinaryEdgeSource) RespectsRobots() bool { return false } + +// Enabled returns true only when APIKey is configured. +func (s *BinaryEdgeSource) Enabled(_ recon.Config) bool { return s.APIKey != "" } + +// Sweep issues one BinaryEdge search request per provider keyword and emits +// a Finding for every result event. +func (s *BinaryEdgeSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if s.APIKey == "" { + return nil + } + if s.client == nil { + s.client = NewClient() + } + base := s.BaseURL + if base == "" { + base = "https://api.binaryedge.io" + } + + queries := BuildQueries(s.Registry, "binaryedge") + kwIndex := binaryedgeKeywordIndex(s.Registry) + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + endpoint := fmt.Sprintf("%s/v2/query/search?query=%s&page=1", + base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("binaryedge: build request: %w", err) + } + req.Header.Set("X-Key", s.APIKey) + req.Header.Set("Accept", "application/json") + + resp, err := s.client.Do(ctx, req) + if err != nil { + if errors.Is(err, ErrUnauthorized) { + return err + } + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + continue + } + + var parsed binaryedgeSearchResponse + decErr := json.NewDecoder(resp.Body).Decode(&parsed) + _ = resp.Body.Close() + if decErr != nil { + continue + } + + provName := kwIndex[strings.ToLower(q)] + for _, ev := range parsed.Events { + f := recon.Finding{ + ProviderName: provName, + Confidence: "low", + Source: fmt.Sprintf("binaryedge://%s:%d", ev.Target.IP, ev.Target.Port), + SourceType: "recon:binaryedge", + DetectedAt: time.Now(), + } + select { + case out <- f: + case <-ctx.Done(): + return ctx.Err() + } + } + } + return nil +} + +type binaryedgeSearchResponse struct { + Events []binaryedgeEvent `json:"events"` +} + +type binaryedgeEvent struct { + Target binaryedgeTarget `json:"target"` +} + +type binaryedgeTarget struct { + IP string `json:"ip"` + Port int `json:"port"` +} + +// binaryedgeKeywordIndex maps lowercased keywords to provider names. +func binaryedgeKeywordIndex(reg *providers.Registry) map[string]string { + m := make(map[string]string) + if reg == nil { + return m + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + kl := strings.ToLower(strings.TrimSpace(k)) + if kl == "" { + continue + } + if _, exists := m[kl]; !exists { + m[kl] = p.Name + } + } + } + return m +} diff --git a/pkg/recon/sources/binaryedge_test.go b/pkg/recon/sources/binaryedge_test.go new file mode 100644 index 0000000..e003a01 --- /dev/null +++ b/pkg/recon/sources/binaryedge_test.go @@ -0,0 +1,117 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func binaryedgeStubHandler(t *testing.T, calls *int32) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(calls, 1) + if !strings.HasPrefix(r.URL.Path, "/v2/query/search") { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if got := r.Header.Get("X-Key"); got != "testkey" { + t.Errorf("missing X-Key header: %q", got) + } + body := binaryedgeSearchResponse{ + Events: []binaryedgeEvent{ + {Target: binaryedgeTarget{IP: "192.168.1.1", Port: 80}}, + {Target: binaryedgeTarget{IP: "192.168.1.2", Port: 443}}, + }, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(body) + } +} + +func TestBinaryEdgeSource_EnabledRequiresAPIKey(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + + s := &BinaryEdgeSource{APIKey: "", Registry: reg, Limiters: lim} + if s.Enabled(recon.Config{}) { + t.Error("expected Enabled=false with empty key") + } + s = &BinaryEdgeSource{APIKey: "key", Registry: reg, Limiters: lim} + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true with key") + } +} + +func TestBinaryEdgeSource_SweepEmitsFindings(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("binaryedge", 1000, 100) + + var calls int32 + srv := httptest.NewServer(binaryedgeStubHandler(t, &calls)) + defer srv.Close() + + s := &BinaryEdgeSource{ + APIKey: "testkey", + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + done := make(chan error, 1) + go func() { done <- s.Sweep(ctx, "", out); close(out) }() + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if err := <-done; err != nil { + t.Fatalf("Sweep error: %v", err) + } + + // 2 keywords * 2 events = 4 findings + if len(findings) != 4 { + t.Fatalf("expected 4 findings, got %d", len(findings)) + } + for _, f := range findings { + if f.SourceType != "recon:binaryedge" { + t.Errorf("SourceType=%q want recon:binaryedge", f.SourceType) + } + } + if got := atomic.LoadInt32(&calls); got != 2 { + t.Errorf("expected 2 API calls, got %d", got) + } +} + +func TestBinaryEdgeSource_CtxCancelled(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("binaryedge", 1000, 100) + + s := &BinaryEdgeSource{ + APIKey: "key", + BaseURL: "http://127.0.0.1:1", + Registry: reg, + Limiters: lim, + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 1) + err := s.Sweep(ctx, "", out) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled, got %v", err) + } +} diff --git a/pkg/recon/sources/fofa.go b/pkg/recon/sources/fofa.go new file mode 100644 index 0000000..2fec8d5 --- /dev/null +++ b/pkg/recon/sources/fofa.go @@ -0,0 +1,144 @@ +package sources + +import ( + "context" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "net/http" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// FOFASource implements recon.ReconSource against the FOFA internet search +// engine API. It iterates provider keyword queries and emits a Finding per +// result. +// +// A missing Email or API key disables the source without error. +type FOFASource struct { + Email string + APIKey string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + client *Client +} + +// Compile-time assertion. +var _ recon.ReconSource = (*FOFASource)(nil) + +func (s *FOFASource) Name() string { return "fofa" } +func (s *FOFASource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) } +func (s *FOFASource) Burst() int { return 1 } +func (s *FOFASource) RespectsRobots() bool { return false } + +// Enabled returns true only when both Email and APIKey are configured. +func (s *FOFASource) Enabled(_ recon.Config) bool { return s.Email != "" && s.APIKey != "" } + +// Sweep issues one FOFA search request per provider keyword and emits a +// Finding for every result row. +func (s *FOFASource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if s.Email == "" || s.APIKey == "" { + return nil + } + if s.client == nil { + s.client = NewClient() + } + base := s.BaseURL + if base == "" { + base = "https://fofa.info" + } + + queries := BuildQueries(s.Registry, "fofa") + kwIndex := fofaKeywordIndex(s.Registry) + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + qb64 := base64.StdEncoding.EncodeToString([]byte(q)) + endpoint := fmt.Sprintf("%s/api/v1/search/all?email=%s&key=%s&qbase64=%s&size=100", + base, s.Email, s.APIKey, qb64) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("fofa: build request: %w", err) + } + req.Header.Set("Accept", "application/json") + + resp, err := s.client.Do(ctx, req) + if err != nil { + if errors.Is(err, ErrUnauthorized) { + return err + } + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + continue + } + + var parsed fofaSearchResponse + decErr := json.NewDecoder(resp.Body).Decode(&parsed) + _ = resp.Body.Close() + if decErr != nil { + continue + } + + provName := kwIndex[strings.ToLower(q)] + for _, row := range parsed.Results { + // Each row is [host, ip, port]. + if len(row) < 3 { + continue + } + f := recon.Finding{ + ProviderName: provName, + Confidence: "low", + Source: fmt.Sprintf("fofa://%s:%s", row[1], row[2]), + SourceType: "recon:fofa", + DetectedAt: time.Now(), + } + select { + case out <- f: + case <-ctx.Done(): + return ctx.Err() + } + } + } + return nil +} + +type fofaSearchResponse struct { + Results [][]string `json:"results"` + Size int `json:"size"` +} + +// fofaKeywordIndex maps lowercased keywords to provider names. +func fofaKeywordIndex(reg *providers.Registry) map[string]string { + m := make(map[string]string) + if reg == nil { + return m + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + kl := strings.ToLower(strings.TrimSpace(k)) + if kl == "" { + continue + } + if _, exists := m[kl]; !exists { + m[kl] = p.Name + } + } + } + return m +} diff --git a/pkg/recon/sources/fofa_test.go b/pkg/recon/sources/fofa_test.go new file mode 100644 index 0000000..e17497d --- /dev/null +++ b/pkg/recon/sources/fofa_test.go @@ -0,0 +1,130 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func fofaStubHandler(t *testing.T, calls *int32) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(calls, 1) + if r.URL.Path != "/api/v1/search/all" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if got := r.URL.Query().Get("email"); got != "test@example.com" { + t.Errorf("missing email param: %q", got) + } + if got := r.URL.Query().Get("key"); got != "testkey" { + t.Errorf("missing key param: %q", got) + } + body := fofaSearchResponse{ + Results: [][]string{ + {"example.com", "1.2.3.4", "443"}, + {"test.org", "5.6.7.8", "8080"}, + }, + Size: 2, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(body) + } +} + +func TestFOFASource_EnabledRequiresCredentials(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + + s := &FOFASource{Email: "", APIKey: "", Registry: reg, Limiters: lim} + if s.Enabled(recon.Config{}) { + t.Error("expected Enabled=false with empty credentials") + } + s = &FOFASource{Email: "a@b.com", APIKey: "", Registry: reg, Limiters: lim} + if s.Enabled(recon.Config{}) { + t.Error("expected Enabled=false with empty APIKey") + } + s = &FOFASource{Email: "", APIKey: "key", Registry: reg, Limiters: lim} + if s.Enabled(recon.Config{}) { + t.Error("expected Enabled=false with empty Email") + } + s = &FOFASource{Email: "a@b.com", APIKey: "key", Registry: reg, Limiters: lim} + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true with both credentials") + } +} + +func TestFOFASource_SweepEmitsFindings(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("fofa", 1000, 100) + + var calls int32 + srv := httptest.NewServer(fofaStubHandler(t, &calls)) + defer srv.Close() + + s := &FOFASource{ + Email: "test@example.com", + APIKey: "testkey", + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + done := make(chan error, 1) + go func() { done <- s.Sweep(ctx, "", out); close(out) }() + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if err := <-done; err != nil { + t.Fatalf("Sweep error: %v", err) + } + + // 2 keywords * 2 results = 4 findings + if len(findings) != 4 { + t.Fatalf("expected 4 findings, got %d", len(findings)) + } + for _, f := range findings { + if f.SourceType != "recon:fofa" { + t.Errorf("SourceType=%q want recon:fofa", f.SourceType) + } + } + if got := atomic.LoadInt32(&calls); got != 2 { + t.Errorf("expected 2 API calls, got %d", got) + } +} + +func TestFOFASource_CtxCancelled(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("fofa", 1000, 100) + + s := &FOFASource{ + Email: "a@b.com", + APIKey: "key", + BaseURL: "http://127.0.0.1:1", + Registry: reg, + Limiters: lim, + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 1) + err := s.Sweep(ctx, "", out) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled, got %v", err) + } +} diff --git a/pkg/recon/sources/netlas.go b/pkg/recon/sources/netlas.go new file mode 100644 index 0000000..017dd1f --- /dev/null +++ b/pkg/recon/sources/netlas.go @@ -0,0 +1,147 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/url" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// NetlasSource implements recon.ReconSource against the Netlas internet +// intelligence API. It iterates provider keyword queries and emits a Finding +// per result item. +// +// A missing API key disables the source without error. +type NetlasSource struct { + APIKey string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + client *Client +} + +// Compile-time assertion. +var _ recon.ReconSource = (*NetlasSource)(nil) + +func (s *NetlasSource) Name() string { return "netlas" } +func (s *NetlasSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) } +func (s *NetlasSource) Burst() int { return 1 } +func (s *NetlasSource) RespectsRobots() bool { return false } + +// Enabled returns true only when APIKey is configured. +func (s *NetlasSource) Enabled(_ recon.Config) bool { return s.APIKey != "" } + +// Sweep issues one Netlas search request per provider keyword and emits a +// Finding for every result item. +func (s *NetlasSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if s.APIKey == "" { + return nil + } + if s.client == nil { + s.client = NewClient() + } + base := s.BaseURL + if base == "" { + base = "https://app.netlas.io" + } + + queries := BuildQueries(s.Registry, "netlas") + kwIndex := netlasKeywordIndex(s.Registry) + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + endpoint := fmt.Sprintf("%s/api/responses/?q=%s&start=0&indices=", + base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("netlas: build request: %w", err) + } + req.Header.Set("X-API-Key", s.APIKey) + req.Header.Set("Accept", "application/json") + + resp, err := s.client.Do(ctx, req) + if err != nil { + if errors.Is(err, ErrUnauthorized) { + return err + } + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + continue + } + + var parsed netlasSearchResponse + decErr := json.NewDecoder(resp.Body).Decode(&parsed) + _ = resp.Body.Close() + if decErr != nil { + continue + } + + provName := kwIndex[strings.ToLower(q)] + for _, item := range parsed.Items { + f := recon.Finding{ + ProviderName: provName, + Confidence: "low", + Source: fmt.Sprintf("netlas://%s:%d", item.Data.IP, item.Data.Port), + SourceType: "recon:netlas", + DetectedAt: time.Now(), + } + select { + case out <- f: + case <-ctx.Done(): + return ctx.Err() + } + } + } + return nil +} + +type netlasSearchResponse struct { + Items []netlasItem `json:"items"` +} + +type netlasItem struct { + Data netlasData `json:"data"` +} + +type netlasData struct { + IP string `json:"ip"` + Port int `json:"port"` +} + +// netlasKeywordIndex maps lowercased keywords to provider names. +func netlasKeywordIndex(reg *providers.Registry) map[string]string { + m := make(map[string]string) + if reg == nil { + return m + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + kl := strings.ToLower(strings.TrimSpace(k)) + if kl == "" { + continue + } + if _, exists := m[kl]; !exists { + m[kl] = p.Name + } + } + } + return m +} diff --git a/pkg/recon/sources/netlas_test.go b/pkg/recon/sources/netlas_test.go new file mode 100644 index 0000000..ddc337a --- /dev/null +++ b/pkg/recon/sources/netlas_test.go @@ -0,0 +1,117 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func netlasStubHandler(t *testing.T, calls *int32) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(calls, 1) + if !strings.HasPrefix(r.URL.Path, "/api/responses/") { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if got := r.Header.Get("X-API-Key"); got != "testkey" { + t.Errorf("missing X-API-Key header: %q", got) + } + body := netlasSearchResponse{ + Items: []netlasItem{ + {Data: netlasData{IP: "10.0.0.1", Port: 443}}, + {Data: netlasData{IP: "10.0.0.2", Port: 8443}}, + }, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(body) + } +} + +func TestNetlasSource_EnabledRequiresAPIKey(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + + s := &NetlasSource{APIKey: "", Registry: reg, Limiters: lim} + if s.Enabled(recon.Config{}) { + t.Error("expected Enabled=false with empty key") + } + s = &NetlasSource{APIKey: "key", Registry: reg, Limiters: lim} + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true with key") + } +} + +func TestNetlasSource_SweepEmitsFindings(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("netlas", 1000, 100) + + var calls int32 + srv := httptest.NewServer(netlasStubHandler(t, &calls)) + defer srv.Close() + + s := &NetlasSource{ + APIKey: "testkey", + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + done := make(chan error, 1) + go func() { done <- s.Sweep(ctx, "", out); close(out) }() + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if err := <-done; err != nil { + t.Fatalf("Sweep error: %v", err) + } + + // 2 keywords * 2 items = 4 findings + if len(findings) != 4 { + t.Fatalf("expected 4 findings, got %d", len(findings)) + } + for _, f := range findings { + if f.SourceType != "recon:netlas" { + t.Errorf("SourceType=%q want recon:netlas", f.SourceType) + } + } + if got := atomic.LoadInt32(&calls); got != 2 { + t.Errorf("expected 2 API calls, got %d", got) + } +} + +func TestNetlasSource_CtxCancelled(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("netlas", 1000, 100) + + s := &NetlasSource{ + APIKey: "key", + BaseURL: "http://127.0.0.1:1", + Registry: reg, + Limiters: lim, + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 1) + err := s.Sweep(ctx, "", out) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled, got %v", err) + } +}