diff --git a/.planning/phases/04-input-sources/04-01-PLAN.md b/.planning/phases/04-input-sources/04-01-PLAN.md
new file mode 100644
index 0000000..b1f8f9c
--- /dev/null
+++ b/.planning/phases/04-input-sources/04-01-PLAN.md
@@ -0,0 +1,114 @@
+---
+phase: 04-input-sources
+plan: 01
+type: execute
+wave: 0
+depends_on: []
+files_modified:
+ - go.mod
+ - go.sum
+autonomous: true
+requirements: []
+must_haves:
+ truths:
+ - "go-git/v5, atotto/clipboard, x/exp/mmap are available as imports"
+ - "go build ./... succeeds with new dependencies"
+ artifacts:
+ - path: "go.mod"
+ provides: "Module declarations for go-git, clipboard, and x/exp"
+ contains: "github.com/go-git/go-git/v5"
+ - path: "go.sum"
+ provides: "Checksums for added dependencies"
+ key_links:
+ - from: "go.mod"
+ to: "module cache"
+ via: "go mod tidy"
+ pattern: "go-git/go-git/v5"
+---
+
+
+Add the three external Go dependencies that Phase 4 input sources require:
+- `github.com/go-git/go-git/v5` — git history traversal (INPUT-02)
+- `github.com/atotto/clipboard` — cross-platform clipboard access (INPUT-05)
+- `golang.org/x/exp/mmap` — memory-mapped large file reads (CORE-07)
+
+Purpose: Wave 0 dependency bootstrap so the parallel source implementation plans (04-02, 04-03, 04-04) compile cleanly on first attempt with no dependency resolution thrash.
+Output: Updated go.mod and go.sum with all three modules resolved.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/04-input-sources/04-CONTEXT.md
+@go.mod
+
+
+
+
+
+ Task 1: Add go-git, clipboard, and x/exp/mmap dependencies
+
+ - go.mod
+ - .planning/phases/04-input-sources/04-CONTEXT.md
+
+ go.mod, go.sum
+
+Run the following commands from the repo root in order:
+
+```bash
+go get github.com/go-git/go-git/v5@latest
+go get github.com/atotto/clipboard@latest
+go get golang.org/x/exp/mmap@latest
+go mod tidy
+go build ./...
+```
+
+Verify the `require` block in go.mod now contains direct entries (non-indirect) for:
+
+```
+github.com/go-git/go-git/v5 vX.Y.Z
+github.com/atotto/clipboard vX.Y.Z
+golang.org/x/exp vYYYYMMDD-hash
+```
+
+If `go build ./...` fails, do NOT try to fix anything beyond the dependency graph — unrelated build failures must be surfaced. If `go mod tidy` moves a module to indirect, that is acceptable only if no source file yet imports it; the follow-on plans in Wave 1 will promote them to direct.
+
+Do NOT modify any source files in this plan. This is dependency bootstrap only.
+
+
+ go build ./... && grep -E "go-git/go-git/v5|atotto/clipboard|golang.org/x/exp" go.mod
+
+
+ - `grep "github.com/go-git/go-git/v5" go.mod` returns a match
+ - `grep "github.com/atotto/clipboard" go.mod` returns a match
+ - `grep "golang.org/x/exp" go.mod` returns a match
+ - `go build ./...` exits 0
+ - `go.sum` contains entries for all three modules
+
+ All three new modules are present in go.mod, go.sum has their checksums, and `go build ./...` succeeds.
+
+
+
+
+
+- `go build ./...` succeeds
+- `go vet ./...` succeeds
+- `grep -c "go-git/go-git/v5\|atotto/clipboard\|golang.org/x/exp" go.mod` returns 3 or more
+
+
+
+Dependencies resolved and build is green. Wave 1 plans can import from these modules without needing their own `go get` calls.
+
+
+
diff --git a/.planning/phases/04-input-sources/04-02-PLAN.md b/.planning/phases/04-input-sources/04-02-PLAN.md
new file mode 100644
index 0000000..4d41893
--- /dev/null
+++ b/.planning/phases/04-input-sources/04-02-PLAN.md
@@ -0,0 +1,573 @@
+---
+phase: 04-input-sources
+plan: 02
+type: execute
+wave: 1
+depends_on: ["04-01"]
+files_modified:
+ - pkg/engine/sources/dir.go
+ - pkg/engine/sources/dir_test.go
+ - pkg/engine/sources/file.go
+ - pkg/engine/sources/file_test.go
+autonomous: true
+requirements:
+ - INPUT-01
+ - CORE-07
+must_haves:
+ truths:
+ - "DirSource recursively walks a directory and emits Chunks for every non-excluded file"
+ - "Glob exclusion patterns (--exclude) skip matching files by basename AND full relative path"
+ - "Default exclusions skip .git/, node_modules/, vendor/, *.min.js, *.map"
+ - "Binary files (null byte in first 512 bytes) are skipped"
+ - "Files larger than the mmap threshold (10MB) are read via golang.org/x/exp/mmap, smaller files via os.ReadFile"
+ - "File emission order is deterministic (sorted) for reproducible tests"
+ artifacts:
+ - path: "pkg/engine/sources/dir.go"
+ provides: "DirSource implementing Source interface for recursive directory scanning"
+ exports: ["DirSource", "NewDirSource"]
+ min_lines: 120
+ - path: "pkg/engine/sources/dir_test.go"
+ provides: "Test coverage for recursive walk, exclusion, binary skip, mmap threshold"
+ min_lines: 100
+ - path: "pkg/engine/sources/file.go"
+ provides: "FileSource extended to use mmap for files > 10MB"
+ contains: "mmap"
+ key_links:
+ - from: "pkg/engine/sources/dir.go"
+ to: "golang.org/x/exp/mmap"
+ via: "mmap.Open for large files"
+ pattern: "mmap\\.Open"
+ - from: "pkg/engine/sources/dir.go"
+ to: "filepath.WalkDir"
+ via: "recursive traversal"
+ pattern: "filepath\\.WalkDir"
+ - from: "pkg/engine/sources/dir.go"
+ to: "types.Chunk"
+ via: "channel send"
+ pattern: "out <- types\\.Chunk"
+---
+
+
+Implement `DirSource` — a recursive directory scanner that walks a root path via `filepath.WalkDir`, honors glob exclusion patterns, detects and skips binary files, and uses memory-mapped I/O for large files. This satisfies INPUT-01 (directory/recursive scanning with exclusions) and CORE-07 (mmap large file reading).
+
+Purpose: The most common scan target is a repo directory, not a single file. This plan replaces the "wrap FileSource per path" hack with a purpose-built recursive source that emits deterministically ordered chunks and scales to multi-GB files without blowing out memory.
+Output: `pkg/engine/sources/dir.go`, `dir_test.go`, plus a small `file.go` update to share the mmap read helper.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/phases/04-input-sources/04-CONTEXT.md
+@pkg/engine/sources/source.go
+@pkg/engine/sources/file.go
+@pkg/types/chunk.go
+
+
+Source interface (pkg/engine/sources/source.go):
+```go
+type Source interface {
+ Chunks(ctx context.Context, out chan<- types.Chunk) error
+}
+```
+
+Chunk type (pkg/types/chunk.go):
+```go
+type Chunk struct {
+ Data []byte
+ Source string
+ Offset int64
+}
+```
+
+Existing constants in pkg/engine/sources/file.go:
+```go
+const defaultChunkSize = 4096
+const chunkOverlap = 256
+```
+
+
+
+
+
+
+ Task 1: Implement DirSource with recursive walk, exclusion, binary detection, and mmap
+
+ - pkg/engine/sources/source.go
+ - pkg/engine/sources/file.go
+ - pkg/types/chunk.go
+ - .planning/phases/04-input-sources/04-CONTEXT.md (Directory/File Scanning section)
+
+
+ pkg/engine/sources/dir.go,
+ pkg/engine/sources/dir_test.go,
+ pkg/engine/sources/file.go
+
+
+ - Test 1: DirSource walks a temp dir containing 3 text files, emits 3 chunks, source fields match file paths
+ - Test 2: Default exclusions skip `.git/config`, `node_modules/foo.js`, `vendor/bar.go`, `app.min.js`, `app.js.map`
+ - Test 3: User-supplied exclude pattern `*.log` skips `foo.log` but keeps `foo.txt`
+ - Test 4: Binary file (first 512 bytes contain a null byte) is skipped; text file is emitted
+ - Test 5: File >10MB is read via mmap path and emits chunks whose concatenated data equals file content
+ - Test 6: File emission order is deterministic (sorted lexicographically) across two runs on same dir
+ - Test 7: ctx cancellation mid-walk returns ctx.Err() promptly
+ - Test 8: Non-existent root returns an error
+
+
+Create `pkg/engine/sources/dir.go` with the following complete implementation:
+
+```go
+package sources
+
+import (
+ "bytes"
+ "context"
+ "errors"
+ "fmt"
+ "io/fs"
+ "os"
+ "path/filepath"
+ "sort"
+ "strings"
+
+ "golang.org/x/exp/mmap"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+// MmapThreshold is the file size above which DirSource/FileSource use memory-mapped reads.
+const MmapThreshold int64 = 10 * 1024 * 1024 // 10 MB
+
+// BinarySniffSize is the number of leading bytes inspected for a NUL byte
+// to classify a file as binary and skip it.
+const BinarySniffSize = 512
+
+// DefaultExcludes are glob patterns excluded from directory scans unless
+// the caller passes an empty slice explicitly via NewDirSourceRaw.
+var DefaultExcludes = []string{
+ ".git/**",
+ "node_modules/**",
+ "vendor/**",
+ "*.min.js",
+ "*.map",
+}
+
+// DirSource walks a directory recursively and emits Chunks for every
+// non-excluded, non-binary file it finds. Files larger than MmapThreshold
+// are read via mmap; smaller files use os.ReadFile.
+type DirSource struct {
+ Root string
+ Excludes []string // glob patterns applied to path basename AND full relative path
+ ChunkSize int
+}
+
+// NewDirSource creates a DirSource with the default exclusions merged
+// with the caller-supplied extras.
+func NewDirSource(root string, extraExcludes ...string) *DirSource {
+ merged := make([]string, 0, len(DefaultExcludes)+len(extraExcludes))
+ merged = append(merged, DefaultExcludes...)
+ merged = append(merged, extraExcludes...)
+ return &DirSource{Root: root, Excludes: merged, ChunkSize: defaultChunkSize}
+}
+
+// NewDirSourceRaw creates a DirSource with ONLY the caller-supplied excludes
+// (no defaults). Useful for tests and advanced users.
+func NewDirSourceRaw(root string, excludes []string) *DirSource {
+ return &DirSource{Root: root, Excludes: excludes, ChunkSize: defaultChunkSize}
+}
+
+// Chunks implements Source. It walks d.Root, filters excluded and binary
+// files, reads each remaining file (via mmap above MmapThreshold), and
+// emits overlapping chunks through out.
+func (d *DirSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
+ if d.Root == "" {
+ return errors.New("DirSource: Root is empty")
+ }
+ info, err := os.Stat(d.Root)
+ if err != nil {
+ return fmt.Errorf("DirSource: stat root: %w", err)
+ }
+ if !info.IsDir() {
+ return fmt.Errorf("DirSource: root %q is not a directory", d.Root)
+ }
+
+ // Collect paths first for deterministic ordering across runs.
+ var paths []string
+ err = filepath.WalkDir(d.Root, func(path string, de fs.DirEntry, werr error) error {
+ if werr != nil {
+ return werr
+ }
+ if de.IsDir() {
+ rel, _ := filepath.Rel(d.Root, path)
+ if d.isExcluded(rel, de.Name()) {
+ return filepath.SkipDir
+ }
+ return nil
+ }
+ rel, _ := filepath.Rel(d.Root, path)
+ if d.isExcluded(rel, de.Name()) {
+ return nil
+ }
+ paths = append(paths, path)
+ return nil
+ })
+ if err != nil {
+ return fmt.Errorf("DirSource: walk: %w", err)
+ }
+ sort.Strings(paths)
+
+ for _, p := range paths {
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+ if err := d.emitFile(ctx, p, out); err != nil {
+ // Per-file errors are non-fatal: continue walking, but respect ctx.
+ if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+ return err
+ }
+ // Swallow per-file errors; the engine logs elsewhere.
+ continue
+ }
+ }
+ return nil
+}
+
+// isExcluded returns true if either the relative path or the basename matches
+// any configured glob pattern.
+func (d *DirSource) isExcluded(rel, base string) bool {
+ rel = filepath.ToSlash(rel)
+ for _, pat := range d.Excludes {
+ pat = filepath.ToSlash(pat)
+ // Match against basename.
+ if ok, _ := filepath.Match(pat, base); ok {
+ return true
+ }
+ // Match against full relative path.
+ if ok, _ := filepath.Match(pat, rel); ok {
+ return true
+ }
+ // `dir/**` style — naive prefix match against the leading segment.
+ if strings.HasSuffix(pat, "/**") {
+ prefix := strings.TrimSuffix(pat, "/**")
+ if rel == prefix || strings.HasPrefix(rel, prefix+"/") {
+ return true
+ }
+ }
+ }
+ return false
+}
+
+// emitFile reads a single file and pushes its chunks onto out.
+func (d *DirSource) emitFile(ctx context.Context, path string, out chan<- types.Chunk) error {
+ fi, err := os.Stat(path)
+ if err != nil {
+ return err
+ }
+ size := fi.Size()
+ if size == 0 {
+ return nil
+ }
+
+ var data []byte
+ if size >= MmapThreshold {
+ ra, err := mmap.Open(path)
+ if err != nil {
+ return fmt.Errorf("mmap open %s: %w", path, err)
+ }
+ defer ra.Close()
+ data = make([]byte, ra.Len())
+ if _, err := ra.ReadAt(data, 0); err != nil {
+ return fmt.Errorf("mmap read %s: %w", path, err)
+ }
+ } else {
+ data, err = os.ReadFile(path)
+ if err != nil {
+ return err
+ }
+ }
+
+ if isBinary(data) {
+ return nil
+ }
+ return emitChunks(ctx, data, path, d.ChunkSize, out)
+}
+
+// isBinary reports whether the leading BinarySniffSize bytes contain a NUL byte.
+func isBinary(data []byte) bool {
+ n := len(data)
+ if n > BinarySniffSize {
+ n = BinarySniffSize
+ }
+ return bytes.IndexByte(data[:n], 0x00) >= 0
+}
+
+// emitChunks is the shared overlapping-chunk emitter used by FileSource and DirSource.
+func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error {
+ if chunkSize <= 0 {
+ chunkSize = defaultChunkSize
+ }
+ if len(data) <= chunkSize {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case out <- types.Chunk{Data: data, Source: source, Offset: 0}:
+ }
+ return nil
+ }
+ var offset int64
+ for start := 0; start < len(data); start += chunkSize - chunkOverlap {
+ end := start + chunkSize
+ if end > len(data) {
+ end = len(data)
+ }
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case out <- types.Chunk{Data: data[start:end], Source: source, Offset: offset}:
+ }
+ offset += int64(end - start)
+ if end == len(data) {
+ break
+ }
+ }
+ return nil
+}
+```
+
+Update `pkg/engine/sources/file.go` so FileSource reuses `emitChunks` and adopts the same mmap threshold for large single-file scans:
+
+```go
+package sources
+
+import (
+ "context"
+ "os"
+
+ "golang.org/x/exp/mmap"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+const defaultChunkSize = 4096
+const chunkOverlap = 256
+
+// FileSource reads a single file and emits overlapping chunks.
+// For files >= MmapThreshold it uses golang.org/x/exp/mmap.
+type FileSource struct {
+ Path string
+ ChunkSize int
+}
+
+func NewFileSource(path string) *FileSource {
+ return &FileSource{Path: path, ChunkSize: defaultChunkSize}
+}
+
+func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
+ fi, err := os.Stat(f.Path)
+ if err != nil {
+ return err
+ }
+ size := fi.Size()
+ if size == 0 {
+ return nil
+ }
+ var data []byte
+ if size >= MmapThreshold {
+ ra, err := mmap.Open(f.Path)
+ if err != nil {
+ return err
+ }
+ defer ra.Close()
+ data = make([]byte, ra.Len())
+ if _, err := ra.ReadAt(data, 0); err != nil {
+ return err
+ }
+ } else {
+ data, err = os.ReadFile(f.Path)
+ if err != nil {
+ return err
+ }
+ }
+ if isBinary(data) {
+ return nil
+ }
+ return emitChunks(ctx, data, f.Path, f.ChunkSize, out)
+}
+```
+
+Create `pkg/engine/sources/dir_test.go` with a comprehensive suite:
+
+```go
+package sources
+
+import (
+ "context"
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+func drain(t *testing.T, src Source) []types.Chunk {
+ t.Helper()
+ ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+ defer cancel()
+ out := make(chan types.Chunk, 1024)
+ errCh := make(chan error, 1)
+ go func() { errCh <- src.Chunks(ctx, out); close(out) }()
+ var got []types.Chunk
+ for c := range out {
+ got = append(got, c)
+ }
+ require.NoError(t, <-errCh)
+ return got
+}
+
+func writeFile(t *testing.T, path, content string) {
+ t.Helper()
+ require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o755))
+ require.NoError(t, os.WriteFile(path, []byte(content), 0o644))
+}
+
+func TestDirSource_RecursiveWalk(t *testing.T) {
+ root := t.TempDir()
+ writeFile(t, filepath.Join(root, "a.txt"), "alpha content")
+ writeFile(t, filepath.Join(root, "sub", "b.txt"), "bravo content")
+ writeFile(t, filepath.Join(root, "sub", "deep", "c.txt"), "charlie content")
+
+ chunks := drain(t, NewDirSourceRaw(root, nil))
+ require.Len(t, chunks, 3)
+
+ sources := make([]string, 0, len(chunks))
+ for _, c := range chunks {
+ sources = append(sources, c.Source)
+ }
+ // Deterministic sorted order.
+ require.True(t, sort_IsSorted(sources), "emission order must be sorted, got %v", sources)
+}
+
+func sort_IsSorted(s []string) bool {
+ for i := 1; i < len(s); i++ {
+ if s[i-1] > s[i] {
+ return false
+ }
+ }
+ return true
+}
+
+func TestDirSource_DefaultExcludes(t *testing.T) {
+ root := t.TempDir()
+ writeFile(t, filepath.Join(root, "keep.txt"), "keep me")
+ writeFile(t, filepath.Join(root, ".git", "config"), "[core]")
+ writeFile(t, filepath.Join(root, "node_modules", "foo.js"), "x")
+ writeFile(t, filepath.Join(root, "vendor", "bar.go"), "package x")
+ writeFile(t, filepath.Join(root, "app.min.js"), "y")
+ writeFile(t, filepath.Join(root, "app.js.map"), "{}")
+
+ chunks := drain(t, NewDirSource(root))
+ require.Len(t, chunks, 1)
+ require.Contains(t, chunks[0].Source, "keep.txt")
+}
+
+func TestDirSource_UserExclude(t *testing.T) {
+ root := t.TempDir()
+ writeFile(t, filepath.Join(root, "keep.txt"), "keep")
+ writeFile(t, filepath.Join(root, "drop.log"), "drop")
+
+ chunks := drain(t, NewDirSourceRaw(root, []string{"*.log"}))
+ require.Len(t, chunks, 1)
+ require.Contains(t, chunks[0].Source, "keep.txt")
+}
+
+func TestDirSource_BinarySkipped(t *testing.T) {
+ root := t.TempDir()
+ writeFile(t, filepath.Join(root, "text.txt"), "plain text content")
+ binPath := filepath.Join(root, "blob.bin")
+ require.NoError(t, os.WriteFile(binPath, []byte{0x7f, 'E', 'L', 'F', 0x00, 0x01, 0x02}, 0o644))
+
+ chunks := drain(t, NewDirSourceRaw(root, nil))
+ require.Len(t, chunks, 1)
+ require.Contains(t, chunks[0].Source, "text.txt")
+}
+
+func TestDirSource_MmapLargeFile(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping large file test in short mode")
+ }
+ root := t.TempDir()
+ big := filepath.Join(root, "big.txt")
+ // Construct a payload slightly above MmapThreshold.
+ payload := strings.Repeat("API_KEY=xxxxxxxxxxxxxxxxxxxx\n", (int(MmapThreshold)/28)+10)
+ require.NoError(t, os.WriteFile(big, []byte(payload), 0o644))
+
+ chunks := drain(t, NewDirSourceRaw(root, nil))
+ // Reconstruct data accounting for chunk overlap.
+ require.NotEmpty(t, chunks)
+ require.Equal(t, big, chunks[0].Source)
+}
+
+func TestDirSource_MissingRoot(t *testing.T) {
+ src := NewDirSourceRaw("/definitely/does/not/exist/keyhunter-xyz", nil)
+ ctx := context.Background()
+ out := make(chan types.Chunk, 1)
+ err := src.Chunks(ctx, out)
+ require.Error(t, err)
+}
+
+func TestDirSource_CtxCancellation(t *testing.T) {
+ root := t.TempDir()
+ for i := 0; i < 50; i++ {
+ writeFile(t, filepath.Join(root, "f", string(rune('a'+i%26))+".txt"), "payload")
+ }
+ ctx, cancel := context.WithCancel(context.Background())
+ cancel() // pre-cancelled
+ out := make(chan types.Chunk, 1024)
+ err := NewDirSourceRaw(root, nil).Chunks(ctx, out)
+ require.ErrorIs(t, err, context.Canceled)
+}
+```
+
+Also add a minimal update to `pkg/engine/sources/file_test.go` if it exists — if not present, skip. Do NOT alter any other source files in this plan.
+
+
+ go test ./pkg/engine/sources/... -run 'TestDirSource|TestFileSource' -race -count=1
+
+
+ - `go build ./pkg/engine/sources/...` exits 0
+ - `go test ./pkg/engine/sources/... -run TestDirSource -race -count=1` passes all subtests
+ - `grep -n "mmap.Open" pkg/engine/sources/dir.go pkg/engine/sources/file.go` returns two hits
+ - `grep -n "filepath.WalkDir" pkg/engine/sources/dir.go` returns a hit
+ - `grep -n "DefaultExcludes" pkg/engine/sources/dir.go` returns a hit
+ - `grep -n "isBinary" pkg/engine/sources/dir.go` returns a hit
+
+
+ DirSource implements Source, walks recursively, honors default and user glob exclusions, skips binary files, and uses mmap above 10MB. FileSource refactored to share the same mmap/emit helpers. All tests green under -race.
+
+
+
+
+
+
+- `go test ./pkg/engine/sources/... -race -count=1` passes
+- `go vet ./pkg/engine/sources/...` clean
+- All acceptance criteria grep matches hit
+
+
+
+A caller can create `sources.NewDirSource("./myrepo", "*.log")` and receive chunks for every non-excluded, non-binary file in deterministic order, with files >10MB read via mmap.
+
+
+
diff --git a/.planning/phases/04-input-sources/04-03-PLAN.md b/.planning/phases/04-input-sources/04-03-PLAN.md
new file mode 100644
index 0000000..a25b8b9
--- /dev/null
+++ b/.planning/phases/04-input-sources/04-03-PLAN.md
@@ -0,0 +1,456 @@
+---
+phase: 04-input-sources
+plan: 03
+type: execute
+wave: 1
+depends_on: ["04-01"]
+files_modified:
+ - pkg/engine/sources/git.go
+ - pkg/engine/sources/git_test.go
+autonomous: true
+requirements:
+ - INPUT-02
+must_haves:
+ truths:
+ - "GitSource opens a local git repo via go-git and iterates commits on all branches and tags"
+ - "Each unique blob (by OID) is scanned exactly once — duplicate blobs across commits are skipped"
+ - "Finding.Source is formatted as 'git::' for every emitted chunk"
+ - "--since filter (passed via GitSource.Since time.Time) excludes commits older than the cutoff"
+ - "Bare repos and regular repos with worktrees both work"
+ artifacts:
+ - path: "pkg/engine/sources/git.go"
+ provides: "GitSource implementing Source interface via go-git/v5"
+ exports: ["GitSource", "NewGitSource"]
+ min_lines: 120
+ - path: "pkg/engine/sources/git_test.go"
+ provides: "Tests using an in-process go-git repo fixture"
+ min_lines: 100
+ key_links:
+ - from: "pkg/engine/sources/git.go"
+ to: "github.com/go-git/go-git/v5"
+ via: "git.PlainOpen"
+ pattern: "git\\.PlainOpen"
+ - from: "pkg/engine/sources/git.go"
+ to: "repo.References"
+ via: "iterating refs/heads + refs/tags"
+ pattern: "References\\(\\)"
+ - from: "pkg/engine/sources/git.go"
+ to: "types.Chunk"
+ via: "channel send with git:sha:path source"
+ pattern: "git:"
+---
+
+
+Implement `GitSource` — a git-history-aware input adapter that walks every commit across every branch and tag in a local repository, deduplicates blob scans by OID, and emits chunks with commit-SHA-prefixed source identifiers. Satisfies INPUT-02.
+
+Purpose: Leaked keys often exist only in git history — deleted from HEAD but still reachable via old commits. A one-shot HEAD scan misses them. This source walks the full commit graph using `go-git/v5` with blob-level deduplication so a 10k-commit repo with 200k historical files scans in minutes, not hours.
+Output: `pkg/engine/sources/git.go` and `git_test.go`. Wired into CLI in plan 04-05.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/phases/04-input-sources/04-CONTEXT.md
+@pkg/engine/sources/source.go
+@pkg/types/chunk.go
+
+
+Source interface:
+```go
+type Source interface {
+ Chunks(ctx context.Context, out chan<- types.Chunk) error
+}
+```
+
+Chunk struct:
+```go
+type Chunk struct {
+ Data []byte
+ Source string // will be "git::"
+ Offset int64
+}
+```
+
+Relevant go-git/v5 APIs (from https://pkg.go.dev/github.com/go-git/go-git/v5):
+```go
+import "github.com/go-git/go-git/v5"
+import "github.com/go-git/go-git/v5/plumbing"
+import "github.com/go-git/go-git/v5/plumbing/object"
+
+repo, err := git.PlainOpen(path) // opens local repo
+refs, err := repo.References() // iterator over refs
+refs.ForEach(func(*plumbing.Reference) error { }) // walk refs
+commit, err := repo.CommitObject(hash) // resolve commit
+iter, err := repo.Log(&git.LogOptions{From: hash, All: false})
+iter.ForEach(func(*object.Commit) error { }) // walk commits
+tree, err := commit.Tree()
+tree.Files().ForEach(func(*object.File) error { }) // walk blobs
+file.Contents() // returns (string, error)
+file.Binary() // (bool, error)
+file.Hash // plumbing.Hash (blob OID)
+```
+
+emitChunks helper from 04-02 plan (pkg/engine/sources/dir.go) — reuse:
+```go
+func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error
+```
+
+
+
+
+
+
+ Task 1: Implement GitSource with full-history traversal and blob deduplication
+
+ - pkg/engine/sources/source.go
+ - pkg/engine/sources/dir.go (for emitChunks helper — produced by plan 04-02)
+ - pkg/types/chunk.go
+ - .planning/phases/04-input-sources/04-CONTEXT.md (Git History section)
+
+
+ pkg/engine/sources/git.go,
+ pkg/engine/sources/git_test.go
+
+
+ - Test 1: GitSource on a fresh repo with 3 commits (each adding a file) emits exactly 3 unique blob scans
+ - Test 2: Second commit modifying file A creates a new blob — both old and new versions are scanned
+ - Test 3: Duplicate blob (same content in two files on same commit) is scanned once (dedup by OID)
+ - Test 4: Multi-branch repo — branch A with file X, branch B with file Y — both are scanned
+ - Test 5: Tag pointing to an old commit makes that commit's blobs reachable
+ - Test 6: Since filter set to "now + 1 hour" emits zero chunks
+ - Test 7: Finding.Source field matches pattern `git:[0-9a-f]{7}:.*`
+ - Test 8: Non-existent repo path returns an error
+
+
+Create `pkg/engine/sources/git.go`:
+
+```go
+package sources
+
+import (
+ "bytes"
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "time"
+
+ "github.com/go-git/go-git/v5"
+ "github.com/go-git/go-git/v5/plumbing"
+ "github.com/go-git/go-git/v5/plumbing/object"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+// GitSource scans the full history of a local git repository: every commit
+// on every branch and tag, deduplicating blob scans by OID.
+type GitSource struct {
+ // RepoPath is the path to the local git repo (working tree or bare).
+ RepoPath string
+ // Since, if non-zero, excludes commits older than this timestamp
+ // (using commit author date).
+ Since time.Time
+ // ChunkSize is the overlap-chunker size; zero uses defaultChunkSize.
+ ChunkSize int
+}
+
+// NewGitSource creates a GitSource for the given repo path.
+func NewGitSource(repoPath string) *GitSource {
+ return &GitSource{RepoPath: repoPath, ChunkSize: defaultChunkSize}
+}
+
+// Chunks walks every commit reachable from every branch, tag, and the
+// stash ref (if present), streaming each unique blob's content through
+// the shared emitChunks helper.
+func (g *GitSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
+ if g.RepoPath == "" {
+ return errors.New("GitSource: RepoPath is empty")
+ }
+ repo, err := git.PlainOpen(g.RepoPath)
+ if err != nil {
+ return fmt.Errorf("GitSource: open %q: %w", g.RepoPath, err)
+ }
+
+ // Collect commit hashes to walk from every ref under refs/heads, refs/tags, refs/stash.
+ seedCommits, err := collectSeedCommits(repo)
+ if err != nil {
+ return fmt.Errorf("GitSource: collect refs: %w", err)
+ }
+ if len(seedCommits) == 0 {
+ return nil // empty repo is not an error
+ }
+
+ seenCommits := make(map[plumbing.Hash]struct{})
+ seenBlobs := make(map[plumbing.Hash]struct{})
+
+ for _, seed := range seedCommits {
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+ iter, err := repo.Log(&git.LogOptions{From: seed, All: false})
+ if err != nil {
+ continue
+ }
+ err = iter.ForEach(func(c *object.Commit) error {
+ if ctxErr := ctx.Err(); ctxErr != nil {
+ return ctxErr
+ }
+ if _, ok := seenCommits[c.Hash]; ok {
+ return nil
+ }
+ seenCommits[c.Hash] = struct{}{}
+
+ if !g.Since.IsZero() && c.Author.When.Before(g.Since) {
+ return nil
+ }
+ return g.emitCommitBlobs(ctx, c, seenBlobs, out)
+ })
+ iter.Close()
+ if err != nil {
+ if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+ return err
+ }
+ // Swallow per-seed iterator errors; continue with other refs.
+ }
+ }
+ return nil
+}
+
+// collectSeedCommits gathers commit hashes from all local branches, tags,
+// and the stash ref — the union of which reaches every commit worth scanning.
+func collectSeedCommits(repo *git.Repository) ([]plumbing.Hash, error) {
+ var seeds []plumbing.Hash
+ refs, err := repo.References()
+ if err != nil {
+ return nil, err
+ }
+ err = refs.ForEach(func(ref *plumbing.Reference) error {
+ name := ref.Name()
+ if !(name.IsBranch() || name.IsTag() || name == plumbing.ReferenceName("refs/stash") || name.IsRemote()) {
+ return nil
+ }
+ hash := ref.Hash()
+ // For annotated tags the ref points at a tag object; resolve to commit if possible.
+ if name.IsTag() {
+ if tag, err := repo.TagObject(hash); err == nil {
+ if c, err := tag.Commit(); err == nil {
+ hash = c.Hash
+ }
+ }
+ }
+ // Skip symbolic refs (HEAD) whose target we already walked via IsBranch.
+ seeds = append(seeds, hash)
+ return nil
+ })
+ return seeds, err
+}
+
+// emitCommitBlobs walks the tree of a commit and emits every blob whose
+// OID has not already been scanned.
+func (g *GitSource) emitCommitBlobs(ctx context.Context, c *object.Commit, seenBlobs map[plumbing.Hash]struct{}, out chan<- types.Chunk) error {
+ tree, err := c.Tree()
+ if err != nil {
+ return nil // skip unreadable tree
+ }
+ shortSHA := c.Hash.String()[:7]
+
+ return tree.Files().ForEach(func(f *object.File) error {
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+ if _, ok := seenBlobs[f.Hash]; ok {
+ return nil
+ }
+ seenBlobs[f.Hash] = struct{}{}
+
+ // Skip obviously-binary blobs via go-git's helper, then via our sniff.
+ if isBin, _ := f.IsBinary(); isBin {
+ return nil
+ }
+ reader, err := f.Reader()
+ if err != nil {
+ return nil
+ }
+ defer reader.Close()
+ data, err := io.ReadAll(reader)
+ if err != nil {
+ return nil
+ }
+ if len(data) == 0 {
+ return nil
+ }
+ if bytes.IndexByte(data[:minInt(len(data), BinarySniffSize)], 0x00) >= 0 {
+ return nil
+ }
+
+ source := fmt.Sprintf("git:%s:%s", shortSHA, f.Name)
+ return emitChunks(ctx, data, source, g.ChunkSize, out)
+ })
+}
+
+func minInt(a, b int) int {
+ if a < b {
+ return a
+ }
+ return b
+}
+```
+
+Create `pkg/engine/sources/git_test.go` using go-git's in-process fixtures:
+
+```go
+package sources
+
+import (
+ "context"
+ "os"
+ "path/filepath"
+ "regexp"
+ "testing"
+ "time"
+
+ "github.com/go-git/go-git/v5"
+ "github.com/go-git/go-git/v5/plumbing/object"
+ "github.com/stretchr/testify/require"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+func initRepo(t *testing.T) (string, *git.Repository) {
+ t.Helper()
+ dir := t.TempDir()
+ repo, err := git.PlainInit(dir, false)
+ require.NoError(t, err)
+ return dir, repo
+}
+
+func commitFile(t *testing.T, dir string, repo *git.Repository, name, content string) {
+ t.Helper()
+ path := filepath.Join(dir, name)
+ require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o755))
+ require.NoError(t, os.WriteFile(path, []byte(content), 0o644))
+ wt, err := repo.Worktree()
+ require.NoError(t, err)
+ _, err = wt.Add(name)
+ require.NoError(t, err)
+ _, err = wt.Commit("add "+name, &git.CommitOptions{
+ Author: &object.Signature{Name: "test", Email: "t@x", When: time.Now()},
+ })
+ require.NoError(t, err)
+}
+
+func drainGit(t *testing.T, src Source) []types.Chunk {
+ t.Helper()
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+ out := make(chan types.Chunk, 1024)
+ errCh := make(chan error, 1)
+ go func() { errCh <- src.Chunks(ctx, out); close(out) }()
+ var got []types.Chunk
+ for c := range out {
+ got = append(got, c)
+ }
+ require.NoError(t, <-errCh)
+ return got
+}
+
+func TestGitSource_HistoryWalk(t *testing.T) {
+ dir, repo := initRepo(t)
+ commitFile(t, dir, repo, "a.txt", "contents alpha")
+ commitFile(t, dir, repo, "b.txt", "contents bravo")
+ commitFile(t, dir, repo, "c.txt", "contents charlie")
+
+ chunks := drainGit(t, NewGitSource(dir))
+ require.GreaterOrEqual(t, len(chunks), 3)
+
+ re := regexp.MustCompile(`^git:[0-9a-f]{7}:.+$`)
+ for _, c := range chunks {
+ require.Regexp(t, re, c.Source)
+ }
+}
+
+func TestGitSource_BlobDeduplication(t *testing.T) {
+ dir, repo := initRepo(t)
+ commitFile(t, dir, repo, "a.txt", "same exact content everywhere")
+ commitFile(t, dir, repo, "b.txt", "same exact content everywhere") // identical blob -> same OID
+ commitFile(t, dir, repo, "c.txt", "different content here")
+
+ chunks := drainGit(t, NewGitSource(dir))
+ // Expect 2 unique blobs scanned, not 3 files.
+ unique := make(map[string]bool)
+ for _, c := range chunks {
+ unique[string(c.Data)] = true
+ }
+ require.Len(t, unique, 2, "duplicate blobs must be deduped by OID")
+}
+
+func TestGitSource_ModifiedFileKeepsBothVersions(t *testing.T) {
+ dir, repo := initRepo(t)
+ commitFile(t, dir, repo, "a.txt", "version one")
+ commitFile(t, dir, repo, "a.txt", "version two") // modifying produces a second blob
+
+ chunks := drainGit(t, NewGitSource(dir))
+ bodies := make(map[string]bool)
+ for _, c := range chunks {
+ bodies[string(c.Data)] = true
+ }
+ require.True(t, bodies["version one"], "old version must still be scanned")
+ require.True(t, bodies["version two"], "new version must be scanned")
+}
+
+func TestGitSource_SinceFilterExcludesAll(t *testing.T) {
+ dir, repo := initRepo(t)
+ commitFile(t, dir, repo, "a.txt", "alpha")
+
+ src := NewGitSource(dir)
+ src.Since = time.Now().Add(1 * time.Hour)
+ chunks := drainGit(t, src)
+ require.Empty(t, chunks)
+}
+
+func TestGitSource_MissingRepo(t *testing.T) {
+ src := NewGitSource(filepath.Join(t.TempDir(), "not-a-repo"))
+ ctx := context.Background()
+ out := make(chan types.Chunk, 1)
+ err := src.Chunks(ctx, out)
+ require.Error(t, err)
+}
+```
+
+Do NOT touch any file outside `pkg/engine/sources/git.go` and `pkg/engine/sources/git_test.go`. CLI wire-up happens in plan 04-05.
+
+
+ go test ./pkg/engine/sources/... -run TestGitSource -race -count=1 -timeout=60s
+
+
+ - `go build ./pkg/engine/sources/...` exits 0
+ - `go test ./pkg/engine/sources/... -run TestGitSource -race -count=1` passes all subtests
+ - `grep -n "git.PlainOpen" pkg/engine/sources/git.go` returns a hit
+ - `grep -n "seenBlobs" pkg/engine/sources/git.go` returns a hit (dedup map)
+ - `grep -n "fmt.Sprintf(\"git:%s:%s\"" pkg/engine/sources/git.go` returns a hit
+ - `grep -n "g.Since" pkg/engine/sources/git.go` returns a hit
+
+
+ GitSource walks all branches/tags, emits each unique blob once, honors Since filter, formats source as `git::`, and tests cover dedup/history/since/missing-repo.
+
+
+
+
+
+
+- `go test ./pkg/engine/sources/... -run TestGitSource -race` passes
+- `go vet ./pkg/engine/sources/...` clean
+- All grep acceptance checks hit
+
+
+
+A caller can `sources.NewGitSource("./myrepo")` and receive chunks for every historical blob across all refs, with deterministic dedup and source attribution in `git::` form.
+
+
+
diff --git a/.planning/phases/04-input-sources/04-04-PLAN.md b/.planning/phases/04-input-sources/04-04-PLAN.md
new file mode 100644
index 0000000..75f9edd
--- /dev/null
+++ b/.planning/phases/04-input-sources/04-04-PLAN.md
@@ -0,0 +1,624 @@
+---
+phase: 04-input-sources
+plan: 04
+type: execute
+wave: 1
+depends_on: ["04-01"]
+files_modified:
+ - pkg/engine/sources/stdin.go
+ - pkg/engine/sources/stdin_test.go
+ - pkg/engine/sources/url.go
+ - pkg/engine/sources/url_test.go
+ - pkg/engine/sources/clipboard.go
+ - pkg/engine/sources/clipboard_test.go
+autonomous: true
+requirements:
+ - INPUT-03
+ - INPUT-04
+ - INPUT-05
+must_haves:
+ truths:
+ - "StdinSource reads from an io.Reader and emits chunks with Source='stdin'"
+ - "URLSource fetches an http/https URL with 30s timeout, 50MB cap, rejects file:// and other schemes, and emits chunks with Source='url:'"
+ - "URLSource rejects responses with non-text Content-Type unless allowlisted (text/*, application/json, application/javascript, application/xml)"
+ - "ClipboardSource reads current clipboard via atotto/clipboard and emits chunks with Source='clipboard'"
+ - "ClipboardSource returns a clear error if clipboard tooling is unavailable"
+ artifacts:
+ - path: "pkg/engine/sources/stdin.go"
+ provides: "StdinSource"
+ exports: ["StdinSource", "NewStdinSource"]
+ min_lines: 40
+ - path: "pkg/engine/sources/url.go"
+ provides: "URLSource with HTTP fetch, timeout, size cap, content-type filter"
+ exports: ["URLSource", "NewURLSource"]
+ min_lines: 100
+ - path: "pkg/engine/sources/clipboard.go"
+ provides: "ClipboardSource wrapping atotto/clipboard"
+ exports: ["ClipboardSource", "NewClipboardSource"]
+ min_lines: 30
+ key_links:
+ - from: "pkg/engine/sources/url.go"
+ to: "net/http"
+ via: "http.Client with Timeout"
+ pattern: "http\\.Client"
+ - from: "pkg/engine/sources/url.go"
+ to: "io.LimitReader"
+ via: "MaxContentLength enforcement"
+ pattern: "LimitReader"
+ - from: "pkg/engine/sources/clipboard.go"
+ to: "github.com/atotto/clipboard"
+ via: "clipboard.ReadAll"
+ pattern: "clipboard\\.ReadAll"
+---
+
+
+Implement three smaller Source adapters in a single plan since each is <80 lines and they share no state:
+- `StdinSource` reads from an injectable `io.Reader` (defaults to `os.Stdin`) — INPUT-03
+- `URLSource` fetches a remote URL via stdlib `net/http` with timeout, size cap, scheme whitelist, and content-type filter — INPUT-04
+- `ClipboardSource` reads the current clipboard via `github.com/atotto/clipboard` with graceful fallback — INPUT-05
+
+Purpose: These three adapters complete the Phase 4 input surface area. Bundling them into one plan keeps wave-1 parallelism healthy (04-02 + 04-03 + 04-04 run simultaneously) while respecting the ~50% context budget since each adapter is self-contained and small.
+Output: Six files total (three sources + three test files).
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/phases/04-input-sources/04-CONTEXT.md
+@pkg/engine/sources/source.go
+@pkg/types/chunk.go
+
+
+Source interface:
+```go
+type Source interface {
+ Chunks(ctx context.Context, out chan<- types.Chunk) error
+}
+```
+
+Shared helper (produced by plan 04-02 in pkg/engine/sources/dir.go):
+```go
+func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error
+```
+
+atotto/clipboard API:
+```go
+import "github.com/atotto/clipboard"
+func ReadAll() (string, error)
+func Unsupported bool // set on platforms without clipboard tooling
+```
+
+
+
+
+
+
+ Task 1: Implement StdinSource, URLSource, and ClipboardSource with full test coverage
+
+ - pkg/engine/sources/source.go
+ - pkg/engine/sources/dir.go (for emitChunks signature from plan 04-02)
+ - pkg/types/chunk.go
+ - .planning/phases/04-input-sources/04-CONTEXT.md (Stdin, URL, Clipboard sections)
+
+
+ pkg/engine/sources/stdin.go,
+ pkg/engine/sources/stdin_test.go,
+ pkg/engine/sources/url.go,
+ pkg/engine/sources/url_test.go,
+ pkg/engine/sources/clipboard.go,
+ pkg/engine/sources/clipboard_test.go
+
+
+ StdinSource:
+ - Test 1: Feeding "API_KEY=xyz" through a bytes.Buffer emits one chunk with Source="stdin"
+ - Test 2: Empty input emits zero chunks without error
+ - Test 3: ctx cancellation returns ctx.Err()
+ URLSource:
+ - Test 4: Fetches content from httptest.Server, emits a chunk with Source="url:"
+ - Test 5: Server returning 50MB+1 body is rejected with a size error
+ - Test 6: Server returning Content-Type image/png is rejected
+ - Test 7: Scheme "file:///etc/passwd" is rejected without any request attempt
+ - Test 8: Server returning 500 returns a non-nil error containing "500"
+ - Test 9: HTTP 301 redirect is followed (max 5 hops)
+ ClipboardSource:
+ - Test 10: If clipboard.Unsupported is true, returns an error with "clipboard" in the message
+ - Test 11: Otherwise reads clipboard (may skip if empty on CI) — use build tag or t.Skip guard
+
+
+
+Create `pkg/engine/sources/stdin.go`:
+
+```go
+package sources
+
+import (
+ "context"
+ "io"
+ "os"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+// StdinSource reads content from an io.Reader (defaults to os.Stdin) and
+// emits overlapping chunks. Used when a user runs `keyhunter scan stdin`
+// or `keyhunter scan -`.
+type StdinSource struct {
+ Reader io.Reader
+ ChunkSize int
+}
+
+// NewStdinSource returns a StdinSource bound to os.Stdin.
+func NewStdinSource() *StdinSource {
+ return &StdinSource{Reader: os.Stdin, ChunkSize: defaultChunkSize}
+}
+
+// NewStdinSourceFrom returns a StdinSource bound to the given reader
+// (used primarily by tests).
+func NewStdinSourceFrom(r io.Reader) *StdinSource {
+ return &StdinSource{Reader: r, ChunkSize: defaultChunkSize}
+}
+
+// Chunks reads the entire input, then hands it to the shared chunk emitter.
+func (s *StdinSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
+ if s.Reader == nil {
+ s.Reader = os.Stdin
+ }
+ data, err := io.ReadAll(s.Reader)
+ if err != nil {
+ return err
+ }
+ if len(data) == 0 {
+ return nil
+ }
+ return emitChunks(ctx, data, "stdin", s.ChunkSize, out)
+}
+```
+
+Create `pkg/engine/sources/stdin_test.go`:
+
+```go
+package sources
+
+import (
+ "bytes"
+ "context"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+func TestStdinSource_Basic(t *testing.T) {
+ src := NewStdinSourceFrom(bytes.NewBufferString("API_KEY=sk-test-xyz"))
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ out := make(chan types.Chunk, 8)
+ errCh := make(chan error, 1)
+ go func() { errCh <- src.Chunks(ctx, out); close(out) }()
+
+ var got []types.Chunk
+ for c := range out {
+ got = append(got, c)
+ }
+ require.NoError(t, <-errCh)
+ require.Len(t, got, 1)
+ require.Equal(t, "stdin", got[0].Source)
+ require.Equal(t, "API_KEY=sk-test-xyz", string(got[0].Data))
+}
+
+func TestStdinSource_Empty(t *testing.T) {
+ src := NewStdinSourceFrom(bytes.NewBuffer(nil))
+ out := make(chan types.Chunk, 1)
+ err := src.Chunks(context.Background(), out)
+ close(out)
+ require.NoError(t, err)
+ require.Len(t, out, 0)
+}
+
+func TestStdinSource_CtxCancel(t *testing.T) {
+ // Large buffer so emitChunks iterates and can observe cancellation.
+ data := make([]byte, 1<<20)
+ src := NewStdinSourceFrom(bytes.NewReader(data))
+ ctx, cancel := context.WithCancel(context.Background())
+ cancel()
+ out := make(chan types.Chunk) // unbuffered forces select on ctx
+ err := src.Chunks(ctx, out)
+ require.ErrorIs(t, err, context.Canceled)
+}
+```
+
+Create `pkg/engine/sources/url.go`:
+
+```go
+package sources
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "strings"
+ "time"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+// MaxURLContentLength is the hard cap on URLSource response bodies.
+const MaxURLContentLength int64 = 50 * 1024 * 1024 // 50 MB
+
+// DefaultURLTimeout is the overall request timeout (connect + read + body).
+const DefaultURLTimeout = 30 * time.Second
+
+// allowedContentTypes is the whitelist of Content-Type prefixes URLSource
+// will accept. Binary types (images, archives, executables) are rejected.
+var allowedContentTypes = []string{
+ "text/",
+ "application/json",
+ "application/javascript",
+ "application/xml",
+ "application/x-yaml",
+ "application/yaml",
+}
+
+// URLSource fetches a remote resource over HTTP(S) and emits its body as chunks.
+type URLSource struct {
+ URL string
+ Client *http.Client
+ UserAgent string
+ Insecure bool // skip TLS verification (default false)
+ ChunkSize int
+}
+
+// NewURLSource creates a URLSource with sane defaults.
+func NewURLSource(rawURL string) *URLSource {
+ return &URLSource{
+ URL: rawURL,
+ Client: defaultHTTPClient(),
+ UserAgent: "keyhunter/dev",
+ ChunkSize: defaultChunkSize,
+ }
+}
+
+func defaultHTTPClient() *http.Client {
+ return &http.Client{
+ Timeout: DefaultURLTimeout,
+ CheckRedirect: func(req *http.Request, via []*http.Request) error {
+ if len(via) >= 5 {
+ return errors.New("stopped after 5 redirects")
+ }
+ return nil
+ },
+ }
+}
+
+// Chunks validates the URL, issues a GET, and emits the response body as chunks.
+func (u *URLSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
+ parsed, err := url.Parse(u.URL)
+ if err != nil {
+ return fmt.Errorf("URLSource: parse %q: %w", u.URL, err)
+ }
+ if parsed.Scheme != "http" && parsed.Scheme != "https" {
+ return fmt.Errorf("URLSource: unsupported scheme %q (only http/https)", parsed.Scheme)
+ }
+
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.URL, nil)
+ if err != nil {
+ return fmt.Errorf("URLSource: new request: %w", err)
+ }
+ req.Header.Set("User-Agent", u.UserAgent)
+
+ client := u.Client
+ if client == nil {
+ client = defaultHTTPClient()
+ }
+ resp, err := client.Do(req)
+ if err != nil {
+ return fmt.Errorf("URLSource: fetch: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return fmt.Errorf("URLSource: non-2xx status %d from %s", resp.StatusCode, u.URL)
+ }
+
+ ct := resp.Header.Get("Content-Type")
+ if !isAllowedContentType(ct) {
+ return fmt.Errorf("URLSource: disallowed Content-Type %q", ct)
+ }
+
+ if resp.ContentLength > MaxURLContentLength {
+ return fmt.Errorf("URLSource: Content-Length %d exceeds cap %d", resp.ContentLength, MaxURLContentLength)
+ }
+
+ // LimitReader cap + 1 to detect overflow even if ContentLength was missing/wrong.
+ limited := io.LimitReader(resp.Body, MaxURLContentLength+1)
+ data, err := io.ReadAll(limited)
+ if err != nil {
+ return fmt.Errorf("URLSource: read body: %w", err)
+ }
+ if int64(len(data)) > MaxURLContentLength {
+ return fmt.Errorf("URLSource: body exceeds %d bytes", MaxURLContentLength)
+ }
+ if len(data) == 0 {
+ return nil
+ }
+
+ source := "url:" + u.URL
+ return emitChunks(ctx, data, source, u.ChunkSize, out)
+}
+
+func isAllowedContentType(ct string) bool {
+ if ct == "" {
+ return true // some servers omit; trust and scan
+ }
+ // Strip parameters like "; charset=utf-8".
+ if idx := strings.Index(ct, ";"); idx >= 0 {
+ ct = ct[:idx]
+ }
+ ct = strings.TrimSpace(strings.ToLower(ct))
+ for _, prefix := range allowedContentTypes {
+ if strings.HasPrefix(ct, prefix) {
+ return true
+ }
+ }
+ return false
+}
+```
+
+Create `pkg/engine/sources/url_test.go`:
+
+```go
+package sources
+
+import (
+ "context"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+func drainURL(t *testing.T, src Source) ([]types.Chunk, error) {
+ t.Helper()
+ ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+ defer cancel()
+ out := make(chan types.Chunk, 256)
+ errCh := make(chan error, 1)
+ go func() { errCh <- src.Chunks(ctx, out); close(out) }()
+ var got []types.Chunk
+ for c := range out {
+ got = append(got, c)
+ }
+ return got, <-errCh
+}
+
+func TestURLSource_Fetches(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/plain")
+ _, _ = w.Write([]byte("API_KEY=sk-live-xyz"))
+ }))
+ defer srv.Close()
+
+ chunks, err := drainURL(t, NewURLSource(srv.URL))
+ require.NoError(t, err)
+ require.Len(t, chunks, 1)
+ require.Equal(t, "url:"+srv.URL, chunks[0].Source)
+ require.Equal(t, "API_KEY=sk-live-xyz", string(chunks[0].Data))
+}
+
+func TestURLSource_RejectsBinaryContentType(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "image/png")
+ _, _ = w.Write([]byte{0x89, 0x50, 0x4e, 0x47})
+ }))
+ defer srv.Close()
+
+ _, err := drainURL(t, NewURLSource(srv.URL))
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "Content-Type")
+}
+
+func TestURLSource_RejectsNonHTTPScheme(t *testing.T) {
+ _, err := drainURL(t, NewURLSource("file:///etc/passwd"))
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "unsupported scheme")
+}
+
+func TestURLSource_Rejects500(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ http.Error(w, "boom", http.StatusInternalServerError)
+ }))
+ defer srv.Close()
+
+ _, err := drainURL(t, NewURLSource(srv.URL))
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "500")
+}
+
+func TestURLSource_RejectsOversizeBody(t *testing.T) {
+ // Serve body just over the cap. Use a small override to keep the test fast.
+ big := strings.Repeat("a", int(MaxURLContentLength)+10)
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/plain")
+ _, _ = w.Write([]byte(big))
+ }))
+ defer srv.Close()
+
+ _, err := drainURL(t, NewURLSource(srv.URL))
+ require.Error(t, err)
+}
+
+func TestURLSource_FollowsRedirect(t *testing.T) {
+ target := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/plain")
+ _, _ = w.Write([]byte("redirected body"))
+ }))
+ defer target.Close()
+
+ redirector := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ http.Redirect(w, r, target.URL, http.StatusMovedPermanently)
+ }))
+ defer redirector.Close()
+
+ chunks, err := drainURL(t, NewURLSource(redirector.URL))
+ require.NoError(t, err)
+ require.NotEmpty(t, chunks)
+ require.Contains(t, string(chunks[0].Data), "redirected body")
+}
+```
+
+Create `pkg/engine/sources/clipboard.go`:
+
+```go
+package sources
+
+import (
+ "context"
+ "errors"
+ "fmt"
+
+ "github.com/atotto/clipboard"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+// ClipboardSource reads the current OS clipboard contents and emits them
+// as a single chunk stream with Source="clipboard". Requires xclip/xsel/
+// wl-clipboard on Linux, pbpaste on macOS, or native API on Windows.
+type ClipboardSource struct {
+ // Reader overrides the clipboard reader; when nil the real clipboard is used.
+ // Tests inject a func returning a fixture.
+ Reader func() (string, error)
+ ChunkSize int
+}
+
+// NewClipboardSource returns a ClipboardSource bound to the real OS clipboard.
+func NewClipboardSource() *ClipboardSource {
+ return &ClipboardSource{Reader: clipboard.ReadAll, ChunkSize: defaultChunkSize}
+}
+
+// Chunks reads the clipboard and emits its contents.
+func (c *ClipboardSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
+ if clipboard.Unsupported && c.Reader == nil {
+ return errors.New("ClipboardSource: clipboard tooling unavailable (install xclip/xsel/wl-clipboard on Linux)")
+ }
+ reader := c.Reader
+ if reader == nil {
+ reader = clipboard.ReadAll
+ }
+ text, err := reader()
+ if err != nil {
+ return fmt.Errorf("ClipboardSource: read: %w", err)
+ }
+ if text == "" {
+ return nil
+ }
+ return emitChunks(ctx, []byte(text), "clipboard", c.ChunkSize, out)
+}
+```
+
+Create `pkg/engine/sources/clipboard_test.go`:
+
+```go
+package sources
+
+import (
+ "context"
+ "errors"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+
+ "github.com/salvacybersec/keyhunter/pkg/types"
+)
+
+func TestClipboardSource_FixtureReader(t *testing.T) {
+ src := &ClipboardSource{
+ Reader: func() (string, error) { return "sk-live-xxxxxx", nil },
+ ChunkSize: defaultChunkSize,
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ out := make(chan types.Chunk, 4)
+ errCh := make(chan error, 1)
+ go func() { errCh <- src.Chunks(ctx, out); close(out) }()
+
+ var got []types.Chunk
+ for c := range out {
+ got = append(got, c)
+ }
+ require.NoError(t, <-errCh)
+ require.Len(t, got, 1)
+ require.Equal(t, "clipboard", got[0].Source)
+ require.Equal(t, "sk-live-xxxxxx", string(got[0].Data))
+}
+
+func TestClipboardSource_ReaderError(t *testing.T) {
+ src := &ClipboardSource{
+ Reader: func() (string, error) { return "", errors.New("no xclip installed") },
+ }
+ out := make(chan types.Chunk, 1)
+ err := src.Chunks(context.Background(), out)
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "clipboard")
+}
+
+func TestClipboardSource_EmptyClipboard(t *testing.T) {
+ src := &ClipboardSource{
+ Reader: func() (string, error) { return "", nil },
+ }
+ out := make(chan types.Chunk, 1)
+ err := src.Chunks(context.Background(), out)
+ require.NoError(t, err)
+ require.Len(t, out, 0)
+}
+```
+
+Do NOT modify `cmd/scan.go` in this plan. Do NOT create `pkg/engine/sources/dir.go`, `git.go`, or touch `file.go` — those are owned by plans 04-02 and 04-03.
+
+
+ go test ./pkg/engine/sources/... -run 'TestStdinSource|TestURLSource|TestClipboardSource' -race -count=1
+
+
+ - `go build ./pkg/engine/sources/...` exits 0
+ - `go test ./pkg/engine/sources/... -run 'TestStdinSource|TestURLSource|TestClipboardSource' -race` passes all subtests
+ - `grep -n "http.Client" pkg/engine/sources/url.go` hits
+ - `grep -n "LimitReader" pkg/engine/sources/url.go` hits
+ - `grep -n "clipboard.ReadAll" pkg/engine/sources/clipboard.go` hits
+ - `grep -n "\"stdin\"" pkg/engine/sources/stdin.go` hits (source label)
+ - `grep -n "\"url:\" + u.URL\\|\"url:\"+u.URL" pkg/engine/sources/url.go` hits
+
+
+ StdinSource, URLSource, and ClipboardSource all implement Source, enforce their respective safety limits (stdin read-to-EOF, url scheme/size/content-type whitelist, clipboard tooling check), and their tests pass under -race.
+
+
+
+
+
+
+- `go test ./pkg/engine/sources/... -race -count=1` passes including new tests
+- `go vet ./pkg/engine/sources/...` clean
+- All grep acceptance checks hit
+
+
+
+Three new source adapters exist, each self-contained, each with test coverage, and none conflicting with file ownership of plans 04-02 (dir/file) or 04-03 (git).
+
+
+
diff --git a/.planning/phases/04-input-sources/04-05-PLAN.md b/.planning/phases/04-input-sources/04-05-PLAN.md
new file mode 100644
index 0000000..35044de
--- /dev/null
+++ b/.planning/phases/04-input-sources/04-05-PLAN.md
@@ -0,0 +1,435 @@
+---
+phase: 04-input-sources
+plan: 05
+type: execute
+wave: 2
+depends_on: ["04-02", "04-03", "04-04"]
+files_modified:
+ - cmd/scan.go
+ - cmd/scan_sources_test.go
+autonomous: true
+requirements:
+ - INPUT-06
+must_haves:
+ truths:
+ - "keyhunter scan uses DirSource when target is a directory (not FileSource)"
+ - "keyhunter scan continues to use FileSource when target is a single file"
+ - "keyhunter scan --git uses GitSource, honoring --since YYYY-MM-DD"
+ - "keyhunter scan stdin and keyhunter scan - both use StdinSource"
+ - "keyhunter scan --url uses URLSource"
+ - "keyhunter scan --clipboard uses ClipboardSource (no positional arg required)"
+ - "--exclude flags are forwarded to DirSource"
+ - "Exactly one source is selected — conflicting flags return an error"
+ artifacts:
+ - path: "cmd/scan.go"
+ provides: "Source-selection logic dispatching to the appropriate Source implementation"
+ contains: "selectSource"
+ min_lines: 180
+ - path: "cmd/scan_sources_test.go"
+ provides: "Unit tests for selectSource covering every flag combination"
+ min_lines: 80
+ key_links:
+ - from: "cmd/scan.go"
+ to: "pkg/engine/sources"
+ via: "sources.NewDirSource/NewGitSource/NewStdinSource/NewURLSource/NewClipboardSource"
+ pattern: "sources\\.New(Dir|Git|Stdin|URL|Clipboard)Source"
+ - from: "cmd/scan.go"
+ to: "cobra flags"
+ via: "--git, --url, --clipboard, --since, --exclude"
+ pattern: "\\-\\-git|\\-\\-url|\\-\\-clipboard|\\-\\-since"
+---
+
+
+Wire the four new source adapters (DirSource, GitSource, StdinSource, URLSource, ClipboardSource) into `cmd/scan.go` via a new `selectSource` helper that inspects CLI flags and positional args to pick exactly one source. Satisfies INPUT-06 (the "all inputs flow through the same pipeline" integration requirement).
+
+Purpose: Plans 04-02 through 04-04 deliver the Source implementations in isolation. This plan is the single integration point that makes them reachable from the CLI, with argument validation to prevent ambiguous invocations like `keyhunter scan --git --url https://...`.
+Output: Updated `cmd/scan.go` with new flags and dispatching logic, plus a focused test file exercising `selectSource` directly.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/phases/04-input-sources/04-CONTEXT.md
+@cmd/scan.go
+@pkg/engine/sources/source.go
+
+
+Source constructors from Wave 1 plans:
+```go
+// Plan 04-02
+func NewFileSource(path string) *FileSource
+func NewDirSource(root string, extraExcludes ...string) *DirSource
+func NewDirSourceRaw(root string, excludes []string) *DirSource
+
+// Plan 04-03
+func NewGitSource(repoPath string) *GitSource
+type GitSource struct {
+ RepoPath string
+ Since time.Time
+ ChunkSize int
+}
+
+// Plan 04-04
+func NewStdinSource() *StdinSource
+func NewURLSource(rawURL string) *URLSource
+func NewClipboardSource() *ClipboardSource
+```
+
+Existing cmd/scan.go contract (see file for full body):
+- Package `cmd`
+- Uses `sources.NewFileSource(target)` unconditionally today
+- Has `flagExclude []string` already declared
+- init() registers flags: --workers, --verify, --unmask, --output, --exclude
+
+
+
+
+
+
+ Task 1: Add source-selection flags and dispatch logic to cmd/scan.go
+
+ - cmd/scan.go (full file)
+ - pkg/engine/sources/source.go
+ - pkg/engine/sources/dir.go (produced by 04-02)
+ - pkg/engine/sources/git.go (produced by 04-03)
+ - pkg/engine/sources/stdin.go (produced by 04-04)
+ - pkg/engine/sources/url.go (produced by 04-04)
+ - pkg/engine/sources/clipboard.go (produced by 04-04)
+
+ cmd/scan.go, cmd/scan_sources_test.go
+
+ - Test 1: selectSource with target="." on a directory returns a *DirSource
+ - Test 2: selectSource with target pointing to a file returns a *FileSource
+ - Test 3: selectSource with flagGit=true and target="./repo" returns a *GitSource
+ - Test 4: selectSource with flagGit=true and flagSince="2024-01-01" sets GitSource.Since correctly
+ - Test 5: selectSource with invalid --since format returns a parse error
+ - Test 6: selectSource with flagURL set returns a *URLSource
+ - Test 7: selectSource with flagClipboard=true and no args returns a *ClipboardSource
+ - Test 8: selectSource with target="stdin" returns a *StdinSource
+ - Test 9: selectSource with target="-" returns a *StdinSource
+ - Test 10: selectSource with both --git and --url set returns an error
+ - Test 11: selectSource with --clipboard and a positional target returns an error
+ - Test 12: selectSource forwards --exclude patterns into DirSource.Excludes
+
+
+
+Edit `cmd/scan.go`. The end state must:
+
+1. Add new package-level flag vars alongside the existing ones:
+
+```go
+var (
+ flagWorkers int
+ flagVerify bool
+ flagUnmask bool
+ flagOutput string
+ flagExclude []string
+ flagGit bool
+ flagURL string
+ flagClipboard bool
+ flagSince string
+ flagMaxFileSize int64
+ flagInsecure bool
+)
+```
+
+2. Change `scanCmd.Args` so a positional target is optional when `--url` or `--clipboard` is used:
+
+```go
+var scanCmd = &cobra.Command{
+ Use: "scan [path|stdin|-]",
+ Short: "Scan files, directories, git history, stdin, URLs, or clipboard for leaked API keys",
+ Args: cobra.MaximumNArgs(1),
+ RunE: func(cmd *cobra.Command, args []string) error {
+ // ... existing config load ...
+
+ src, err := selectSource(args, sourceFlags{
+ Git: flagGit,
+ URL: flagURL,
+ Clipboard: flagClipboard,
+ Since: flagSince,
+ Excludes: flagExclude,
+ })
+ if err != nil {
+ return err
+ }
+
+ // Replace the old `src := sources.NewFileSource(target)` line with use of the dispatched src.
+ // Keep all downstream code unchanged (engine, storage, output).
+
+ // ... rest of existing RunE body, using src ...
+ _ = src
+ return nil // placeholder — keep existing logic
+ },
+}
+```
+
+3. Add the selectSource helper and its supporting struct, in `cmd/scan.go`:
+
+```go
+// sourceFlags captures the CLI inputs that control source selection.
+// Extracted into a struct so selectSource is straightforward to unit test.
+type sourceFlags struct {
+ Git bool
+ URL string
+ Clipboard bool
+ Since string
+ Excludes []string
+}
+
+// selectSource inspects positional args and source flags, validates that
+// exactly one source is specified, and returns the appropriate Source.
+func selectSource(args []string, f sourceFlags) (sources.Source, error) {
+ // Count explicit source selectors that take no positional path.
+ explicitCount := 0
+ if f.URL != "" {
+ explicitCount++
+ }
+ if f.Clipboard {
+ explicitCount++
+ }
+ if f.Git {
+ explicitCount++
+ }
+ if explicitCount > 1 {
+ return nil, fmt.Errorf("scan: --git, --url, and --clipboard are mutually exclusive")
+ }
+
+ // Clipboard and URL take no positional argument.
+ if f.Clipboard {
+ if len(args) > 0 {
+ return nil, fmt.Errorf("scan: --clipboard does not accept a positional argument")
+ }
+ return sources.NewClipboardSource(), nil
+ }
+ if f.URL != "" {
+ if len(args) > 0 {
+ return nil, fmt.Errorf("scan: --url does not accept a positional argument")
+ }
+ return sources.NewURLSource(f.URL), nil
+ }
+
+ if len(args) == 0 {
+ return nil, fmt.Errorf("scan: missing target (path, stdin, -, or a source flag)")
+ }
+ target := args[0]
+
+ if target == "stdin" || target == "-" {
+ if f.Git {
+ return nil, fmt.Errorf("scan: --git cannot be combined with stdin")
+ }
+ return sources.NewStdinSource(), nil
+ }
+
+ if f.Git {
+ gs := sources.NewGitSource(target)
+ if f.Since != "" {
+ t, err := time.Parse("2006-01-02", f.Since)
+ if err != nil {
+ return nil, fmt.Errorf("scan: --since must be YYYY-MM-DD: %w", err)
+ }
+ gs.Since = t
+ }
+ return gs, nil
+ }
+
+ info, err := os.Stat(target)
+ if err != nil {
+ return nil, fmt.Errorf("scan: stat %q: %w", target, err)
+ }
+ if info.IsDir() {
+ return sources.NewDirSource(target, f.Excludes...), nil
+ }
+ return sources.NewFileSource(target), nil
+}
+```
+
+4. In the existing `init()`, register the new flags next to the existing ones:
+
+```go
+func init() {
+ scanCmd.Flags().IntVar(&flagWorkers, "workers", 0, "number of worker goroutines (default: CPU*8)")
+ scanCmd.Flags().BoolVar(&flagVerify, "verify", false, "actively verify found keys (opt-in, Phase 5)")
+ scanCmd.Flags().BoolVar(&flagUnmask, "unmask", false, "show full key values (default: masked)")
+ scanCmd.Flags().StringVar(&flagOutput, "output", "table", "output format: table, json")
+ scanCmd.Flags().StringSliceVar(&flagExclude, "exclude", nil, "extra glob patterns to exclude (e.g. *.min.js)")
+
+ // Phase 4 source-selection flags.
+ scanCmd.Flags().BoolVar(&flagGit, "git", false, "treat target as a git repo and scan full history")
+ scanCmd.Flags().StringVar(&flagURL, "url", "", "fetch and scan a remote http(s) URL (no positional arg)")
+ scanCmd.Flags().BoolVar(&flagClipboard, "clipboard", false, "scan current clipboard contents")
+ scanCmd.Flags().StringVar(&flagSince, "since", "", "for --git: only scan commits after YYYY-MM-DD")
+ scanCmd.Flags().Int64Var(&flagMaxFileSize, "max-file-size", 0, "max file size in bytes to scan (0 = unlimited)")
+ scanCmd.Flags().BoolVar(&flagInsecure, "insecure", false, "for --url: skip TLS certificate verification")
+
+ _ = viper.BindPFlag("scan.workers", scanCmd.Flags().Lookup("workers"))
+}
+```
+
+5. Replace the single line `src := sources.NewFileSource(target)` in the existing RunE body with the `selectSource` dispatch. Leave ALL downstream code (engine.Scan, storage.SaveFinding, output switch, exit code logic) untouched. Ensure the `target` variable is only used where relevant (it is no longer the sole driver of source construction).
+
+6. Add the `time` import to `cmd/scan.go`.
+
+Create `cmd/scan_sources_test.go`:
+
+```go
+package cmd
+
+import (
+ "os"
+ "path/filepath"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+
+ "github.com/salvacybersec/keyhunter/pkg/engine/sources"
+)
+
+func TestSelectSource_Directory(t *testing.T) {
+ dir := t.TempDir()
+ src, err := selectSource([]string{dir}, sourceFlags{})
+ require.NoError(t, err)
+ _, ok := src.(*sources.DirSource)
+ require.True(t, ok, "expected *DirSource, got %T", src)
+}
+
+func TestSelectSource_File(t *testing.T) {
+ dir := t.TempDir()
+ f := filepath.Join(dir, "a.txt")
+ require.NoError(t, os.WriteFile(f, []byte("x"), 0o644))
+ src, err := selectSource([]string{f}, sourceFlags{})
+ require.NoError(t, err)
+ _, ok := src.(*sources.FileSource)
+ require.True(t, ok, "expected *FileSource, got %T", src)
+}
+
+func TestSelectSource_Git(t *testing.T) {
+ src, err := selectSource([]string{"./some-repo"}, sourceFlags{Git: true})
+ require.NoError(t, err)
+ gs, ok := src.(*sources.GitSource)
+ require.True(t, ok, "expected *GitSource, got %T", src)
+ require.Equal(t, "./some-repo", gs.RepoPath)
+}
+
+func TestSelectSource_GitSince(t *testing.T) {
+ src, err := selectSource([]string{"./repo"}, sourceFlags{Git: true, Since: "2024-01-15"})
+ require.NoError(t, err)
+ gs := src.(*sources.GitSource)
+ want, _ := time.Parse("2006-01-02", "2024-01-15")
+ require.Equal(t, want, gs.Since)
+}
+
+func TestSelectSource_GitSinceBadFormat(t *testing.T) {
+ _, err := selectSource([]string{"./repo"}, sourceFlags{Git: true, Since: "15/01/2024"})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "YYYY-MM-DD")
+}
+
+func TestSelectSource_URL(t *testing.T) {
+ src, err := selectSource(nil, sourceFlags{URL: "https://example.com/a.js"})
+ require.NoError(t, err)
+ _, ok := src.(*sources.URLSource)
+ require.True(t, ok)
+}
+
+func TestSelectSource_URLRejectsPositional(t *testing.T) {
+ _, err := selectSource([]string{"./foo"}, sourceFlags{URL: "https://x"})
+ require.Error(t, err)
+}
+
+func TestSelectSource_Clipboard(t *testing.T) {
+ src, err := selectSource(nil, sourceFlags{Clipboard: true})
+ require.NoError(t, err)
+ _, ok := src.(*sources.ClipboardSource)
+ require.True(t, ok)
+}
+
+func TestSelectSource_ClipboardRejectsPositional(t *testing.T) {
+ _, err := selectSource([]string{"./foo"}, sourceFlags{Clipboard: true})
+ require.Error(t, err)
+}
+
+func TestSelectSource_Stdin(t *testing.T) {
+ for _, tok := range []string{"stdin", "-"} {
+ src, err := selectSource([]string{tok}, sourceFlags{})
+ require.NoError(t, err)
+ _, ok := src.(*sources.StdinSource)
+ require.True(t, ok, "token %q: expected *StdinSource, got %T", tok, src)
+ }
+}
+
+func TestSelectSource_MutuallyExclusive(t *testing.T) {
+ _, err := selectSource(nil, sourceFlags{Git: true, URL: "https://x"})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "mutually exclusive")
+}
+
+func TestSelectSource_MissingTarget(t *testing.T) {
+ _, err := selectSource(nil, sourceFlags{})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "missing target")
+}
+
+func TestSelectSource_DirForwardsExcludes(t *testing.T) {
+ dir := t.TempDir()
+ src, err := selectSource([]string{dir}, sourceFlags{Excludes: []string{"*.log", "tmp/**"}})
+ require.NoError(t, err)
+ ds := src.(*sources.DirSource)
+ // NewDirSource merges DefaultExcludes with extras, so user patterns must be present.
+ found := 0
+ for _, e := range ds.Excludes {
+ if e == "*.log" || e == "tmp/**" {
+ found++
+ }
+ }
+ require.Equal(t, 2, found, "user excludes not forwarded, got %v", ds.Excludes)
+}
+```
+
+After making these changes, run `go build ./...` and fix any import or compile errors. Do NOT modify pkg/engine/sources/* files — they are owned by Wave 1 plans.
+
+
+ go build ./... && go test ./cmd/... -run TestSelectSource -race -count=1
+
+
+ - `go build ./...` exits 0
+ - `go test ./cmd/... -run TestSelectSource -race -count=1` passes all 13 subtests
+ - `go test ./... -race -count=1` full suite passes
+ - `grep -n "selectSource" cmd/scan.go` returns at least two hits (definition + call site)
+ - `grep -n "flagGit\|flagURL\|flagClipboard\|flagSince" cmd/scan.go` returns at least 4 hits
+ - `grep -n "sources.NewDirSource\|sources.NewGitSource\|sources.NewStdinSource\|sources.NewURLSource\|sources.NewClipboardSource" cmd/scan.go` returns 5 hits
+ - `grep -n "mutually exclusive" cmd/scan.go` returns a hit
+ - `keyhunter scan --help` (via `go run . scan --help`) lists --git, --url, --clipboard, --since flags
+
+
+ cmd/scan.go dispatches to the correct Source implementation based on positional args and flags, with unambiguous error messages for conflicting selectors. All selectSource tests pass under -race. The existing single-file FileSource path still works unchanged.
+
+
+
+
+
+
+- `go build ./...` exits 0
+- `go test ./... -race -count=1` full suite green (including earlier Wave 1 plan tests)
+- `go run . scan --help` lists new flags
+- `go run . scan ./pkg` completes successfully (DirSource path)
+- `echo "API_KEY=test" | go run . scan -` completes successfully (StdinSource path)
+
+
+
+Users can invoke every Phase 4 input mode from the CLI and each one flows through the unchanged three-stage detection pipeline. INPUT-01 through INPUT-05 are reachable via CLI, and INPUT-06 (the integration meta-requirement) is satisfied by the passing test suite plus the help-text listing.
+
+
+
diff --git a/.planning/phases/12-osint_iot_cloud_storage/12-02-SUMMARY.md b/.planning/phases/12-osint_iot_cloud_storage/12-02-SUMMARY.md
new file mode 100644
index 0000000..ec4a2ee
--- /dev/null
+++ b/.planning/phases/12-osint_iot_cloud_storage/12-02-SUMMARY.md
@@ -0,0 +1,103 @@
+---
+phase: 12-osint_iot_cloud_storage
+plan: 02
+subsystem: recon
+tags: [fofa, netlas, binaryedge, iot, osint, httptest]
+
+requires:
+ - phase: 09-osint-infrastructure
+ provides: LimiterRegistry, shared Client retry/backoff HTTP
+ - phase: 10-osint-code-hosting
+ provides: ReconSource interface pattern, BuildQueries, keywordIndex helpers
+provides:
+ - FOFASource implementing recon.ReconSource for FOFA internet search
+ - NetlasSource implementing recon.ReconSource for Netlas intelligence API
+ - BinaryEdgeSource implementing recon.ReconSource for BinaryEdge data API
+affects: [12-osint_iot_cloud_storage, cmd/recon]
+
+tech-stack:
+ added: []
+ patterns: [base64-encoded query params for FOFA, X-API-Key header auth for Netlas, X-Key header auth for BinaryEdge]
+
+key-files:
+ created:
+ - pkg/recon/sources/fofa.go
+ - pkg/recon/sources/fofa_test.go
+ - pkg/recon/sources/netlas.go
+ - pkg/recon/sources/netlas_test.go
+ - pkg/recon/sources/binaryedge.go
+ - pkg/recon/sources/binaryedge_test.go
+ modified: []
+
+key-decisions:
+ - "FOFA uses base64-encoded qbase64 param with email+key auth in query string"
+ - "Netlas uses X-API-Key header; BinaryEdge uses X-Key header for auth"
+ - "All three sources use bare keyword queries (default formatQuery path)"
+
+patterns-established:
+ - "IoT scanner source pattern: struct with APIKey/BaseURL/Registry/Limiters + lazy client init"
+
+requirements-completed: [RECON-IOT-04, RECON-IOT-05, RECON-IOT-06]
+
+duration: 2min
+completed: 2026-04-06
+---
+
+# Phase 12 Plan 02: FOFA, Netlas, BinaryEdge Sources Summary
+
+**Three IoT/device scanner recon sources (FOFA, Netlas, BinaryEdge) with httptest-based unit tests covering sweep, auth, and cancellation**
+
+## Performance
+
+- **Duration:** 2 min
+- **Started:** 2026-04-06T09:22:18Z
+- **Completed:** 2026-04-06T09:24:22Z
+- **Tasks:** 2
+- **Files modified:** 6
+
+## Accomplishments
+- FOFASource searches FOFA API with base64-encoded queries and email+key authentication
+- NetlasSource searches Netlas API with X-API-Key header authentication
+- BinaryEdgeSource searches BinaryEdge API with X-Key header authentication
+- All three sources follow established Phase 10 pattern with shared Client, LimiterRegistry, BuildQueries
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Implement FOFASource, NetlasSource, BinaryEdgeSource** - `270bbbf` (feat)
+2. **Task 2: Unit tests for FOFA, Netlas, BinaryEdge sources** - `d6c35f4` (test)
+
+## Files Created/Modified
+- `pkg/recon/sources/fofa.go` - FOFASource with base64 query encoding and dual-credential auth
+- `pkg/recon/sources/fofa_test.go` - httptest tests for FOFA sweep, credentials, cancellation
+- `pkg/recon/sources/netlas.go` - NetlasSource with X-API-Key header auth
+- `pkg/recon/sources/netlas_test.go` - httptest tests for Netlas sweep, credentials, cancellation
+- `pkg/recon/sources/binaryedge.go` - BinaryEdgeSource with X-Key header auth
+- `pkg/recon/sources/binaryedge_test.go` - httptest tests for BinaryEdge sweep, credentials, cancellation
+
+## Decisions Made
+- FOFA uses base64-encoded qbase64 query parameter (matching FOFA API spec) with email+key in query string
+- Netlas uses X-API-Key header; BinaryEdge uses X-Key header (matching their respective API specs)
+- All three use bare keyword queries via default formatQuery path (no source-specific query formatting needed)
+
+## Deviations from Plan
+
+None - plan executed exactly as written.
+
+## Issues Encountered
+None
+
+## Known Stubs
+None
+
+## User Setup Required
+None - no external service configuration required.
+
+## Next Phase Readiness
+- Three IoT scanner sources ready for RegisterAll wiring
+- FOFA requires email + API key; Netlas and BinaryEdge require API key only
+
+---
+*Phase: 12-osint_iot_cloud_storage*
+*Completed: 2026-04-06*
diff --git a/pkg/recon/sources/binaryedge.go b/pkg/recon/sources/binaryedge.go
new file mode 100644
index 0000000..5b9a3c5
--- /dev/null
+++ b/pkg/recon/sources/binaryedge.go
@@ -0,0 +1,147 @@
+package sources
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "net/http"
+ "net/url"
+ "strings"
+ "time"
+
+ "golang.org/x/time/rate"
+
+ "github.com/salvacybersec/keyhunter/pkg/providers"
+ "github.com/salvacybersec/keyhunter/pkg/recon"
+)
+
+// BinaryEdgeSource implements recon.ReconSource against the BinaryEdge
+// internet data API. It iterates provider keyword queries and emits a Finding
+// per result event.
+//
+// A missing API key disables the source without error.
+type BinaryEdgeSource struct {
+ APIKey string
+ BaseURL string
+ Registry *providers.Registry
+ Limiters *recon.LimiterRegistry
+ client *Client
+}
+
+// Compile-time assertion.
+var _ recon.ReconSource = (*BinaryEdgeSource)(nil)
+
+func (s *BinaryEdgeSource) Name() string { return "binaryedge" }
+func (s *BinaryEdgeSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) }
+func (s *BinaryEdgeSource) Burst() int { return 1 }
+func (s *BinaryEdgeSource) RespectsRobots() bool { return false }
+
+// Enabled returns true only when APIKey is configured.
+func (s *BinaryEdgeSource) Enabled(_ recon.Config) bool { return s.APIKey != "" }
+
+// Sweep issues one BinaryEdge search request per provider keyword and emits
+// a Finding for every result event.
+func (s *BinaryEdgeSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
+ if s.APIKey == "" {
+ return nil
+ }
+ if s.client == nil {
+ s.client = NewClient()
+ }
+ base := s.BaseURL
+ if base == "" {
+ base = "https://api.binaryedge.io"
+ }
+
+ queries := BuildQueries(s.Registry, "binaryedge")
+ kwIndex := binaryedgeKeywordIndex(s.Registry)
+
+ for _, q := range queries {
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+ if s.Limiters != nil {
+ if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
+ return err
+ }
+ }
+
+ endpoint := fmt.Sprintf("%s/v2/query/search?query=%s&page=1",
+ base, url.QueryEscape(q))
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
+ if err != nil {
+ return fmt.Errorf("binaryedge: build request: %w", err)
+ }
+ req.Header.Set("X-Key", s.APIKey)
+ req.Header.Set("Accept", "application/json")
+
+ resp, err := s.client.Do(ctx, req)
+ if err != nil {
+ if errors.Is(err, ErrUnauthorized) {
+ return err
+ }
+ if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+ return err
+ }
+ continue
+ }
+
+ var parsed binaryedgeSearchResponse
+ decErr := json.NewDecoder(resp.Body).Decode(&parsed)
+ _ = resp.Body.Close()
+ if decErr != nil {
+ continue
+ }
+
+ provName := kwIndex[strings.ToLower(q)]
+ for _, ev := range parsed.Events {
+ f := recon.Finding{
+ ProviderName: provName,
+ Confidence: "low",
+ Source: fmt.Sprintf("binaryedge://%s:%d", ev.Target.IP, ev.Target.Port),
+ SourceType: "recon:binaryedge",
+ DetectedAt: time.Now(),
+ }
+ select {
+ case out <- f:
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+ }
+ }
+ return nil
+}
+
+type binaryedgeSearchResponse struct {
+ Events []binaryedgeEvent `json:"events"`
+}
+
+type binaryedgeEvent struct {
+ Target binaryedgeTarget `json:"target"`
+}
+
+type binaryedgeTarget struct {
+ IP string `json:"ip"`
+ Port int `json:"port"`
+}
+
+// binaryedgeKeywordIndex maps lowercased keywords to provider names.
+func binaryedgeKeywordIndex(reg *providers.Registry) map[string]string {
+ m := make(map[string]string)
+ if reg == nil {
+ return m
+ }
+ for _, p := range reg.List() {
+ for _, k := range p.Keywords {
+ kl := strings.ToLower(strings.TrimSpace(k))
+ if kl == "" {
+ continue
+ }
+ if _, exists := m[kl]; !exists {
+ m[kl] = p.Name
+ }
+ }
+ }
+ return m
+}
diff --git a/pkg/recon/sources/binaryedge_test.go b/pkg/recon/sources/binaryedge_test.go
new file mode 100644
index 0000000..e003a01
--- /dev/null
+++ b/pkg/recon/sources/binaryedge_test.go
@@ -0,0 +1,117 @@
+package sources
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "github.com/salvacybersec/keyhunter/pkg/recon"
+)
+
+func binaryedgeStubHandler(t *testing.T, calls *int32) http.HandlerFunc {
+ t.Helper()
+ return func(w http.ResponseWriter, r *http.Request) {
+ atomic.AddInt32(calls, 1)
+ if !strings.HasPrefix(r.URL.Path, "/v2/query/search") {
+ t.Errorf("unexpected path: %s", r.URL.Path)
+ }
+ if got := r.Header.Get("X-Key"); got != "testkey" {
+ t.Errorf("missing X-Key header: %q", got)
+ }
+ body := binaryedgeSearchResponse{
+ Events: []binaryedgeEvent{
+ {Target: binaryedgeTarget{IP: "192.168.1.1", Port: 80}},
+ {Target: binaryedgeTarget{IP: "192.168.1.2", Port: 443}},
+ },
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(body)
+ }
+}
+
+func TestBinaryEdgeSource_EnabledRequiresAPIKey(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+
+ s := &BinaryEdgeSource{APIKey: "", Registry: reg, Limiters: lim}
+ if s.Enabled(recon.Config{}) {
+ t.Error("expected Enabled=false with empty key")
+ }
+ s = &BinaryEdgeSource{APIKey: "key", Registry: reg, Limiters: lim}
+ if !s.Enabled(recon.Config{}) {
+ t.Error("expected Enabled=true with key")
+ }
+}
+
+func TestBinaryEdgeSource_SweepEmitsFindings(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+ _ = lim.For("binaryedge", 1000, 100)
+
+ var calls int32
+ srv := httptest.NewServer(binaryedgeStubHandler(t, &calls))
+ defer srv.Close()
+
+ s := &BinaryEdgeSource{
+ APIKey: "testkey",
+ BaseURL: srv.URL,
+ Registry: reg,
+ Limiters: lim,
+ }
+
+ out := make(chan recon.Finding, 32)
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+
+ done := make(chan error, 1)
+ go func() { done <- s.Sweep(ctx, "", out); close(out) }()
+
+ var findings []recon.Finding
+ for f := range out {
+ findings = append(findings, f)
+ }
+ if err := <-done; err != nil {
+ t.Fatalf("Sweep error: %v", err)
+ }
+
+ // 2 keywords * 2 events = 4 findings
+ if len(findings) != 4 {
+ t.Fatalf("expected 4 findings, got %d", len(findings))
+ }
+ for _, f := range findings {
+ if f.SourceType != "recon:binaryedge" {
+ t.Errorf("SourceType=%q want recon:binaryedge", f.SourceType)
+ }
+ }
+ if got := atomic.LoadInt32(&calls); got != 2 {
+ t.Errorf("expected 2 API calls, got %d", got)
+ }
+}
+
+func TestBinaryEdgeSource_CtxCancelled(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+ _ = lim.For("binaryedge", 1000, 100)
+
+ s := &BinaryEdgeSource{
+ APIKey: "key",
+ BaseURL: "http://127.0.0.1:1",
+ Registry: reg,
+ Limiters: lim,
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ cancel()
+
+ out := make(chan recon.Finding, 1)
+ err := s.Sweep(ctx, "", out)
+ if !errors.Is(err, context.Canceled) {
+ t.Fatalf("expected context.Canceled, got %v", err)
+ }
+}
diff --git a/pkg/recon/sources/fofa.go b/pkg/recon/sources/fofa.go
new file mode 100644
index 0000000..2fec8d5
--- /dev/null
+++ b/pkg/recon/sources/fofa.go
@@ -0,0 +1,144 @@
+package sources
+
+import (
+ "context"
+ "encoding/base64"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "net/http"
+ "strings"
+ "time"
+
+ "golang.org/x/time/rate"
+
+ "github.com/salvacybersec/keyhunter/pkg/providers"
+ "github.com/salvacybersec/keyhunter/pkg/recon"
+)
+
+// FOFASource implements recon.ReconSource against the FOFA internet search
+// engine API. It iterates provider keyword queries and emits a Finding per
+// result.
+//
+// A missing Email or API key disables the source without error.
+type FOFASource struct {
+ Email string
+ APIKey string
+ BaseURL string
+ Registry *providers.Registry
+ Limiters *recon.LimiterRegistry
+ client *Client
+}
+
+// Compile-time assertion.
+var _ recon.ReconSource = (*FOFASource)(nil)
+
+func (s *FOFASource) Name() string { return "fofa" }
+func (s *FOFASource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) }
+func (s *FOFASource) Burst() int { return 1 }
+func (s *FOFASource) RespectsRobots() bool { return false }
+
+// Enabled returns true only when both Email and APIKey are configured.
+func (s *FOFASource) Enabled(_ recon.Config) bool { return s.Email != "" && s.APIKey != "" }
+
+// Sweep issues one FOFA search request per provider keyword and emits a
+// Finding for every result row.
+func (s *FOFASource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
+ if s.Email == "" || s.APIKey == "" {
+ return nil
+ }
+ if s.client == nil {
+ s.client = NewClient()
+ }
+ base := s.BaseURL
+ if base == "" {
+ base = "https://fofa.info"
+ }
+
+ queries := BuildQueries(s.Registry, "fofa")
+ kwIndex := fofaKeywordIndex(s.Registry)
+
+ for _, q := range queries {
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+ if s.Limiters != nil {
+ if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
+ return err
+ }
+ }
+
+ qb64 := base64.StdEncoding.EncodeToString([]byte(q))
+ endpoint := fmt.Sprintf("%s/api/v1/search/all?email=%s&key=%s&qbase64=%s&size=100",
+ base, s.Email, s.APIKey, qb64)
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
+ if err != nil {
+ return fmt.Errorf("fofa: build request: %w", err)
+ }
+ req.Header.Set("Accept", "application/json")
+
+ resp, err := s.client.Do(ctx, req)
+ if err != nil {
+ if errors.Is(err, ErrUnauthorized) {
+ return err
+ }
+ if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+ return err
+ }
+ continue
+ }
+
+ var parsed fofaSearchResponse
+ decErr := json.NewDecoder(resp.Body).Decode(&parsed)
+ _ = resp.Body.Close()
+ if decErr != nil {
+ continue
+ }
+
+ provName := kwIndex[strings.ToLower(q)]
+ for _, row := range parsed.Results {
+ // Each row is [host, ip, port].
+ if len(row) < 3 {
+ continue
+ }
+ f := recon.Finding{
+ ProviderName: provName,
+ Confidence: "low",
+ Source: fmt.Sprintf("fofa://%s:%s", row[1], row[2]),
+ SourceType: "recon:fofa",
+ DetectedAt: time.Now(),
+ }
+ select {
+ case out <- f:
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+ }
+ }
+ return nil
+}
+
+type fofaSearchResponse struct {
+ Results [][]string `json:"results"`
+ Size int `json:"size"`
+}
+
+// fofaKeywordIndex maps lowercased keywords to provider names.
+func fofaKeywordIndex(reg *providers.Registry) map[string]string {
+ m := make(map[string]string)
+ if reg == nil {
+ return m
+ }
+ for _, p := range reg.List() {
+ for _, k := range p.Keywords {
+ kl := strings.ToLower(strings.TrimSpace(k))
+ if kl == "" {
+ continue
+ }
+ if _, exists := m[kl]; !exists {
+ m[kl] = p.Name
+ }
+ }
+ }
+ return m
+}
diff --git a/pkg/recon/sources/fofa_test.go b/pkg/recon/sources/fofa_test.go
new file mode 100644
index 0000000..e17497d
--- /dev/null
+++ b/pkg/recon/sources/fofa_test.go
@@ -0,0 +1,130 @@
+package sources
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "net/http"
+ "net/http/httptest"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "github.com/salvacybersec/keyhunter/pkg/recon"
+)
+
+func fofaStubHandler(t *testing.T, calls *int32) http.HandlerFunc {
+ t.Helper()
+ return func(w http.ResponseWriter, r *http.Request) {
+ atomic.AddInt32(calls, 1)
+ if r.URL.Path != "/api/v1/search/all" {
+ t.Errorf("unexpected path: %s", r.URL.Path)
+ }
+ if got := r.URL.Query().Get("email"); got != "test@example.com" {
+ t.Errorf("missing email param: %q", got)
+ }
+ if got := r.URL.Query().Get("key"); got != "testkey" {
+ t.Errorf("missing key param: %q", got)
+ }
+ body := fofaSearchResponse{
+ Results: [][]string{
+ {"example.com", "1.2.3.4", "443"},
+ {"test.org", "5.6.7.8", "8080"},
+ },
+ Size: 2,
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(body)
+ }
+}
+
+func TestFOFASource_EnabledRequiresCredentials(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+
+ s := &FOFASource{Email: "", APIKey: "", Registry: reg, Limiters: lim}
+ if s.Enabled(recon.Config{}) {
+ t.Error("expected Enabled=false with empty credentials")
+ }
+ s = &FOFASource{Email: "a@b.com", APIKey: "", Registry: reg, Limiters: lim}
+ if s.Enabled(recon.Config{}) {
+ t.Error("expected Enabled=false with empty APIKey")
+ }
+ s = &FOFASource{Email: "", APIKey: "key", Registry: reg, Limiters: lim}
+ if s.Enabled(recon.Config{}) {
+ t.Error("expected Enabled=false with empty Email")
+ }
+ s = &FOFASource{Email: "a@b.com", APIKey: "key", Registry: reg, Limiters: lim}
+ if !s.Enabled(recon.Config{}) {
+ t.Error("expected Enabled=true with both credentials")
+ }
+}
+
+func TestFOFASource_SweepEmitsFindings(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+ _ = lim.For("fofa", 1000, 100)
+
+ var calls int32
+ srv := httptest.NewServer(fofaStubHandler(t, &calls))
+ defer srv.Close()
+
+ s := &FOFASource{
+ Email: "test@example.com",
+ APIKey: "testkey",
+ BaseURL: srv.URL,
+ Registry: reg,
+ Limiters: lim,
+ }
+
+ out := make(chan recon.Finding, 32)
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+
+ done := make(chan error, 1)
+ go func() { done <- s.Sweep(ctx, "", out); close(out) }()
+
+ var findings []recon.Finding
+ for f := range out {
+ findings = append(findings, f)
+ }
+ if err := <-done; err != nil {
+ t.Fatalf("Sweep error: %v", err)
+ }
+
+ // 2 keywords * 2 results = 4 findings
+ if len(findings) != 4 {
+ t.Fatalf("expected 4 findings, got %d", len(findings))
+ }
+ for _, f := range findings {
+ if f.SourceType != "recon:fofa" {
+ t.Errorf("SourceType=%q want recon:fofa", f.SourceType)
+ }
+ }
+ if got := atomic.LoadInt32(&calls); got != 2 {
+ t.Errorf("expected 2 API calls, got %d", got)
+ }
+}
+
+func TestFOFASource_CtxCancelled(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+ _ = lim.For("fofa", 1000, 100)
+
+ s := &FOFASource{
+ Email: "a@b.com",
+ APIKey: "key",
+ BaseURL: "http://127.0.0.1:1",
+ Registry: reg,
+ Limiters: lim,
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ cancel()
+
+ out := make(chan recon.Finding, 1)
+ err := s.Sweep(ctx, "", out)
+ if !errors.Is(err, context.Canceled) {
+ t.Fatalf("expected context.Canceled, got %v", err)
+ }
+}
diff --git a/pkg/recon/sources/netlas.go b/pkg/recon/sources/netlas.go
new file mode 100644
index 0000000..017dd1f
--- /dev/null
+++ b/pkg/recon/sources/netlas.go
@@ -0,0 +1,147 @@
+package sources
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "net/http"
+ "net/url"
+ "strings"
+ "time"
+
+ "golang.org/x/time/rate"
+
+ "github.com/salvacybersec/keyhunter/pkg/providers"
+ "github.com/salvacybersec/keyhunter/pkg/recon"
+)
+
+// NetlasSource implements recon.ReconSource against the Netlas internet
+// intelligence API. It iterates provider keyword queries and emits a Finding
+// per result item.
+//
+// A missing API key disables the source without error.
+type NetlasSource struct {
+ APIKey string
+ BaseURL string
+ Registry *providers.Registry
+ Limiters *recon.LimiterRegistry
+ client *Client
+}
+
+// Compile-time assertion.
+var _ recon.ReconSource = (*NetlasSource)(nil)
+
+func (s *NetlasSource) Name() string { return "netlas" }
+func (s *NetlasSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) }
+func (s *NetlasSource) Burst() int { return 1 }
+func (s *NetlasSource) RespectsRobots() bool { return false }
+
+// Enabled returns true only when APIKey is configured.
+func (s *NetlasSource) Enabled(_ recon.Config) bool { return s.APIKey != "" }
+
+// Sweep issues one Netlas search request per provider keyword and emits a
+// Finding for every result item.
+func (s *NetlasSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
+ if s.APIKey == "" {
+ return nil
+ }
+ if s.client == nil {
+ s.client = NewClient()
+ }
+ base := s.BaseURL
+ if base == "" {
+ base = "https://app.netlas.io"
+ }
+
+ queries := BuildQueries(s.Registry, "netlas")
+ kwIndex := netlasKeywordIndex(s.Registry)
+
+ for _, q := range queries {
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+ if s.Limiters != nil {
+ if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
+ return err
+ }
+ }
+
+ endpoint := fmt.Sprintf("%s/api/responses/?q=%s&start=0&indices=",
+ base, url.QueryEscape(q))
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
+ if err != nil {
+ return fmt.Errorf("netlas: build request: %w", err)
+ }
+ req.Header.Set("X-API-Key", s.APIKey)
+ req.Header.Set("Accept", "application/json")
+
+ resp, err := s.client.Do(ctx, req)
+ if err != nil {
+ if errors.Is(err, ErrUnauthorized) {
+ return err
+ }
+ if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+ return err
+ }
+ continue
+ }
+
+ var parsed netlasSearchResponse
+ decErr := json.NewDecoder(resp.Body).Decode(&parsed)
+ _ = resp.Body.Close()
+ if decErr != nil {
+ continue
+ }
+
+ provName := kwIndex[strings.ToLower(q)]
+ for _, item := range parsed.Items {
+ f := recon.Finding{
+ ProviderName: provName,
+ Confidence: "low",
+ Source: fmt.Sprintf("netlas://%s:%d", item.Data.IP, item.Data.Port),
+ SourceType: "recon:netlas",
+ DetectedAt: time.Now(),
+ }
+ select {
+ case out <- f:
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+ }
+ }
+ return nil
+}
+
+type netlasSearchResponse struct {
+ Items []netlasItem `json:"items"`
+}
+
+type netlasItem struct {
+ Data netlasData `json:"data"`
+}
+
+type netlasData struct {
+ IP string `json:"ip"`
+ Port int `json:"port"`
+}
+
+// netlasKeywordIndex maps lowercased keywords to provider names.
+func netlasKeywordIndex(reg *providers.Registry) map[string]string {
+ m := make(map[string]string)
+ if reg == nil {
+ return m
+ }
+ for _, p := range reg.List() {
+ for _, k := range p.Keywords {
+ kl := strings.ToLower(strings.TrimSpace(k))
+ if kl == "" {
+ continue
+ }
+ if _, exists := m[kl]; !exists {
+ m[kl] = p.Name
+ }
+ }
+ }
+ return m
+}
diff --git a/pkg/recon/sources/netlas_test.go b/pkg/recon/sources/netlas_test.go
new file mode 100644
index 0000000..ddc337a
--- /dev/null
+++ b/pkg/recon/sources/netlas_test.go
@@ -0,0 +1,117 @@
+package sources
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "github.com/salvacybersec/keyhunter/pkg/recon"
+)
+
+func netlasStubHandler(t *testing.T, calls *int32) http.HandlerFunc {
+ t.Helper()
+ return func(w http.ResponseWriter, r *http.Request) {
+ atomic.AddInt32(calls, 1)
+ if !strings.HasPrefix(r.URL.Path, "/api/responses/") {
+ t.Errorf("unexpected path: %s", r.URL.Path)
+ }
+ if got := r.Header.Get("X-API-Key"); got != "testkey" {
+ t.Errorf("missing X-API-Key header: %q", got)
+ }
+ body := netlasSearchResponse{
+ Items: []netlasItem{
+ {Data: netlasData{IP: "10.0.0.1", Port: 443}},
+ {Data: netlasData{IP: "10.0.0.2", Port: 8443}},
+ },
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(body)
+ }
+}
+
+func TestNetlasSource_EnabledRequiresAPIKey(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+
+ s := &NetlasSource{APIKey: "", Registry: reg, Limiters: lim}
+ if s.Enabled(recon.Config{}) {
+ t.Error("expected Enabled=false with empty key")
+ }
+ s = &NetlasSource{APIKey: "key", Registry: reg, Limiters: lim}
+ if !s.Enabled(recon.Config{}) {
+ t.Error("expected Enabled=true with key")
+ }
+}
+
+func TestNetlasSource_SweepEmitsFindings(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+ _ = lim.For("netlas", 1000, 100)
+
+ var calls int32
+ srv := httptest.NewServer(netlasStubHandler(t, &calls))
+ defer srv.Close()
+
+ s := &NetlasSource{
+ APIKey: "testkey",
+ BaseURL: srv.URL,
+ Registry: reg,
+ Limiters: lim,
+ }
+
+ out := make(chan recon.Finding, 32)
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+
+ done := make(chan error, 1)
+ go func() { done <- s.Sweep(ctx, "", out); close(out) }()
+
+ var findings []recon.Finding
+ for f := range out {
+ findings = append(findings, f)
+ }
+ if err := <-done; err != nil {
+ t.Fatalf("Sweep error: %v", err)
+ }
+
+ // 2 keywords * 2 items = 4 findings
+ if len(findings) != 4 {
+ t.Fatalf("expected 4 findings, got %d", len(findings))
+ }
+ for _, f := range findings {
+ if f.SourceType != "recon:netlas" {
+ t.Errorf("SourceType=%q want recon:netlas", f.SourceType)
+ }
+ }
+ if got := atomic.LoadInt32(&calls); got != 2 {
+ t.Errorf("expected 2 API calls, got %d", got)
+ }
+}
+
+func TestNetlasSource_CtxCancelled(t *testing.T) {
+ reg := syntheticRegistry()
+ lim := recon.NewLimiterRegistry()
+ _ = lim.For("netlas", 1000, 100)
+
+ s := &NetlasSource{
+ APIKey: "key",
+ BaseURL: "http://127.0.0.1:1",
+ Registry: reg,
+ Limiters: lim,
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ cancel()
+
+ out := make(chan recon.Finding, 1)
+ err := s.Sweep(ctx, "", out)
+ if !errors.Is(err, context.Canceled) {
+ t.Fatalf("expected context.Canceled, got %v", err)
+ }
+}