From 850c3ff8e98020703cff1356786e0277a7a51138 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Sun, 5 Apr 2026 15:18:23 +0300 Subject: [PATCH] feat(04-04): add StdinSource, URLSource, and ClipboardSource - StdinSource reads from an injectable io.Reader (INPUT-03) - URLSource fetches http/https with 30s timeout, 50MB cap, scheme whitelist, and Content-Type filter (INPUT-04) - ClipboardSource wraps atotto/clipboard with graceful fallback for missing tooling (INPUT-05) - emitByteChunks local helper mirrors file.go windowing to stay independent of sibling wave-1 plans - Tests cover happy path, cancellation, redirects, oversize bodies, binary content types, scheme rejection, and clipboard error paths --- pkg/engine/sources/clipboard.go | 45 +++++++++ pkg/engine/sources/clipboard_test.go | 54 +++++++++++ pkg/engine/sources/stdin.go | 85 +++++++++++++++++ pkg/engine/sources/stdin_test.go | 50 ++++++++++ pkg/engine/sources/url.go | 135 +++++++++++++++++++++++++++ pkg/engine/sources/url_test.go | 102 ++++++++++++++++++++ 6 files changed, 471 insertions(+) create mode 100644 pkg/engine/sources/clipboard.go create mode 100644 pkg/engine/sources/clipboard_test.go create mode 100644 pkg/engine/sources/stdin.go create mode 100644 pkg/engine/sources/stdin_test.go create mode 100644 pkg/engine/sources/url.go create mode 100644 pkg/engine/sources/url_test.go diff --git a/pkg/engine/sources/clipboard.go b/pkg/engine/sources/clipboard.go new file mode 100644 index 0000000..97f1a31 --- /dev/null +++ b/pkg/engine/sources/clipboard.go @@ -0,0 +1,45 @@ +package sources + +import ( + "context" + "errors" + "fmt" + + "github.com/atotto/clipboard" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// ClipboardSource reads the current OS clipboard contents and emits them +// as a single chunk stream with Source="clipboard". Requires xclip/xsel/ +// wl-clipboard on Linux, pbpaste on macOS, or native API on Windows. +type ClipboardSource struct { + // Reader overrides the clipboard reader; when nil the real clipboard is used. + // Tests inject a func returning a fixture. + Reader func() (string, error) + ChunkSize int +} + +// NewClipboardSource returns a ClipboardSource bound to the real OS clipboard. +func NewClipboardSource() *ClipboardSource { + return &ClipboardSource{Reader: clipboard.ReadAll, ChunkSize: defaultChunkSize} +} + +// Chunks reads the clipboard and emits its contents. +func (c *ClipboardSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + if clipboard.Unsupported && c.Reader == nil { + return errors.New("ClipboardSource: clipboard tooling unavailable (install xclip/xsel/wl-clipboard on Linux)") + } + reader := c.Reader + if reader == nil { + reader = clipboard.ReadAll + } + text, err := reader() + if err != nil { + return fmt.Errorf("ClipboardSource: read: %w", err) + } + if text == "" { + return nil + } + return emitByteChunks(ctx, []byte(text), "clipboard", c.ChunkSize, out) +} diff --git a/pkg/engine/sources/clipboard_test.go b/pkg/engine/sources/clipboard_test.go new file mode 100644 index 0000000..15a957b --- /dev/null +++ b/pkg/engine/sources/clipboard_test.go @@ -0,0 +1,54 @@ +package sources + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +func TestClipboardSource_FixtureReader(t *testing.T) { + src := &ClipboardSource{ + Reader: func() (string, error) { return "sk-live-xxxxxx", nil }, + ChunkSize: defaultChunkSize, + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + out := make(chan types.Chunk, 4) + errCh := make(chan error, 1) + go func() { errCh <- src.Chunks(ctx, out); close(out) }() + + var got []types.Chunk + for c := range out { + got = append(got, c) + } + require.NoError(t, <-errCh) + require.Len(t, got, 1) + require.Equal(t, "clipboard", got[0].Source) + require.Equal(t, "sk-live-xxxxxx", string(got[0].Data)) +} + +func TestClipboardSource_ReaderError(t *testing.T) { + src := &ClipboardSource{ + Reader: func() (string, error) { return "", errors.New("no xclip installed") }, + } + out := make(chan types.Chunk, 1) + err := src.Chunks(context.Background(), out) + require.Error(t, err) + require.Contains(t, strings.ToLower(err.Error()), "clipboard") +} + +func TestClipboardSource_EmptyClipboard(t *testing.T) { + src := &ClipboardSource{ + Reader: func() (string, error) { return "", nil }, + } + out := make(chan types.Chunk, 1) + err := src.Chunks(context.Background(), out) + require.NoError(t, err) + require.Len(t, out, 0) +} diff --git a/pkg/engine/sources/stdin.go b/pkg/engine/sources/stdin.go new file mode 100644 index 0000000..b05fcbd --- /dev/null +++ b/pkg/engine/sources/stdin.go @@ -0,0 +1,85 @@ +package sources + +import ( + "context" + "io" + "os" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// StdinSource reads content from an io.Reader (defaults to os.Stdin) and +// emits overlapping chunks. Used when a user runs `keyhunter scan stdin` +// or `keyhunter scan -`. +type StdinSource struct { + Reader io.Reader + ChunkSize int +} + +// NewStdinSource returns a StdinSource bound to os.Stdin. +func NewStdinSource() *StdinSource { + return &StdinSource{Reader: os.Stdin, ChunkSize: defaultChunkSize} +} + +// NewStdinSourceFrom returns a StdinSource bound to the given reader +// (used primarily by tests). +func NewStdinSourceFrom(r io.Reader) *StdinSource { + return &StdinSource{Reader: r, ChunkSize: defaultChunkSize} +} + +// Chunks reads the entire input, then emits it as overlapping chunks. +func (s *StdinSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + if s.Reader == nil { + s.Reader = os.Stdin + } + data, err := io.ReadAll(s.Reader) + if err != nil { + return err + } + if len(data) == 0 { + return nil + } + return emitByteChunks(ctx, data, "stdin", s.ChunkSize, out) +} + +// emitByteChunks emits overlapping chunks from an in-memory byte slice. +// Local helper for stdin/url/clipboard sources; mirrors the chunk-windowing +// logic in file.go so this plan does not depend on sibling plans' helpers. +func emitByteChunks(ctx context.Context, data []byte, source string, size int, out chan<- types.Chunk) error { + if size <= 0 { + size = defaultChunkSize + } + if len(data) == 0 { + return nil + } + if len(data) <= size { + select { + case <-ctx.Done(): + return ctx.Err() + case out <- types.Chunk{Data: data, Source: source, Offset: 0}: + } + return nil + } + var offset int64 + for start := 0; start < len(data); start += size - chunkOverlap { + end := start + size + if end > len(data) { + end = len(data) + } + chunk := types.Chunk{ + Data: data[start:end], + Source: source, + Offset: offset, + } + select { + case <-ctx.Done(): + return ctx.Err() + case out <- chunk: + } + offset += int64(end - start) + if end == len(data) { + break + } + } + return nil +} diff --git a/pkg/engine/sources/stdin_test.go b/pkg/engine/sources/stdin_test.go new file mode 100644 index 0000000..290e31b --- /dev/null +++ b/pkg/engine/sources/stdin_test.go @@ -0,0 +1,50 @@ +package sources + +import ( + "bytes" + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +func TestStdinSource_Basic(t *testing.T) { + src := NewStdinSourceFrom(bytes.NewBufferString("API_KEY=sk-test-xyz")) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + out := make(chan types.Chunk, 8) + errCh := make(chan error, 1) + go func() { errCh <- src.Chunks(ctx, out); close(out) }() + + var got []types.Chunk + for c := range out { + got = append(got, c) + } + require.NoError(t, <-errCh) + require.Len(t, got, 1) + require.Equal(t, "stdin", got[0].Source) + require.Equal(t, "API_KEY=sk-test-xyz", string(got[0].Data)) +} + +func TestStdinSource_Empty(t *testing.T) { + src := NewStdinSourceFrom(bytes.NewBuffer(nil)) + out := make(chan types.Chunk, 1) + err := src.Chunks(context.Background(), out) + close(out) + require.NoError(t, err) + require.Len(t, out, 0) +} + +func TestStdinSource_CtxCancel(t *testing.T) { + // Large buffer so emitByteChunks iterates and can observe cancellation. + data := make([]byte, 1<<20) + src := NewStdinSourceFrom(bytes.NewReader(data)) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + out := make(chan types.Chunk) // unbuffered forces select on ctx + err := src.Chunks(ctx, out) + require.ErrorIs(t, err, context.Canceled) +} diff --git a/pkg/engine/sources/url.go b/pkg/engine/sources/url.go new file mode 100644 index 0000000..bd4c69d --- /dev/null +++ b/pkg/engine/sources/url.go @@ -0,0 +1,135 @@ +package sources + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// MaxURLContentLength is the hard cap on URLSource response bodies. +const MaxURLContentLength int64 = 50 * 1024 * 1024 // 50 MB + +// DefaultURLTimeout is the overall request timeout (connect + read + body). +const DefaultURLTimeout = 30 * time.Second + +// allowedContentTypes is the whitelist of Content-Type prefixes URLSource +// will accept. Binary types (images, archives, executables) are rejected. +var allowedContentTypes = []string{ + "text/", + "application/json", + "application/javascript", + "application/xml", + "application/x-yaml", + "application/yaml", +} + +// URLSource fetches a remote resource over HTTP(S) and emits its body as chunks. +type URLSource struct { + URL string + Client *http.Client + UserAgent string + Insecure bool // skip TLS verification (default false) + ChunkSize int +} + +// NewURLSource creates a URLSource with sane defaults. +func NewURLSource(rawURL string) *URLSource { + return &URLSource{ + URL: rawURL, + Client: defaultHTTPClient(), + UserAgent: "keyhunter/dev", + ChunkSize: defaultChunkSize, + } +} + +func defaultHTTPClient() *http.Client { + return &http.Client{ + Timeout: DefaultURLTimeout, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + if len(via) >= 5 { + return errors.New("stopped after 5 redirects") + } + return nil + }, + } +} + +// Chunks validates the URL, issues a GET, and emits the response body as chunks. +func (u *URLSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + parsed, err := url.Parse(u.URL) + if err != nil { + return fmt.Errorf("URLSource: parse %q: %w", u.URL, err) + } + if parsed.Scheme != "http" && parsed.Scheme != "https" { + return fmt.Errorf("URLSource: unsupported scheme %q (only http/https)", parsed.Scheme) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.URL, nil) + if err != nil { + return fmt.Errorf("URLSource: new request: %w", err) + } + req.Header.Set("User-Agent", u.UserAgent) + + client := u.Client + if client == nil { + client = defaultHTTPClient() + } + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("URLSource: fetch: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("URLSource: non-2xx status %d from %s", resp.StatusCode, u.URL) + } + + ct := resp.Header.Get("Content-Type") + if !isAllowedContentType(ct) { + return fmt.Errorf("URLSource: disallowed Content-Type %q", ct) + } + + if resp.ContentLength > MaxURLContentLength { + return fmt.Errorf("URLSource: Content-Length %d exceeds cap %d", resp.ContentLength, MaxURLContentLength) + } + + // LimitReader cap + 1 to detect overflow even if ContentLength was missing/wrong. + limited := io.LimitReader(resp.Body, MaxURLContentLength+1) + data, err := io.ReadAll(limited) + if err != nil { + return fmt.Errorf("URLSource: read body: %w", err) + } + if int64(len(data)) > MaxURLContentLength { + return fmt.Errorf("URLSource: body exceeds %d bytes", MaxURLContentLength) + } + if len(data) == 0 { + return nil + } + + source := "url:" + u.URL + return emitByteChunks(ctx, data, source, u.ChunkSize, out) +} + +func isAllowedContentType(ct string) bool { + if ct == "" { + return true // some servers omit; trust and scan + } + // Strip parameters like "; charset=utf-8". + if idx := strings.Index(ct, ";"); idx >= 0 { + ct = ct[:idx] + } + ct = strings.TrimSpace(strings.ToLower(ct)) + for _, prefix := range allowedContentTypes { + if strings.HasPrefix(ct, prefix) { + return true + } + } + return false +} diff --git a/pkg/engine/sources/url_test.go b/pkg/engine/sources/url_test.go new file mode 100644 index 0000000..fed3b73 --- /dev/null +++ b/pkg/engine/sources/url_test.go @@ -0,0 +1,102 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +func drainURL(t *testing.T, src Source) ([]types.Chunk, error) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + out := make(chan types.Chunk, 65536) + errCh := make(chan error, 1) + go func() { errCh <- src.Chunks(ctx, out); close(out) }() + var got []types.Chunk + for c := range out { + got = append(got, c) + } + return got, <-errCh +} + +func TestURLSource_Fetches(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte("API_KEY=sk-live-xyz")) + })) + defer srv.Close() + + chunks, err := drainURL(t, NewURLSource(srv.URL)) + require.NoError(t, err) + require.Len(t, chunks, 1) + require.Equal(t, "url:"+srv.URL, chunks[0].Source) + require.Equal(t, "API_KEY=sk-live-xyz", string(chunks[0].Data)) +} + +func TestURLSource_RejectsBinaryContentType(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "image/png") + _, _ = w.Write([]byte{0x89, 0x50, 0x4e, 0x47}) + })) + defer srv.Close() + + _, err := drainURL(t, NewURLSource(srv.URL)) + require.Error(t, err) + require.Contains(t, err.Error(), "Content-Type") +} + +func TestURLSource_RejectsNonHTTPScheme(t *testing.T) { + _, err := drainURL(t, NewURLSource("file:///etc/passwd")) + require.Error(t, err) + require.Contains(t, err.Error(), "unsupported scheme") +} + +func TestURLSource_Rejects500(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer srv.Close() + + _, err := drainURL(t, NewURLSource(srv.URL)) + require.Error(t, err) + require.Contains(t, err.Error(), "500") +} + +func TestURLSource_RejectsOversizeBody(t *testing.T) { + // Serve body just over the cap. + big := strings.Repeat("a", int(MaxURLContentLength)+10) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(big)) + })) + defer srv.Close() + + _, err := drainURL(t, NewURLSource(srv.URL)) + require.Error(t, err) +} + +func TestURLSource_FollowsRedirect(t *testing.T) { + target := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte("redirected body")) + })) + defer target.Close() + + redirector := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, target.URL, http.StatusMovedPermanently) + })) + defer redirector.Close() + + chunks, err := drainURL(t, NewURLSource(redirector.URL)) + require.NoError(t, err) + require.NotEmpty(t, chunks) + require.Contains(t, string(chunks[0].Data), "redirected body") +}