feat(04-04): add StdinSource, URLSource, and ClipboardSource

- StdinSource reads from an injectable io.Reader (INPUT-03)
- URLSource fetches http/https with 30s timeout, 50MB cap, scheme whitelist, and Content-Type filter (INPUT-04)
- ClipboardSource wraps atotto/clipboard with graceful fallback for missing tooling (INPUT-05)
- emitByteChunks local helper mirrors file.go windowing to stay independent of sibling wave-1 plans
- Tests cover happy path, cancellation, redirects, oversize bodies, binary content types, scheme rejection, and clipboard error paths
This commit is contained in:
salvacybersec
2026-04-05 15:18:23 +03:00
parent 6f834c9c06
commit 850c3ff8e9
6 changed files with 471 additions and 0 deletions

View File

@@ -0,0 +1,45 @@
package sources
import (
"context"
"errors"
"fmt"
"github.com/atotto/clipboard"
"github.com/salvacybersec/keyhunter/pkg/types"
)
// ClipboardSource reads the current OS clipboard contents and emits them
// as a single chunk stream with Source="clipboard". Requires xclip/xsel/
// wl-clipboard on Linux, pbpaste on macOS, or native API on Windows.
type ClipboardSource struct {
// Reader overrides the clipboard reader; when nil the real clipboard is used.
// Tests inject a func returning a fixture.
Reader func() (string, error)
ChunkSize int
}
// NewClipboardSource returns a ClipboardSource bound to the real OS clipboard.
func NewClipboardSource() *ClipboardSource {
return &ClipboardSource{Reader: clipboard.ReadAll, ChunkSize: defaultChunkSize}
}
// Chunks reads the clipboard and emits its contents.
func (c *ClipboardSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
if clipboard.Unsupported && c.Reader == nil {
return errors.New("ClipboardSource: clipboard tooling unavailable (install xclip/xsel/wl-clipboard on Linux)")
}
reader := c.Reader
if reader == nil {
reader = clipboard.ReadAll
}
text, err := reader()
if err != nil {
return fmt.Errorf("ClipboardSource: read: %w", err)
}
if text == "" {
return nil
}
return emitByteChunks(ctx, []byte(text), "clipboard", c.ChunkSize, out)
}

View File

@@ -0,0 +1,54 @@
package sources
import (
"context"
"errors"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/salvacybersec/keyhunter/pkg/types"
)
func TestClipboardSource_FixtureReader(t *testing.T) {
src := &ClipboardSource{
Reader: func() (string, error) { return "sk-live-xxxxxx", nil },
ChunkSize: defaultChunkSize,
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
out := make(chan types.Chunk, 4)
errCh := make(chan error, 1)
go func() { errCh <- src.Chunks(ctx, out); close(out) }()
var got []types.Chunk
for c := range out {
got = append(got, c)
}
require.NoError(t, <-errCh)
require.Len(t, got, 1)
require.Equal(t, "clipboard", got[0].Source)
require.Equal(t, "sk-live-xxxxxx", string(got[0].Data))
}
func TestClipboardSource_ReaderError(t *testing.T) {
src := &ClipboardSource{
Reader: func() (string, error) { return "", errors.New("no xclip installed") },
}
out := make(chan types.Chunk, 1)
err := src.Chunks(context.Background(), out)
require.Error(t, err)
require.Contains(t, strings.ToLower(err.Error()), "clipboard")
}
func TestClipboardSource_EmptyClipboard(t *testing.T) {
src := &ClipboardSource{
Reader: func() (string, error) { return "", nil },
}
out := make(chan types.Chunk, 1)
err := src.Chunks(context.Background(), out)
require.NoError(t, err)
require.Len(t, out, 0)
}

View File

@@ -0,0 +1,85 @@
package sources
import (
"context"
"io"
"os"
"github.com/salvacybersec/keyhunter/pkg/types"
)
// StdinSource reads content from an io.Reader (defaults to os.Stdin) and
// emits overlapping chunks. Used when a user runs `keyhunter scan stdin`
// or `keyhunter scan -`.
type StdinSource struct {
Reader io.Reader
ChunkSize int
}
// NewStdinSource returns a StdinSource bound to os.Stdin.
func NewStdinSource() *StdinSource {
return &StdinSource{Reader: os.Stdin, ChunkSize: defaultChunkSize}
}
// NewStdinSourceFrom returns a StdinSource bound to the given reader
// (used primarily by tests).
func NewStdinSourceFrom(r io.Reader) *StdinSource {
return &StdinSource{Reader: r, ChunkSize: defaultChunkSize}
}
// Chunks reads the entire input, then emits it as overlapping chunks.
func (s *StdinSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
if s.Reader == nil {
s.Reader = os.Stdin
}
data, err := io.ReadAll(s.Reader)
if err != nil {
return err
}
if len(data) == 0 {
return nil
}
return emitByteChunks(ctx, data, "stdin", s.ChunkSize, out)
}
// emitByteChunks emits overlapping chunks from an in-memory byte slice.
// Local helper for stdin/url/clipboard sources; mirrors the chunk-windowing
// logic in file.go so this plan does not depend on sibling plans' helpers.
func emitByteChunks(ctx context.Context, data []byte, source string, size int, out chan<- types.Chunk) error {
if size <= 0 {
size = defaultChunkSize
}
if len(data) == 0 {
return nil
}
if len(data) <= size {
select {
case <-ctx.Done():
return ctx.Err()
case out <- types.Chunk{Data: data, Source: source, Offset: 0}:
}
return nil
}
var offset int64
for start := 0; start < len(data); start += size - chunkOverlap {
end := start + size
if end > len(data) {
end = len(data)
}
chunk := types.Chunk{
Data: data[start:end],
Source: source,
Offset: offset,
}
select {
case <-ctx.Done():
return ctx.Err()
case out <- chunk:
}
offset += int64(end - start)
if end == len(data) {
break
}
}
return nil
}

View File

@@ -0,0 +1,50 @@
package sources
import (
"bytes"
"context"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/salvacybersec/keyhunter/pkg/types"
)
func TestStdinSource_Basic(t *testing.T) {
src := NewStdinSourceFrom(bytes.NewBufferString("API_KEY=sk-test-xyz"))
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
out := make(chan types.Chunk, 8)
errCh := make(chan error, 1)
go func() { errCh <- src.Chunks(ctx, out); close(out) }()
var got []types.Chunk
for c := range out {
got = append(got, c)
}
require.NoError(t, <-errCh)
require.Len(t, got, 1)
require.Equal(t, "stdin", got[0].Source)
require.Equal(t, "API_KEY=sk-test-xyz", string(got[0].Data))
}
func TestStdinSource_Empty(t *testing.T) {
src := NewStdinSourceFrom(bytes.NewBuffer(nil))
out := make(chan types.Chunk, 1)
err := src.Chunks(context.Background(), out)
close(out)
require.NoError(t, err)
require.Len(t, out, 0)
}
func TestStdinSource_CtxCancel(t *testing.T) {
// Large buffer so emitByteChunks iterates and can observe cancellation.
data := make([]byte, 1<<20)
src := NewStdinSourceFrom(bytes.NewReader(data))
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan types.Chunk) // unbuffered forces select on ctx
err := src.Chunks(ctx, out)
require.ErrorIs(t, err, context.Canceled)
}

135
pkg/engine/sources/url.go Normal file
View File

@@ -0,0 +1,135 @@
package sources
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/salvacybersec/keyhunter/pkg/types"
)
// MaxURLContentLength is the hard cap on URLSource response bodies.
const MaxURLContentLength int64 = 50 * 1024 * 1024 // 50 MB
// DefaultURLTimeout is the overall request timeout (connect + read + body).
const DefaultURLTimeout = 30 * time.Second
// allowedContentTypes is the whitelist of Content-Type prefixes URLSource
// will accept. Binary types (images, archives, executables) are rejected.
var allowedContentTypes = []string{
"text/",
"application/json",
"application/javascript",
"application/xml",
"application/x-yaml",
"application/yaml",
}
// URLSource fetches a remote resource over HTTP(S) and emits its body as chunks.
type URLSource struct {
URL string
Client *http.Client
UserAgent string
Insecure bool // skip TLS verification (default false)
ChunkSize int
}
// NewURLSource creates a URLSource with sane defaults.
func NewURLSource(rawURL string) *URLSource {
return &URLSource{
URL: rawURL,
Client: defaultHTTPClient(),
UserAgent: "keyhunter/dev",
ChunkSize: defaultChunkSize,
}
}
func defaultHTTPClient() *http.Client {
return &http.Client{
Timeout: DefaultURLTimeout,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 5 {
return errors.New("stopped after 5 redirects")
}
return nil
},
}
}
// Chunks validates the URL, issues a GET, and emits the response body as chunks.
func (u *URLSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
parsed, err := url.Parse(u.URL)
if err != nil {
return fmt.Errorf("URLSource: parse %q: %w", u.URL, err)
}
if parsed.Scheme != "http" && parsed.Scheme != "https" {
return fmt.Errorf("URLSource: unsupported scheme %q (only http/https)", parsed.Scheme)
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.URL, nil)
if err != nil {
return fmt.Errorf("URLSource: new request: %w", err)
}
req.Header.Set("User-Agent", u.UserAgent)
client := u.Client
if client == nil {
client = defaultHTTPClient()
}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("URLSource: fetch: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("URLSource: non-2xx status %d from %s", resp.StatusCode, u.URL)
}
ct := resp.Header.Get("Content-Type")
if !isAllowedContentType(ct) {
return fmt.Errorf("URLSource: disallowed Content-Type %q", ct)
}
if resp.ContentLength > MaxURLContentLength {
return fmt.Errorf("URLSource: Content-Length %d exceeds cap %d", resp.ContentLength, MaxURLContentLength)
}
// LimitReader cap + 1 to detect overflow even if ContentLength was missing/wrong.
limited := io.LimitReader(resp.Body, MaxURLContentLength+1)
data, err := io.ReadAll(limited)
if err != nil {
return fmt.Errorf("URLSource: read body: %w", err)
}
if int64(len(data)) > MaxURLContentLength {
return fmt.Errorf("URLSource: body exceeds %d bytes", MaxURLContentLength)
}
if len(data) == 0 {
return nil
}
source := "url:" + u.URL
return emitByteChunks(ctx, data, source, u.ChunkSize, out)
}
func isAllowedContentType(ct string) bool {
if ct == "" {
return true // some servers omit; trust and scan
}
// Strip parameters like "; charset=utf-8".
if idx := strings.Index(ct, ";"); idx >= 0 {
ct = ct[:idx]
}
ct = strings.TrimSpace(strings.ToLower(ct))
for _, prefix := range allowedContentTypes {
if strings.HasPrefix(ct, prefix) {
return true
}
}
return false
}

View File

@@ -0,0 +1,102 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/salvacybersec/keyhunter/pkg/types"
)
func drainURL(t *testing.T, src Source) ([]types.Chunk, error) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
out := make(chan types.Chunk, 65536)
errCh := make(chan error, 1)
go func() { errCh <- src.Chunks(ctx, out); close(out) }()
var got []types.Chunk
for c := range out {
got = append(got, c)
}
return got, <-errCh
}
func TestURLSource_Fetches(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
_, _ = w.Write([]byte("API_KEY=sk-live-xyz"))
}))
defer srv.Close()
chunks, err := drainURL(t, NewURLSource(srv.URL))
require.NoError(t, err)
require.Len(t, chunks, 1)
require.Equal(t, "url:"+srv.URL, chunks[0].Source)
require.Equal(t, "API_KEY=sk-live-xyz", string(chunks[0].Data))
}
func TestURLSource_RejectsBinaryContentType(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "image/png")
_, _ = w.Write([]byte{0x89, 0x50, 0x4e, 0x47})
}))
defer srv.Close()
_, err := drainURL(t, NewURLSource(srv.URL))
require.Error(t, err)
require.Contains(t, err.Error(), "Content-Type")
}
func TestURLSource_RejectsNonHTTPScheme(t *testing.T) {
_, err := drainURL(t, NewURLSource("file:///etc/passwd"))
require.Error(t, err)
require.Contains(t, err.Error(), "unsupported scheme")
}
func TestURLSource_Rejects500(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "boom", http.StatusInternalServerError)
}))
defer srv.Close()
_, err := drainURL(t, NewURLSource(srv.URL))
require.Error(t, err)
require.Contains(t, err.Error(), "500")
}
func TestURLSource_RejectsOversizeBody(t *testing.T) {
// Serve body just over the cap.
big := strings.Repeat("a", int(MaxURLContentLength)+10)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
_, _ = w.Write([]byte(big))
}))
defer srv.Close()
_, err := drainURL(t, NewURLSource(srv.URL))
require.Error(t, err)
}
func TestURLSource_FollowsRedirect(t *testing.T) {
target := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
_, _ = w.Write([]byte("redirected body"))
}))
defer target.Close()
redirector := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, target.URL, http.StatusMovedPermanently)
}))
defer redirector.Close()
chunks, err := drainURL(t, NewURLSource(redirector.URL))
require.NoError(t, err)
require.NotEmpty(t, chunks)
require.Contains(t, string(chunks[0].Data), "redirected body")
}