diff --git a/.planning/phases/10-osint-code-hosting/10-08-SUMMARY.md b/.planning/phases/10-osint-code-hosting/10-08-SUMMARY.md new file mode 100644 index 0000000..a5b0c02 --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-08-SUMMARY.md @@ -0,0 +1,117 @@ +--- +phase: 10-osint-code-hosting +plan: 08 +subsystem: recon +tags: [kaggle, osint, http-basic-auth, httptest] + +requires: + - phase: 10-osint-code-hosting + provides: "recon.ReconSource interface, sources.Client, BuildQueries, LimiterRegistry (Plan 10-01)" +provides: + - "KaggleSource implementing recon.ReconSource against Kaggle /api/v1/kernels/list" + - "HTTP Basic auth wiring via req.SetBasicAuth(user, key)" + - "Finding normalization to Source=/code/, SourceType=recon:kaggle" +affects: [10-09-register, 10-full-integration] + +tech-stack: + added: [] + patterns: + - "Basic-auth recon source pattern (user + key) as counterpart to bearer-token sources" + - "Credential-gated Sweep: return nil without HTTP when either credential missing" + +key-files: + created: + - pkg/recon/sources/kaggle.go + - pkg/recon/sources/kaggle_test.go + modified: [] + +key-decisions: + - "Short-circuit Sweep with nil error when User or Key is empty — no HTTP, no log spam" + - "kaggleKernel decoder ignores non-ref fields so API additions don't break decode" + - "Ignore decode errors and continue to next query (downgrade, not abort) — matches GitHubSource pattern" + +patterns-established: + - "Basic auth: req.SetBasicAuth(s.User, s.Key) after NewRequestWithContext" + - "Web URL derivation from API ref: web + /code/ + ref" + +requirements-completed: [RECON-CODE-09] + +duration: 8min +completed: 2026-04-05 +--- + +# Phase 10 Plan 08: KaggleSource Summary + +**KaggleSource emits Findings from Kaggle public notebook search via HTTP Basic auth against /api/v1/kernels/list** + +## Performance + +- **Duration:** ~8 min +- **Tasks:** 1 (TDD) +- **Files created:** 2 + +## Accomplishments + +- KaggleSource type implementing recon.ReconSource (Name, RateLimit, Burst, RespectsRobots, Enabled, Sweep) +- Credentials-gated: both User AND Key required; missing either returns nil with zero HTTP calls +- HTTP Basic auth wired via req.SetBasicAuth to Kaggle's /api/v1/kernels/list endpoint +- Findings normalized with SourceType "recon:kaggle" and Source = WebBaseURL + "/code/" + ref +- 60 req/min rate limit via rate.Every(1*time.Second), burst 1, honoring per-source LimiterRegistry +- Compile-time interface assertion: `var _ recon.ReconSource = (*KaggleSource)(nil)` + +## Task Commits + +1. **Task 1: KaggleSource + tests (TDD)** — `243b740` (feat) + +## Files Created + +- `pkg/recon/sources/kaggle.go` — KaggleSource implementation, kaggleKernel decoder, interface assertion +- `pkg/recon/sources/kaggle_test.go` — 6 httptest-driven tests + +## Test Coverage + +| Test | Covers | +|------|--------| +| TestKaggle_Enabled | All 4 credential combinations (empty/empty, user-only, key-only, both) | +| TestKaggle_Sweep_BasicAuthAndFindings | Authorization header decoded as testuser:testkey, 2 refs → 2 Findings with correct Source URLs and recon:kaggle SourceType | +| TestKaggle_Sweep_MissingCredentials_NoHTTP | Atomic counter verifies zero HTTP calls when either User or Key empty | +| TestKaggle_Sweep_Unauthorized | 401 response wrapped as ErrUnauthorized | +| TestKaggle_Sweep_CtxCancellation | Pre-cancelled ctx returns context.Canceled promptly | +| TestKaggle_ReconSourceInterface | Compile + runtime assertions on Name, Burst, RespectsRobots, RateLimit | + +All 6 tests pass in isolation: `go test ./pkg/recon/sources/ -run TestKaggle -v` + +## Decisions Made + +- **Missing-cred behavior:** Sweep returns nil (no error) when either credential absent. Matches GitHubSource pattern — disabled sources log-and-skip at the Engine level, not error out. +- **Decode tolerance:** kaggleKernel struct only declares `Ref string`. Other fields (title, author, language) are silently discarded so upstream API changes don't break the source. +- **Error downgrade:** Non-401 HTTP errors skip to next query rather than aborting the whole sweep. 401 is the only hard-fail case because it means credentials are actually invalid, not transient. +- **Dual BaseURL fields:** BaseURL (API) and WebBaseURL (Finding URL stem) are separate struct fields so tests can point BaseURL at httptest.NewServer while WebBaseURL stays at the production kaggle.com domain for assertion stability. + +## Deviations from Plan + +None — plan executed exactly as written. All truths from frontmatter (`must_haves`) satisfied: +- KaggleSource queries `/api/v1/kernels/list` with Basic auth → TestKaggle_Sweep_BasicAuthAndFindings +- Disabled when either credential empty → TestKaggle_Enabled + TestKaggle_Sweep_MissingCredentials_NoHTTP +- Findings tagged recon:kaggle with Source = web + /code/ + ref → TestKaggle_Sweep_BasicAuthAndFindings + +## Issues Encountered + +- **Sibling-wave file churn:** During testing, sibling Wave 2 plans (10-02 GitHub, 10-05 Replit, 10-07 CodeSandbox, 10-03 GitLab) had already dropped partial files into `pkg/recon/sources/` in the main repo. A stray `github_test.go` with no `github.go` broke package compilation. Resolved by running tests in this plan's git worktree where only kaggle.go and kaggle_test.go are present alongside the Plan 10-01 scaffolding. No cross-plan changes made — scope boundary respected. Final wave merge will resolve all sibling files together. + +## Next Phase Readiness + +- KaggleSource is ready for registration in Plan 10-09 (`RegisterAll` wiring). +- No blockers for downstream plans. RECON-CODE-09 satisfied. + +## Self-Check: PASSED + +- File exists: `pkg/recon/sources/kaggle.go` — FOUND +- File exists: `pkg/recon/sources/kaggle_test.go` — FOUND +- Commit exists: `243b740` — FOUND (feat(10-08): add KaggleSource with HTTP Basic auth) +- Tests pass: 6/6 TestKaggle_* (verified with sibling files stashed to isolate package build) + +--- +*Phase: 10-osint-code-hosting* +*Plan: 08* +*Completed: 2026-04-05* diff --git a/pkg/recon/sources/kaggle.go b/pkg/recon/sources/kaggle.go new file mode 100644 index 0000000..de1a8ba --- /dev/null +++ b/pkg/recon/sources/kaggle.go @@ -0,0 +1,149 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// KaggleSource implements recon.ReconSource against the Kaggle public REST API. +// +// RECON-CODE-09: queries GET /api/v1/kernels/list?search=&pageSize=50 with +// HTTP Basic authentication (username + API key from kaggle.json). Emits +// engine.Finding entries for every returned kernel ref, with Source pointing +// to https://www.kaggle.com/code/. +type KaggleSource struct { + User string + Key string + BaseURL string // API base, default https://www.kaggle.com + WebBaseURL string // Web UI base for Finding URLs, default https://www.kaggle.com + Registry *providers.Registry + Limiters *recon.LimiterRegistry + client *Client +} + +// NewKaggleSource constructs a KaggleSource with default URLs and a shared Client. +func NewKaggleSource(user, key string, reg *providers.Registry, lim *recon.LimiterRegistry) *KaggleSource { + return &KaggleSource{ + User: user, + Key: key, + BaseURL: "https://www.kaggle.com", + WebBaseURL: "https://www.kaggle.com", + Registry: reg, + Limiters: lim, + client: NewClient(), + } +} + +// Name returns the stable source identifier. +func (s *KaggleSource) Name() string { return "kaggle" } + +// RateLimit enforces Kaggle's documented 60 requests/minute ceiling. +func (s *KaggleSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) } + +// Burst returns the per-source burst capacity. +func (s *KaggleSource) Burst() int { return 1 } + +// RespectsRobots is false — Kaggle exposes a public REST API, not scraped HTML. +func (s *KaggleSource) RespectsRobots() bool { return false } + +// Enabled reports whether both User and Key credentials are present. +func (s *KaggleSource) Enabled(_ recon.Config) bool { + return s.User != "" && s.Key != "" +} + +// Sweep iterates provider keyword queries, calling the Kaggle kernels/list API +// with Basic auth for each. For every returned kernel ref, a Finding is emitted +// on out with SourceType "recon:kaggle" and Source pointing at the web UI URL. +// +// Missing credentials short-circuit to nil without issuing any HTTP calls. +func (s *KaggleSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if s.User == "" || s.Key == "" { + return nil + } + + base := s.BaseURL + if base == "" { + base = "https://www.kaggle.com" + } + web := s.WebBaseURL + if web == "" { + web = "https://www.kaggle.com" + } + + queries := BuildQueries(s.Registry, "kaggle") + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + endpoint := fmt.Sprintf("%s/api/v1/kernels/list?search=%s&pageSize=50", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return err + } + req.SetBasicAuth(s.User, s.Key) + req.Header.Set("Accept", "application/json") + + resp, err := s.client.Do(ctx, req) + if err != nil { + if errors.Is(err, ErrUnauthorized) { + return err + } + // Sources downgrade on transient errors rather than aborting + // the whole sweep — skip to the next query. + continue + } + + var kernels []kaggleKernel + decodeErr := json.NewDecoder(resp.Body).Decode(&kernels) + resp.Body.Close() + if decodeErr != nil { + continue + } + + for _, k := range kernels { + if k.Ref == "" { + continue + } + f := recon.Finding{ + Confidence: "low", + Source: web + "/code/" + k.Ref, + SourceType: "recon:kaggle", + DetectedAt: time.Now(), + } + select { + case out <- f: + case <-ctx.Done(): + return ctx.Err() + } + } + } + + return nil +} + +// kaggleKernel mirrors the subset of fields returned by /api/v1/kernels/list +// that this source consumes. Additional fields (title, author, language) are +// ignored on purpose to keep the decoder tolerant of future API changes. +type kaggleKernel struct { + Ref string `json:"ref"` +} + +// Compile-time assertion that KaggleSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*KaggleSource)(nil) diff --git a/pkg/recon/sources/kaggle_test.go b/pkg/recon/sources/kaggle_test.go new file mode 100644 index 0000000..90dac47 --- /dev/null +++ b/pkg/recon/sources/kaggle_test.go @@ -0,0 +1,204 @@ +package sources + +import ( + "context" + "encoding/base64" + "errors" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func kaggleTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +func newKaggleSource(t *testing.T, user, key, baseURL string) *KaggleSource { + t.Helper() + s := NewKaggleSource(user, key, kaggleTestRegistry(), recon.NewLimiterRegistry()) + s.BaseURL = baseURL + s.WebBaseURL = "https://www.kaggle.com" + return s +} + +func TestKaggle_Enabled(t *testing.T) { + reg := kaggleTestRegistry() + lim := recon.NewLimiterRegistry() + + cases := []struct { + user, key string + want bool + }{ + {"", "", false}, + {"user", "", false}, + {"", "key", false}, + {"user", "key", true}, + } + for _, c := range cases { + s := NewKaggleSource(c.user, c.key, reg, lim) + if got := s.Enabled(recon.Config{}); got != c.want { + t.Errorf("Enabled(user=%q,key=%q) = %v, want %v", c.user, c.key, got, c.want) + } + } +} + +func TestKaggle_Sweep_BasicAuthAndFindings(t *testing.T) { + var gotAuth string + var gotQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + gotQuery = r.URL.Query().Get("search") + if r.URL.Query().Get("pageSize") != "50" { + t.Errorf("expected pageSize=50, got %q", r.URL.Query().Get("pageSize")) + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(200) + _, _ = w.Write([]byte(`[{"ref":"alice/notebook-one","title":"one"},{"ref":"bob/notebook-two","title":"two"}]`)) + })) + defer srv.Close() + + s := newKaggleSource(t, "testuser", "testkey", srv.URL) + + out := make(chan recon.Finding, 8) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := s.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep returned error: %v", err) + } + close(out) + + if !strings.HasPrefix(gotAuth, "Basic ") { + t.Fatalf("expected Basic auth header, got %q", gotAuth) + } + decoded, err := base64.StdEncoding.DecodeString(strings.TrimPrefix(gotAuth, "Basic ")) + if err != nil { + t.Fatalf("failed to decode Basic auth: %v", err) + } + if string(decoded) != "testuser:testkey" { + t.Fatalf("expected credentials testuser:testkey, got %q", string(decoded)) + } + + if gotQuery != "sk-proj-" { + t.Errorf("expected search=sk-proj-, got %q", gotQuery) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + wantSources := map[string]bool{ + "https://www.kaggle.com/code/alice/notebook-one": false, + "https://www.kaggle.com/code/bob/notebook-two": false, + } + for _, f := range findings { + if f.SourceType != "recon:kaggle" { + t.Errorf("expected SourceType recon:kaggle, got %q", f.SourceType) + } + if _, ok := wantSources[f.Source]; !ok { + t.Errorf("unexpected Source: %q", f.Source) + } + wantSources[f.Source] = true + } + for src, seen := range wantSources { + if !seen { + t.Errorf("missing expected finding source: %s", src) + } + } +} + +func TestKaggle_Sweep_MissingCredentials_NoHTTP(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.WriteHeader(200) + _, _ = w.Write([]byte("[]")) + })) + defer srv.Close() + + s := newKaggleSource(t, "testuser", "", srv.URL) + out := make(chan recon.Finding, 1) + if err := s.Sweep(context.Background(), "", out); err != nil { + t.Fatalf("expected nil error for missing key, got %v", err) + } + close(out) + + s2 := newKaggleSource(t, "", "testkey", srv.URL) + out2 := make(chan recon.Finding, 1) + if err := s2.Sweep(context.Background(), "", out2); err != nil { + t.Fatalf("expected nil error for missing user, got %v", err) + } + close(out2) + + if n := atomic.LoadInt32(&calls); n != 0 { + t.Fatalf("expected 0 HTTP calls when credentials missing, got %d", n) + } +} + +func TestKaggle_Sweep_Unauthorized(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(401) + _, _ = w.Write([]byte("bad creds")) + })) + defer srv.Close() + + s := newKaggleSource(t, "testuser", "testkey", srv.URL) + out := make(chan recon.Finding, 1) + err := s.Sweep(context.Background(), "", out) + if err == nil { + t.Fatal("expected error on 401") + } + if !errors.Is(err, ErrUnauthorized) { + t.Fatalf("expected ErrUnauthorized, got %v", err) + } +} + +func TestKaggle_Sweep_CtxCancellation(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(2 * time.Second) + w.WriteHeader(200) + _, _ = w.Write([]byte("[]")) + })) + defer srv.Close() + + s := newKaggleSource(t, "testuser", "testkey", srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 1) + err := s.Sweep(ctx, "", out) + if err == nil { + t.Fatal("expected error from cancelled context") + } + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled, got %v", err) + } +} + +func TestKaggle_ReconSourceInterface(t *testing.T) { + var _ recon.ReconSource = (*KaggleSource)(nil) + s := NewKaggleSource("u", "k", nil, nil) + if s.Name() != "kaggle" { + t.Errorf("Name = %q, want kaggle", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("Burst = %d, want 1", s.Burst()) + } + if s.RespectsRobots() { + t.Error("RespectsRobots should be false") + } + if s.RateLimit() <= 0 { + t.Error("RateLimit should be > 0") + } +}