package sources import ( "context" "encoding/json" "errors" "fmt" "net/http" "net/url" "time" "golang.org/x/time/rate" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // KaggleSource implements recon.ReconSource against the Kaggle public REST API. // // RECON-CODE-09: queries GET /api/v1/kernels/list?search=&pageSize=50 with // HTTP Basic authentication (username + API key from kaggle.json). Emits // engine.Finding entries for every returned kernel ref, with Source pointing // to https://www.kaggle.com/code/. type KaggleSource struct { User string Key string BaseURL string // API base, default https://www.kaggle.com WebBaseURL string // Web UI base for Finding URLs, default https://www.kaggle.com Registry *providers.Registry Limiters *recon.LimiterRegistry client *Client } // NewKaggleSource constructs a KaggleSource with default URLs and a shared Client. func NewKaggleSource(user, key string, reg *providers.Registry, lim *recon.LimiterRegistry) *KaggleSource { return &KaggleSource{ User: user, Key: key, BaseURL: "https://www.kaggle.com", WebBaseURL: "https://www.kaggle.com", Registry: reg, Limiters: lim, client: NewClient(), } } // Name returns the stable source identifier. func (s *KaggleSource) Name() string { return "kaggle" } // RateLimit enforces Kaggle's documented 60 requests/minute ceiling. func (s *KaggleSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) } // Burst returns the per-source burst capacity. func (s *KaggleSource) Burst() int { return 1 } // RespectsRobots is false — Kaggle exposes a public REST API, not scraped HTML. func (s *KaggleSource) RespectsRobots() bool { return false } // Enabled reports whether both User and Key credentials are present. func (s *KaggleSource) Enabled(_ recon.Config) bool { return s.User != "" && s.Key != "" } // Sweep iterates provider keyword queries, calling the Kaggle kernels/list API // with Basic auth for each. For every returned kernel ref, a Finding is emitted // on out with SourceType "recon:kaggle" and Source pointing at the web UI URL. // // Missing credentials short-circuit to nil without issuing any HTTP calls. func (s *KaggleSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { if s.User == "" || s.Key == "" { return nil } base := s.BaseURL if base == "" { base = "https://www.kaggle.com" } web := s.WebBaseURL if web == "" { web = "https://www.kaggle.com" } queries := BuildQueries(s.Registry, "kaggle") for _, q := range queries { if err := ctx.Err(); err != nil { return err } if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } endpoint := fmt.Sprintf("%s/api/v1/kernels/list?search=%s&pageSize=50", base, url.QueryEscape(q)) req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) if err != nil { return err } req.SetBasicAuth(s.User, s.Key) req.Header.Set("Accept", "application/json") resp, err := s.client.Do(ctx, req) if err != nil { if errors.Is(err, ErrUnauthorized) { return err } // Sources downgrade on transient errors rather than aborting // the whole sweep — skip to the next query. continue } var kernels []kaggleKernel decodeErr := json.NewDecoder(resp.Body).Decode(&kernels) resp.Body.Close() if decodeErr != nil { continue } for _, k := range kernels { if k.Ref == "" { continue } f := recon.Finding{ Confidence: "low", Source: web + "/code/" + k.Ref, SourceType: "recon:kaggle", DetectedAt: time.Now(), } select { case out <- f: case <-ctx.Done(): return ctx.Err() } } } return nil } // kaggleKernel mirrors the subset of fields returned by /api/v1/kernels/list // that this source consumes. Additional fields (title, author, language) are // ignored on purpose to keep the decoder tolerant of future API changes. type kaggleKernel struct { Ref string `json:"ref"` } // Compile-time assertion that KaggleSource satisfies recon.ReconSource. var _ recon.ReconSource = (*KaggleSource)(nil)