- KaggleSource queries /api/v1/kernels/list with SetBasicAuth(user, key) - Disabled when either KaggleUser or KaggleKey is empty (no HTTP calls) - Emits Findings tagged recon:kaggle with Source = <web>/code/<ref> - 60/min rate limit via rate.Every(1s), burst 1 - httptest-driven tests cover enabled, auth header, missing creds, 401 unauthorized, and ctx cancellation - RECON-CODE-09
150 lines
4.2 KiB
Go
150 lines
4.2 KiB
Go
package sources
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net/http"
|
|
"net/url"
|
|
"time"
|
|
|
|
"golang.org/x/time/rate"
|
|
|
|
"github.com/salvacybersec/keyhunter/pkg/providers"
|
|
"github.com/salvacybersec/keyhunter/pkg/recon"
|
|
)
|
|
|
|
// KaggleSource implements recon.ReconSource against the Kaggle public REST API.
|
|
//
|
|
// RECON-CODE-09: queries GET /api/v1/kernels/list?search=<q>&pageSize=50 with
|
|
// HTTP Basic authentication (username + API key from kaggle.json). Emits
|
|
// engine.Finding entries for every returned kernel ref, with Source pointing
|
|
// to https://www.kaggle.com/code/<ref>.
|
|
type KaggleSource struct {
|
|
User string
|
|
Key string
|
|
BaseURL string // API base, default https://www.kaggle.com
|
|
WebBaseURL string // Web UI base for Finding URLs, default https://www.kaggle.com
|
|
Registry *providers.Registry
|
|
Limiters *recon.LimiterRegistry
|
|
client *Client
|
|
}
|
|
|
|
// NewKaggleSource constructs a KaggleSource with default URLs and a shared Client.
|
|
func NewKaggleSource(user, key string, reg *providers.Registry, lim *recon.LimiterRegistry) *KaggleSource {
|
|
return &KaggleSource{
|
|
User: user,
|
|
Key: key,
|
|
BaseURL: "https://www.kaggle.com",
|
|
WebBaseURL: "https://www.kaggle.com",
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
client: NewClient(),
|
|
}
|
|
}
|
|
|
|
// Name returns the stable source identifier.
|
|
func (s *KaggleSource) Name() string { return "kaggle" }
|
|
|
|
// RateLimit enforces Kaggle's documented 60 requests/minute ceiling.
|
|
func (s *KaggleSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) }
|
|
|
|
// Burst returns the per-source burst capacity.
|
|
func (s *KaggleSource) Burst() int { return 1 }
|
|
|
|
// RespectsRobots is false — Kaggle exposes a public REST API, not scraped HTML.
|
|
func (s *KaggleSource) RespectsRobots() bool { return false }
|
|
|
|
// Enabled reports whether both User and Key credentials are present.
|
|
func (s *KaggleSource) Enabled(_ recon.Config) bool {
|
|
return s.User != "" && s.Key != ""
|
|
}
|
|
|
|
// Sweep iterates provider keyword queries, calling the Kaggle kernels/list API
|
|
// with Basic auth for each. For every returned kernel ref, a Finding is emitted
|
|
// on out with SourceType "recon:kaggle" and Source pointing at the web UI URL.
|
|
//
|
|
// Missing credentials short-circuit to nil without issuing any HTTP calls.
|
|
func (s *KaggleSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
|
if s.User == "" || s.Key == "" {
|
|
return nil
|
|
}
|
|
|
|
base := s.BaseURL
|
|
if base == "" {
|
|
base = "https://www.kaggle.com"
|
|
}
|
|
web := s.WebBaseURL
|
|
if web == "" {
|
|
web = "https://www.kaggle.com"
|
|
}
|
|
|
|
queries := BuildQueries(s.Registry, "kaggle")
|
|
|
|
for _, q := range queries {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
if s.Limiters != nil {
|
|
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
endpoint := fmt.Sprintf("%s/api/v1/kernels/list?search=%s&pageSize=50", base, url.QueryEscape(q))
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
req.SetBasicAuth(s.User, s.Key)
|
|
req.Header.Set("Accept", "application/json")
|
|
|
|
resp, err := s.client.Do(ctx, req)
|
|
if err != nil {
|
|
if errors.Is(err, ErrUnauthorized) {
|
|
return err
|
|
}
|
|
// Sources downgrade on transient errors rather than aborting
|
|
// the whole sweep — skip to the next query.
|
|
continue
|
|
}
|
|
|
|
var kernels []kaggleKernel
|
|
decodeErr := json.NewDecoder(resp.Body).Decode(&kernels)
|
|
resp.Body.Close()
|
|
if decodeErr != nil {
|
|
continue
|
|
}
|
|
|
|
for _, k := range kernels {
|
|
if k.Ref == "" {
|
|
continue
|
|
}
|
|
f := recon.Finding{
|
|
Confidence: "low",
|
|
Source: web + "/code/" + k.Ref,
|
|
SourceType: "recon:kaggle",
|
|
DetectedAt: time.Now(),
|
|
}
|
|
select {
|
|
case out <- f:
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// kaggleKernel mirrors the subset of fields returned by /api/v1/kernels/list
|
|
// that this source consumes. Additional fields (title, author, language) are
|
|
// ignored on purpose to keep the decoder tolerant of future API changes.
|
|
type kaggleKernel struct {
|
|
Ref string `json:"ref"`
|
|
}
|
|
|
|
// Compile-time assertion that KaggleSource satisfies recon.ReconSource.
|
|
var _ recon.ReconSource = (*KaggleSource)(nil)
|