diff --git a/pkg/recon/sources/huggingface.go b/pkg/recon/sources/huggingface.go new file mode 100644 index 0000000..0d1c11f --- /dev/null +++ b/pkg/recon/sources/huggingface.go @@ -0,0 +1,181 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// defaultHuggingFaceBaseURL is the public HF Hub API root. +const defaultHuggingFaceBaseURL = "https://huggingface.co" + +// HuggingFaceConfig configures a HuggingFaceSource. +type HuggingFaceConfig struct { + // Token is the Hugging Face access token. Optional — anonymous requests + // are accepted but rate-limited more aggressively. + Token string + // BaseURL overrides the API root for tests. Defaults to + // https://huggingface.co when empty. + BaseURL string + // Registry drives keyword generation via BuildQueries. + Registry *providers.Registry + // Limiters is the shared per-source limiter registry. + Limiters *recon.LimiterRegistry +} + +// HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub +// API, sweeping both Spaces and model repositories for provider keywords. +// +// RECON-CODE-08: token optional; when empty the source still runs but applies +// a slower RateLimit to stay within anonymous quotas. +type HuggingFaceSource struct { + Token string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + + client *Client +} + +// NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults. +func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource { + base := cfg.BaseURL + if base == "" { + base = defaultHuggingFaceBaseURL + } + return &HuggingFaceSource{ + Token: cfg.Token, + BaseURL: base, + Registry: cfg.Registry, + Limiters: cfg.Limiters, + client: NewClient(), + } +} + +// Name returns the stable source identifier. +func (s *HuggingFaceSource) Name() string { return "huggingface" } + +// RateLimit returns the per-source token bucket rate. Authenticated requests +// get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to +// one every 10 seconds to stay conservative against the public quota. +func (s *HuggingFaceSource) RateLimit() rate.Limit { + if s.Token != "" { + return rate.Every(3600 * time.Millisecond) + } + return rate.Every(10 * time.Second) +} + +// Burst returns the limiter burst capacity. +func (s *HuggingFaceSource) Burst() int { return 1 } + +// RespectsRobots reports whether this source should honor robots.txt. +// The Hub API is a JSON endpoint, so robots.txt does not apply. +func (s *HuggingFaceSource) RespectsRobots() bool { return false } + +// Enabled reports whether this source should run. HuggingFace runs even +// without a token — anonymous requests are permitted at a lower rate limit. +func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true } + +// hfItem is the minimal shape returned by /api/spaces and /api/models list +// endpoints. Both expose an `id` of the form "owner/name". +type hfItem struct { + ID string `json:"id"` +} + +// Sweep iterates provider keywords and queries both the Spaces and Models +// search endpoints, emitting one Finding per result. +func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if s.client == nil { + s.client = NewClient() + } + base := s.BaseURL + if base == "" { + base = defaultHuggingFaceBaseURL + } + + queries := BuildQueries(s.Registry, s.Name()) + if len(queries) == 0 { + return nil + } + + endpoints := []struct { + path string + urlPrefix string // prefix applied to item.ID to form Finding.Source + }{ + {"/api/spaces", "https://huggingface.co/spaces/"}, + {"/api/models", "https://huggingface.co/"}, + } + + for _, q := range queries { + for _, ep := range endpoints { + if err := ctx.Err(); err != nil { + return err + } + if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil { + return err + } + } + } + return nil +} + +func (s *HuggingFaceSource) sweepEndpoint( + ctx context.Context, + base, path, urlPrefix, query string, + out chan<- recon.Finding, +) error { + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return fmt.Errorf("huggingface: build request: %w", err) + } + req.Header.Set("Accept", "application/json") + if s.Token != "" { + req.Header.Set("Authorization", "Bearer "+s.Token) + } + + resp, err := s.client.Do(ctx, req) + if err != nil { + return fmt.Errorf("huggingface %s: %w", path, err) + } + defer resp.Body.Close() + + var items []hfItem + if err := json.NewDecoder(resp.Body).Decode(&items); err != nil { + return fmt.Errorf("huggingface %s: decode: %w", path, err) + } + + for _, item := range items { + if item.ID == "" { + continue + } + finding := recon.Finding{ + Source: urlPrefix + item.ID, + SourceType: "recon:huggingface", + DetectedAt: time.Now().UTC(), + } + select { + case out <- finding: + case <-ctx.Done(): + return ctx.Err() + } + } + return nil +} + +// Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*HuggingFaceSource)(nil)