feat(10-06): implement HuggingFaceSource scanning Spaces and Models

- queries /api/spaces and /api/models via Hub API
- token optional: slower rate when absent (10s vs 3.6s)
- emits Findings with SourceType=recon:huggingface and prefixed Source URLs
- compile-time assert implements recon.ReconSource
This commit is contained in:
salvacybersec
2026-04-06 01:15:49 +03:00
parent 45f8782464
commit 39001f208c

View File

@@ -0,0 +1,181 @@
package sources
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/url"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// defaultHuggingFaceBaseURL is the public HF Hub API root.
const defaultHuggingFaceBaseURL = "https://huggingface.co"
// HuggingFaceConfig configures a HuggingFaceSource.
type HuggingFaceConfig struct {
// Token is the Hugging Face access token. Optional — anonymous requests
// are accepted but rate-limited more aggressively.
Token string
// BaseURL overrides the API root for tests. Defaults to
// https://huggingface.co when empty.
BaseURL string
// Registry drives keyword generation via BuildQueries.
Registry *providers.Registry
// Limiters is the shared per-source limiter registry.
Limiters *recon.LimiterRegistry
}
// HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub
// API, sweeping both Spaces and model repositories for provider keywords.
//
// RECON-CODE-08: token optional; when empty the source still runs but applies
// a slower RateLimit to stay within anonymous quotas.
type HuggingFaceSource struct {
Token string
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
client *Client
}
// NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults.
func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource {
base := cfg.BaseURL
if base == "" {
base = defaultHuggingFaceBaseURL
}
return &HuggingFaceSource{
Token: cfg.Token,
BaseURL: base,
Registry: cfg.Registry,
Limiters: cfg.Limiters,
client: NewClient(),
}
}
// Name returns the stable source identifier.
func (s *HuggingFaceSource) Name() string { return "huggingface" }
// RateLimit returns the per-source token bucket rate. Authenticated requests
// get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to
// one every 10 seconds to stay conservative against the public quota.
func (s *HuggingFaceSource) RateLimit() rate.Limit {
if s.Token != "" {
return rate.Every(3600 * time.Millisecond)
}
return rate.Every(10 * time.Second)
}
// Burst returns the limiter burst capacity.
func (s *HuggingFaceSource) Burst() int { return 1 }
// RespectsRobots reports whether this source should honor robots.txt.
// The Hub API is a JSON endpoint, so robots.txt does not apply.
func (s *HuggingFaceSource) RespectsRobots() bool { return false }
// Enabled reports whether this source should run. HuggingFace runs even
// without a token — anonymous requests are permitted at a lower rate limit.
func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true }
// hfItem is the minimal shape returned by /api/spaces and /api/models list
// endpoints. Both expose an `id` of the form "owner/name".
type hfItem struct {
ID string `json:"id"`
}
// Sweep iterates provider keywords and queries both the Spaces and Models
// search endpoints, emitting one Finding per result.
func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
if s.client == nil {
s.client = NewClient()
}
base := s.BaseURL
if base == "" {
base = defaultHuggingFaceBaseURL
}
queries := BuildQueries(s.Registry, s.Name())
if len(queries) == 0 {
return nil
}
endpoints := []struct {
path string
urlPrefix string // prefix applied to item.ID to form Finding.Source
}{
{"/api/spaces", "https://huggingface.co/spaces/"},
{"/api/models", "https://huggingface.co/"},
}
for _, q := range queries {
for _, ep := range endpoints {
if err := ctx.Err(); err != nil {
return err
}
if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil {
return err
}
}
}
return nil
}
func (s *HuggingFaceSource) sweepEndpoint(
ctx context.Context,
base, path, urlPrefix, query string,
out chan<- recon.Finding,
) error {
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return fmt.Errorf("huggingface: build request: %w", err)
}
req.Header.Set("Accept", "application/json")
if s.Token != "" {
req.Header.Set("Authorization", "Bearer "+s.Token)
}
resp, err := s.client.Do(ctx, req)
if err != nil {
return fmt.Errorf("huggingface %s: %w", path, err)
}
defer resp.Body.Close()
var items []hfItem
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
return fmt.Errorf("huggingface %s: decode: %w", path, err)
}
for _, item := range items {
if item.ID == "" {
continue
}
finding := recon.Finding{
Source: urlPrefix + item.ID,
SourceType: "recon:huggingface",
DetectedAt: time.Now().UTC(),
}
select {
case out <- finding:
case <-ctx.Done():
return ctx.Err()
}
}
return nil
}
// Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource.
var _ recon.ReconSource = (*HuggingFaceSource)(nil)