package sources import ( "context" "encoding/json" "fmt" "net/http" "net/url" "time" "golang.org/x/time/rate" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // defaultHuggingFaceBaseURL is the public HF Hub API root. const defaultHuggingFaceBaseURL = "https://huggingface.co" // HuggingFaceConfig configures a HuggingFaceSource. type HuggingFaceConfig struct { // Token is the Hugging Face access token. Optional — anonymous requests // are accepted but rate-limited more aggressively. Token string // BaseURL overrides the API root for tests. Defaults to // https://huggingface.co when empty. BaseURL string // Registry drives keyword generation via BuildQueries. Registry *providers.Registry // Limiters is the shared per-source limiter registry. Limiters *recon.LimiterRegistry } // HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub // API, sweeping both Spaces and model repositories for provider keywords. // // RECON-CODE-08: token optional; when empty the source still runs but applies // a slower RateLimit to stay within anonymous quotas. type HuggingFaceSource struct { Token string BaseURL string Registry *providers.Registry Limiters *recon.LimiterRegistry client *Client } // NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults. func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource { base := cfg.BaseURL if base == "" { base = defaultHuggingFaceBaseURL } return &HuggingFaceSource{ Token: cfg.Token, BaseURL: base, Registry: cfg.Registry, Limiters: cfg.Limiters, client: NewClient(), } } // Name returns the stable source identifier. func (s *HuggingFaceSource) Name() string { return "huggingface" } // RateLimit returns the per-source token bucket rate. Authenticated requests // get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to // one every 10 seconds to stay conservative against the public quota. func (s *HuggingFaceSource) RateLimit() rate.Limit { if s.Token != "" { return rate.Every(3600 * time.Millisecond) } return rate.Every(10 * time.Second) } // Burst returns the limiter burst capacity. func (s *HuggingFaceSource) Burst() int { return 1 } // RespectsRobots reports whether this source should honor robots.txt. // The Hub API is a JSON endpoint, so robots.txt does not apply. func (s *HuggingFaceSource) RespectsRobots() bool { return false } // Enabled reports whether this source should run. HuggingFace runs even // without a token — anonymous requests are permitted at a lower rate limit. func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true } // hfItem is the minimal shape returned by /api/spaces and /api/models list // endpoints. Both expose an `id` of the form "owner/name". type hfItem struct { ID string `json:"id"` } // Sweep iterates provider keywords and queries both the Spaces and Models // search endpoints, emitting one Finding per result. func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { if s.client == nil { s.client = NewClient() } base := s.BaseURL if base == "" { base = defaultHuggingFaceBaseURL } queries := BuildQueries(s.Registry, s.Name()) if len(queries) == 0 { return nil } endpoints := []struct { path string urlPrefix string // prefix applied to item.ID to form Finding.Source }{ {"/api/spaces", "https://huggingface.co/spaces/"}, {"/api/models", "https://huggingface.co/"}, } for _, q := range queries { for _, ep := range endpoints { if err := ctx.Err(); err != nil { return err } if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil { return err } } } return nil } func (s *HuggingFaceSource) sweepEndpoint( ctx context.Context, base, path, urlPrefix, query string, out chan<- recon.Finding, ) error { if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query)) req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) if err != nil { return fmt.Errorf("huggingface: build request: %w", err) } req.Header.Set("Accept", "application/json") if s.Token != "" { req.Header.Set("Authorization", "Bearer "+s.Token) } resp, err := s.client.Do(ctx, req) if err != nil { return fmt.Errorf("huggingface %s: %w", path, err) } defer resp.Body.Close() var items []hfItem if err := json.NewDecoder(resp.Body).Decode(&items); err != nil { return fmt.Errorf("huggingface %s: decode: %w", path, err) } for _, item := range items { if item.ID == "" { continue } finding := recon.Finding{ Source: urlPrefix + item.ID, SourceType: "recon:huggingface", DetectedAt: time.Now().UTC(), } select { case out <- finding: case <-ctx.Done(): return ctx.Err() } } return nil } // Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource. var _ recon.ReconSource = (*HuggingFaceSource)(nil)