feat(10-06): implement HuggingFaceSource scanning Spaces and Models
- queries /api/spaces and /api/models via Hub API - token optional: slower rate when absent (10s vs 3.6s) - emits Findings with SourceType=recon:huggingface and prefixed Source URLs - compile-time assert implements recon.ReconSource
This commit is contained in:
181
pkg/recon/sources/huggingface.go
Normal file
181
pkg/recon/sources/huggingface.go
Normal file
@@ -0,0 +1,181 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// defaultHuggingFaceBaseURL is the public HF Hub API root.
|
||||
const defaultHuggingFaceBaseURL = "https://huggingface.co"
|
||||
|
||||
// HuggingFaceConfig configures a HuggingFaceSource.
|
||||
type HuggingFaceConfig struct {
|
||||
// Token is the Hugging Face access token. Optional — anonymous requests
|
||||
// are accepted but rate-limited more aggressively.
|
||||
Token string
|
||||
// BaseURL overrides the API root for tests. Defaults to
|
||||
// https://huggingface.co when empty.
|
||||
BaseURL string
|
||||
// Registry drives keyword generation via BuildQueries.
|
||||
Registry *providers.Registry
|
||||
// Limiters is the shared per-source limiter registry.
|
||||
Limiters *recon.LimiterRegistry
|
||||
}
|
||||
|
||||
// HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub
|
||||
// API, sweeping both Spaces and model repositories for provider keywords.
|
||||
//
|
||||
// RECON-CODE-08: token optional; when empty the source still runs but applies
|
||||
// a slower RateLimit to stay within anonymous quotas.
|
||||
type HuggingFaceSource struct {
|
||||
Token string
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
|
||||
client *Client
|
||||
}
|
||||
|
||||
// NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults.
|
||||
func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource {
|
||||
base := cfg.BaseURL
|
||||
if base == "" {
|
||||
base = defaultHuggingFaceBaseURL
|
||||
}
|
||||
return &HuggingFaceSource{
|
||||
Token: cfg.Token,
|
||||
BaseURL: base,
|
||||
Registry: cfg.Registry,
|
||||
Limiters: cfg.Limiters,
|
||||
client: NewClient(),
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns the stable source identifier.
|
||||
func (s *HuggingFaceSource) Name() string { return "huggingface" }
|
||||
|
||||
// RateLimit returns the per-source token bucket rate. Authenticated requests
|
||||
// get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to
|
||||
// one every 10 seconds to stay conservative against the public quota.
|
||||
func (s *HuggingFaceSource) RateLimit() rate.Limit {
|
||||
if s.Token != "" {
|
||||
return rate.Every(3600 * time.Millisecond)
|
||||
}
|
||||
return rate.Every(10 * time.Second)
|
||||
}
|
||||
|
||||
// Burst returns the limiter burst capacity.
|
||||
func (s *HuggingFaceSource) Burst() int { return 1 }
|
||||
|
||||
// RespectsRobots reports whether this source should honor robots.txt.
|
||||
// The Hub API is a JSON endpoint, so robots.txt does not apply.
|
||||
func (s *HuggingFaceSource) RespectsRobots() bool { return false }
|
||||
|
||||
// Enabled reports whether this source should run. HuggingFace runs even
|
||||
// without a token — anonymous requests are permitted at a lower rate limit.
|
||||
func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// hfItem is the minimal shape returned by /api/spaces and /api/models list
|
||||
// endpoints. Both expose an `id` of the form "owner/name".
|
||||
type hfItem struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
|
||||
// Sweep iterates provider keywords and queries both the Spaces and Models
|
||||
// search endpoints, emitting one Finding per result.
|
||||
func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
if s.client == nil {
|
||||
s.client = NewClient()
|
||||
}
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
base = defaultHuggingFaceBaseURL
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, s.Name())
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
endpoints := []struct {
|
||||
path string
|
||||
urlPrefix string // prefix applied to item.ID to form Finding.Source
|
||||
}{
|
||||
{"/api/spaces", "https://huggingface.co/spaces/"},
|
||||
{"/api/models", "https://huggingface.co/"},
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
for _, ep := range endpoints {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *HuggingFaceSource) sweepEndpoint(
|
||||
ctx context.Context,
|
||||
base, path, urlPrefix, query string,
|
||||
out chan<- recon.Finding,
|
||||
) error {
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query))
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("huggingface: build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
if s.Token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+s.Token)
|
||||
}
|
||||
|
||||
resp, err := s.client.Do(ctx, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("huggingface %s: %w", path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var items []hfItem
|
||||
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
|
||||
return fmt.Errorf("huggingface %s: decode: %w", path, err)
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
if item.ID == "" {
|
||||
continue
|
||||
}
|
||||
finding := recon.Finding{
|
||||
Source: urlPrefix + item.ID,
|
||||
SourceType: "recon:huggingface",
|
||||
DetectedAt: time.Now().UTC(),
|
||||
}
|
||||
select {
|
||||
case out <- finding:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource.
|
||||
var _ recon.ReconSource = (*HuggingFaceSource)(nil)
|
||||
Reference in New Issue
Block a user