feat(10-06): implement HuggingFaceSource scanning Spaces and Models
- queries /api/spaces and /api/models via Hub API - token optional: slower rate when absent (10s vs 3.6s) - emits Findings with SourceType=recon:huggingface and prefixed Source URLs - compile-time assert implements recon.ReconSource
This commit is contained in:
181
pkg/recon/sources/huggingface.go
Normal file
181
pkg/recon/sources/huggingface.go
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
package sources
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"golang.org/x/time/rate"
|
||||||
|
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||||
|
)
|
||||||
|
|
||||||
|
// defaultHuggingFaceBaseURL is the public HF Hub API root.
|
||||||
|
const defaultHuggingFaceBaseURL = "https://huggingface.co"
|
||||||
|
|
||||||
|
// HuggingFaceConfig configures a HuggingFaceSource.
|
||||||
|
type HuggingFaceConfig struct {
|
||||||
|
// Token is the Hugging Face access token. Optional — anonymous requests
|
||||||
|
// are accepted but rate-limited more aggressively.
|
||||||
|
Token string
|
||||||
|
// BaseURL overrides the API root for tests. Defaults to
|
||||||
|
// https://huggingface.co when empty.
|
||||||
|
BaseURL string
|
||||||
|
// Registry drives keyword generation via BuildQueries.
|
||||||
|
Registry *providers.Registry
|
||||||
|
// Limiters is the shared per-source limiter registry.
|
||||||
|
Limiters *recon.LimiterRegistry
|
||||||
|
}
|
||||||
|
|
||||||
|
// HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub
|
||||||
|
// API, sweeping both Spaces and model repositories for provider keywords.
|
||||||
|
//
|
||||||
|
// RECON-CODE-08: token optional; when empty the source still runs but applies
|
||||||
|
// a slower RateLimit to stay within anonymous quotas.
|
||||||
|
type HuggingFaceSource struct {
|
||||||
|
Token string
|
||||||
|
BaseURL string
|
||||||
|
Registry *providers.Registry
|
||||||
|
Limiters *recon.LimiterRegistry
|
||||||
|
|
||||||
|
client *Client
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults.
|
||||||
|
func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource {
|
||||||
|
base := cfg.BaseURL
|
||||||
|
if base == "" {
|
||||||
|
base = defaultHuggingFaceBaseURL
|
||||||
|
}
|
||||||
|
return &HuggingFaceSource{
|
||||||
|
Token: cfg.Token,
|
||||||
|
BaseURL: base,
|
||||||
|
Registry: cfg.Registry,
|
||||||
|
Limiters: cfg.Limiters,
|
||||||
|
client: NewClient(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Name returns the stable source identifier.
|
||||||
|
func (s *HuggingFaceSource) Name() string { return "huggingface" }
|
||||||
|
|
||||||
|
// RateLimit returns the per-source token bucket rate. Authenticated requests
|
||||||
|
// get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to
|
||||||
|
// one every 10 seconds to stay conservative against the public quota.
|
||||||
|
func (s *HuggingFaceSource) RateLimit() rate.Limit {
|
||||||
|
if s.Token != "" {
|
||||||
|
return rate.Every(3600 * time.Millisecond)
|
||||||
|
}
|
||||||
|
return rate.Every(10 * time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Burst returns the limiter burst capacity.
|
||||||
|
func (s *HuggingFaceSource) Burst() int { return 1 }
|
||||||
|
|
||||||
|
// RespectsRobots reports whether this source should honor robots.txt.
|
||||||
|
// The Hub API is a JSON endpoint, so robots.txt does not apply.
|
||||||
|
func (s *HuggingFaceSource) RespectsRobots() bool { return false }
|
||||||
|
|
||||||
|
// Enabled reports whether this source should run. HuggingFace runs even
|
||||||
|
// without a token — anonymous requests are permitted at a lower rate limit.
|
||||||
|
func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true }
|
||||||
|
|
||||||
|
// hfItem is the minimal shape returned by /api/spaces and /api/models list
|
||||||
|
// endpoints. Both expose an `id` of the form "owner/name".
|
||||||
|
type hfItem struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sweep iterates provider keywords and queries both the Spaces and Models
|
||||||
|
// search endpoints, emitting one Finding per result.
|
||||||
|
func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||||
|
if s.client == nil {
|
||||||
|
s.client = NewClient()
|
||||||
|
}
|
||||||
|
base := s.BaseURL
|
||||||
|
if base == "" {
|
||||||
|
base = defaultHuggingFaceBaseURL
|
||||||
|
}
|
||||||
|
|
||||||
|
queries := BuildQueries(s.Registry, s.Name())
|
||||||
|
if len(queries) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
endpoints := []struct {
|
||||||
|
path string
|
||||||
|
urlPrefix string // prefix applied to item.ID to form Finding.Source
|
||||||
|
}{
|
||||||
|
{"/api/spaces", "https://huggingface.co/spaces/"},
|
||||||
|
{"/api/models", "https://huggingface.co/"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, q := range queries {
|
||||||
|
for _, ep := range endpoints {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *HuggingFaceSource) sweepEndpoint(
|
||||||
|
ctx context.Context,
|
||||||
|
base, path, urlPrefix, query string,
|
||||||
|
out chan<- recon.Finding,
|
||||||
|
) error {
|
||||||
|
if s.Limiters != nil {
|
||||||
|
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query))
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("huggingface: build request: %w", err)
|
||||||
|
}
|
||||||
|
req.Header.Set("Accept", "application/json")
|
||||||
|
if s.Token != "" {
|
||||||
|
req.Header.Set("Authorization", "Bearer "+s.Token)
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := s.client.Do(ctx, req)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("huggingface %s: %w", path, err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
var items []hfItem
|
||||||
|
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
|
||||||
|
return fmt.Errorf("huggingface %s: decode: %w", path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, item := range items {
|
||||||
|
if item.ID == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
finding := recon.Finding{
|
||||||
|
Source: urlPrefix + item.ID,
|
||||||
|
SourceType: "recon:huggingface",
|
||||||
|
DetectedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case out <- finding:
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource.
|
||||||
|
var _ recon.ReconSource = (*HuggingFaceSource)(nil)
|
||||||
Reference in New Issue
Block a user