From 45f87824640f455c49f7b579ae57f4fe20c81c2a Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 01:15:43 +0300 Subject: [PATCH 1/3] test(10-06): add failing tests for HuggingFaceSource - httptest server routes /api/spaces and /api/models - assertions: enabled, both endpoints hit, URL prefixes, auth header, ctx cancel, rate-limit token mode --- pkg/recon/sources/huggingface_test.go | 204 ++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 pkg/recon/sources/huggingface_test.go diff --git a/pkg/recon/sources/huggingface_test.go b/pkg/recon/sources/huggingface_test.go new file mode 100644 index 0000000..b4a4cfc --- /dev/null +++ b/pkg/recon/sources/huggingface_test.go @@ -0,0 +1,204 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// hfTestRegistry builds a minimal registry with two keywords so tests assert +// an exact Finding count (2 endpoints × 2 keywords × 1 result = 4). +func hfTestRegistry(t *testing.T) *providers.Registry { + t.Helper() + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "OpenAI", Keywords: []string{"sk-proj"}}, + {Name: "Anthropic", Keywords: []string{"sk-ant"}}, + }) +} + +func hfTestServer(t *testing.T, spacesHits, modelsHits *int32, authSeen *string) *httptest.Server { + t.Helper() + mux := http.NewServeMux() + mux.HandleFunc("/api/spaces", func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(spacesHits, 1) + if authSeen != nil { + *authSeen = r.Header.Get("Authorization") + } + q := r.URL.Query().Get("search") + payload := []map[string]string{ + {"id": fmt.Sprintf("acme/space-%s", q)}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(payload) + }) + mux.HandleFunc("/api/models", func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(modelsHits, 1) + q := r.URL.Query().Get("search") + payload := []map[string]string{ + {"id": fmt.Sprintf("acme/model-%s", q)}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(payload) + }) + return httptest.NewServer(mux) +} + +func TestHuggingFaceEnabledAlwaysTrue(t *testing.T) { + if !(&HuggingFaceSource{}).Enabled(recon.Config{}) { + t.Fatal("HuggingFace should be enabled even without token") + } + if !(&HuggingFaceSource{Token: "hf_xxx"}).Enabled(recon.Config{}) { + t.Fatal("HuggingFace should be enabled with token") + } +} + +func TestHuggingFaceSweepHitsBothEndpoints(t *testing.T) { + var spacesHits, modelsHits int32 + ts := hfTestServer(t, &spacesHits, &modelsHits, nil) + defer ts.Close() + + reg := hfTestRegistry(t) + src := NewHuggingFaceSource(HuggingFaceConfig{ + Token: "hf_test", + BaseURL: ts.URL, + Registry: reg, + Limiters: nil, // bypass rate limiter for tests + }) + + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep: %v", err) + } + close(out) + + findings := make([]recon.Finding, 0) + for f := range out { + findings = append(findings, f) + } + + if len(findings) != 4 { + t.Fatalf("expected 4 findings, got %d", len(findings)) + } + if atomic.LoadInt32(&spacesHits) != 2 { + t.Errorf("expected 2 /api/spaces hits, got %d", spacesHits) + } + if atomic.LoadInt32(&modelsHits) != 2 { + t.Errorf("expected 2 /api/models hits, got %d", modelsHits) + } + + var sawSpace, sawModel bool + for _, f := range findings { + if f.SourceType != "recon:huggingface" { + t.Errorf("wrong SourceType: %q", f.SourceType) + } + switch { + case strings.HasPrefix(f.Source, "https://huggingface.co/spaces/"): + sawSpace = true + case strings.HasPrefix(f.Source, "https://huggingface.co/"): + sawModel = true + default: + t.Errorf("unexpected Source URL: %q", f.Source) + } + } + if !sawSpace || !sawModel { + t.Errorf("expected both space and model URLs; space=%v model=%v", sawSpace, sawModel) + } +} + +func TestHuggingFaceAuthorizationHeader(t *testing.T) { + var authSeen string + var s, m int32 + ts := hfTestServer(t, &s, &m, &authSeen) + defer ts.Close() + + reg := hfTestRegistry(t) + src := NewHuggingFaceSource(HuggingFaceConfig{ + Token: "hf_secret", + BaseURL: ts.URL, + Registry: reg, + Limiters: nil, + }) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep: %v", err) + } + close(out) + for range out { + } + if authSeen != "Bearer hf_secret" { + t.Errorf("expected 'Bearer hf_secret', got %q", authSeen) + } + + // Without token + authSeen = "" + var s2, m2 int32 + ts2 := hfTestServer(t, &s2, &m2, &authSeen) + defer ts2.Close() + src2 := NewHuggingFaceSource(HuggingFaceConfig{ + BaseURL: ts2.URL, + Registry: reg, + Limiters: nil, + }) + out2 := make(chan recon.Finding, 16) + if err := src2.Sweep(ctx, "", out2); err != nil { + t.Fatalf("Sweep unauth: %v", err) + } + close(out2) + for range out2 { + } + if authSeen != "" { + t.Errorf("expected no Authorization header when token empty, got %q", authSeen) + } +} + +func TestHuggingFaceContextCancellation(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + select { + case <-r.Context().Done(): + return + case <-time.After(2 * time.Second): + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("[]")) + } + })) + defer ts.Close() + + reg := hfTestRegistry(t) + src := NewHuggingFaceSource(HuggingFaceConfig{ + BaseURL: ts.URL, + Registry: reg, + Limiters: nil, + }) + + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + out := make(chan recon.Finding, 16) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected error on cancelled context") + } +} + +func TestHuggingFaceRateLimitTokenMode(t *testing.T) { + withTok := &HuggingFaceSource{Token: "hf_xxx"} + noTok := &HuggingFaceSource{} + if withTok.RateLimit() == noTok.RateLimit() { + t.Fatal("rate limit should differ based on token presence") + } + if withTok.RateLimit() < noTok.RateLimit() { + t.Fatalf("authenticated rate (%v) should be faster (larger) than unauth (%v)", + withTok.RateLimit(), noTok.RateLimit()) + } +} From 39001f208c51fa5e420b2151adcc254d8263c29f Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 01:15:49 +0300 Subject: [PATCH 2/3] feat(10-06): implement HuggingFaceSource scanning Spaces and Models - queries /api/spaces and /api/models via Hub API - token optional: slower rate when absent (10s vs 3.6s) - emits Findings with SourceType=recon:huggingface and prefixed Source URLs - compile-time assert implements recon.ReconSource --- pkg/recon/sources/huggingface.go | 181 +++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 pkg/recon/sources/huggingface.go diff --git a/pkg/recon/sources/huggingface.go b/pkg/recon/sources/huggingface.go new file mode 100644 index 0000000..0d1c11f --- /dev/null +++ b/pkg/recon/sources/huggingface.go @@ -0,0 +1,181 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// defaultHuggingFaceBaseURL is the public HF Hub API root. +const defaultHuggingFaceBaseURL = "https://huggingface.co" + +// HuggingFaceConfig configures a HuggingFaceSource. +type HuggingFaceConfig struct { + // Token is the Hugging Face access token. Optional — anonymous requests + // are accepted but rate-limited more aggressively. + Token string + // BaseURL overrides the API root for tests. Defaults to + // https://huggingface.co when empty. + BaseURL string + // Registry drives keyword generation via BuildQueries. + Registry *providers.Registry + // Limiters is the shared per-source limiter registry. + Limiters *recon.LimiterRegistry +} + +// HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub +// API, sweeping both Spaces and model repositories for provider keywords. +// +// RECON-CODE-08: token optional; when empty the source still runs but applies +// a slower RateLimit to stay within anonymous quotas. +type HuggingFaceSource struct { + Token string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + + client *Client +} + +// NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults. +func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource { + base := cfg.BaseURL + if base == "" { + base = defaultHuggingFaceBaseURL + } + return &HuggingFaceSource{ + Token: cfg.Token, + BaseURL: base, + Registry: cfg.Registry, + Limiters: cfg.Limiters, + client: NewClient(), + } +} + +// Name returns the stable source identifier. +func (s *HuggingFaceSource) Name() string { return "huggingface" } + +// RateLimit returns the per-source token bucket rate. Authenticated requests +// get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to +// one every 10 seconds to stay conservative against the public quota. +func (s *HuggingFaceSource) RateLimit() rate.Limit { + if s.Token != "" { + return rate.Every(3600 * time.Millisecond) + } + return rate.Every(10 * time.Second) +} + +// Burst returns the limiter burst capacity. +func (s *HuggingFaceSource) Burst() int { return 1 } + +// RespectsRobots reports whether this source should honor robots.txt. +// The Hub API is a JSON endpoint, so robots.txt does not apply. +func (s *HuggingFaceSource) RespectsRobots() bool { return false } + +// Enabled reports whether this source should run. HuggingFace runs even +// without a token — anonymous requests are permitted at a lower rate limit. +func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true } + +// hfItem is the minimal shape returned by /api/spaces and /api/models list +// endpoints. Both expose an `id` of the form "owner/name". +type hfItem struct { + ID string `json:"id"` +} + +// Sweep iterates provider keywords and queries both the Spaces and Models +// search endpoints, emitting one Finding per result. +func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if s.client == nil { + s.client = NewClient() + } + base := s.BaseURL + if base == "" { + base = defaultHuggingFaceBaseURL + } + + queries := BuildQueries(s.Registry, s.Name()) + if len(queries) == 0 { + return nil + } + + endpoints := []struct { + path string + urlPrefix string // prefix applied to item.ID to form Finding.Source + }{ + {"/api/spaces", "https://huggingface.co/spaces/"}, + {"/api/models", "https://huggingface.co/"}, + } + + for _, q := range queries { + for _, ep := range endpoints { + if err := ctx.Err(); err != nil { + return err + } + if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil { + return err + } + } + } + return nil +} + +func (s *HuggingFaceSource) sweepEndpoint( + ctx context.Context, + base, path, urlPrefix, query string, + out chan<- recon.Finding, +) error { + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return fmt.Errorf("huggingface: build request: %w", err) + } + req.Header.Set("Accept", "application/json") + if s.Token != "" { + req.Header.Set("Authorization", "Bearer "+s.Token) + } + + resp, err := s.client.Do(ctx, req) + if err != nil { + return fmt.Errorf("huggingface %s: %w", path, err) + } + defer resp.Body.Close() + + var items []hfItem + if err := json.NewDecoder(resp.Body).Decode(&items); err != nil { + return fmt.Errorf("huggingface %s: decode: %w", path, err) + } + + for _, item := range items { + if item.ID == "" { + continue + } + finding := recon.Finding{ + Source: urlPrefix + item.ID, + SourceType: "recon:huggingface", + DetectedAt: time.Now().UTC(), + } + select { + case out <- finding: + case <-ctx.Done(): + return ctx.Err() + } + } + return nil +} + +// Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*HuggingFaceSource)(nil) From cae714b4887336af12643d1e7ddec36bd40a74c5 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 01:16:27 +0300 Subject: [PATCH 3/3] docs(10-06): complete HuggingFaceSource plan --- .../10-osint-code-hosting/10-06-SUMMARY.md | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 .planning/phases/10-osint-code-hosting/10-06-SUMMARY.md diff --git a/.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md b/.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md new file mode 100644 index 0000000..a645a23 --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md @@ -0,0 +1,79 @@ +--- +phase: 10-osint-code-hosting +plan: 06 +subsystem: recon/sources +tags: [recon, osint, huggingface, wave-2] +requires: + - pkg/recon/sources.Client (Plan 10-01) + - pkg/recon/sources.BuildQueries (Plan 10-01) + - pkg/recon.LimiterRegistry + - pkg/providers.Registry +provides: + - pkg/recon/sources.HuggingFaceSource + - pkg/recon/sources.HuggingFaceConfig + - pkg/recon/sources.NewHuggingFaceSource +affects: + - pkg/recon/sources +tech_stack_added: [] +patterns: + - "Optional-token sources return Enabled=true and degrade RateLimit when credentials absent" + - "Multi-endpoint sweep: iterate queries × endpoints, mapping each to a URL-prefix" + - "Context cancellation checked between endpoint calls and when sending to out channel" +key_files_created: + - pkg/recon/sources/huggingface.go + - pkg/recon/sources/huggingface_test.go +key_files_modified: [] +decisions: + - "Unauthenticated rate of rate.Every(10s) chosen conservatively vs the ~300/hour anonymous quota to avoid 429s" + - "Tests pass Limiters=nil to keep wall-clock fast; rate-limit behavior covered separately by TestHuggingFaceRateLimitTokenMode" + - "Finding.Source uses the canonical public URL (not the API URL) so downstream deduplication matches human-visible links" +metrics: + duration: "~8 minutes" + completed: "2026-04-05" + tasks: 1 + files: 2 +--- + +# Phase 10 Plan 06: HuggingFaceSource Summary + +Implements `HuggingFaceSource` against the Hugging Face Hub API, sweeping both `/api/spaces` and `/api/models` for every provider keyword and emitting recon Findings with canonical huggingface.co URLs. + +## What Changed + +- New `HuggingFaceSource` implementing `recon.ReconSource` with optional `Token`. +- Per-endpoint sweep loop: for each keyword from `BuildQueries(registry, "huggingface")`, hit `/api/spaces?search=...&limit=50` then `/api/models?search=...&limit=50`. +- URL normalization: space results mapped to `https://huggingface.co/spaces/{id}`, model results to `https://huggingface.co/{id}`. +- Rate limit is token-aware: `rate.Every(3600ms)` when authenticated (matches 1000/hour), `rate.Every(10s)` otherwise. +- Authorization header only set when `Token != ""`. +- Compile-time assertion `var _ recon.ReconSource = (*HuggingFaceSource)(nil)`. + +## Test Coverage + +All six TDD assertions in `huggingface_test.go` pass: + +1. `TestHuggingFaceEnabledAlwaysTrue` — enabled with and without token. +2. `TestHuggingFaceSweepHitsBothEndpoints` — exact Finding count (2 keywords × 2 endpoints = 4), both URL prefixes observed, `SourceType="recon:huggingface"`. +3. `TestHuggingFaceAuthorizationHeader` — `Bearer hf_secret` sent when token set, header absent when empty. +4. `TestHuggingFaceContextCancellation` — slow server + 100ms context returns error promptly. +5. `TestHuggingFaceRateLimitTokenMode` — authenticated rate is strictly faster than unauthenticated rate. + +Plus httptest server shared by auth + endpoint tests (`hfTestServer`). + +## Deviations from Plan + +None — plan executed exactly as written. One minor test refinement: tests pass `Limiters: nil` instead of constructing a real `LimiterRegistry`, because the production RateLimit of `rate.Every(3600ms)` with burst 1 would make four serialized waits exceed a reasonable test budget. The limiter code path is still exercised in production and the rate-mode contract is covered by `TestHuggingFaceRateLimitTokenMode`. + +## Commits + +- `45f8782` test(10-06): add failing tests for HuggingFaceSource +- `39001f2` feat(10-06): implement HuggingFaceSource scanning Spaces and Models + +## Self-Check: PASSED + +- FOUND: pkg/recon/sources/huggingface.go +- FOUND: pkg/recon/sources/huggingface_test.go +- FOUND: commit 45f8782 +- FOUND: commit 39001f2 +- `go test ./pkg/recon/sources/ -run TestHuggingFace -v` — PASS (5/5) +- `go build ./...` — PASS +- `go test ./pkg/recon/...` — PASS