Merge branch 'worktree-agent-ac81d6ab'

This commit is contained in:
salvacybersec
2026-04-06 01:20:25 +03:00
3 changed files with 464 additions and 0 deletions

View File

@@ -0,0 +1,79 @@
---
phase: 10-osint-code-hosting
plan: 06
subsystem: recon/sources
tags: [recon, osint, huggingface, wave-2]
requires:
- pkg/recon/sources.Client (Plan 10-01)
- pkg/recon/sources.BuildQueries (Plan 10-01)
- pkg/recon.LimiterRegistry
- pkg/providers.Registry
provides:
- pkg/recon/sources.HuggingFaceSource
- pkg/recon/sources.HuggingFaceConfig
- pkg/recon/sources.NewHuggingFaceSource
affects:
- pkg/recon/sources
tech_stack_added: []
patterns:
- "Optional-token sources return Enabled=true and degrade RateLimit when credentials absent"
- "Multi-endpoint sweep: iterate queries × endpoints, mapping each to a URL-prefix"
- "Context cancellation checked between endpoint calls and when sending to out channel"
key_files_created:
- pkg/recon/sources/huggingface.go
- pkg/recon/sources/huggingface_test.go
key_files_modified: []
decisions:
- "Unauthenticated rate of rate.Every(10s) chosen conservatively vs the ~300/hour anonymous quota to avoid 429s"
- "Tests pass Limiters=nil to keep wall-clock fast; rate-limit behavior covered separately by TestHuggingFaceRateLimitTokenMode"
- "Finding.Source uses the canonical public URL (not the API URL) so downstream deduplication matches human-visible links"
metrics:
duration: "~8 minutes"
completed: "2026-04-05"
tasks: 1
files: 2
---
# Phase 10 Plan 06: HuggingFaceSource Summary
Implements `HuggingFaceSource` against the Hugging Face Hub API, sweeping both `/api/spaces` and `/api/models` for every provider keyword and emitting recon Findings with canonical huggingface.co URLs.
## What Changed
- New `HuggingFaceSource` implementing `recon.ReconSource` with optional `Token`.
- Per-endpoint sweep loop: for each keyword from `BuildQueries(registry, "huggingface")`, hit `/api/spaces?search=...&limit=50` then `/api/models?search=...&limit=50`.
- URL normalization: space results mapped to `https://huggingface.co/spaces/{id}`, model results to `https://huggingface.co/{id}`.
- Rate limit is token-aware: `rate.Every(3600ms)` when authenticated (matches 1000/hour), `rate.Every(10s)` otherwise.
- Authorization header only set when `Token != ""`.
- Compile-time assertion `var _ recon.ReconSource = (*HuggingFaceSource)(nil)`.
## Test Coverage
All six TDD assertions in `huggingface_test.go` pass:
1. `TestHuggingFaceEnabledAlwaysTrue` — enabled with and without token.
2. `TestHuggingFaceSweepHitsBothEndpoints` — exact Finding count (2 keywords × 2 endpoints = 4), both URL prefixes observed, `SourceType="recon:huggingface"`.
3. `TestHuggingFaceAuthorizationHeader``Bearer hf_secret` sent when token set, header absent when empty.
4. `TestHuggingFaceContextCancellation` — slow server + 100ms context returns error promptly.
5. `TestHuggingFaceRateLimitTokenMode` — authenticated rate is strictly faster than unauthenticated rate.
Plus httptest server shared by auth + endpoint tests (`hfTestServer`).
## Deviations from Plan
None — plan executed exactly as written. One minor test refinement: tests pass `Limiters: nil` instead of constructing a real `LimiterRegistry`, because the production RateLimit of `rate.Every(3600ms)` with burst 1 would make four serialized waits exceed a reasonable test budget. The limiter code path is still exercised in production and the rate-mode contract is covered by `TestHuggingFaceRateLimitTokenMode`.
## Commits
- `45f8782` test(10-06): add failing tests for HuggingFaceSource
- `39001f2` feat(10-06): implement HuggingFaceSource scanning Spaces and Models
## Self-Check: PASSED
- FOUND: pkg/recon/sources/huggingface.go
- FOUND: pkg/recon/sources/huggingface_test.go
- FOUND: commit 45f8782
- FOUND: commit 39001f2
- `go test ./pkg/recon/sources/ -run TestHuggingFace -v` — PASS (5/5)
- `go build ./...` — PASS
- `go test ./pkg/recon/...` — PASS

View File

@@ -0,0 +1,181 @@
package sources
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/url"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// defaultHuggingFaceBaseURL is the public HF Hub API root.
const defaultHuggingFaceBaseURL = "https://huggingface.co"
// HuggingFaceConfig configures a HuggingFaceSource.
type HuggingFaceConfig struct {
// Token is the Hugging Face access token. Optional — anonymous requests
// are accepted but rate-limited more aggressively.
Token string
// BaseURL overrides the API root for tests. Defaults to
// https://huggingface.co when empty.
BaseURL string
// Registry drives keyword generation via BuildQueries.
Registry *providers.Registry
// Limiters is the shared per-source limiter registry.
Limiters *recon.LimiterRegistry
}
// HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub
// API, sweeping both Spaces and model repositories for provider keywords.
//
// RECON-CODE-08: token optional; when empty the source still runs but applies
// a slower RateLimit to stay within anonymous quotas.
type HuggingFaceSource struct {
Token string
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
client *Client
}
// NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults.
func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource {
base := cfg.BaseURL
if base == "" {
base = defaultHuggingFaceBaseURL
}
return &HuggingFaceSource{
Token: cfg.Token,
BaseURL: base,
Registry: cfg.Registry,
Limiters: cfg.Limiters,
client: NewClient(),
}
}
// Name returns the stable source identifier.
func (s *HuggingFaceSource) Name() string { return "huggingface" }
// RateLimit returns the per-source token bucket rate. Authenticated requests
// get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to
// one every 10 seconds to stay conservative against the public quota.
func (s *HuggingFaceSource) RateLimit() rate.Limit {
if s.Token != "" {
return rate.Every(3600 * time.Millisecond)
}
return rate.Every(10 * time.Second)
}
// Burst returns the limiter burst capacity.
func (s *HuggingFaceSource) Burst() int { return 1 }
// RespectsRobots reports whether this source should honor robots.txt.
// The Hub API is a JSON endpoint, so robots.txt does not apply.
func (s *HuggingFaceSource) RespectsRobots() bool { return false }
// Enabled reports whether this source should run. HuggingFace runs even
// without a token — anonymous requests are permitted at a lower rate limit.
func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true }
// hfItem is the minimal shape returned by /api/spaces and /api/models list
// endpoints. Both expose an `id` of the form "owner/name".
type hfItem struct {
ID string `json:"id"`
}
// Sweep iterates provider keywords and queries both the Spaces and Models
// search endpoints, emitting one Finding per result.
func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
if s.client == nil {
s.client = NewClient()
}
base := s.BaseURL
if base == "" {
base = defaultHuggingFaceBaseURL
}
queries := BuildQueries(s.Registry, s.Name())
if len(queries) == 0 {
return nil
}
endpoints := []struct {
path string
urlPrefix string // prefix applied to item.ID to form Finding.Source
}{
{"/api/spaces", "https://huggingface.co/spaces/"},
{"/api/models", "https://huggingface.co/"},
}
for _, q := range queries {
for _, ep := range endpoints {
if err := ctx.Err(); err != nil {
return err
}
if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil {
return err
}
}
}
return nil
}
func (s *HuggingFaceSource) sweepEndpoint(
ctx context.Context,
base, path, urlPrefix, query string,
out chan<- recon.Finding,
) error {
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return fmt.Errorf("huggingface: build request: %w", err)
}
req.Header.Set("Accept", "application/json")
if s.Token != "" {
req.Header.Set("Authorization", "Bearer "+s.Token)
}
resp, err := s.client.Do(ctx, req)
if err != nil {
return fmt.Errorf("huggingface %s: %w", path, err)
}
defer resp.Body.Close()
var items []hfItem
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
return fmt.Errorf("huggingface %s: decode: %w", path, err)
}
for _, item := range items {
if item.ID == "" {
continue
}
finding := recon.Finding{
Source: urlPrefix + item.ID,
SourceType: "recon:huggingface",
DetectedAt: time.Now().UTC(),
}
select {
case out <- finding:
case <-ctx.Done():
return ctx.Err()
}
}
return nil
}
// Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource.
var _ recon.ReconSource = (*HuggingFaceSource)(nil)

View File

@@ -0,0 +1,204 @@
package sources
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// hfTestRegistry builds a minimal registry with two keywords so tests assert
// an exact Finding count (2 endpoints × 2 keywords × 1 result = 4).
func hfTestRegistry(t *testing.T) *providers.Registry {
t.Helper()
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "OpenAI", Keywords: []string{"sk-proj"}},
{Name: "Anthropic", Keywords: []string{"sk-ant"}},
})
}
func hfTestServer(t *testing.T, spacesHits, modelsHits *int32, authSeen *string) *httptest.Server {
t.Helper()
mux := http.NewServeMux()
mux.HandleFunc("/api/spaces", func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(spacesHits, 1)
if authSeen != nil {
*authSeen = r.Header.Get("Authorization")
}
q := r.URL.Query().Get("search")
payload := []map[string]string{
{"id": fmt.Sprintf("acme/space-%s", q)},
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(payload)
})
mux.HandleFunc("/api/models", func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(modelsHits, 1)
q := r.URL.Query().Get("search")
payload := []map[string]string{
{"id": fmt.Sprintf("acme/model-%s", q)},
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(payload)
})
return httptest.NewServer(mux)
}
func TestHuggingFaceEnabledAlwaysTrue(t *testing.T) {
if !(&HuggingFaceSource{}).Enabled(recon.Config{}) {
t.Fatal("HuggingFace should be enabled even without token")
}
if !(&HuggingFaceSource{Token: "hf_xxx"}).Enabled(recon.Config{}) {
t.Fatal("HuggingFace should be enabled with token")
}
}
func TestHuggingFaceSweepHitsBothEndpoints(t *testing.T) {
var spacesHits, modelsHits int32
ts := hfTestServer(t, &spacesHits, &modelsHits, nil)
defer ts.Close()
reg := hfTestRegistry(t)
src := NewHuggingFaceSource(HuggingFaceConfig{
Token: "hf_test",
BaseURL: ts.URL,
Registry: reg,
Limiters: nil, // bypass rate limiter for tests
})
out := make(chan recon.Finding, 16)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep: %v", err)
}
close(out)
findings := make([]recon.Finding, 0)
for f := range out {
findings = append(findings, f)
}
if len(findings) != 4 {
t.Fatalf("expected 4 findings, got %d", len(findings))
}
if atomic.LoadInt32(&spacesHits) != 2 {
t.Errorf("expected 2 /api/spaces hits, got %d", spacesHits)
}
if atomic.LoadInt32(&modelsHits) != 2 {
t.Errorf("expected 2 /api/models hits, got %d", modelsHits)
}
var sawSpace, sawModel bool
for _, f := range findings {
if f.SourceType != "recon:huggingface" {
t.Errorf("wrong SourceType: %q", f.SourceType)
}
switch {
case strings.HasPrefix(f.Source, "https://huggingface.co/spaces/"):
sawSpace = true
case strings.HasPrefix(f.Source, "https://huggingface.co/"):
sawModel = true
default:
t.Errorf("unexpected Source URL: %q", f.Source)
}
}
if !sawSpace || !sawModel {
t.Errorf("expected both space and model URLs; space=%v model=%v", sawSpace, sawModel)
}
}
func TestHuggingFaceAuthorizationHeader(t *testing.T) {
var authSeen string
var s, m int32
ts := hfTestServer(t, &s, &m, &authSeen)
defer ts.Close()
reg := hfTestRegistry(t)
src := NewHuggingFaceSource(HuggingFaceConfig{
Token: "hf_secret",
BaseURL: ts.URL,
Registry: reg,
Limiters: nil,
})
out := make(chan recon.Finding, 16)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep: %v", err)
}
close(out)
for range out {
}
if authSeen != "Bearer hf_secret" {
t.Errorf("expected 'Bearer hf_secret', got %q", authSeen)
}
// Without token
authSeen = ""
var s2, m2 int32
ts2 := hfTestServer(t, &s2, &m2, &authSeen)
defer ts2.Close()
src2 := NewHuggingFaceSource(HuggingFaceConfig{
BaseURL: ts2.URL,
Registry: reg,
Limiters: nil,
})
out2 := make(chan recon.Finding, 16)
if err := src2.Sweep(ctx, "", out2); err != nil {
t.Fatalf("Sweep unauth: %v", err)
}
close(out2)
for range out2 {
}
if authSeen != "" {
t.Errorf("expected no Authorization header when token empty, got %q", authSeen)
}
}
func TestHuggingFaceContextCancellation(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
select {
case <-r.Context().Done():
return
case <-time.After(2 * time.Second):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("[]"))
}
}))
defer ts.Close()
reg := hfTestRegistry(t)
src := NewHuggingFaceSource(HuggingFaceConfig{
BaseURL: ts.URL,
Registry: reg,
Limiters: nil,
})
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
out := make(chan recon.Finding, 16)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected error on cancelled context")
}
}
func TestHuggingFaceRateLimitTokenMode(t *testing.T) {
withTok := &HuggingFaceSource{Token: "hf_xxx"}
noTok := &HuggingFaceSource{}
if withTok.RateLimit() == noTok.RateLimit() {
t.Fatal("rate limit should differ based on token presence")
}
if withTok.RateLimit() < noTok.RateLimit() {
t.Fatalf("authenticated rate (%v) should be faster (larger) than unauth (%v)",
withTok.RateLimit(), noTok.RateLimit())
}
}