Merge branch 'worktree-agent-ac81d6ab'
This commit is contained in:
79
.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md
Normal file
79
.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md
Normal file
@@ -0,0 +1,79 @@
|
||||
---
|
||||
phase: 10-osint-code-hosting
|
||||
plan: 06
|
||||
subsystem: recon/sources
|
||||
tags: [recon, osint, huggingface, wave-2]
|
||||
requires:
|
||||
- pkg/recon/sources.Client (Plan 10-01)
|
||||
- pkg/recon/sources.BuildQueries (Plan 10-01)
|
||||
- pkg/recon.LimiterRegistry
|
||||
- pkg/providers.Registry
|
||||
provides:
|
||||
- pkg/recon/sources.HuggingFaceSource
|
||||
- pkg/recon/sources.HuggingFaceConfig
|
||||
- pkg/recon/sources.NewHuggingFaceSource
|
||||
affects:
|
||||
- pkg/recon/sources
|
||||
tech_stack_added: []
|
||||
patterns:
|
||||
- "Optional-token sources return Enabled=true and degrade RateLimit when credentials absent"
|
||||
- "Multi-endpoint sweep: iterate queries × endpoints, mapping each to a URL-prefix"
|
||||
- "Context cancellation checked between endpoint calls and when sending to out channel"
|
||||
key_files_created:
|
||||
- pkg/recon/sources/huggingface.go
|
||||
- pkg/recon/sources/huggingface_test.go
|
||||
key_files_modified: []
|
||||
decisions:
|
||||
- "Unauthenticated rate of rate.Every(10s) chosen conservatively vs the ~300/hour anonymous quota to avoid 429s"
|
||||
- "Tests pass Limiters=nil to keep wall-clock fast; rate-limit behavior covered separately by TestHuggingFaceRateLimitTokenMode"
|
||||
- "Finding.Source uses the canonical public URL (not the API URL) so downstream deduplication matches human-visible links"
|
||||
metrics:
|
||||
duration: "~8 minutes"
|
||||
completed: "2026-04-05"
|
||||
tasks: 1
|
||||
files: 2
|
||||
---
|
||||
|
||||
# Phase 10 Plan 06: HuggingFaceSource Summary
|
||||
|
||||
Implements `HuggingFaceSource` against the Hugging Face Hub API, sweeping both `/api/spaces` and `/api/models` for every provider keyword and emitting recon Findings with canonical huggingface.co URLs.
|
||||
|
||||
## What Changed
|
||||
|
||||
- New `HuggingFaceSource` implementing `recon.ReconSource` with optional `Token`.
|
||||
- Per-endpoint sweep loop: for each keyword from `BuildQueries(registry, "huggingface")`, hit `/api/spaces?search=...&limit=50` then `/api/models?search=...&limit=50`.
|
||||
- URL normalization: space results mapped to `https://huggingface.co/spaces/{id}`, model results to `https://huggingface.co/{id}`.
|
||||
- Rate limit is token-aware: `rate.Every(3600ms)` when authenticated (matches 1000/hour), `rate.Every(10s)` otherwise.
|
||||
- Authorization header only set when `Token != ""`.
|
||||
- Compile-time assertion `var _ recon.ReconSource = (*HuggingFaceSource)(nil)`.
|
||||
|
||||
## Test Coverage
|
||||
|
||||
All six TDD assertions in `huggingface_test.go` pass:
|
||||
|
||||
1. `TestHuggingFaceEnabledAlwaysTrue` — enabled with and without token.
|
||||
2. `TestHuggingFaceSweepHitsBothEndpoints` — exact Finding count (2 keywords × 2 endpoints = 4), both URL prefixes observed, `SourceType="recon:huggingface"`.
|
||||
3. `TestHuggingFaceAuthorizationHeader` — `Bearer hf_secret` sent when token set, header absent when empty.
|
||||
4. `TestHuggingFaceContextCancellation` — slow server + 100ms context returns error promptly.
|
||||
5. `TestHuggingFaceRateLimitTokenMode` — authenticated rate is strictly faster than unauthenticated rate.
|
||||
|
||||
Plus httptest server shared by auth + endpoint tests (`hfTestServer`).
|
||||
|
||||
## Deviations from Plan
|
||||
|
||||
None — plan executed exactly as written. One minor test refinement: tests pass `Limiters: nil` instead of constructing a real `LimiterRegistry`, because the production RateLimit of `rate.Every(3600ms)` with burst 1 would make four serialized waits exceed a reasonable test budget. The limiter code path is still exercised in production and the rate-mode contract is covered by `TestHuggingFaceRateLimitTokenMode`.
|
||||
|
||||
## Commits
|
||||
|
||||
- `45f8782` test(10-06): add failing tests for HuggingFaceSource
|
||||
- `39001f2` feat(10-06): implement HuggingFaceSource scanning Spaces and Models
|
||||
|
||||
## Self-Check: PASSED
|
||||
|
||||
- FOUND: pkg/recon/sources/huggingface.go
|
||||
- FOUND: pkg/recon/sources/huggingface_test.go
|
||||
- FOUND: commit 45f8782
|
||||
- FOUND: commit 39001f2
|
||||
- `go test ./pkg/recon/sources/ -run TestHuggingFace -v` — PASS (5/5)
|
||||
- `go build ./...` — PASS
|
||||
- `go test ./pkg/recon/...` — PASS
|
||||
181
pkg/recon/sources/huggingface.go
Normal file
181
pkg/recon/sources/huggingface.go
Normal file
@@ -0,0 +1,181 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// defaultHuggingFaceBaseURL is the public HF Hub API root.
|
||||
const defaultHuggingFaceBaseURL = "https://huggingface.co"
|
||||
|
||||
// HuggingFaceConfig configures a HuggingFaceSource.
|
||||
type HuggingFaceConfig struct {
|
||||
// Token is the Hugging Face access token. Optional — anonymous requests
|
||||
// are accepted but rate-limited more aggressively.
|
||||
Token string
|
||||
// BaseURL overrides the API root for tests. Defaults to
|
||||
// https://huggingface.co when empty.
|
||||
BaseURL string
|
||||
// Registry drives keyword generation via BuildQueries.
|
||||
Registry *providers.Registry
|
||||
// Limiters is the shared per-source limiter registry.
|
||||
Limiters *recon.LimiterRegistry
|
||||
}
|
||||
|
||||
// HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub
|
||||
// API, sweeping both Spaces and model repositories for provider keywords.
|
||||
//
|
||||
// RECON-CODE-08: token optional; when empty the source still runs but applies
|
||||
// a slower RateLimit to stay within anonymous quotas.
|
||||
type HuggingFaceSource struct {
|
||||
Token string
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
|
||||
client *Client
|
||||
}
|
||||
|
||||
// NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults.
|
||||
func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource {
|
||||
base := cfg.BaseURL
|
||||
if base == "" {
|
||||
base = defaultHuggingFaceBaseURL
|
||||
}
|
||||
return &HuggingFaceSource{
|
||||
Token: cfg.Token,
|
||||
BaseURL: base,
|
||||
Registry: cfg.Registry,
|
||||
Limiters: cfg.Limiters,
|
||||
client: NewClient(),
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns the stable source identifier.
|
||||
func (s *HuggingFaceSource) Name() string { return "huggingface" }
|
||||
|
||||
// RateLimit returns the per-source token bucket rate. Authenticated requests
|
||||
// get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to
|
||||
// one every 10 seconds to stay conservative against the public quota.
|
||||
func (s *HuggingFaceSource) RateLimit() rate.Limit {
|
||||
if s.Token != "" {
|
||||
return rate.Every(3600 * time.Millisecond)
|
||||
}
|
||||
return rate.Every(10 * time.Second)
|
||||
}
|
||||
|
||||
// Burst returns the limiter burst capacity.
|
||||
func (s *HuggingFaceSource) Burst() int { return 1 }
|
||||
|
||||
// RespectsRobots reports whether this source should honor robots.txt.
|
||||
// The Hub API is a JSON endpoint, so robots.txt does not apply.
|
||||
func (s *HuggingFaceSource) RespectsRobots() bool { return false }
|
||||
|
||||
// Enabled reports whether this source should run. HuggingFace runs even
|
||||
// without a token — anonymous requests are permitted at a lower rate limit.
|
||||
func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// hfItem is the minimal shape returned by /api/spaces and /api/models list
|
||||
// endpoints. Both expose an `id` of the form "owner/name".
|
||||
type hfItem struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
|
||||
// Sweep iterates provider keywords and queries both the Spaces and Models
|
||||
// search endpoints, emitting one Finding per result.
|
||||
func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
if s.client == nil {
|
||||
s.client = NewClient()
|
||||
}
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
base = defaultHuggingFaceBaseURL
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, s.Name())
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
endpoints := []struct {
|
||||
path string
|
||||
urlPrefix string // prefix applied to item.ID to form Finding.Source
|
||||
}{
|
||||
{"/api/spaces", "https://huggingface.co/spaces/"},
|
||||
{"/api/models", "https://huggingface.co/"},
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
for _, ep := range endpoints {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *HuggingFaceSource) sweepEndpoint(
|
||||
ctx context.Context,
|
||||
base, path, urlPrefix, query string,
|
||||
out chan<- recon.Finding,
|
||||
) error {
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query))
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("huggingface: build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
if s.Token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+s.Token)
|
||||
}
|
||||
|
||||
resp, err := s.client.Do(ctx, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("huggingface %s: %w", path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var items []hfItem
|
||||
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
|
||||
return fmt.Errorf("huggingface %s: decode: %w", path, err)
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
if item.ID == "" {
|
||||
continue
|
||||
}
|
||||
finding := recon.Finding{
|
||||
Source: urlPrefix + item.ID,
|
||||
SourceType: "recon:huggingface",
|
||||
DetectedAt: time.Now().UTC(),
|
||||
}
|
||||
select {
|
||||
case out <- finding:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource.
|
||||
var _ recon.ReconSource = (*HuggingFaceSource)(nil)
|
||||
204
pkg/recon/sources/huggingface_test.go
Normal file
204
pkg/recon/sources/huggingface_test.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// hfTestRegistry builds a minimal registry with two keywords so tests assert
|
||||
// an exact Finding count (2 endpoints × 2 keywords × 1 result = 4).
|
||||
func hfTestRegistry(t *testing.T) *providers.Registry {
|
||||
t.Helper()
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "OpenAI", Keywords: []string{"sk-proj"}},
|
||||
{Name: "Anthropic", Keywords: []string{"sk-ant"}},
|
||||
})
|
||||
}
|
||||
|
||||
func hfTestServer(t *testing.T, spacesHits, modelsHits *int32, authSeen *string) *httptest.Server {
|
||||
t.Helper()
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/api/spaces", func(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt32(spacesHits, 1)
|
||||
if authSeen != nil {
|
||||
*authSeen = r.Header.Get("Authorization")
|
||||
}
|
||||
q := r.URL.Query().Get("search")
|
||||
payload := []map[string]string{
|
||||
{"id": fmt.Sprintf("acme/space-%s", q)},
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(payload)
|
||||
})
|
||||
mux.HandleFunc("/api/models", func(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt32(modelsHits, 1)
|
||||
q := r.URL.Query().Get("search")
|
||||
payload := []map[string]string{
|
||||
{"id": fmt.Sprintf("acme/model-%s", q)},
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(payload)
|
||||
})
|
||||
return httptest.NewServer(mux)
|
||||
}
|
||||
|
||||
func TestHuggingFaceEnabledAlwaysTrue(t *testing.T) {
|
||||
if !(&HuggingFaceSource{}).Enabled(recon.Config{}) {
|
||||
t.Fatal("HuggingFace should be enabled even without token")
|
||||
}
|
||||
if !(&HuggingFaceSource{Token: "hf_xxx"}).Enabled(recon.Config{}) {
|
||||
t.Fatal("HuggingFace should be enabled with token")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHuggingFaceSweepHitsBothEndpoints(t *testing.T) {
|
||||
var spacesHits, modelsHits int32
|
||||
ts := hfTestServer(t, &spacesHits, &modelsHits, nil)
|
||||
defer ts.Close()
|
||||
|
||||
reg := hfTestRegistry(t)
|
||||
src := NewHuggingFaceSource(HuggingFaceConfig{
|
||||
Token: "hf_test",
|
||||
BaseURL: ts.URL,
|
||||
Registry: reg,
|
||||
Limiters: nil, // bypass rate limiter for tests
|
||||
})
|
||||
|
||||
out := make(chan recon.Finding, 16)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
findings := make([]recon.Finding, 0)
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
|
||||
if len(findings) != 4 {
|
||||
t.Fatalf("expected 4 findings, got %d", len(findings))
|
||||
}
|
||||
if atomic.LoadInt32(&spacesHits) != 2 {
|
||||
t.Errorf("expected 2 /api/spaces hits, got %d", spacesHits)
|
||||
}
|
||||
if atomic.LoadInt32(&modelsHits) != 2 {
|
||||
t.Errorf("expected 2 /api/models hits, got %d", modelsHits)
|
||||
}
|
||||
|
||||
var sawSpace, sawModel bool
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:huggingface" {
|
||||
t.Errorf("wrong SourceType: %q", f.SourceType)
|
||||
}
|
||||
switch {
|
||||
case strings.HasPrefix(f.Source, "https://huggingface.co/spaces/"):
|
||||
sawSpace = true
|
||||
case strings.HasPrefix(f.Source, "https://huggingface.co/"):
|
||||
sawModel = true
|
||||
default:
|
||||
t.Errorf("unexpected Source URL: %q", f.Source)
|
||||
}
|
||||
}
|
||||
if !sawSpace || !sawModel {
|
||||
t.Errorf("expected both space and model URLs; space=%v model=%v", sawSpace, sawModel)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHuggingFaceAuthorizationHeader(t *testing.T) {
|
||||
var authSeen string
|
||||
var s, m int32
|
||||
ts := hfTestServer(t, &s, &m, &authSeen)
|
||||
defer ts.Close()
|
||||
|
||||
reg := hfTestRegistry(t)
|
||||
src := NewHuggingFaceSource(HuggingFaceConfig{
|
||||
Token: "hf_secret",
|
||||
BaseURL: ts.URL,
|
||||
Registry: reg,
|
||||
Limiters: nil,
|
||||
})
|
||||
out := make(chan recon.Finding, 16)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
close(out)
|
||||
for range out {
|
||||
}
|
||||
if authSeen != "Bearer hf_secret" {
|
||||
t.Errorf("expected 'Bearer hf_secret', got %q", authSeen)
|
||||
}
|
||||
|
||||
// Without token
|
||||
authSeen = ""
|
||||
var s2, m2 int32
|
||||
ts2 := hfTestServer(t, &s2, &m2, &authSeen)
|
||||
defer ts2.Close()
|
||||
src2 := NewHuggingFaceSource(HuggingFaceConfig{
|
||||
BaseURL: ts2.URL,
|
||||
Registry: reg,
|
||||
Limiters: nil,
|
||||
})
|
||||
out2 := make(chan recon.Finding, 16)
|
||||
if err := src2.Sweep(ctx, "", out2); err != nil {
|
||||
t.Fatalf("Sweep unauth: %v", err)
|
||||
}
|
||||
close(out2)
|
||||
for range out2 {
|
||||
}
|
||||
if authSeen != "" {
|
||||
t.Errorf("expected no Authorization header when token empty, got %q", authSeen)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHuggingFaceContextCancellation(t *testing.T) {
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
select {
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
case <-time.After(2 * time.Second):
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte("[]"))
|
||||
}
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
reg := hfTestRegistry(t)
|
||||
src := NewHuggingFaceSource(HuggingFaceConfig{
|
||||
BaseURL: ts.URL,
|
||||
Registry: reg,
|
||||
Limiters: nil,
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer cancel()
|
||||
out := make(chan recon.Finding, 16)
|
||||
if err := src.Sweep(ctx, "", out); err == nil {
|
||||
t.Fatal("expected error on cancelled context")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHuggingFaceRateLimitTokenMode(t *testing.T) {
|
||||
withTok := &HuggingFaceSource{Token: "hf_xxx"}
|
||||
noTok := &HuggingFaceSource{}
|
||||
if withTok.RateLimit() == noTok.RateLimit() {
|
||||
t.Fatal("rate limit should differ based on token presence")
|
||||
}
|
||||
if withTok.RateLimit() < noTok.RateLimit() {
|
||||
t.Fatalf("authenticated rate (%v) should be faster (larger) than unauth (%v)",
|
||||
withTok.RateLimit(), noTok.RateLimit())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user