diff --git a/pkg/recon/sources/gistpaste.go b/pkg/recon/sources/gistpaste.go new file mode 100644 index 0000000..1d6e130 --- /dev/null +++ b/pkg/recon/sources/gistpaste.go @@ -0,0 +1,152 @@ +package sources + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// GistPasteSource scrapes gist.github.com's public search (no auth required) +// for API key leaks. This is distinct from Phase 10's GistSource which uses +// the authenticated GitHub API. +// +// Auth: none. Rate: Every(3s), Burst 1. +type GistPasteSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +// gistPasteLinkRE matches gist links: // +var gistPasteLinkRE = regexp.MustCompile(`^/[^/]+/[a-f0-9]+$`) + +// Compile-time assertion. +var _ recon.ReconSource = (*GistPasteSource)(nil) + +func (s *GistPasteSource) Name() string { return "gistpaste" } +func (s *GistPasteSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *GistPasteSource) Burst() int { return 1 } +func (s *GistPasteSource) RespectsRobots() bool { return true } + +// Enabled always returns true: gist search scraping requires no credentials. +func (s *GistPasteSource) Enabled(_ recon.Config) bool { return true } + +// Sweep searches gist.github.com for each provider keyword, fetches raw gist +// content, and emits Findings for keyword matches. +func (s *GistPasteSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://gist.github.com" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "gistpaste") + if len(queries) == 0 { + return nil + } + + keywords := gistPasteKeywordSet(s.Registry) + if len(keywords) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/search?q=%s", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("gistpaste: build search req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("gistpaste: search fetch: %w", err) + } + links, err := extractAnchorHrefs(resp.Body, gistPasteLinkRE) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("gistpaste: parse search html: %w", err) + } + + for _, gistPath := range links { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + rawURL := fmt.Sprintf("%s%s/raw", base, gistPath) + rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("gistpaste: build raw req: %w", err) + } + rawResp, err := client.Do(ctx, rawReq) + if err != nil { + continue // skip this gist on error + } + body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024)) + _ = rawResp.Body.Close() + if readErr != nil { + continue + } + + content := string(body) + for kw, provName := range keywords { + if strings.Contains(content, kw) { + out <- recon.Finding{ + ProviderName: provName, + Source: fmt.Sprintf("%s%s", base, gistPath), + SourceType: "recon:gistpaste", + Confidence: "low", + DetectedAt: time.Now(), + } + break // one finding per gist + } + } + } + } + return nil +} + +// gistPasteKeywordSet builds keyword->providerName map from registry. +func gistPasteKeywordSet(reg *providers.Registry) map[string]string { + out := make(map[string]string) + if reg == nil { + return out + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + if k == "" { + continue + } + if _, ok := out[k]; !ok { + out[k] = p.Name + } + } + } + return out +} diff --git a/pkg/recon/sources/gistpaste_test.go b/pkg/recon/sources/gistpaste_test.go new file mode 100644 index 0000000..491281b --- /dev/null +++ b/pkg/recon/sources/gistpaste_test.go @@ -0,0 +1,119 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func gistPasteTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "anthropic", Keywords: []string{"sk-ant-"}}, + }) +} + +const gistPasteSearchHTML = ` + + gist one + gist two + nope + nope +` + +const gistPasteRaw1 = `config with sk-ant-XYZKEY123 inside` +const gistPasteRaw2 = `nothing here` + +func TestGistPaste_Sweep_ExtractsFindings(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.URL.Path == "/search": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(gistPasteSearchHTML)) + case r.URL.Path == "/alice/abc123def456/raw": + _, _ = w.Write([]byte(gistPasteRaw1)) + case r.URL.Path == "/bob/789aaa000bbb/raw": + _, _ = w.Write([]byte(gistPasteRaw2)) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + src := &GistPasteSource{ + BaseURL: srv.URL, + Registry: gistPasteTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.SourceType != "recon:gistpaste" { + t.Errorf("SourceType=%s, want recon:gistpaste", f.SourceType) + } + if f.ProviderName != "anthropic" { + t.Errorf("ProviderName=%s, want anthropic", f.ProviderName) + } + wantSource := srv.URL + "/alice/abc123def456" + if f.Source != wantSource { + t.Errorf("Source=%s, want %s", f.Source, wantSource) + } +} + +func TestGistPaste_NameAndRate(t *testing.T) { + s := &GistPasteSource{} + if s.Name() != "gistpaste" { + t.Errorf("Name=%s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("Burst=%d", s.Burst()) + } + if !s.RespectsRobots() { + t.Error("expected RespectsRobots=true") + } + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true") + } +} + +func TestGistPaste_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(gistPasteSearchHTML)) + })) + defer srv.Close() + + src := &GistPasteSource{ + BaseURL: srv.URL, + Registry: gistPasteTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} diff --git a/pkg/recon/sources/pastebin.go b/pkg/recon/sources/pastebin.go new file mode 100644 index 0000000..057e4d0 --- /dev/null +++ b/pkg/recon/sources/pastebin.go @@ -0,0 +1,156 @@ +package sources + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// PastebinSource scrapes pastebin.com search results for API key leaks. +// +// Two-phase approach per keyword: +// - Phase A: search pastebin for keyword, extract paste IDs from result links +// - Phase B: fetch raw paste content, keyword-match against provider registry +// +// Auth: none (credential-free). Rate: Every(3s), Burst 1 (conservative scraping). +type PastebinSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +// pastebinIDRE matches Pastebin paste links: /XXXXXXXX (8 alphanumeric chars). +var pastebinIDRE = regexp.MustCompile(`^/[A-Za-z0-9]{8}$`) + +// Compile-time assertion. +var _ recon.ReconSource = (*PastebinSource)(nil) + +func (s *PastebinSource) Name() string { return "pastebin" } +func (s *PastebinSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *PastebinSource) Burst() int { return 1 } +func (s *PastebinSource) RespectsRobots() bool { return true } + +// Enabled always returns true: Pastebin scraping requires no credentials. +func (s *PastebinSource) Enabled(_ recon.Config) bool { return true } + +// Sweep searches Pastebin for each provider keyword and scans raw paste content. +func (s *PastebinSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://pastebin.com" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "pastebin") + if len(queries) == 0 { + return nil + } + + keywords := pastebinKeywordSet(s.Registry) + if len(keywords) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Phase A: search for paste links. + searchURL := fmt.Sprintf("%s/search?q=%s", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("pastebin: build search req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("pastebin: search fetch: %w", err) + } + ids, err := extractAnchorHrefs(resp.Body, pastebinIDRE) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("pastebin: parse search html: %w", err) + } + + // Phase B: fetch raw content and keyword-match. + for _, idPath := range ids { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + rawURL := fmt.Sprintf("%s/raw%s", base, idPath) + rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("pastebin: build raw req: %w", err) + } + rawResp, err := client.Do(ctx, rawReq) + if err != nil { + // Skip this paste on fetch error, continue to next. + continue + } + body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024)) + _ = rawResp.Body.Close() + if readErr != nil { + continue + } + + content := string(body) + for kw, provName := range keywords { + if strings.Contains(content, kw) { + out <- recon.Finding{ + ProviderName: provName, + Source: fmt.Sprintf("%s%s", base, idPath), + SourceType: "recon:pastebin", + Confidence: "low", + DetectedAt: time.Now(), + } + break // one finding per paste + } + } + } + } + return nil +} + +// pastebinKeywordSet builds keyword->providerName map from registry. +func pastebinKeywordSet(reg *providers.Registry) map[string]string { + out := make(map[string]string) + if reg == nil { + return out + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + if k == "" { + continue + } + if _, ok := out[k]; !ok { + out[k] = p.Name + } + } + } + return out +} diff --git a/pkg/recon/sources/pastebin_test.go b/pkg/recon/sources/pastebin_test.go new file mode 100644 index 0000000..2192569 --- /dev/null +++ b/pkg/recon/sources/pastebin_test.go @@ -0,0 +1,120 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func pastebinTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const pastebinSearchHTML = ` + + paste one + paste two + nope + nine chars nope +` + +const pastebinRawContent1 = `some text with sk-proj-AAAA1234 leaked here` +const pastebinRawContent2 = `nothing interesting in this paste` + +func TestPastebin_Sweep_ExtractsFindings(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.URL.Path == "/search": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(pastebinSearchHTML)) + case r.URL.Path == "/raw/Ab12Cd34": + _, _ = w.Write([]byte(pastebinRawContent1)) + case r.URL.Path == "/raw/Ef56Gh78": + _, _ = w.Write([]byte(pastebinRawContent2)) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + src := &PastebinSource{ + BaseURL: srv.URL, + Registry: pastebinTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + // Only paste one has "sk-proj-", paste two doesn't match. + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.SourceType != "recon:pastebin" { + t.Errorf("SourceType=%s, want recon:pastebin", f.SourceType) + } + if f.ProviderName != "openai" { + t.Errorf("ProviderName=%s, want openai", f.ProviderName) + } + wantSource := srv.URL + "/Ab12Cd34" + if f.Source != wantSource { + t.Errorf("Source=%s, want %s", f.Source, wantSource) + } +} + +func TestPastebin_NameAndRate(t *testing.T) { + s := &PastebinSource{} + if s.Name() != "pastebin" { + t.Errorf("Name=%s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("Burst=%d", s.Burst()) + } + if !s.RespectsRobots() { + t.Error("expected RespectsRobots=true") + } + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true") + } +} + +func TestPastebin_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(pastebinSearchHTML)) + })) + defer srv.Close() + + src := &PastebinSource{ + BaseURL: srv.URL, + Registry: pastebinTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +}