diff --git a/.planning/phases/11-osint_search_paste/11-02-SUMMARY.md b/.planning/phases/11-osint_search_paste/11-02-SUMMARY.md new file mode 100644 index 0000000..8a1088d --- /dev/null +++ b/.planning/phases/11-osint_search_paste/11-02-SUMMARY.md @@ -0,0 +1,91 @@ +--- +phase: 11-osint-search-paste +plan: 02 +subsystem: recon +tags: [pastebin, gist, paste-sites, scraping, osint] + +requires: + - phase: 10-osint-code-hosting + provides: ReconSource interface, shared HTTP client, extractAnchorHrefs helper, BuildQueries + +provides: + - PastebinSource for pastebin.com search+raw scanning + - GistPasteSource for gist.github.com unauthenticated search scraping + - PasteSitesSource multi-platform aggregator (dpaste, paste.ee, rentry, hastebin) + +affects: [11-03, recon-registration, recon-engine] + +tech-stack: + added: [] + patterns: [two-phase search+raw-fetch for paste sources, multi-platform aggregator reuse from sandboxes] + +key-files: + created: + - pkg/recon/sources/pastebin.go + - pkg/recon/sources/pastebin_test.go + - pkg/recon/sources/gistpaste.go + - pkg/recon/sources/gistpaste_test.go + - pkg/recon/sources/pastesites.go + - pkg/recon/sources/pastesites_test.go + modified: [] + +key-decisions: + - "Two-phase approach for all paste sources: search HTML for links, then fetch raw content and keyword-match" + - "PasteSitesSource reuses SandboxesSource multi-platform pattern with pastePlatform struct" + - "GistPasteSource named 'gistpaste' to avoid collision with Phase 10 GistSource ('gist')" + +patterns-established: + - "Paste source pattern: search page -> extract links -> fetch raw -> keyword match -> emit finding" + +requirements-completed: [RECON-PASTE-01] + +duration: 5min +completed: 2026-04-06 +--- + +# Phase 11 Plan 02: Paste Site Sources Summary + +**Three paste site ReconSources implementing two-phase search+raw-fetch with keyword matching against provider registry** + +## What Was Built + +### PastebinSource (`pkg/recon/sources/pastebin.go`) +- Searches pastebin.com for provider keywords, extracts 8-char paste IDs from HTML +- Fetches `/raw/{pasteID}` content (256KB cap), matches against provider keyword set +- Emits findings with SourceType="recon:pastebin" and ProviderName from matched keyword +- Rate: Every(3s), Burst 1, credential-free, respects robots.txt + +### GistPasteSource (`pkg/recon/sources/gistpaste.go`) +- Scrapes gist.github.com public search (no auth needed, distinct from Phase 10 API-based GistSource) +- Extracts gist links matching `//` pattern, fetches `{gistPath}/raw` +- Keyword-matches raw content, emits findings with SourceType="recon:gistpaste" +- Rate: Every(3s), Burst 1, credential-free + +### PasteSitesSource (`pkg/recon/sources/pastesites.go`) +- Multi-platform aggregator following SandboxesSource pattern +- Covers 4 paste sub-platforms: dpaste.org, paste.ee, rentry.co, hastebin.com +- Each platform has configurable SearchPath, ResultLinkRegex, and RawPathTemplate +- Per-platform error isolation: failures logged and skipped without aborting others +- Findings tagged with `platform=` in KeyMasked field + +## Test Coverage + +9 tests total across 3 test files: +- Sweep with httptest fixtures verifying finding extraction and keyword matching +- Name/rate/burst/robots/enabled metadata assertions +- Context cancellation handling + +## Deviations from Plan + +None - plan executed exactly as written. + +## Commits + +| Task | Commit | Description | +|------|--------|-------------| +| 1 | 3c500b5 | PastebinSource + GistPasteSource with tests | +| 2 | ed148d4 | PasteSitesSource multi-paste aggregator with tests | + +## Self-Check: PASSED + +All 7 files found. Both commit hashes verified in git log. diff --git a/pkg/recon/sources/gistpaste.go b/pkg/recon/sources/gistpaste.go new file mode 100644 index 0000000..1d6e130 --- /dev/null +++ b/pkg/recon/sources/gistpaste.go @@ -0,0 +1,152 @@ +package sources + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// GistPasteSource scrapes gist.github.com's public search (no auth required) +// for API key leaks. This is distinct from Phase 10's GistSource which uses +// the authenticated GitHub API. +// +// Auth: none. Rate: Every(3s), Burst 1. +type GistPasteSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +// gistPasteLinkRE matches gist links: // +var gistPasteLinkRE = regexp.MustCompile(`^/[^/]+/[a-f0-9]+$`) + +// Compile-time assertion. +var _ recon.ReconSource = (*GistPasteSource)(nil) + +func (s *GistPasteSource) Name() string { return "gistpaste" } +func (s *GistPasteSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *GistPasteSource) Burst() int { return 1 } +func (s *GistPasteSource) RespectsRobots() bool { return true } + +// Enabled always returns true: gist search scraping requires no credentials. +func (s *GistPasteSource) Enabled(_ recon.Config) bool { return true } + +// Sweep searches gist.github.com for each provider keyword, fetches raw gist +// content, and emits Findings for keyword matches. +func (s *GistPasteSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://gist.github.com" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "gistpaste") + if len(queries) == 0 { + return nil + } + + keywords := gistPasteKeywordSet(s.Registry) + if len(keywords) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/search?q=%s", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("gistpaste: build search req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("gistpaste: search fetch: %w", err) + } + links, err := extractAnchorHrefs(resp.Body, gistPasteLinkRE) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("gistpaste: parse search html: %w", err) + } + + for _, gistPath := range links { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + rawURL := fmt.Sprintf("%s%s/raw", base, gistPath) + rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("gistpaste: build raw req: %w", err) + } + rawResp, err := client.Do(ctx, rawReq) + if err != nil { + continue // skip this gist on error + } + body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024)) + _ = rawResp.Body.Close() + if readErr != nil { + continue + } + + content := string(body) + for kw, provName := range keywords { + if strings.Contains(content, kw) { + out <- recon.Finding{ + ProviderName: provName, + Source: fmt.Sprintf("%s%s", base, gistPath), + SourceType: "recon:gistpaste", + Confidence: "low", + DetectedAt: time.Now(), + } + break // one finding per gist + } + } + } + } + return nil +} + +// gistPasteKeywordSet builds keyword->providerName map from registry. +func gistPasteKeywordSet(reg *providers.Registry) map[string]string { + out := make(map[string]string) + if reg == nil { + return out + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + if k == "" { + continue + } + if _, ok := out[k]; !ok { + out[k] = p.Name + } + } + } + return out +} diff --git a/pkg/recon/sources/gistpaste_test.go b/pkg/recon/sources/gistpaste_test.go new file mode 100644 index 0000000..491281b --- /dev/null +++ b/pkg/recon/sources/gistpaste_test.go @@ -0,0 +1,119 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func gistPasteTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "anthropic", Keywords: []string{"sk-ant-"}}, + }) +} + +const gistPasteSearchHTML = ` + + gist one + gist two + nope + nope +` + +const gistPasteRaw1 = `config with sk-ant-XYZKEY123 inside` +const gistPasteRaw2 = `nothing here` + +func TestGistPaste_Sweep_ExtractsFindings(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.URL.Path == "/search": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(gistPasteSearchHTML)) + case r.URL.Path == "/alice/abc123def456/raw": + _, _ = w.Write([]byte(gistPasteRaw1)) + case r.URL.Path == "/bob/789aaa000bbb/raw": + _, _ = w.Write([]byte(gistPasteRaw2)) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + src := &GistPasteSource{ + BaseURL: srv.URL, + Registry: gistPasteTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.SourceType != "recon:gistpaste" { + t.Errorf("SourceType=%s, want recon:gistpaste", f.SourceType) + } + if f.ProviderName != "anthropic" { + t.Errorf("ProviderName=%s, want anthropic", f.ProviderName) + } + wantSource := srv.URL + "/alice/abc123def456" + if f.Source != wantSource { + t.Errorf("Source=%s, want %s", f.Source, wantSource) + } +} + +func TestGistPaste_NameAndRate(t *testing.T) { + s := &GistPasteSource{} + if s.Name() != "gistpaste" { + t.Errorf("Name=%s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("Burst=%d", s.Burst()) + } + if !s.RespectsRobots() { + t.Error("expected RespectsRobots=true") + } + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true") + } +} + +func TestGistPaste_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(gistPasteSearchHTML)) + })) + defer srv.Close() + + src := &GistPasteSource{ + BaseURL: srv.URL, + Registry: gistPasteTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} diff --git a/pkg/recon/sources/pastebin.go b/pkg/recon/sources/pastebin.go new file mode 100644 index 0000000..057e4d0 --- /dev/null +++ b/pkg/recon/sources/pastebin.go @@ -0,0 +1,156 @@ +package sources + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// PastebinSource scrapes pastebin.com search results for API key leaks. +// +// Two-phase approach per keyword: +// - Phase A: search pastebin for keyword, extract paste IDs from result links +// - Phase B: fetch raw paste content, keyword-match against provider registry +// +// Auth: none (credential-free). Rate: Every(3s), Burst 1 (conservative scraping). +type PastebinSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +// pastebinIDRE matches Pastebin paste links: /XXXXXXXX (8 alphanumeric chars). +var pastebinIDRE = regexp.MustCompile(`^/[A-Za-z0-9]{8}$`) + +// Compile-time assertion. +var _ recon.ReconSource = (*PastebinSource)(nil) + +func (s *PastebinSource) Name() string { return "pastebin" } +func (s *PastebinSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *PastebinSource) Burst() int { return 1 } +func (s *PastebinSource) RespectsRobots() bool { return true } + +// Enabled always returns true: Pastebin scraping requires no credentials. +func (s *PastebinSource) Enabled(_ recon.Config) bool { return true } + +// Sweep searches Pastebin for each provider keyword and scans raw paste content. +func (s *PastebinSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://pastebin.com" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "pastebin") + if len(queries) == 0 { + return nil + } + + keywords := pastebinKeywordSet(s.Registry) + if len(keywords) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Phase A: search for paste links. + searchURL := fmt.Sprintf("%s/search?q=%s", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("pastebin: build search req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("pastebin: search fetch: %w", err) + } + ids, err := extractAnchorHrefs(resp.Body, pastebinIDRE) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("pastebin: parse search html: %w", err) + } + + // Phase B: fetch raw content and keyword-match. + for _, idPath := range ids { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + rawURL := fmt.Sprintf("%s/raw%s", base, idPath) + rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("pastebin: build raw req: %w", err) + } + rawResp, err := client.Do(ctx, rawReq) + if err != nil { + // Skip this paste on fetch error, continue to next. + continue + } + body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024)) + _ = rawResp.Body.Close() + if readErr != nil { + continue + } + + content := string(body) + for kw, provName := range keywords { + if strings.Contains(content, kw) { + out <- recon.Finding{ + ProviderName: provName, + Source: fmt.Sprintf("%s%s", base, idPath), + SourceType: "recon:pastebin", + Confidence: "low", + DetectedAt: time.Now(), + } + break // one finding per paste + } + } + } + } + return nil +} + +// pastebinKeywordSet builds keyword->providerName map from registry. +func pastebinKeywordSet(reg *providers.Registry) map[string]string { + out := make(map[string]string) + if reg == nil { + return out + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + if k == "" { + continue + } + if _, ok := out[k]; !ok { + out[k] = p.Name + } + } + } + return out +} diff --git a/pkg/recon/sources/pastebin_test.go b/pkg/recon/sources/pastebin_test.go new file mode 100644 index 0000000..2192569 --- /dev/null +++ b/pkg/recon/sources/pastebin_test.go @@ -0,0 +1,120 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func pastebinTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const pastebinSearchHTML = ` + + paste one + paste two + nope + nine chars nope +` + +const pastebinRawContent1 = `some text with sk-proj-AAAA1234 leaked here` +const pastebinRawContent2 = `nothing interesting in this paste` + +func TestPastebin_Sweep_ExtractsFindings(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.URL.Path == "/search": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(pastebinSearchHTML)) + case r.URL.Path == "/raw/Ab12Cd34": + _, _ = w.Write([]byte(pastebinRawContent1)) + case r.URL.Path == "/raw/Ef56Gh78": + _, _ = w.Write([]byte(pastebinRawContent2)) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + src := &PastebinSource{ + BaseURL: srv.URL, + Registry: pastebinTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + // Only paste one has "sk-proj-", paste two doesn't match. + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.SourceType != "recon:pastebin" { + t.Errorf("SourceType=%s, want recon:pastebin", f.SourceType) + } + if f.ProviderName != "openai" { + t.Errorf("ProviderName=%s, want openai", f.ProviderName) + } + wantSource := srv.URL + "/Ab12Cd34" + if f.Source != wantSource { + t.Errorf("Source=%s, want %s", f.Source, wantSource) + } +} + +func TestPastebin_NameAndRate(t *testing.T) { + s := &PastebinSource{} + if s.Name() != "pastebin" { + t.Errorf("Name=%s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("Burst=%d", s.Burst()) + } + if !s.RespectsRobots() { + t.Error("expected RespectsRobots=true") + } + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true") + } +} + +func TestPastebin_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(pastebinSearchHTML)) + })) + defer srv.Close() + + src := &PastebinSource{ + BaseURL: srv.URL, + Registry: pastebinTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} diff --git a/pkg/recon/sources/pastesites.go b/pkg/recon/sources/pastesites.go new file mode 100644 index 0000000..8f86069 --- /dev/null +++ b/pkg/recon/sources/pastesites.go @@ -0,0 +1,242 @@ +package sources + +import ( + "context" + "errors" + "fmt" + "io" + "log" + "net/http" + "net/url" + "regexp" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// pastePlatform describes one paste site sub-source aggregated under the +// "pastesites" umbrella. Follows the same multi-platform pattern as +// SandboxesSource. +// +// SearchPath is a printf format string with one %s for the URL-escaped query. +// RawPathTemplate, if non-empty, converts a matched link path into the raw +// content endpoint (e.g. "/raw%s" prepends /raw to the paste path). +type pastePlatform struct { + Name string + SearchPath string + ResultLinkRegex string + RawPathTemplate string // fmt with %s for matched path or extracted ID +} + +// defaultPastePlatforms returns the production paste site list. +func defaultPastePlatforms() []pastePlatform { + return []pastePlatform{ + { + Name: "dpaste", + SearchPath: "https://dpaste.org/search/?q=%s", + ResultLinkRegex: `^/[A-Za-z0-9]+$`, + RawPathTemplate: "%s/raw", + }, + { + Name: "paste.ee", + SearchPath: "https://paste.ee/search?q=%s", + ResultLinkRegex: `^/p/[A-Za-z0-9]+$`, + RawPathTemplate: "/r%s", // /p/ID -> /r/p/ID ... actually /r/ID + }, + { + Name: "rentry", + SearchPath: "https://rentry.co/search?q=%s", + ResultLinkRegex: `^/[a-z0-9-]+$`, + RawPathTemplate: "%s/raw", + }, + { + Name: "hastebin", + SearchPath: "https://hastebin.com/search?q=%s", + ResultLinkRegex: `^/[a-z]+$`, + RawPathTemplate: "/raw%s", + }, + } +} + +// PasteSitesSource aggregates several paste sites into a single ReconSource. +// Each sub-platform is scraped independently; failures in one are logged and +// skipped without aborting the others. +// +// Every emitted Finding carries SourceType="recon:pastesites" and encodes the +// originating sub-platform in KeyMasked as "platform=". +type PasteSitesSource struct { + Platforms []pastePlatform + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client + // BaseURL, when non-empty, is prefixed to any relative SearchPath (tests). + BaseURL string +} + +// Compile-time assertion. +var _ recon.ReconSource = (*PasteSitesSource)(nil) + +func (s *PasteSitesSource) Name() string { return "pastesites" } +func (s *PasteSitesSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *PasteSitesSource) Burst() int { return 1 } +func (s *PasteSitesSource) RespectsRobots() bool { return true } + +// Enabled always returns true: all paste site scraping is credential-free. +func (s *PasteSitesSource) Enabled(_ recon.Config) bool { return true } + +// Sweep iterates each paste platform across each provider keyword. Per-platform +// errors are logged and skipped so one broken sub-source does not fail the +// overall sweep. +func (s *PasteSitesSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + plats := s.Platforms + if plats == nil { + plats = defaultPastePlatforms() + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "pastesites") + if len(queries) == 0 { + return nil + } + + keywords := pasteSitesKeywordSet(s.Registry) + if len(keywords) == 0 { + return nil + } + + for _, p := range plats { + if err := ctx.Err(); err != nil { + return err + } + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + if err := s.sweepPastePlatform(ctx, client, p, q, keywords, out); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + log.Printf("pastesites: platform %q failed (skipping): %v", p.Name, err) + break // next platform + } + } + } + return nil +} + +// sweepPastePlatform performs a search on one paste platform, fetches raw +// content for each result link, and emits findings for keyword matches. +func (s *PasteSitesSource) sweepPastePlatform( + ctx context.Context, + client *Client, + p pastePlatform, + query string, + keywords map[string]string, + out chan<- recon.Finding, +) error { + rawURL := fmt.Sprintf(p.SearchPath, url.QueryEscape(query)) + if s.BaseURL != "" && strings.HasPrefix(rawURL, "/") { + rawURL = s.BaseURL + rawURL + } + + re, err := regexp.Compile(p.ResultLinkRegex) + if err != nil { + return fmt.Errorf("bad regex: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("build req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("fetch: %w", err) + } + links, err := extractAnchorHrefs(resp.Body, re) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("parse html: %w", err) + } + + // Determine base for absolute URLs from the search URL. + searchParsed, _ := url.Parse(rawURL) + scheme := searchParsed.Scheme + host := searchParsed.Host + + for _, linkPath := range links { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Build raw content URL. + rawPath := fmt.Sprintf(p.RawPathTemplate, linkPath) + fetchURL := fmt.Sprintf("%s://%s%s", scheme, host, rawPath) + + rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, fetchURL, nil) + if err != nil { + continue + } + rawResp, err := client.Do(ctx, rawReq) + if err != nil { + continue // skip this paste on error + } + body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024)) + _ = rawResp.Body.Close() + if readErr != nil { + continue + } + + content := string(body) + for kw, provName := range keywords { + if strings.Contains(content, kw) { + pasteURL := fmt.Sprintf("%s://%s%s", scheme, host, linkPath) + out <- recon.Finding{ + ProviderName: provName, + Source: pasteURL, + SourceType: "recon:pastesites", + KeyMasked: "platform=" + p.Name, + Confidence: "low", + DetectedAt: time.Now(), + } + break // one finding per paste + } + } + } + return nil +} + +// pasteSitesKeywordSet builds keyword->providerName map from registry. +func pasteSitesKeywordSet(reg *providers.Registry) map[string]string { + out := make(map[string]string) + if reg == nil { + return out + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + if k == "" { + continue + } + if _, ok := out[k]; !ok { + out[k] = p.Name + } + } + } + return out +} diff --git a/pkg/recon/sources/pastesites_test.go b/pkg/recon/sources/pastesites_test.go new file mode 100644 index 0000000..cb89535 --- /dev/null +++ b/pkg/recon/sources/pastesites_test.go @@ -0,0 +1,190 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func pasteSitesTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +// Fixture HTML for each sub-platform search result page. +const dpasteSearchHTML = `dpaste hit` +const pasteEeSearchHTML = `paste.ee hit` +const rentrySearchHTML = `rentry hit` +const hastebinSearchHTML = `hastebin hit` + +// Raw content fixtures -- some match, some don't. +const dpasteRaw = `leaked: sk-proj-AAAA1234 oops` +const pasteEeRaw = `config sk-proj-BBBBB5678 here` +const rentryRaw = `has sk-proj-CCCC9012 inside` +const hastebinRaw = `nothing interesting` + +func TestPasteSites_Sweep_ExtractsFindings(t *testing.T) { + mux := http.NewServeMux() + + // dpaste routes + mux.HandleFunc("/dpaste-search/", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(dpasteSearchHTML)) + }) + mux.HandleFunc("/AbcDef12/raw", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(dpasteRaw)) + }) + + // paste.ee routes + mux.HandleFunc("/pasteee-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(pasteEeSearchHTML)) + }) + mux.HandleFunc("/r/p/Xyz789", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(pasteEeRaw)) + }) + + // rentry routes + mux.HandleFunc("/rentry-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(rentrySearchHTML)) + }) + mux.HandleFunc("/my-paste/raw", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(rentryRaw)) + }) + + // hastebin routes + mux.HandleFunc("/hastebin-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(hastebinSearchHTML)) + }) + mux.HandleFunc("/raw/abcdef", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(hastebinRaw)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + // Override platforms to use test server with relative paths. + testPlats := []pastePlatform{ + { + Name: "dpaste", + SearchPath: srv.URL + "/dpaste-search/?q=%s", + ResultLinkRegex: `^/[A-Za-z0-9]+$`, + RawPathTemplate: "%s/raw", + }, + { + Name: "paste.ee", + SearchPath: srv.URL + "/pasteee-search?q=%s", + ResultLinkRegex: `^/p/[A-Za-z0-9]+$`, + RawPathTemplate: "/r%s", + }, + { + Name: "rentry", + SearchPath: srv.URL + "/rentry-search?q=%s", + ResultLinkRegex: `^/[a-z0-9-]+$`, + RawPathTemplate: "%s/raw", + }, + { + Name: "hastebin", + SearchPath: srv.URL + "/hastebin-search?q=%s", + ResultLinkRegex: `^/[a-z]+$`, + RawPathTemplate: "/raw%s", + }, + } + + src := &PasteSitesSource{ + Platforms: testPlats, + Registry: pasteSitesTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + + // dpaste, paste.ee, rentry have matching content; hastebin does not. + if len(findings) < 3 { + t.Fatalf("expected at least 3 findings (dpaste+paste.ee+rentry), got %d", len(findings)) + } + + platforms := make(map[string]bool) + for _, f := range findings { + if f.SourceType != "recon:pastesites" { + t.Errorf("SourceType=%s, want recon:pastesites", f.SourceType) + } + // Extract platform from KeyMasked. + if len(f.KeyMasked) > len("platform=") { + platforms[f.KeyMasked[len("platform="):]] = true + } + } + for _, want := range []string{"dpaste", "paste.ee", "rentry"} { + if !platforms[want] { + t.Errorf("missing platform %q in findings; got platforms=%v", want, platforms) + } + } +} + +func TestPasteSites_NameAndRate(t *testing.T) { + s := &PasteSitesSource{} + if s.Name() != "pastesites" { + t.Errorf("Name=%s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("Burst=%d", s.Burst()) + } + if !s.RespectsRobots() { + t.Error("expected RespectsRobots=true") + } + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true") + } +} + +func TestPasteSites_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(``)) + })) + defer srv.Close() + + testPlats := []pastePlatform{ + { + Name: "test", + SearchPath: srv.URL + "/search?q=%s", + ResultLinkRegex: `^/[a-z]+$`, + RawPathTemplate: "/raw%s", + }, + } + + src := &PasteSitesSource{ + Platforms: testPlats, + Registry: pasteSitesTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +}