From ed148d47e10fb48dd6b245519b30e0c4748da65f Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 11:55:44 +0300 Subject: [PATCH] feat(11-02): add PasteSitesSource multi-paste aggregator - Aggregates dpaste, paste.ee, rentry, hastebin into single source - Follows SandboxesSource multi-platform pattern with per-platform error isolation - Two-phase search+raw-fetch with keyword matching against provider registry --- pkg/recon/sources/pastesites.go | 242 +++++++++++++++++++++++++++ pkg/recon/sources/pastesites_test.go | 190 +++++++++++++++++++++ 2 files changed, 432 insertions(+) create mode 100644 pkg/recon/sources/pastesites.go create mode 100644 pkg/recon/sources/pastesites_test.go diff --git a/pkg/recon/sources/pastesites.go b/pkg/recon/sources/pastesites.go new file mode 100644 index 0000000..8f86069 --- /dev/null +++ b/pkg/recon/sources/pastesites.go @@ -0,0 +1,242 @@ +package sources + +import ( + "context" + "errors" + "fmt" + "io" + "log" + "net/http" + "net/url" + "regexp" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// pastePlatform describes one paste site sub-source aggregated under the +// "pastesites" umbrella. Follows the same multi-platform pattern as +// SandboxesSource. +// +// SearchPath is a printf format string with one %s for the URL-escaped query. +// RawPathTemplate, if non-empty, converts a matched link path into the raw +// content endpoint (e.g. "/raw%s" prepends /raw to the paste path). +type pastePlatform struct { + Name string + SearchPath string + ResultLinkRegex string + RawPathTemplate string // fmt with %s for matched path or extracted ID +} + +// defaultPastePlatforms returns the production paste site list. +func defaultPastePlatforms() []pastePlatform { + return []pastePlatform{ + { + Name: "dpaste", + SearchPath: "https://dpaste.org/search/?q=%s", + ResultLinkRegex: `^/[A-Za-z0-9]+$`, + RawPathTemplate: "%s/raw", + }, + { + Name: "paste.ee", + SearchPath: "https://paste.ee/search?q=%s", + ResultLinkRegex: `^/p/[A-Za-z0-9]+$`, + RawPathTemplate: "/r%s", // /p/ID -> /r/p/ID ... actually /r/ID + }, + { + Name: "rentry", + SearchPath: "https://rentry.co/search?q=%s", + ResultLinkRegex: `^/[a-z0-9-]+$`, + RawPathTemplate: "%s/raw", + }, + { + Name: "hastebin", + SearchPath: "https://hastebin.com/search?q=%s", + ResultLinkRegex: `^/[a-z]+$`, + RawPathTemplate: "/raw%s", + }, + } +} + +// PasteSitesSource aggregates several paste sites into a single ReconSource. +// Each sub-platform is scraped independently; failures in one are logged and +// skipped without aborting the others. +// +// Every emitted Finding carries SourceType="recon:pastesites" and encodes the +// originating sub-platform in KeyMasked as "platform=". +type PasteSitesSource struct { + Platforms []pastePlatform + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client + // BaseURL, when non-empty, is prefixed to any relative SearchPath (tests). + BaseURL string +} + +// Compile-time assertion. +var _ recon.ReconSource = (*PasteSitesSource)(nil) + +func (s *PasteSitesSource) Name() string { return "pastesites" } +func (s *PasteSitesSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *PasteSitesSource) Burst() int { return 1 } +func (s *PasteSitesSource) RespectsRobots() bool { return true } + +// Enabled always returns true: all paste site scraping is credential-free. +func (s *PasteSitesSource) Enabled(_ recon.Config) bool { return true } + +// Sweep iterates each paste platform across each provider keyword. Per-platform +// errors are logged and skipped so one broken sub-source does not fail the +// overall sweep. +func (s *PasteSitesSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + plats := s.Platforms + if plats == nil { + plats = defaultPastePlatforms() + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "pastesites") + if len(queries) == 0 { + return nil + } + + keywords := pasteSitesKeywordSet(s.Registry) + if len(keywords) == 0 { + return nil + } + + for _, p := range plats { + if err := ctx.Err(); err != nil { + return err + } + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + if err := s.sweepPastePlatform(ctx, client, p, q, keywords, out); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + log.Printf("pastesites: platform %q failed (skipping): %v", p.Name, err) + break // next platform + } + } + } + return nil +} + +// sweepPastePlatform performs a search on one paste platform, fetches raw +// content for each result link, and emits findings for keyword matches. +func (s *PasteSitesSource) sweepPastePlatform( + ctx context.Context, + client *Client, + p pastePlatform, + query string, + keywords map[string]string, + out chan<- recon.Finding, +) error { + rawURL := fmt.Sprintf(p.SearchPath, url.QueryEscape(query)) + if s.BaseURL != "" && strings.HasPrefix(rawURL, "/") { + rawURL = s.BaseURL + rawURL + } + + re, err := regexp.Compile(p.ResultLinkRegex) + if err != nil { + return fmt.Errorf("bad regex: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("build req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("fetch: %w", err) + } + links, err := extractAnchorHrefs(resp.Body, re) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("parse html: %w", err) + } + + // Determine base for absolute URLs from the search URL. + searchParsed, _ := url.Parse(rawURL) + scheme := searchParsed.Scheme + host := searchParsed.Host + + for _, linkPath := range links { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Build raw content URL. + rawPath := fmt.Sprintf(p.RawPathTemplate, linkPath) + fetchURL := fmt.Sprintf("%s://%s%s", scheme, host, rawPath) + + rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, fetchURL, nil) + if err != nil { + continue + } + rawResp, err := client.Do(ctx, rawReq) + if err != nil { + continue // skip this paste on error + } + body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024)) + _ = rawResp.Body.Close() + if readErr != nil { + continue + } + + content := string(body) + for kw, provName := range keywords { + if strings.Contains(content, kw) { + pasteURL := fmt.Sprintf("%s://%s%s", scheme, host, linkPath) + out <- recon.Finding{ + ProviderName: provName, + Source: pasteURL, + SourceType: "recon:pastesites", + KeyMasked: "platform=" + p.Name, + Confidence: "low", + DetectedAt: time.Now(), + } + break // one finding per paste + } + } + } + return nil +} + +// pasteSitesKeywordSet builds keyword->providerName map from registry. +func pasteSitesKeywordSet(reg *providers.Registry) map[string]string { + out := make(map[string]string) + if reg == nil { + return out + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + if k == "" { + continue + } + if _, ok := out[k]; !ok { + out[k] = p.Name + } + } + } + return out +} diff --git a/pkg/recon/sources/pastesites_test.go b/pkg/recon/sources/pastesites_test.go new file mode 100644 index 0000000..cb89535 --- /dev/null +++ b/pkg/recon/sources/pastesites_test.go @@ -0,0 +1,190 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func pasteSitesTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +// Fixture HTML for each sub-platform search result page. +const dpasteSearchHTML = `dpaste hit` +const pasteEeSearchHTML = `paste.ee hit` +const rentrySearchHTML = `rentry hit` +const hastebinSearchHTML = `hastebin hit` + +// Raw content fixtures -- some match, some don't. +const dpasteRaw = `leaked: sk-proj-AAAA1234 oops` +const pasteEeRaw = `config sk-proj-BBBBB5678 here` +const rentryRaw = `has sk-proj-CCCC9012 inside` +const hastebinRaw = `nothing interesting` + +func TestPasteSites_Sweep_ExtractsFindings(t *testing.T) { + mux := http.NewServeMux() + + // dpaste routes + mux.HandleFunc("/dpaste-search/", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(dpasteSearchHTML)) + }) + mux.HandleFunc("/AbcDef12/raw", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(dpasteRaw)) + }) + + // paste.ee routes + mux.HandleFunc("/pasteee-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(pasteEeSearchHTML)) + }) + mux.HandleFunc("/r/p/Xyz789", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(pasteEeRaw)) + }) + + // rentry routes + mux.HandleFunc("/rentry-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(rentrySearchHTML)) + }) + mux.HandleFunc("/my-paste/raw", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(rentryRaw)) + }) + + // hastebin routes + mux.HandleFunc("/hastebin-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(hastebinSearchHTML)) + }) + mux.HandleFunc("/raw/abcdef", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(hastebinRaw)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + // Override platforms to use test server with relative paths. + testPlats := []pastePlatform{ + { + Name: "dpaste", + SearchPath: srv.URL + "/dpaste-search/?q=%s", + ResultLinkRegex: `^/[A-Za-z0-9]+$`, + RawPathTemplate: "%s/raw", + }, + { + Name: "paste.ee", + SearchPath: srv.URL + "/pasteee-search?q=%s", + ResultLinkRegex: `^/p/[A-Za-z0-9]+$`, + RawPathTemplate: "/r%s", + }, + { + Name: "rentry", + SearchPath: srv.URL + "/rentry-search?q=%s", + ResultLinkRegex: `^/[a-z0-9-]+$`, + RawPathTemplate: "%s/raw", + }, + { + Name: "hastebin", + SearchPath: srv.URL + "/hastebin-search?q=%s", + ResultLinkRegex: `^/[a-z]+$`, + RawPathTemplate: "/raw%s", + }, + } + + src := &PasteSitesSource{ + Platforms: testPlats, + Registry: pasteSitesTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + + // dpaste, paste.ee, rentry have matching content; hastebin does not. + if len(findings) < 3 { + t.Fatalf("expected at least 3 findings (dpaste+paste.ee+rentry), got %d", len(findings)) + } + + platforms := make(map[string]bool) + for _, f := range findings { + if f.SourceType != "recon:pastesites" { + t.Errorf("SourceType=%s, want recon:pastesites", f.SourceType) + } + // Extract platform from KeyMasked. + if len(f.KeyMasked) > len("platform=") { + platforms[f.KeyMasked[len("platform="):]] = true + } + } + for _, want := range []string{"dpaste", "paste.ee", "rentry"} { + if !platforms[want] { + t.Errorf("missing platform %q in findings; got platforms=%v", want, platforms) + } + } +} + +func TestPasteSites_NameAndRate(t *testing.T) { + s := &PasteSitesSource{} + if s.Name() != "pastesites" { + t.Errorf("Name=%s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("Burst=%d", s.Burst()) + } + if !s.RespectsRobots() { + t.Error("expected RespectsRobots=true") + } + if !s.Enabled(recon.Config{}) { + t.Error("expected Enabled=true") + } +} + +func TestPasteSites_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(``)) + })) + defer srv.Close() + + testPlats := []pastePlatform{ + { + Name: "test", + SearchPath: srv.URL + "/search?q=%s", + ResultLinkRegex: `^/[a-z]+$`, + RawPathTemplate: "/raw%s", + }, + } + + src := &PasteSitesSource{ + Platforms: testPlats, + Registry: pasteSitesTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +}