package sources import ( "context" "fmt" "io" "net/http" "net/url" "regexp" "strings" "time" "golang.org/x/time/rate" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // PastebinSource scrapes pastebin.com search results for API key leaks. // // Two-phase approach per keyword: // - Phase A: search pastebin for keyword, extract paste IDs from result links // - Phase B: fetch raw paste content, keyword-match against provider registry // // Auth: none (credential-free). Rate: Every(3s), Burst 1 (conservative scraping). type PastebinSource struct { BaseURL string Registry *providers.Registry Limiters *recon.LimiterRegistry Client *Client } // pastebinIDRE matches Pastebin paste links: /XXXXXXXX (8 alphanumeric chars). var pastebinIDRE = regexp.MustCompile(`^/[A-Za-z0-9]{8}$`) // Compile-time assertion. var _ recon.ReconSource = (*PastebinSource)(nil) func (s *PastebinSource) Name() string { return "pastebin" } func (s *PastebinSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } func (s *PastebinSource) Burst() int { return 1 } func (s *PastebinSource) RespectsRobots() bool { return true } // Enabled always returns true: Pastebin scraping requires no credentials. func (s *PastebinSource) Enabled(_ recon.Config) bool { return true } // Sweep searches Pastebin for each provider keyword and scans raw paste content. func (s *PastebinSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { base := s.BaseURL if base == "" { base = "https://pastebin.com" } client := s.Client if client == nil { client = NewClient() } queries := BuildQueries(s.Registry, "pastebin") if len(queries) == 0 { return nil } keywords := pastebinKeywordSet(s.Registry) if len(keywords) == 0 { return nil } for _, q := range queries { if err := ctx.Err(); err != nil { return err } if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } // Phase A: search for paste links. searchURL := fmt.Sprintf("%s/search?q=%s", base, url.QueryEscape(q)) req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) if err != nil { return fmt.Errorf("pastebin: build search req: %w", err) } resp, err := client.Do(ctx, req) if err != nil { return fmt.Errorf("pastebin: search fetch: %w", err) } ids, err := extractAnchorHrefs(resp.Body, pastebinIDRE) _ = resp.Body.Close() if err != nil { return fmt.Errorf("pastebin: parse search html: %w", err) } // Phase B: fetch raw content and keyword-match. for _, idPath := range ids { if err := ctx.Err(); err != nil { return err } if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } rawURL := fmt.Sprintf("%s/raw%s", base, idPath) rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) if err != nil { return fmt.Errorf("pastebin: build raw req: %w", err) } rawResp, err := client.Do(ctx, rawReq) if err != nil { // Skip this paste on fetch error, continue to next. continue } body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024)) _ = rawResp.Body.Close() if readErr != nil { continue } content := string(body) for kw, provName := range keywords { if strings.Contains(content, kw) { out <- recon.Finding{ ProviderName: provName, Source: fmt.Sprintf("%s%s", base, idPath), SourceType: "recon:pastebin", Confidence: "low", DetectedAt: time.Now(), } break // one finding per paste } } } } return nil } // pastebinKeywordSet builds keyword->providerName map from registry. func pastebinKeywordSet(reg *providers.Registry) map[string]string { out := make(map[string]string) if reg == nil { return out } for _, p := range reg.List() { for _, k := range p.Keywords { if k == "" { continue } if _, ok := out[k]; !ok { out[k] = p.Name } } } return out }