package sources import ( "context" "errors" "fmt" "io" "log" "net/http" "net/url" "regexp" "strings" "time" "golang.org/x/time/rate" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // pastePlatform describes one paste site sub-source aggregated under the // "pastesites" umbrella. Follows the same multi-platform pattern as // SandboxesSource. // // SearchPath is a printf format string with one %s for the URL-escaped query. // RawPathTemplate, if non-empty, converts a matched link path into the raw // content endpoint (e.g. "/raw%s" prepends /raw to the paste path). type pastePlatform struct { Name string SearchPath string ResultLinkRegex string RawPathTemplate string // fmt with %s for matched path or extracted ID } // defaultPastePlatforms returns the production paste site list. func defaultPastePlatforms() []pastePlatform { return []pastePlatform{ { Name: "dpaste", SearchPath: "https://dpaste.org/search/?q=%s", ResultLinkRegex: `^/[A-Za-z0-9]+$`, RawPathTemplate: "%s/raw", }, { Name: "paste.ee", SearchPath: "https://paste.ee/search?q=%s", ResultLinkRegex: `^/p/[A-Za-z0-9]+$`, RawPathTemplate: "/r%s", // /p/ID -> /r/p/ID ... actually /r/ID }, { Name: "rentry", SearchPath: "https://rentry.co/search?q=%s", ResultLinkRegex: `^/[a-z0-9-]+$`, RawPathTemplate: "%s/raw", }, { Name: "hastebin", SearchPath: "https://hastebin.com/search?q=%s", ResultLinkRegex: `^/[a-z]+$`, RawPathTemplate: "/raw%s", }, } } // PasteSitesSource aggregates several paste sites into a single ReconSource. // Each sub-platform is scraped independently; failures in one are logged and // skipped without aborting the others. // // Every emitted Finding carries SourceType="recon:pastesites" and encodes the // originating sub-platform in KeyMasked as "platform=". type PasteSitesSource struct { Platforms []pastePlatform Registry *providers.Registry Limiters *recon.LimiterRegistry Client *Client // BaseURL, when non-empty, is prefixed to any relative SearchPath (tests). BaseURL string } // Compile-time assertion. var _ recon.ReconSource = (*PasteSitesSource)(nil) func (s *PasteSitesSource) Name() string { return "pastesites" } func (s *PasteSitesSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } func (s *PasteSitesSource) Burst() int { return 1 } func (s *PasteSitesSource) RespectsRobots() bool { return true } // Enabled always returns true: all paste site scraping is credential-free. func (s *PasteSitesSource) Enabled(_ recon.Config) bool { return true } // Sweep iterates each paste platform across each provider keyword. Per-platform // errors are logged and skipped so one broken sub-source does not fail the // overall sweep. func (s *PasteSitesSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { plats := s.Platforms if plats == nil { plats = defaultPastePlatforms() } client := s.Client if client == nil { client = NewClient() } queries := BuildQueries(s.Registry, "pastesites") if len(queries) == 0 { return nil } keywords := pasteSitesKeywordSet(s.Registry) if len(keywords) == 0 { return nil } for _, p := range plats { if err := ctx.Err(); err != nil { return err } for _, q := range queries { if err := ctx.Err(); err != nil { return err } if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } if err := s.sweepPastePlatform(ctx, client, p, q, keywords, out); err != nil { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return err } log.Printf("pastesites: platform %q failed (skipping): %v", p.Name, err) break // next platform } } } return nil } // sweepPastePlatform performs a search on one paste platform, fetches raw // content for each result link, and emits findings for keyword matches. func (s *PasteSitesSource) sweepPastePlatform( ctx context.Context, client *Client, p pastePlatform, query string, keywords map[string]string, out chan<- recon.Finding, ) error { rawURL := fmt.Sprintf(p.SearchPath, url.QueryEscape(query)) if s.BaseURL != "" && strings.HasPrefix(rawURL, "/") { rawURL = s.BaseURL + rawURL } re, err := regexp.Compile(p.ResultLinkRegex) if err != nil { return fmt.Errorf("bad regex: %w", err) } req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) if err != nil { return fmt.Errorf("build req: %w", err) } resp, err := client.Do(ctx, req) if err != nil { return fmt.Errorf("fetch: %w", err) } links, err := extractAnchorHrefs(resp.Body, re) _ = resp.Body.Close() if err != nil { return fmt.Errorf("parse html: %w", err) } // Determine base for absolute URLs from the search URL. searchParsed, _ := url.Parse(rawURL) scheme := searchParsed.Scheme host := searchParsed.Host for _, linkPath := range links { if err := ctx.Err(); err != nil { return err } if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } // Build raw content URL. rawPath := fmt.Sprintf(p.RawPathTemplate, linkPath) fetchURL := fmt.Sprintf("%s://%s%s", scheme, host, rawPath) rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, fetchURL, nil) if err != nil { continue } rawResp, err := client.Do(ctx, rawReq) if err != nil { continue // skip this paste on error } body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024)) _ = rawResp.Body.Close() if readErr != nil { continue } content := string(body) for kw, provName := range keywords { if strings.Contains(content, kw) { pasteURL := fmt.Sprintf("%s://%s%s", scheme, host, linkPath) out <- recon.Finding{ ProviderName: provName, Source: pasteURL, SourceType: "recon:pastesites", KeyMasked: "platform=" + p.Name, Confidence: "low", DetectedAt: time.Now(), } break // one finding per paste } } } return nil } // pasteSitesKeywordSet builds keyword->providerName map from registry. func pasteSitesKeywordSet(reg *providers.Registry) map[string]string { out := make(map[string]string) if reg == nil { return out } for _, p := range reg.List() { for _, k := range p.Keywords { if k == "" { continue } if _, ok := out[k]; !ok { out[k] = p.Name } } } return out }