121 lines
3.0 KiB
Go
121 lines
3.0 KiB
Go
package sources
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"time"
|
|
|
|
"golang.org/x/time/rate"
|
|
|
|
"github.com/salvacybersec/keyhunter/pkg/providers"
|
|
"github.com/salvacybersec/keyhunter/pkg/recon"
|
|
)
|
|
|
|
// CommonCrawlSource searches the Common Crawl index for web pages that may
|
|
// contain leaked API keys. Common Crawl archives petabytes of web content;
|
|
// its CDX API allows searching by URL pattern to find pages that historically
|
|
// exposed secrets.
|
|
type CommonCrawlSource struct {
|
|
BaseURL string
|
|
Registry *providers.Registry
|
|
Limiters *recon.LimiterRegistry
|
|
Client *Client
|
|
}
|
|
|
|
var _ recon.ReconSource = (*CommonCrawlSource)(nil)
|
|
|
|
func (s *CommonCrawlSource) Name() string { return "commoncrawl" }
|
|
func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) }
|
|
func (s *CommonCrawlSource) Burst() int { return 1 }
|
|
func (s *CommonCrawlSource) RespectsRobots() bool { return true }
|
|
func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true }
|
|
|
|
// ccIndexResult represents a single Common Crawl CDX index record.
|
|
type ccIndexResult struct {
|
|
URL string `json:"url"`
|
|
Timestamp string `json:"timestamp"`
|
|
Status string `json:"status"`
|
|
Filename string `json:"filename"`
|
|
Length string `json:"length"`
|
|
Offset string `json:"offset"`
|
|
}
|
|
|
|
func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
|
base := s.BaseURL
|
|
if base == "" {
|
|
base = "https://index.commoncrawl.org/CC-MAIN-2024-10-index"
|
|
}
|
|
client := s.Client
|
|
if client == nil {
|
|
client = NewClient()
|
|
}
|
|
|
|
queries := BuildQueries(s.Registry, "commoncrawl")
|
|
if len(queries) == 0 {
|
|
return nil
|
|
}
|
|
|
|
for _, q := range queries {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if s.Limiters != nil {
|
|
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// CDX API: search for URLs matching the query.
|
|
searchURL := fmt.Sprintf("%s?url=*%s*&output=json&limit=10", base, q)
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
req.Header.Set("Accept", "application/json")
|
|
|
|
resp, err := client.Do(ctx, req)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
|
|
_ = resp.Body.Close()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
// Common Crawl returns NDJSON (newline-delimited JSON).
|
|
// Parse each line as a separate JSON object.
|
|
var results []ccIndexResult
|
|
dec := json.NewDecoder(bytes.NewReader(body))
|
|
for dec.More() {
|
|
var r ccIndexResult
|
|
if err := dec.Decode(&r); err != nil {
|
|
break
|
|
}
|
|
results = append(results, r)
|
|
}
|
|
|
|
for _, r := range results {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Each indexed URL is a potential leak location; emit as finding.
|
|
out <- recon.Finding{
|
|
ProviderName: q,
|
|
Source: r.URL,
|
|
SourceType: "recon:commoncrawl",
|
|
Confidence: "low",
|
|
DetectedAt: time.Now(),
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|