feat(14-02): add WaybackMachine + CommonCrawl recon sources
- WaybackMachineSource queries CDX API for historical snapshots - CommonCrawlSource queries CC Index API for matching pages - Both credentialless, rate-limited at 1 req/5s, RespectsRobots=true - RegisterAll extended to 42 sources (40 Phase 10-13 + 2 Phase 14) - Full httptest-based test coverage for both sources
This commit is contained in:
138
pkg/recon/sources/commoncrawl.go
Normal file
138
pkg/recon/sources/commoncrawl.go
Normal file
@@ -0,0 +1,138 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// CommonCrawlSource implements recon.ReconSource against the CommonCrawl
|
||||
// Index Server API. It queries index.commoncrawl.org for pages matching
|
||||
// provider keywords in the CC index.
|
||||
//
|
||||
// RECON-ARCH-02: Each matching index record yields a Finding pointing at the
|
||||
// original URL discovered in the crawl. The source is credentialless and
|
||||
// always enabled.
|
||||
type CommonCrawlSource struct {
|
||||
// BaseURL defaults to https://index.commoncrawl.org. Tests override with httptest URL.
|
||||
BaseURL string
|
||||
// IndexName defaults to CC-MAIN-2024-10 (recent crawl). Tests may override.
|
||||
IndexName string
|
||||
// Registry drives the keyword query list via BuildQueries.
|
||||
Registry *providers.Registry
|
||||
// Limiters is the shared recon.LimiterRegistry.
|
||||
Limiters *recon.LimiterRegistry
|
||||
// Client is the shared retry HTTP wrapper. If nil, a default is used.
|
||||
Client *Client
|
||||
}
|
||||
|
||||
// Compile-time assertion that CommonCrawlSource satisfies recon.ReconSource.
|
||||
var _ recon.ReconSource = (*CommonCrawlSource)(nil)
|
||||
|
||||
func (s *CommonCrawlSource) Name() string { return "commoncrawl" }
|
||||
func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) }
|
||||
func (s *CommonCrawlSource) Burst() int { return 1 }
|
||||
func (s *CommonCrawlSource) RespectsRobots() bool { return true }
|
||||
|
||||
// Enabled always returns true: CommonCrawl index is unauthenticated.
|
||||
func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// Sweep iterates provider keywords, queries the CC index for each, and emits
|
||||
// a Finding for every matched URL. The CC Index API returns NDJSON (one JSON
|
||||
// object per line) with fields like url, timestamp, status, mime, etc.
|
||||
func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
base = "https://index.commoncrawl.org"
|
||||
}
|
||||
idx := s.IndexName
|
||||
if idx == "" {
|
||||
idx = "CC-MAIN-2024-10"
|
||||
}
|
||||
client := s.Client
|
||||
if client == nil {
|
||||
client = NewClient()
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "commoncrawl")
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// CC Index API: output=json returns NDJSON, limit=50 bounds the response.
|
||||
endpoint := fmt.Sprintf("%s/%s-index?url=*&output=json&limit=50&filter=status:200&query=%s",
|
||||
base, idx, url.QueryEscape(q))
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("commoncrawl: build req: %w", err)
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := client.Do(ctx, req)
|
||||
if err != nil {
|
||||
// Non-fatal: skip this keyword on transient errors.
|
||||
continue
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Bytes()
|
||||
if len(line) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
var rec ccIndexRecord
|
||||
if err := json.Unmarshal(line, &rec); err != nil {
|
||||
continue
|
||||
}
|
||||
if rec.URL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
f := recon.Finding{
|
||||
ProviderName: "",
|
||||
Source: rec.URL,
|
||||
SourceType: "recon:commoncrawl",
|
||||
Confidence: "low",
|
||||
DetectedAt: time.Now(),
|
||||
}
|
||||
select {
|
||||
case out <- f:
|
||||
case <-ctx.Done():
|
||||
_ = resp.Body.Close()
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
_ = resp.Body.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ccIndexRecord mirrors the subset of fields returned by the CommonCrawl Index
|
||||
// API that this source consumes. Additional fields (mime, status, digest, etc.)
|
||||
// are ignored to keep the decoder tolerant.
|
||||
type ccIndexRecord struct {
|
||||
URL string `json:"url"`
|
||||
Timestamp string `json:"timestamp"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
Reference in New Issue
Block a user