diff --git a/pkg/recon/sources/commoncrawl.go b/pkg/recon/sources/commoncrawl.go new file mode 100644 index 0000000..eb084ad --- /dev/null +++ b/pkg/recon/sources/commoncrawl.go @@ -0,0 +1,138 @@ +package sources + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// CommonCrawlSource implements recon.ReconSource against the CommonCrawl +// Index Server API. It queries index.commoncrawl.org for pages matching +// provider keywords in the CC index. +// +// RECON-ARCH-02: Each matching index record yields a Finding pointing at the +// original URL discovered in the crawl. The source is credentialless and +// always enabled. +type CommonCrawlSource struct { + // BaseURL defaults to https://index.commoncrawl.org. Tests override with httptest URL. + BaseURL string + // IndexName defaults to CC-MAIN-2024-10 (recent crawl). Tests may override. + IndexName string + // Registry drives the keyword query list via BuildQueries. + Registry *providers.Registry + // Limiters is the shared recon.LimiterRegistry. + Limiters *recon.LimiterRegistry + // Client is the shared retry HTTP wrapper. If nil, a default is used. + Client *Client +} + +// Compile-time assertion that CommonCrawlSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*CommonCrawlSource)(nil) + +func (s *CommonCrawlSource) Name() string { return "commoncrawl" } +func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) } +func (s *CommonCrawlSource) Burst() int { return 1 } +func (s *CommonCrawlSource) RespectsRobots() bool { return true } + +// Enabled always returns true: CommonCrawl index is unauthenticated. +func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true } + +// Sweep iterates provider keywords, queries the CC index for each, and emits +// a Finding for every matched URL. The CC Index API returns NDJSON (one JSON +// object per line) with fields like url, timestamp, status, mime, etc. +func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://index.commoncrawl.org" + } + idx := s.IndexName + if idx == "" { + idx = "CC-MAIN-2024-10" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "commoncrawl") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // CC Index API: output=json returns NDJSON, limit=50 bounds the response. + endpoint := fmt.Sprintf("%s/%s-index?url=*&output=json&limit=50&filter=status:200&query=%s", + base, idx, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("commoncrawl: build req: %w", err) + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + // Non-fatal: skip this keyword on transient errors. + continue + } + + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + line := scanner.Bytes() + if len(line) == 0 { + continue + } + + var rec ccIndexRecord + if err := json.Unmarshal(line, &rec); err != nil { + continue + } + if rec.URL == "" { + continue + } + + f := recon.Finding{ + ProviderName: "", + Source: rec.URL, + SourceType: "recon:commoncrawl", + Confidence: "low", + DetectedAt: time.Now(), + } + select { + case out <- f: + case <-ctx.Done(): + _ = resp.Body.Close() + return ctx.Err() + } + } + _ = resp.Body.Close() + } + return nil +} + +// ccIndexRecord mirrors the subset of fields returned by the CommonCrawl Index +// API that this source consumes. Additional fields (mime, status, digest, etc.) +// are ignored to keep the decoder tolerant. +type ccIndexRecord struct { + URL string `json:"url"` + Timestamp string `json:"timestamp"` + Status string `json:"status"` +} diff --git a/pkg/recon/sources/commoncrawl_test.go b/pkg/recon/sources/commoncrawl_test.go new file mode 100644 index 0000000..6d98966 --- /dev/null +++ b/pkg/recon/sources/commoncrawl_test.go @@ -0,0 +1,168 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func commonCrawlStubHandler(t *testing.T, calls *int32) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(calls, 1) + if r.URL.Query().Get("query") == "" { + t.Errorf("missing query param") + } + w.Header().Set("Content-Type", "application/json") + // NDJSON: one JSON object per line + enc := json.NewEncoder(w) + _ = enc.Encode(ccIndexRecord{URL: "https://example.com/api/config", Timestamp: "20240301120000", Status: "200"}) + _ = enc.Encode(ccIndexRecord{URL: "https://example.com/env.js", Timestamp: "20240301130000", Status: "200"}) + } +} + +func TestCommonCrawl_SweepEmitsFindings(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("commoncrawl", 1000, 100) + + var calls int32 + srv := httptest.NewServer(commonCrawlStubHandler(t, &calls)) + defer srv.Close() + + src := &CommonCrawlSource{ + BaseURL: srv.URL, + IndexName: "CC-MAIN-2024-10", + Registry: reg, + Limiters: lim, + Client: NewClient(), + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + done := make(chan error, 1) + go func() { done <- src.Sweep(ctx, "", out); close(out) }() + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if err := <-done; err != nil { + t.Fatalf("Sweep error: %v", err) + } + + // 2 keywords * 2 results = 4 findings + if len(findings) != 4 { + t.Fatalf("expected 4 findings, got %d", len(findings)) + } + for _, f := range findings { + if f.SourceType != "recon:commoncrawl" { + t.Errorf("SourceType=%q want recon:commoncrawl", f.SourceType) + } + } + if got := atomic.LoadInt32(&calls); got != 2 { + t.Errorf("expected 2 server calls, got %d", got) + } +} + +func TestCommonCrawl_FindingURLs(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("commoncrawl", 1000, 100) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + enc := json.NewEncoder(w) + _ = enc.Encode(ccIndexRecord{URL: "https://target.com/leak.js", Timestamp: "20240101000000", Status: "200"}) + })) + defer srv.Close() + + src := &CommonCrawlSource{ + BaseURL: srv.URL, + IndexName: "CC-MAIN-2024-10", + Registry: reg, + Limiters: lim, + Client: NewClient(), + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + done := make(chan error, 1) + go func() { done <- src.Sweep(ctx, "", out); close(out) }() + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if err := <-done; err != nil { + t.Fatalf("Sweep error: %v", err) + } + + for _, f := range findings { + if f.Source != "https://target.com/leak.js" { + t.Errorf("Source=%q want https://target.com/leak.js", f.Source) + } + } +} + +func TestCommonCrawl_EnabledAlwaysTrue(t *testing.T) { + s := &CommonCrawlSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestCommonCrawl_NameAndRate(t *testing.T) { + s := &CommonCrawlSource{} + if s.Name() != "commoncrawl" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("burst: %d", s.Burst()) + } + if !s.RespectsRobots() { + t.Error("expected RespectsRobots=true") + } +} + +func TestCommonCrawl_CtxCancelled(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("commoncrawl", 1000, 100) + + src := &CommonCrawlSource{ + BaseURL: "http://127.0.0.1:1", + IndexName: "CC-MAIN-2024-10", + Registry: reg, + Limiters: lim, + Client: NewClient(), + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 1) + err := src.Sweep(ctx, "", out) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled, got %v", err) + } +} + +func TestCommonCrawl_NilRegistryNoError(t *testing.T) { + src := &CommonCrawlSource{Client: NewClient()} + out := make(chan recon.Finding, 1) + if err := src.Sweep(context.Background(), "", out); err != nil { + t.Fatalf("expected nil, got %v", err) + } +} diff --git a/pkg/recon/sources/integration_test.go b/pkg/recon/sources/integration_test.go index 5f07a16..cdde951 100644 --- a/pkg/recon/sources/integration_test.go +++ b/pkg/recon/sources/integration_test.go @@ -550,9 +550,16 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) { // helm eng.Register(&HelmSource{BaseURL: srv.URL + "/helm", Registry: reg, Limiters: lim, Client: NewClient()}) - // Sanity: all 40 sources registered. - if n := len(eng.List()); n != 40 { - t.Fatalf("expected 40 sources on engine, got %d: %v", n, eng.List()) + // --- Phase 14: Web archive sources --- + + // wayback + eng.Register(&WaybackMachineSource{BaseURL: srv.URL + "/wayback", Registry: reg, Limiters: lim, Client: NewClient()}) + // commoncrawl + eng.Register(&CommonCrawlSource{BaseURL: srv.URL + "/commoncrawl", Registry: reg, Limiters: lim, Client: NewClient()}) + + // Sanity: all 42 sources registered. + if n := len(eng.List()); n != 42 { + t.Fatalf("expected 42 sources on engine, got %d: %v", n, eng.List()) } ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) @@ -641,8 +648,8 @@ func TestRegisterAll_Phase12(t *testing.T) { }) names := eng.List() - if n := len(names); n != 40 { - t.Fatalf("expected 40 sources from RegisterAll, got %d: %v", n, names) + if n := len(names); n != 42 { + t.Fatalf("expected 42 sources from RegisterAll, got %d: %v", n, names) } // Build lookup for source access. diff --git a/pkg/recon/sources/register.go b/pkg/recon/sources/register.go index 3d56340..7d9a5c6 100644 --- a/pkg/recon/sources/register.go +++ b/pkg/recon/sources/register.go @@ -56,8 +56,9 @@ type SourcesConfig struct { } // RegisterAll registers every Phase 10 code-hosting, Phase 11 search engine / -// paste site, Phase 12 IoT scanner / cloud storage, and Phase 13 package -// registry / container / IaC source on engine (40 sources total). +// paste site, Phase 12 IoT scanner / cloud storage, Phase 13 package +// registry / container / IaC, and Phase 14 web archive source on engine +// (42 sources total). // // All sources are registered unconditionally so that cmd/recon.go can surface // the full catalog via `keyhunter recon list` regardless of which credentials @@ -228,4 +229,8 @@ func RegisterAll(engine *recon.Engine, cfg SourcesConfig) { engine.Register(&KubernetesSource{Registry: reg, Limiters: lim}) engine.Register(&TerraformSource{Registry: reg, Limiters: lim}) engine.Register(&HelmSource{Registry: reg, Limiters: lim}) + + // Phase 14: Web archive sources (credentialless). + engine.Register(&WaybackMachineSource{Registry: reg, Limiters: lim}) + engine.Register(&CommonCrawlSource{Registry: reg, Limiters: lim}) } diff --git a/pkg/recon/sources/register_test.go b/pkg/recon/sources/register_test.go index 6d6d97c..44d07a1 100644 --- a/pkg/recon/sources/register_test.go +++ b/pkg/recon/sources/register_test.go @@ -16,9 +16,9 @@ func registerTestRegistry() *providers.Registry { }) } -// TestRegisterAll_WiresAllFortySources asserts that RegisterAll registers -// every Phase 10 + Phase 11 + Phase 12 + Phase 13 source by its stable name on a fresh engine. -func TestRegisterAll_WiresAllFortySources(t *testing.T) { +// TestRegisterAll_WiresAllFortyTwoSources asserts that RegisterAll registers +// every Phase 10 + Phase 11 + Phase 12 + Phase 13 + Phase 14 source by its stable name on a fresh engine. +func TestRegisterAll_WiresAllFortyTwoSources(t *testing.T) { eng := recon.NewEngine() cfg := SourcesConfig{ Registry: registerTestRegistry(), @@ -36,6 +36,7 @@ func TestRegisterAll_WiresAllFortySources(t *testing.T) { "censys", "codeberg", "codesandbox", + "commoncrawl", "crates", "dockerhub", "duckduckgo", @@ -66,6 +67,7 @@ func TestRegisterAll_WiresAllFortySources(t *testing.T) { "shodan", "spaces", "terraform", + "wayback", "yandex", "zoomeye", } @@ -85,8 +87,8 @@ func TestRegisterAll_MissingCredsStillRegistered(t *testing.T) { Limiters: recon.NewLimiterRegistry(), }) - if n := len(eng.List()); n != 40 { - t.Fatalf("expected 40 sources registered, got %d: %v", n, eng.List()) + if n := len(eng.List()); n != 42 { + t.Fatalf("expected 42 sources registered, got %d: %v", n, eng.List()) } // SweepAll with an empty config should filter out cred-gated sources diff --git a/pkg/recon/sources/wayback.go b/pkg/recon/sources/wayback.go new file mode 100644 index 0000000..82a5f74 --- /dev/null +++ b/pkg/recon/sources/wayback.go @@ -0,0 +1,126 @@ +package sources + +import ( + "bufio" + "context" + "fmt" + "net/http" + "net/url" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// WaybackMachineSource implements recon.ReconSource against the Wayback Machine +// CDX Server API. It queries web.archive.org/cdx/search/cdx for historical +// snapshots of pages matching provider keywords (e.g. domains known to host +// API key documentation or configuration files). +// +// RECON-ARCH-01: Each matching CDX record yields a Finding pointing at the +// archived snapshot URL. The source is credentialless and always enabled. +type WaybackMachineSource struct { + // BaseURL defaults to https://web.archive.org. Tests override with httptest URL. + BaseURL string + // Registry drives the keyword query list via BuildQueries. + Registry *providers.Registry + // Limiters is the shared recon.LimiterRegistry. + Limiters *recon.LimiterRegistry + // Client is the shared retry HTTP wrapper. If nil, a default is used. + Client *Client +} + +// Compile-time assertion that WaybackMachineSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*WaybackMachineSource)(nil) + +func (s *WaybackMachineSource) Name() string { return "wayback" } +func (s *WaybackMachineSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) } +func (s *WaybackMachineSource) Burst() int { return 1 } +func (s *WaybackMachineSource) RespectsRobots() bool { return true } + +// Enabled always returns true: CDX API is unauthenticated. +func (s *WaybackMachineSource) Enabled(_ recon.Config) bool { return true } + +// Sweep iterates provider keywords, queries the CDX API for each, and emits +// a Finding for every archived snapshot URL returned. The CDX API returns +// plain-text lines with space-separated fields; we extract the original URL +// and timestamp to construct the full Wayback snapshot link. +func (s *WaybackMachineSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://web.archive.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "wayback") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // CDX API: output=text, fl=timestamp,original limits response to two fields per line. + // limit=50 keeps the response bounded per keyword. + endpoint := fmt.Sprintf("%s/cdx/search/cdx?url=*&output=text&fl=timestamp,original&limit=50&matchType=prefix&filter=statuscode:200&query=%s", + base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("wayback: build req: %w", err) + } + req.Header.Set("Accept", "text/plain") + + resp, err := client.Do(ctx, req) + if err != nil { + // Non-fatal: skip this keyword on transient errors. + continue + } + + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + // CDX text output: "timestamp original-url" + parts := strings.SplitN(line, " ", 2) + if len(parts) < 2 { + continue + } + ts := parts[0] + origURL := parts[1] + + snapshotURL := fmt.Sprintf("%s/web/%s/%s", base, ts, origURL) + + f := recon.Finding{ + ProviderName: "", + Source: snapshotURL, + SourceType: "recon:wayback", + Confidence: "low", + DetectedAt: time.Now(), + } + select { + case out <- f: + case <-ctx.Done(): + _ = resp.Body.Close() + return ctx.Err() + } + } + _ = resp.Body.Close() + } + return nil +} diff --git a/pkg/recon/sources/wayback_test.go b/pkg/recon/sources/wayback_test.go new file mode 100644 index 0000000..e2ccd6a --- /dev/null +++ b/pkg/recon/sources/wayback_test.go @@ -0,0 +1,168 @@ +package sources + +import ( + "context" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func waybackStubHandler(t *testing.T, calls *int32) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(calls, 1) + if r.URL.Path != "/cdx/search/cdx" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("query") == "" { + t.Errorf("missing query param") + } + w.Header().Set("Content-Type", "text/plain") + // Two CDX records per query: "timestamp original-url" + fmt.Fprintln(w, "20230101120000 https://example.com/config.js") + fmt.Fprintln(w, "20230615080000 https://example.com/env.json") + } +} + +func TestWayback_SweepEmitsFindings(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("wayback", 1000, 100) + + var calls int32 + srv := httptest.NewServer(waybackStubHandler(t, &calls)) + defer srv.Close() + + src := &WaybackMachineSource{ + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + Client: NewClient(), + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + done := make(chan error, 1) + go func() { done <- src.Sweep(ctx, "", out); close(out) }() + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if err := <-done; err != nil { + t.Fatalf("Sweep error: %v", err) + } + + // 2 keywords * 2 results = 4 findings + if len(findings) != 4 { + t.Fatalf("expected 4 findings, got %d", len(findings)) + } + for _, f := range findings { + if f.SourceType != "recon:wayback" { + t.Errorf("SourceType=%q want recon:wayback", f.SourceType) + } + } + if got := atomic.LoadInt32(&calls); got != 2 { + t.Errorf("expected 2 server calls, got %d", got) + } +} + +func TestWayback_SnapshotURL(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("wayback", 1000, 100) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + fmt.Fprintln(w, "20240101000000 https://target.com/page") + })) + defer srv.Close() + + src := &WaybackMachineSource{ + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + Client: NewClient(), + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + done := make(chan error, 1) + go func() { done <- src.Sweep(ctx, "", out); close(out) }() + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if err := <-done; err != nil { + t.Fatalf("Sweep error: %v", err) + } + + // Each finding should have a proper Wayback snapshot URL + for _, f := range findings { + want := srv.URL + "/web/20240101000000/https://target.com/page" + if f.Source != want { + t.Errorf("Source=%q want %q", f.Source, want) + } + } +} + +func TestWayback_EnabledAlwaysTrue(t *testing.T) { + s := &WaybackMachineSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestWayback_NameAndRate(t *testing.T) { + s := &WaybackMachineSource{} + if s.Name() != "wayback" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("burst: %d", s.Burst()) + } + if !s.RespectsRobots() { + t.Error("expected RespectsRobots=true") + } +} + +func TestWayback_CtxCancelled(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("wayback", 1000, 100) + + src := &WaybackMachineSource{ + BaseURL: "http://127.0.0.1:1", + Registry: reg, + Limiters: lim, + Client: NewClient(), + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 1) + err := src.Sweep(ctx, "", out) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled, got %v", err) + } +} + +func TestWayback_NilRegistryNoError(t *testing.T) { + src := &WaybackMachineSource{Client: NewClient()} + out := make(chan recon.Finding, 1) + if err := src.Sweep(context.Background(), "", out); err != nil { + t.Fatalf("expected nil, got %v", err) + } +}