feat(14-02): add WaybackMachine + CommonCrawl recon sources

- WaybackMachineSource queries CDX API for historical snapshots
- CommonCrawlSource queries CC Index API for matching pages
- Both credentialless, rate-limited at 1 req/5s, RespectsRobots=true
- RegisterAll extended to 42 sources (40 Phase 10-13 + 2 Phase 14)
- Full httptest-based test coverage for both sources
This commit is contained in:
salvacybersec
2026-04-06 13:16:13 +03:00
parent dc90785ab0
commit c5332454b0
7 changed files with 626 additions and 12 deletions

View File

@@ -56,8 +56,9 @@ type SourcesConfig struct {
}
// RegisterAll registers every Phase 10 code-hosting, Phase 11 search engine /
// paste site, Phase 12 IoT scanner / cloud storage, and Phase 13 package
// registry / container / IaC source on engine (40 sources total).
// paste site, Phase 12 IoT scanner / cloud storage, Phase 13 package
// registry / container / IaC, and Phase 14 web archive source on engine
// (42 sources total).
//
// All sources are registered unconditionally so that cmd/recon.go can surface
// the full catalog via `keyhunter recon list` regardless of which credentials
@@ -228,4 +229,8 @@ func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
engine.Register(&KubernetesSource{Registry: reg, Limiters: lim})
engine.Register(&TerraformSource{Registry: reg, Limiters: lim})
engine.Register(&HelmSource{Registry: reg, Limiters: lim})
// Phase 14: Web archive sources (credentialless).
engine.Register(&WaybackMachineSource{Registry: reg, Limiters: lim})
engine.Register(&CommonCrawlSource{Registry: reg, Limiters: lim})
}