feat(14-02): add WaybackMachine + CommonCrawl recon sources

- WaybackMachineSource queries CDX API for historical snapshots
- CommonCrawlSource queries CC Index API for matching pages
- Both credentialless, rate-limited at 1 req/5s, RespectsRobots=true
- RegisterAll extended to 42 sources (40 Phase 10-13 + 2 Phase 14)
- Full httptest-based test coverage for both sources
This commit is contained in:
salvacybersec
2026-04-06 13:16:13 +03:00
parent dc90785ab0
commit c5332454b0
7 changed files with 626 additions and 12 deletions

View File

@@ -16,9 +16,9 @@ func registerTestRegistry() *providers.Registry {
})
}
// TestRegisterAll_WiresAllFortySources asserts that RegisterAll registers
// every Phase 10 + Phase 11 + Phase 12 + Phase 13 source by its stable name on a fresh engine.
func TestRegisterAll_WiresAllFortySources(t *testing.T) {
// TestRegisterAll_WiresAllFortyTwoSources asserts that RegisterAll registers
// every Phase 10 + Phase 11 + Phase 12 + Phase 13 + Phase 14 source by its stable name on a fresh engine.
func TestRegisterAll_WiresAllFortyTwoSources(t *testing.T) {
eng := recon.NewEngine()
cfg := SourcesConfig{
Registry: registerTestRegistry(),
@@ -36,6 +36,7 @@ func TestRegisterAll_WiresAllFortySources(t *testing.T) {
"censys",
"codeberg",
"codesandbox",
"commoncrawl",
"crates",
"dockerhub",
"duckduckgo",
@@ -66,6 +67,7 @@ func TestRegisterAll_WiresAllFortySources(t *testing.T) {
"shodan",
"spaces",
"terraform",
"wayback",
"yandex",
"zoomeye",
}
@@ -85,8 +87,8 @@ func TestRegisterAll_MissingCredsStillRegistered(t *testing.T) {
Limiters: recon.NewLimiterRegistry(),
})
if n := len(eng.List()); n != 40 {
t.Fatalf("expected 40 sources registered, got %d: %v", n, eng.List())
if n := len(eng.List()); n != 42 {
t.Fatalf("expected 42 sources registered, got %d: %v", n, eng.List())
}
// SweepAll with an empty config should filter out cred-gated sources