feat(14-02): add WaybackMachine + CommonCrawl recon sources

- WaybackMachineSource queries CDX API for historical snapshots
- CommonCrawlSource queries CC Index API for matching pages
- Both credentialless, rate-limited at 1 req/5s, RespectsRobots=true
- RegisterAll extended to 42 sources (40 Phase 10-13 + 2 Phase 14)
- Full httptest-based test coverage for both sources
This commit is contained in:
salvacybersec
2026-04-06 13:16:13 +03:00
parent dc90785ab0
commit c5332454b0
7 changed files with 626 additions and 12 deletions

View File

@@ -550,9 +550,16 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
// helm
eng.Register(&HelmSource{BaseURL: srv.URL + "/helm", Registry: reg, Limiters: lim, Client: NewClient()})
// Sanity: all 40 sources registered.
if n := len(eng.List()); n != 40 {
t.Fatalf("expected 40 sources on engine, got %d: %v", n, eng.List())
// --- Phase 14: Web archive sources ---
// wayback
eng.Register(&WaybackMachineSource{BaseURL: srv.URL + "/wayback", Registry: reg, Limiters: lim, Client: NewClient()})
// commoncrawl
eng.Register(&CommonCrawlSource{BaseURL: srv.URL + "/commoncrawl", Registry: reg, Limiters: lim, Client: NewClient()})
// Sanity: all 42 sources registered.
if n := len(eng.List()); n != 42 {
t.Fatalf("expected 42 sources on engine, got %d: %v", n, eng.List())
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@@ -641,8 +648,8 @@ func TestRegisterAll_Phase12(t *testing.T) {
})
names := eng.List()
if n := len(names); n != 40 {
t.Fatalf("expected 40 sources from RegisterAll, got %d: %v", n, names)
if n := len(names); n != 42 {
t.Fatalf("expected 42 sources from RegisterAll, got %d: %v", n, names)
}
// Build lookup for source access.