feat(14-02): add WaybackMachine + CommonCrawl recon sources
- WaybackMachineSource queries CDX API for historical snapshots - CommonCrawlSource queries CC Index API for matching pages - Both credentialless, rate-limited at 1 req/5s, RespectsRobots=true - RegisterAll extended to 42 sources (40 Phase 10-13 + 2 Phase 14) - Full httptest-based test coverage for both sources
This commit is contained in:
@@ -550,9 +550,16 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
|
||||
// helm
|
||||
eng.Register(&HelmSource{BaseURL: srv.URL + "/helm", Registry: reg, Limiters: lim, Client: NewClient()})
|
||||
|
||||
// Sanity: all 40 sources registered.
|
||||
if n := len(eng.List()); n != 40 {
|
||||
t.Fatalf("expected 40 sources on engine, got %d: %v", n, eng.List())
|
||||
// --- Phase 14: Web archive sources ---
|
||||
|
||||
// wayback
|
||||
eng.Register(&WaybackMachineSource{BaseURL: srv.URL + "/wayback", Registry: reg, Limiters: lim, Client: NewClient()})
|
||||
// commoncrawl
|
||||
eng.Register(&CommonCrawlSource{BaseURL: srv.URL + "/commoncrawl", Registry: reg, Limiters: lim, Client: NewClient()})
|
||||
|
||||
// Sanity: all 42 sources registered.
|
||||
if n := len(eng.List()); n != 42 {
|
||||
t.Fatalf("expected 42 sources on engine, got %d: %v", n, eng.List())
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
@@ -641,8 +648,8 @@ func TestRegisterAll_Phase12(t *testing.T) {
|
||||
})
|
||||
|
||||
names := eng.List()
|
||||
if n := len(names); n != 40 {
|
||||
t.Fatalf("expected 40 sources from RegisterAll, got %d: %v", n, names)
|
||||
if n := len(names); n != 42 {
|
||||
t.Fatalf("expected 42 sources from RegisterAll, got %d: %v", n, names)
|
||||
}
|
||||
|
||||
// Build lookup for source access.
|
||||
|
||||
Reference in New Issue
Block a user