fix: resolve Phase 14 merge conflicts across CI/CD, archive, and frontend sources

2026-04-06 13:42:54 +03:00
parent 27624e0ec7
commit 4246db8294
14 changed files with 0 additions and 1311 deletions
--- a/pkg/recon/sources/commoncrawl.go
+++ b/pkg/recon/sources/commoncrawl.go
@@ -1,21 +1,12 @@
 package sources

 import (
-<<<<<<< HEAD
-	"bufio"
-	"context"
-	"encoding/json"
-	"fmt"
-	"net/http"
-	"net/url"
-=======
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
->>>>>>> worktree-agent-adad8c10
 	"time"

 	"golang.org/x/time/rate"
@@ -24,50 +15,6 @@ import (
 	"github.com/salvacybersec/keyhunter/pkg/recon"
 )

-<<<<<<< HEAD
-// CommonCrawlSource implements recon.ReconSource against the CommonCrawl
-// Index Server API. It queries index.commoncrawl.org for pages matching
-// provider keywords in the CC index.
-//
-// RECON-ARCH-02: Each matching index record yields a Finding pointing at the
-// original URL discovered in the crawl. The source is credentialless and
-// always enabled.
-type CommonCrawlSource struct {
-	// BaseURL defaults to https://index.commoncrawl.org. Tests override with httptest URL.
-	BaseURL string
-	// IndexName defaults to CC-MAIN-2024-10 (recent crawl). Tests may override.
-	IndexName string
-	// Registry drives the keyword query list via BuildQueries.
-	Registry *providers.Registry
-	// Limiters is the shared recon.LimiterRegistry.
-	Limiters *recon.LimiterRegistry
-	// Client is the shared retry HTTP wrapper. If nil, a default is used.
-	Client *Client
-}
-
-// Compile-time assertion that CommonCrawlSource satisfies recon.ReconSource.
-var _ recon.ReconSource = (*CommonCrawlSource)(nil)
-
-func (s *CommonCrawlSource) Name() string          { return "commoncrawl" }
-func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) }
-func (s *CommonCrawlSource) Burst() int            { return 1 }
-func (s *CommonCrawlSource) RespectsRobots() bool  { return true }
-
-// Enabled always returns true: CommonCrawl index is unauthenticated.
-func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true }
-
-// Sweep iterates provider keywords, queries the CC index for each, and emits
-// a Finding for every matched URL. The CC Index API returns NDJSON (one JSON
-// object per line) with fields like url, timestamp, status, mime, etc.
-func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
-	base := s.BaseURL
-	if base == "" {
-		base = "https://index.commoncrawl.org"
-	}
-	idx := s.IndexName
-	if idx == "" {
-		idx = "CC-MAIN-2024-10"
-=======
 // CommonCrawlSource searches the Common Crawl index for web pages that may
 // contain leaked API keys. Common Crawl archives petabytes of web content;
 // its CDX API allows searching by URL pattern to find pages that historically
@@ -101,7 +48,6 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco
 	base := s.BaseURL
 	if base == "" {
 		base = "https://index.commoncrawl.org/CC-MAIN-2024-10-index"
->>>>>>> worktree-agent-adad8c10
 	}
 	client := s.Client
 	if client == nil {
@@ -124,49 +70,16 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco
 			}
 		}

-<<<<<<< HEAD
-		// CC Index API: output=json returns NDJSON, limit=50 bounds the response.
-		endpoint := fmt.Sprintf("%s/%s-index?url=*&output=json&limit=50&filter=status:200&query=%s",
-			base, idx, url.QueryEscape(q))
-		req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
-		if err != nil {
-			return fmt.Errorf("commoncrawl: build req: %w", err)
-=======
 		// CDX API: search for URLs matching the query.
 		searchURL := fmt.Sprintf("%s?url=*%s*&output=json&limit=10", base, q)
 		req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
 		if err != nil {
 			continue
->>>>>>> worktree-agent-adad8c10
 		}
 		req.Header.Set("Accept", "application/json")

 		resp, err := client.Do(ctx, req)
 		if err != nil {
-<<<<<<< HEAD
-			// Non-fatal: skip this keyword on transient errors.
-			continue
-		}
-
-		scanner := bufio.NewScanner(resp.Body)
-		for scanner.Scan() {
-			line := scanner.Bytes()
-			if len(line) == 0 {
-				continue
-			}
-
-			var rec ccIndexRecord
-			if err := json.Unmarshal(line, &rec); err != nil {
-				continue
-			}
-			if rec.URL == "" {
-				continue
-			}
-
-			f := recon.Finding{
-				ProviderName: "",
-				Source:       rec.URL,
-=======
 			continue
 		}

@@ -197,35 +110,11 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco
 			out <- recon.Finding{
 				ProviderName: q,
 				Source:       r.URL,
->>>>>>> worktree-agent-adad8c10
 				SourceType:   "recon:commoncrawl",
 				Confidence:   "low",
 				DetectedAt:   time.Now(),
 			}
-<<<<<<< HEAD
-			select {
-			case out <- f:
-			case <-ctx.Done():
-				_ = resp.Body.Close()
-				return ctx.Err()
-			}
-		}
-		_ = resp.Body.Close()
-	}
-	return nil
-}
-
-// ccIndexRecord mirrors the subset of fields returned by the CommonCrawl Index
-// API that this source consumes. Additional fields (mime, status, digest, etc.)
-// are ignored to keep the decoder tolerant.
-type ccIndexRecord struct {
-	URL       string `json:"url"`
-	Timestamp string `json:"timestamp"`
-	Status    string `json:"status"`
-}
-=======
 		}
 	}
 	return nil
 }
->>>>>>> worktree-agent-adad8c10