fix: resolve Phase 14 merge conflicts across CI/CD, archive, and frontend sources

This commit is contained in:
salvacybersec
2026-04-06 13:42:54 +03:00
parent 27624e0ec7
commit 4246db8294
14 changed files with 0 additions and 1311 deletions

View File

@@ -1,21 +1,12 @@
package sources
import (
<<<<<<< HEAD
"bufio"
"context"
"encoding/json"
"fmt"
"net/http"
"net/url"
=======
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
>>>>>>> worktree-agent-adad8c10
"time"
"golang.org/x/time/rate"
@@ -24,50 +15,6 @@ import (
"github.com/salvacybersec/keyhunter/pkg/recon"
)
<<<<<<< HEAD
// CommonCrawlSource implements recon.ReconSource against the CommonCrawl
// Index Server API. It queries index.commoncrawl.org for pages matching
// provider keywords in the CC index.
//
// RECON-ARCH-02: Each matching index record yields a Finding pointing at the
// original URL discovered in the crawl. The source is credentialless and
// always enabled.
type CommonCrawlSource struct {
// BaseURL defaults to https://index.commoncrawl.org. Tests override with httptest URL.
BaseURL string
// IndexName defaults to CC-MAIN-2024-10 (recent crawl). Tests may override.
IndexName string
// Registry drives the keyword query list via BuildQueries.
Registry *providers.Registry
// Limiters is the shared recon.LimiterRegistry.
Limiters *recon.LimiterRegistry
// Client is the shared retry HTTP wrapper. If nil, a default is used.
Client *Client
}
// Compile-time assertion that CommonCrawlSource satisfies recon.ReconSource.
var _ recon.ReconSource = (*CommonCrawlSource)(nil)
func (s *CommonCrawlSource) Name() string { return "commoncrawl" }
func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) }
func (s *CommonCrawlSource) Burst() int { return 1 }
func (s *CommonCrawlSource) RespectsRobots() bool { return true }
// Enabled always returns true: CommonCrawl index is unauthenticated.
func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true }
// Sweep iterates provider keywords, queries the CC index for each, and emits
// a Finding for every matched URL. The CC Index API returns NDJSON (one JSON
// object per line) with fields like url, timestamp, status, mime, etc.
func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
if base == "" {
base = "https://index.commoncrawl.org"
}
idx := s.IndexName
if idx == "" {
idx = "CC-MAIN-2024-10"
=======
// CommonCrawlSource searches the Common Crawl index for web pages that may
// contain leaked API keys. Common Crawl archives petabytes of web content;
// its CDX API allows searching by URL pattern to find pages that historically
@@ -101,7 +48,6 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco
base := s.BaseURL
if base == "" {
base = "https://index.commoncrawl.org/CC-MAIN-2024-10-index"
>>>>>>> worktree-agent-adad8c10
}
client := s.Client
if client == nil {
@@ -124,49 +70,16 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco
}
}
<<<<<<< HEAD
// CC Index API: output=json returns NDJSON, limit=50 bounds the response.
endpoint := fmt.Sprintf("%s/%s-index?url=*&output=json&limit=50&filter=status:200&query=%s",
base, idx, url.QueryEscape(q))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return fmt.Errorf("commoncrawl: build req: %w", err)
=======
// CDX API: search for URLs matching the query.
searchURL := fmt.Sprintf("%s?url=*%s*&output=json&limit=10", base, q)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
continue
>>>>>>> worktree-agent-adad8c10
}
req.Header.Set("Accept", "application/json")
resp, err := client.Do(ctx, req)
if err != nil {
<<<<<<< HEAD
// Non-fatal: skip this keyword on transient errors.
continue
}
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := scanner.Bytes()
if len(line) == 0 {
continue
}
var rec ccIndexRecord
if err := json.Unmarshal(line, &rec); err != nil {
continue
}
if rec.URL == "" {
continue
}
f := recon.Finding{
ProviderName: "",
Source: rec.URL,
=======
continue
}
@@ -197,35 +110,11 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco
out <- recon.Finding{
ProviderName: q,
Source: r.URL,
>>>>>>> worktree-agent-adad8c10
SourceType: "recon:commoncrawl",
Confidence: "low",
DetectedAt: time.Now(),
}
<<<<<<< HEAD
select {
case out <- f:
case <-ctx.Done():
_ = resp.Body.Close()
return ctx.Err()
}
}
_ = resp.Body.Close()
}
return nil
}
// ccIndexRecord mirrors the subset of fields returned by the CommonCrawl Index
// API that this source consumes. Additional fields (mime, status, digest, etc.)
// are ignored to keep the decoder tolerant.
type ccIndexRecord struct {
URL string `json:"url"`
Timestamp string `json:"timestamp"`
Status string `json:"status"`
}
=======
}
}
return nil
}
>>>>>>> worktree-agent-adad8c10