diff --git a/cmd/recon.go b/cmd/recon.go index efc5aa5..49a285b 100644 --- a/cmd/recon.go +++ b/cmd/recon.go @@ -167,11 +167,7 @@ func buildReconEngine() *recon.Engine { FOFAAPIKey: firstNonEmpty(os.Getenv("FOFA_API_KEY"), viper.GetString("recon.fofa.api_key")), NetlasAPIKey: firstNonEmpty(os.Getenv("NETLAS_API_KEY"), viper.GetString("recon.netlas.api_key")), BinaryEdgeAPIKey: firstNonEmpty(os.Getenv("BINARYEDGE_API_KEY"), viper.GetString("recon.binaryedge.api_key")), -<<<<<<< HEAD - CircleCIToken: firstNonEmpty(os.Getenv("CIRCLECI_TOKEN"), viper.GetString("recon.circleci.token")), -======= CircleCIToken: firstNonEmpty(os.Getenv("CIRCLECI_TOKEN"), viper.GetString("recon.circleci.token")), ->>>>>>> worktree-agent-adad8c10 } sources.RegisterAll(e, cfg) return e diff --git a/pkg/recon/sources/circleci.go b/pkg/recon/sources/circleci.go index 57befec..e4a44cb 100644 --- a/pkg/recon/sources/circleci.go +++ b/pkg/recon/sources/circleci.go @@ -4,13 +4,8 @@ import ( "context" "encoding/json" "fmt" -<<<<<<< HEAD - "net/http" - "strings" -======= "io" "net/http" ->>>>>>> worktree-agent-adad8c10 "time" "golang.org/x/time/rate" @@ -19,16 +14,10 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -// CircleCISource searches public CircleCI build logs for leaked API keys. -// It queries the CircleCI v2 API for recent pipeline workflows. Requires a -// CircleCI API token for authenticated access. -======= // CircleCISource scrapes CircleCI build logs for leaked API keys. // CircleCI exposes build logs via its API; a personal API token is required // to access build artifacts and logs. Misconfigured pipelines often leak // secrets in build output. ->>>>>>> worktree-agent-adad8c10 type CircleCISource struct { Token string BaseURL string @@ -39,32 +28,6 @@ type CircleCISource struct { var _ recon.ReconSource = (*CircleCISource)(nil) -<<<<<<< HEAD -func (s *CircleCISource) Name() string { return "circleci" } -func (s *CircleCISource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } -func (s *CircleCISource) Burst() int { return 2 } -func (s *CircleCISource) RespectsRobots() bool { return false } -func (s *CircleCISource) Enabled(_ recon.Config) bool { return s.Token != "" } - -type circlePipelineResponse struct { - Items []circlePipeline `json:"items"` -} - -type circlePipeline struct { - ID string `json:"id"` - VCS circleVCSInfo `json:"vcs"` -} - -type circleVCSInfo struct { - ProviderName string `json:"provider_name"` - RepoName string `json:"target_repository_url"` -} - -func (s *CircleCISource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { - if s.Token == "" { - return nil - } -======= func (s *CircleCISource) Name() string { return "circleci" } func (s *CircleCISource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } func (s *CircleCISource) Burst() int { return 2 } @@ -84,7 +47,6 @@ type circleciPipeline struct { } func (s *CircleCISource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { ->>>>>>> worktree-agent-adad8c10 base := s.BaseURL if base == "" { base = "https://circleci.com/api/v2" @@ -95,78 +57,32 @@ func (s *CircleCISource) Sweep(ctx context.Context, _ string, out chan<- recon.F } queries := BuildQueries(s.Registry, "circleci") -<<<<<<< HEAD - kwIndex := circleKeywordIndex(s.Registry) -======= if len(queries) == 0 { return nil } ->>>>>>> worktree-agent-adad8c10 for _, q := range queries { if err := ctx.Err(); err != nil { return err } -<<<<<<< HEAD -======= ->>>>>>> worktree-agent-adad8c10 if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } -<<<<<<< HEAD - endpoint := fmt.Sprintf("%s/pipeline?mine=false", base) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) - if err != nil { - return fmt.Errorf("circleci: build request: %w", err) -======= // Search for pipelines by project slug (query is used as slug hint). searchURL := fmt.Sprintf("%s/project/gh/%s/pipeline?limit=5", base, q) req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) if err != nil { continue ->>>>>>> worktree-agent-adad8c10 } req.Header.Set("Circle-Token", s.Token) req.Header.Set("Accept", "application/json") resp, err := client.Do(ctx, req) if err != nil { -<<<<<<< HEAD - if strings.Contains(err.Error(), "unauthorized") { - return err - } - continue - } - - var result circlePipelineResponse - decErr := json.NewDecoder(resp.Body).Decode(&result) - _ = resp.Body.Close() - if decErr != nil { - continue - } - - provName := kwIndex[strings.ToLower(q)] - for _, p := range result.Items { - source := fmt.Sprintf("https://app.circleci.com/pipelines/%s", p.ID) - if p.VCS.RepoName != "" { - source = p.VCS.RepoName - } - f := recon.Finding{ - ProviderName: provName, - Confidence: "low", - Source: source, - SourceType: "recon:circleci", - DetectedAt: time.Now(), - } - select { - case out <- f: - case <-ctx.Done(): - return ctx.Err() -======= continue } @@ -216,30 +132,8 @@ func (s *CircleCISource) Sweep(ctx context.Context, _ string, out chan<- recon.F Confidence: "medium", DetectedAt: time.Now(), } ->>>>>>> worktree-agent-adad8c10 } } } return nil } -<<<<<<< HEAD - -func circleKeywordIndex(reg *providers.Registry) map[string]string { - m := make(map[string]string) - if reg == nil { - return m - } - for _, p := range reg.List() { - for _, k := range p.Keywords { - kl := strings.ToLower(strings.TrimSpace(k)) - if kl != "" { - if _, exists := m[kl]; !exists { - m[kl] = p.Name - } - } - } - } - return m -} -======= ->>>>>>> worktree-agent-adad8c10 diff --git a/pkg/recon/sources/circleci_test.go b/pkg/recon/sources/circleci_test.go index 4bc27c0..357d055 100644 --- a/pkg/recon/sources/circleci_test.go +++ b/pkg/recon/sources/circleci_test.go @@ -11,52 +11,6 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -const circleFixtureJSON = `{ - "items": [ - { - "id": "pipeline-uuid-1", - "vcs": {"provider_name": "github", "target_repository_url": "https://github.com/alice/repo"} - }, - { - "id": "pipeline-uuid-2", - "vcs": {"provider_name": "github", "target_repository_url": ""} - } - ] -}` - -func TestCircleCI_Sweep_ExtractsFindings(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/pipeline" { - t.Errorf("unexpected path: %s", r.URL.Path) - } - if r.Header.Get("Circle-Token") == "" { - t.Error("missing Circle-Token header") - } - w.Header().Set("Content-Type", "application/json") - _, _ = w.Write([]byte(circleFixtureJSON)) - })) - defer srv.Close() - - src := &CircleCISource{ - Token: "test-token", - BaseURL: srv.URL, - Registry: providers.NewRegistryFromProviders([]providers.Provider{ - {Name: "openai", Keywords: []string{"sk-proj-"}}, - }), - Limiters: recon.NewLimiterRegistry(), - Client: NewClient(), - } - - out := make(chan recon.Finding, 16) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - if err := src.Sweep(ctx, "", out); err != nil { - t.Fatalf("Sweep err: %v", err) - } - close(out) -======= func TestCircleCI_Name(t *testing.T) { s := &CircleCISource{} if s.Name() != "circleci" { @@ -110,57 +64,15 @@ Tests completed successfully`)) if err != nil { t.Fatalf("Sweep error: %v", err) } ->>>>>>> worktree-agent-adad8c10 var findings []recon.Finding for f := range out { findings = append(findings, f) } -<<<<<<< HEAD - if len(findings) != 2 { - t.Fatalf("expected 2 findings, got %d", len(findings)) - } - // First pipeline has VCS URL, second falls back to app URL. - if findings[0].Source != "https://github.com/alice/repo" { - t.Errorf("unexpected source[0]: %s", findings[0].Source) - } - if findings[1].Source != "https://app.circleci.com/pipelines/pipeline-uuid-2" { - t.Errorf("unexpected source[1]: %s", findings[1].Source) - } - for _, f := range findings { - if f.SourceType != "recon:circleci" { - t.Errorf("unexpected SourceType: %s", f.SourceType) - } - } -} - -func TestCircleCI_EnabledOnlyWithToken(t *testing.T) { - s := &CircleCISource{} - if s.Enabled(recon.Config{}) { - t.Fatal("expected Enabled=false without token") - } - s.Token = "test" - if !s.Enabled(recon.Config{}) { - t.Fatal("expected Enabled=true with token") - } -} - -func TestCircleCI_NameAndRate(t *testing.T) { - s := &CircleCISource{} - if s.Name() != "circleci" { - t.Errorf("unexpected name: %s", s.Name()) - } - if s.Burst() != 2 { - t.Errorf("burst: %d", s.Burst()) - } - if s.RespectsRobots() { - t.Error("expected RespectsRobots=false") -======= if len(findings) == 0 { t.Fatal("expected at least one finding from CircleCI pipeline log") } if findings[0].SourceType != "recon:circleci" { t.Fatalf("expected recon:circleci, got %s", findings[0].SourceType) ->>>>>>> worktree-agent-adad8c10 } } diff --git a/pkg/recon/sources/commoncrawl.go b/pkg/recon/sources/commoncrawl.go index c40324b..ec88de0 100644 --- a/pkg/recon/sources/commoncrawl.go +++ b/pkg/recon/sources/commoncrawl.go @@ -1,21 +1,12 @@ package sources import ( -<<<<<<< HEAD - "bufio" - "context" - "encoding/json" - "fmt" - "net/http" - "net/url" -======= "bytes" "context" "encoding/json" "fmt" "io" "net/http" ->>>>>>> worktree-agent-adad8c10 "time" "golang.org/x/time/rate" @@ -24,50 +15,6 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -// CommonCrawlSource implements recon.ReconSource against the CommonCrawl -// Index Server API. It queries index.commoncrawl.org for pages matching -// provider keywords in the CC index. -// -// RECON-ARCH-02: Each matching index record yields a Finding pointing at the -// original URL discovered in the crawl. The source is credentialless and -// always enabled. -type CommonCrawlSource struct { - // BaseURL defaults to https://index.commoncrawl.org. Tests override with httptest URL. - BaseURL string - // IndexName defaults to CC-MAIN-2024-10 (recent crawl). Tests may override. - IndexName string - // Registry drives the keyword query list via BuildQueries. - Registry *providers.Registry - // Limiters is the shared recon.LimiterRegistry. - Limiters *recon.LimiterRegistry - // Client is the shared retry HTTP wrapper. If nil, a default is used. - Client *Client -} - -// Compile-time assertion that CommonCrawlSource satisfies recon.ReconSource. -var _ recon.ReconSource = (*CommonCrawlSource)(nil) - -func (s *CommonCrawlSource) Name() string { return "commoncrawl" } -func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) } -func (s *CommonCrawlSource) Burst() int { return 1 } -func (s *CommonCrawlSource) RespectsRobots() bool { return true } - -// Enabled always returns true: CommonCrawl index is unauthenticated. -func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true } - -// Sweep iterates provider keywords, queries the CC index for each, and emits -// a Finding for every matched URL. The CC Index API returns NDJSON (one JSON -// object per line) with fields like url, timestamp, status, mime, etc. -func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { - base := s.BaseURL - if base == "" { - base = "https://index.commoncrawl.org" - } - idx := s.IndexName - if idx == "" { - idx = "CC-MAIN-2024-10" -======= // CommonCrawlSource searches the Common Crawl index for web pages that may // contain leaked API keys. Common Crawl archives petabytes of web content; // its CDX API allows searching by URL pattern to find pages that historically @@ -101,7 +48,6 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco base := s.BaseURL if base == "" { base = "https://index.commoncrawl.org/CC-MAIN-2024-10-index" ->>>>>>> worktree-agent-adad8c10 } client := s.Client if client == nil { @@ -124,49 +70,16 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco } } -<<<<<<< HEAD - // CC Index API: output=json returns NDJSON, limit=50 bounds the response. - endpoint := fmt.Sprintf("%s/%s-index?url=*&output=json&limit=50&filter=status:200&query=%s", - base, idx, url.QueryEscape(q)) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) - if err != nil { - return fmt.Errorf("commoncrawl: build req: %w", err) -======= // CDX API: search for URLs matching the query. searchURL := fmt.Sprintf("%s?url=*%s*&output=json&limit=10", base, q) req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) if err != nil { continue ->>>>>>> worktree-agent-adad8c10 } req.Header.Set("Accept", "application/json") resp, err := client.Do(ctx, req) if err != nil { -<<<<<<< HEAD - // Non-fatal: skip this keyword on transient errors. - continue - } - - scanner := bufio.NewScanner(resp.Body) - for scanner.Scan() { - line := scanner.Bytes() - if len(line) == 0 { - continue - } - - var rec ccIndexRecord - if err := json.Unmarshal(line, &rec); err != nil { - continue - } - if rec.URL == "" { - continue - } - - f := recon.Finding{ - ProviderName: "", - Source: rec.URL, -======= continue } @@ -197,35 +110,11 @@ func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- reco out <- recon.Finding{ ProviderName: q, Source: r.URL, ->>>>>>> worktree-agent-adad8c10 SourceType: "recon:commoncrawl", Confidence: "low", DetectedAt: time.Now(), } -<<<<<<< HEAD - select { - case out <- f: - case <-ctx.Done(): - _ = resp.Body.Close() - return ctx.Err() - } - } - _ = resp.Body.Close() - } - return nil -} - -// ccIndexRecord mirrors the subset of fields returned by the CommonCrawl Index -// API that this source consumes. Additional fields (mime, status, digest, etc.) -// are ignored to keep the decoder tolerant. -type ccIndexRecord struct { - URL string `json:"url"` - Timestamp string `json:"timestamp"` - Status string `json:"status"` -} -======= } } return nil } ->>>>>>> worktree-agent-adad8c10 diff --git a/pkg/recon/sources/commoncrawl_test.go b/pkg/recon/sources/commoncrawl_test.go index bf18610..a29fcb6 100644 --- a/pkg/recon/sources/commoncrawl_test.go +++ b/pkg/recon/sources/commoncrawl_test.go @@ -2,170 +2,6 @@ package sources import ( "context" -<<<<<<< HEAD - "encoding/json" - "errors" - "net/http" - "net/http/httptest" - "sync/atomic" - "testing" - "time" - - "github.com/salvacybersec/keyhunter/pkg/recon" -) - -func commonCrawlStubHandler(t *testing.T, calls *int32) http.HandlerFunc { - t.Helper() - return func(w http.ResponseWriter, r *http.Request) { - atomic.AddInt32(calls, 1) - if r.URL.Query().Get("query") == "" { - t.Errorf("missing query param") - } - w.Header().Set("Content-Type", "application/json") - // NDJSON: one JSON object per line - enc := json.NewEncoder(w) - _ = enc.Encode(ccIndexRecord{URL: "https://example.com/api/config", Timestamp: "20240301120000", Status: "200"}) - _ = enc.Encode(ccIndexRecord{URL: "https://example.com/env.js", Timestamp: "20240301130000", Status: "200"}) - } -} - -func TestCommonCrawl_SweepEmitsFindings(t *testing.T) { - reg := syntheticRegistry() - lim := recon.NewLimiterRegistry() - _ = lim.For("commoncrawl", 1000, 100) - - var calls int32 - srv := httptest.NewServer(commonCrawlStubHandler(t, &calls)) - defer srv.Close() - - src := &CommonCrawlSource{ - BaseURL: srv.URL, - IndexName: "CC-MAIN-2024-10", - Registry: reg, - Limiters: lim, - Client: NewClient(), - } - - out := make(chan recon.Finding, 32) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - done := make(chan error, 1) - go func() { done <- src.Sweep(ctx, "", out); close(out) }() - - var findings []recon.Finding - for f := range out { - findings = append(findings, f) - } - if err := <-done; err != nil { - t.Fatalf("Sweep error: %v", err) - } - - // 2 keywords * 2 results = 4 findings - if len(findings) != 4 { - t.Fatalf("expected 4 findings, got %d", len(findings)) - } - for _, f := range findings { - if f.SourceType != "recon:commoncrawl" { - t.Errorf("SourceType=%q want recon:commoncrawl", f.SourceType) - } - } - if got := atomic.LoadInt32(&calls); got != 2 { - t.Errorf("expected 2 server calls, got %d", got) - } -} - -func TestCommonCrawl_FindingURLs(t *testing.T) { - reg := syntheticRegistry() - lim := recon.NewLimiterRegistry() - _ = lim.For("commoncrawl", 1000, 100) - - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - enc := json.NewEncoder(w) - _ = enc.Encode(ccIndexRecord{URL: "https://target.com/leak.js", Timestamp: "20240101000000", Status: "200"}) - })) - defer srv.Close() - - src := &CommonCrawlSource{ - BaseURL: srv.URL, - IndexName: "CC-MAIN-2024-10", - Registry: reg, - Limiters: lim, - Client: NewClient(), - } - - out := make(chan recon.Finding, 32) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - done := make(chan error, 1) - go func() { done <- src.Sweep(ctx, "", out); close(out) }() - - var findings []recon.Finding - for f := range out { - findings = append(findings, f) - } - if err := <-done; err != nil { - t.Fatalf("Sweep error: %v", err) - } - - for _, f := range findings { - if f.Source != "https://target.com/leak.js" { - t.Errorf("Source=%q want https://target.com/leak.js", f.Source) - } - } -} - -func TestCommonCrawl_EnabledAlwaysTrue(t *testing.T) { - s := &CommonCrawlSource{} - if !s.Enabled(recon.Config{}) { - t.Fatal("expected Enabled=true") - } -} - -func TestCommonCrawl_NameAndRate(t *testing.T) { - s := &CommonCrawlSource{} - if s.Name() != "commoncrawl" { - t.Errorf("unexpected name: %s", s.Name()) - } - if s.Burst() != 1 { - t.Errorf("burst: %d", s.Burst()) - } - if !s.RespectsRobots() { - t.Error("expected RespectsRobots=true") - } -} - -func TestCommonCrawl_CtxCancelled(t *testing.T) { - reg := syntheticRegistry() - lim := recon.NewLimiterRegistry() - _ = lim.For("commoncrawl", 1000, 100) - - src := &CommonCrawlSource{ - BaseURL: "http://127.0.0.1:1", - IndexName: "CC-MAIN-2024-10", - Registry: reg, - Limiters: lim, - Client: NewClient(), - } - - ctx, cancel := context.WithCancel(context.Background()) - cancel() - - out := make(chan recon.Finding, 1) - err := src.Sweep(ctx, "", out) - if !errors.Is(err, context.Canceled) { - t.Fatalf("expected context.Canceled, got %v", err) - } -} - -func TestCommonCrawl_NilRegistryNoError(t *testing.T) { - src := &CommonCrawlSource{Client: NewClient()} - out := make(chan recon.Finding, 1) - if err := src.Sweep(context.Background(), "", out); err != nil { - t.Fatalf("expected nil, got %v", err) -======= "net/http" "net/http/httptest" "testing" @@ -230,6 +66,5 @@ func TestCommonCrawl_Sweep(t *testing.T) { } if findings[0].SourceType != "recon:commoncrawl" { t.Fatalf("expected recon:commoncrawl, got %s", findings[0].SourceType) ->>>>>>> worktree-agent-adad8c10 } } diff --git a/pkg/recon/sources/githubactions.go b/pkg/recon/sources/githubactions.go index a7bf2e1..4fedab9 100644 --- a/pkg/recon/sources/githubactions.go +++ b/pkg/recon/sources/githubactions.go @@ -4,14 +4,8 @@ import ( "context" "encoding/json" "fmt" -<<<<<<< HEAD - "net/http" - "net/url" - "strings" -======= "io" "net/http" ->>>>>>> worktree-agent-adad8c10 "time" "golang.org/x/time/rate" @@ -21,56 +15,19 @@ import ( ) // GitHubActionsSource searches GitHub Actions workflow run logs for leaked API -<<<<<<< HEAD -// keys. It queries the GitHub REST API for workflow runs matching provider -// keywords (via the repository search endpoint) and emits findings for each -// matching run. Requires a GitHub token (same as GitHubSource). -======= // keys. Workflow logs are public for public repositories and frequently contain // accidentally printed secrets, debug output with credentials, or insecure // echo statements that expose environment variables. ->>>>>>> worktree-agent-adad8c10 type GitHubActionsSource struct { Token string BaseURL string Registry *providers.Registry Limiters *recon.LimiterRegistry -<<<<<<< HEAD - client *Client -======= Client *Client ->>>>>>> worktree-agent-adad8c10 } var _ recon.ReconSource = (*GitHubActionsSource)(nil) -<<<<<<< HEAD -func (s *GitHubActionsSource) Name() string { return "github_actions" } -func (s *GitHubActionsSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } -func (s *GitHubActionsSource) Burst() int { return 2 } -func (s *GitHubActionsSource) RespectsRobots() bool { return false } -func (s *GitHubActionsSource) Enabled(_ recon.Config) bool { return s.Token != "" } - -// ghActionsSearchResponse models the GitHub code search response when looking -// for workflow files containing provider keywords. -type ghActionsSearchResponse struct { - Items []ghActionsItem `json:"items"` -} - -type ghActionsItem struct { - HTMLURL string `json:"html_url"` - Repository ghActionsRepository `json:"repository"` -} - -type ghActionsRepository struct { - FullName string `json:"full_name"` -} - -func (s *GitHubActionsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { - if s.Token == "" { - return nil - } -======= func (s *GitHubActionsSource) Name() string { return "ghactions" } func (s *GitHubActionsSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } func (s *GitHubActionsSource) Burst() int { return 3 } @@ -93,19 +50,10 @@ type ghActionsRun struct { } func (s *GitHubActionsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { ->>>>>>> worktree-agent-adad8c10 base := s.BaseURL if base == "" { base = "https://api.github.com" } -<<<<<<< HEAD - if s.client == nil { - s.client = NewClient() - } - - queries := BuildQueries(s.Registry, "github_actions") - kwIndex := ghActionsKeywordIndex(s.Registry) -======= client := s.Client if client == nil { client = NewClient() @@ -115,62 +63,18 @@ func (s *GitHubActionsSource) Sweep(ctx context.Context, _ string, out chan<- re if len(queries) == 0 { return nil } ->>>>>>> worktree-agent-adad8c10 for _, q := range queries { if err := ctx.Err(); err != nil { return err } -<<<<<<< HEAD -======= ->>>>>>> worktree-agent-adad8c10 if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } -<<<<<<< HEAD - // Search for workflow YAML files referencing the keyword. - endpoint := fmt.Sprintf("%s/search/code?q=%s+path:.github/workflows+extension:yml&per_page=20", - base, url.QueryEscape(q)) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) - if err != nil { - return fmt.Errorf("github_actions: build request: %w", err) - } - req.Header.Set("Authorization", "Bearer "+s.Token) - req.Header.Set("Accept", "application/vnd.github+json") - - resp, err := s.client.Do(ctx, req) - if err != nil { - if strings.Contains(err.Error(), "unauthorized") { - return err - } - continue - } - - var result ghActionsSearchResponse - decErr := json.NewDecoder(resp.Body).Decode(&result) - _ = resp.Body.Close() - if decErr != nil { - continue - } - - provName := kwIndex[strings.ToLower(q)] - for _, item := range result.Items { - f := recon.Finding{ - ProviderName: provName, - Confidence: "low", - Source: item.HTMLURL, - SourceType: "recon:github_actions", - DetectedAt: time.Now(), - } - select { - case out <- f: - case <-ctx.Done(): - return ctx.Err() -======= // Search for workflow runs via the Actions API. searchURL := fmt.Sprintf("%s/search/code?q=%s+path:.github/workflows", base, q) req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) @@ -231,30 +135,8 @@ func (s *GitHubActionsSource) Sweep(ctx context.Context, _ string, out chan<- re Confidence: "medium", DetectedAt: time.Now(), } ->>>>>>> worktree-agent-adad8c10 } } } return nil } -<<<<<<< HEAD - -func ghActionsKeywordIndex(reg *providers.Registry) map[string]string { - m := make(map[string]string) - if reg == nil { - return m - } - for _, p := range reg.List() { - for _, k := range p.Keywords { - kl := strings.ToLower(strings.TrimSpace(k)) - if kl != "" { - if _, exists := m[kl]; !exists { - m[kl] = p.Name - } - } - } - } - return m -} -======= ->>>>>>> worktree-agent-adad8c10 diff --git a/pkg/recon/sources/githubactions_test.go b/pkg/recon/sources/githubactions_test.go index 3395b0a..e3bb53e 100644 --- a/pkg/recon/sources/githubactions_test.go +++ b/pkg/recon/sources/githubactions_test.go @@ -2,11 +2,8 @@ package sources import ( "context" -<<<<<<< HEAD -======= "encoding/json" "fmt" ->>>>>>> worktree-agent-adad8c10 "net/http" "net/http/httptest" "testing" @@ -16,52 +13,6 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -const ghActionsFixtureJSON = `{ - "items": [ - { - "html_url": "https://github.com/alice/repo/blob/main/.github/workflows/ci.yml", - "repository": {"full_name": "alice/repo"} - }, - { - "html_url": "https://github.com/bob/app/blob/main/.github/workflows/deploy.yml", - "repository": {"full_name": "bob/app"} - } - ] -}` - -func TestGitHubActions_Sweep_ExtractsFindings(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/search/code" { - t.Errorf("unexpected path: %s", r.URL.Path) - } - if r.Header.Get("Authorization") == "" { - t.Error("missing Authorization header") - } - w.Header().Set("Content-Type", "application/json") - _, _ = w.Write([]byte(ghActionsFixtureJSON)) - })) - defer srv.Close() - - src := &GitHubActionsSource{ - Token: "ghp-test", - BaseURL: srv.URL, - Registry: providers.NewRegistryFromProviders([]providers.Provider{ - {Name: "openai", Keywords: []string{"sk-proj-"}}, - }), - Limiters: recon.NewLimiterRegistry(), - client: NewClient(), - } - - out := make(chan recon.Finding, 16) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - if err := src.Sweep(ctx, "", out); err != nil { - t.Fatalf("Sweep err: %v", err) - } - close(out) -======= func TestGitHubActions_Name(t *testing.T) { s := &GitHubActionsSource{} if s.Name() != "ghactions" { @@ -119,65 +70,15 @@ Tests passed.`) if err != nil { t.Fatalf("Sweep error: %v", err) } ->>>>>>> worktree-agent-adad8c10 var findings []recon.Finding for f := range out { findings = append(findings, f) } -<<<<<<< HEAD - if len(findings) != 2 { - t.Fatalf("expected 2 findings, got %d", len(findings)) - } - for _, f := range findings { - if f.SourceType != "recon:github_actions" { - t.Errorf("unexpected SourceType: %s", f.SourceType) - } - if f.Confidence != "low" { - t.Errorf("unexpected Confidence: %s", f.Confidence) - } - } -} - -func TestGitHubActions_EnabledOnlyWithToken(t *testing.T) { - s := &GitHubActionsSource{} - if s.Enabled(recon.Config{}) { - t.Fatal("expected Enabled=false without token") - } - s.Token = "test" - if !s.Enabled(recon.Config{}) { - t.Fatal("expected Enabled=true with token") - } -} - -func TestGitHubActions_Sweep_SkipsWhenNoToken(t *testing.T) { - src := &GitHubActionsSource{} - out := make(chan recon.Finding, 4) - if err := src.Sweep(context.Background(), "", out); err != nil { - t.Fatalf("expected nil, got: %v", err) - } - close(out) - if len(out) != 0 { - t.Fatal("expected no findings without token") - } -} - -func TestGitHubActions_NameAndRate(t *testing.T) { - s := &GitHubActionsSource{} - if s.Name() != "github_actions" { - t.Errorf("unexpected name: %s", s.Name()) - } - if s.Burst() != 2 { - t.Errorf("burst: %d", s.Burst()) - } - if s.RespectsRobots() { - t.Error("expected RespectsRobots=false") -======= if len(findings) == 0 { t.Fatal("expected at least one finding from GitHub Actions logs") } if findings[0].SourceType != "recon:ghactions" { t.Fatalf("expected recon:ghactions, got %s", findings[0].SourceType) ->>>>>>> worktree-agent-adad8c10 } } diff --git a/pkg/recon/sources/jenkins.go b/pkg/recon/sources/jenkins.go index 8a3d2c9..d3313b6 100644 --- a/pkg/recon/sources/jenkins.go +++ b/pkg/recon/sources/jenkins.go @@ -4,13 +4,8 @@ import ( "context" "encoding/json" "fmt" -<<<<<<< HEAD - "net/http" - "strings" -======= "io" "net/http" ->>>>>>> worktree-agent-adad8c10 "time" "golang.org/x/time/rate" @@ -19,17 +14,10 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -// JenkinsSource searches publicly accessible Jenkins instances for build -// console output containing leaked API keys. It queries the Jenkins JSON API -// at /api/json to enumerate jobs and their latest builds. Credentialless -- -// targets open Jenkins instances discovered via dorking or IoT scanners. -======= // JenkinsSource scrapes publicly accessible Jenkins build consoles for leaked // API keys. Many Jenkins instances are exposed to the internet without // authentication, and build console output frequently contains printed // environment variables or secrets passed via command-line arguments. ->>>>>>> worktree-agent-adad8c10 type JenkinsSource struct { BaseURL string Registry *providers.Registry @@ -39,14 +27,6 @@ type JenkinsSource struct { var _ recon.ReconSource = (*JenkinsSource)(nil) -<<<<<<< HEAD -func (s *JenkinsSource) Name() string { return "jenkins" } -func (s *JenkinsSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } -func (s *JenkinsSource) Burst() int { return 1 } -func (s *JenkinsSource) RespectsRobots() bool { return true } -func (s *JenkinsSource) Enabled(_ recon.Config) bool { return true } - -======= func (s *JenkinsSource) Name() string { return "jenkins" } func (s *JenkinsSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } func (s *JenkinsSource) Burst() int { return 2 } @@ -54,36 +34,20 @@ func (s *JenkinsSource) RespectsRobots() bool { return true } func (s *JenkinsSource) Enabled(_ recon.Config) bool { return true } // jenkinsJobsResponse represents the Jenkins API jobs listing. ->>>>>>> worktree-agent-adad8c10 type jenkinsJobsResponse struct { Jobs []jenkinsJob `json:"jobs"` } type jenkinsJob struct { -<<<<<<< HEAD - Name string `json:"name"` - URL string `json:"url"` - LastBuild *jenkinsBuild `json:"lastBuild"` -} - -type jenkinsBuild struct { - Number int `json:"number"` - URL string `json:"url"` -======= Name string `json:"name"` URL string `json:"url"` Color string `json:"color"` ->>>>>>> worktree-agent-adad8c10 } func (s *JenkinsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { base := s.BaseURL if base == "" { -<<<<<<< HEAD - base = "https://jenkins.example.com" -======= return nil // No default; Jenkins instances are discovered via dorking ->>>>>>> worktree-agent-adad8c10 } client := s.Client if client == nil { @@ -91,40 +55,26 @@ func (s *JenkinsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Fi } queries := BuildQueries(s.Registry, "jenkins") -<<<<<<< HEAD - kwIndex := jenkinsKeywordIndex(s.Registry) -======= if len(queries) == 0 { return nil } ->>>>>>> worktree-agent-adad8c10 for _, q := range queries { if err := ctx.Err(); err != nil { return err } -<<<<<<< HEAD -======= ->>>>>>> worktree-agent-adad8c10 if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } -<<<<<<< HEAD - endpoint := fmt.Sprintf("%s/api/json?tree=jobs[name,url,lastBuild[number,url]]", base) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) - if err != nil { - return fmt.Errorf("jenkins: build request: %w", err) -======= // List jobs from the Jenkins API. jobsURL := fmt.Sprintf("%s/api/json?tree=jobs[name,url,color]", base) req, err := http.NewRequestWithContext(ctx, http.MethodGet, jobsURL, nil) if err != nil { continue ->>>>>>> worktree-agent-adad8c10 } req.Header.Set("Accept", "application/json") @@ -133,35 +83,6 @@ func (s *JenkinsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Fi continue } -<<<<<<< HEAD - var result jenkinsJobsResponse - decErr := json.NewDecoder(resp.Body).Decode(&result) - _ = resp.Body.Close() - if decErr != nil { - continue - } - - provName := kwIndex[strings.ToLower(q)] - for _, job := range result.Jobs { - if job.LastBuild == nil { - continue - } - source := job.LastBuild.URL - if source == "" { - source = fmt.Sprintf("%s/job/%s/%d/console", base, job.Name, job.LastBuild.Number) - } - f := recon.Finding{ - ProviderName: provName, - Confidence: "low", - Source: source, - SourceType: "recon:jenkins", - DetectedAt: time.Now(), - } - select { - case out <- f: - case <-ctx.Done(): - return ctx.Err() -======= var jobs jenkinsJobsResponse if err := json.NewDecoder(resp.Body).Decode(&jobs); err != nil { _ = resp.Body.Close() @@ -206,30 +127,8 @@ func (s *JenkinsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Fi Confidence: "medium", DetectedAt: time.Now(), } ->>>>>>> worktree-agent-adad8c10 } } } return nil } -<<<<<<< HEAD - -func jenkinsKeywordIndex(reg *providers.Registry) map[string]string { - m := make(map[string]string) - if reg == nil { - return m - } - for _, p := range reg.List() { - for _, k := range p.Keywords { - kl := strings.ToLower(strings.TrimSpace(k)) - if kl != "" { - if _, exists := m[kl]; !exists { - m[kl] = p.Name - } - } - } - } - return m -} -======= ->>>>>>> worktree-agent-adad8c10 diff --git a/pkg/recon/sources/jenkins_test.go b/pkg/recon/sources/jenkins_test.go index 775acc4..8f8ab6e 100644 --- a/pkg/recon/sources/jenkins_test.go +++ b/pkg/recon/sources/jenkins_test.go @@ -11,55 +11,6 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -const jenkinsFixtureJSON = `{ - "jobs": [ - { - "name": "build-api", - "url": "https://jenkins.example.com/job/build-api/", - "lastBuild": {"number": 42, "url": "https://jenkins.example.com/job/build-api/42/"} - }, - { - "name": "deploy-prod", - "url": "https://jenkins.example.com/job/deploy-prod/", - "lastBuild": {"number": 7, "url": ""} - }, - { - "name": "stale-job", - "url": "https://jenkins.example.com/job/stale-job/", - "lastBuild": null - } - ] -}` - -func TestJenkins_Sweep_ExtractsFindings(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/api/json" { - t.Errorf("unexpected path: %s", r.URL.Path) - } - w.Header().Set("Content-Type", "application/json") - _, _ = w.Write([]byte(jenkinsFixtureJSON)) - })) - defer srv.Close() - - src := &JenkinsSource{ - BaseURL: srv.URL, - Registry: providers.NewRegistryFromProviders([]providers.Provider{ - {Name: "openai", Keywords: []string{"sk-proj-"}}, - }), - Limiters: recon.NewLimiterRegistry(), - Client: NewClient(), - } - - out := make(chan recon.Finding, 16) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - if err := src.Sweep(ctx, "", out); err != nil { - t.Fatalf("Sweep err: %v", err) - } - close(out) -======= func TestJenkins_Name(t *testing.T) { s := &JenkinsSource{} if s.Name() != "jenkins" { @@ -110,51 +61,15 @@ Build SUCCESS`)) if err != nil { t.Fatalf("Sweep error: %v", err) } ->>>>>>> worktree-agent-adad8c10 var findings []recon.Finding for f := range out { findings = append(findings, f) } -<<<<<<< HEAD - // 3 jobs but 1 has null lastBuild -> 2 findings. - if len(findings) != 2 { - t.Fatalf("expected 2 findings, got %d", len(findings)) - } - // First job has URL from lastBuild. - if findings[0].Source != "https://jenkins.example.com/job/build-api/42/" { - t.Errorf("unexpected source[0]: %s", findings[0].Source) - } - for _, f := range findings { - if f.SourceType != "recon:jenkins" { - t.Errorf("unexpected SourceType: %s", f.SourceType) - } - } -} - -func TestJenkins_EnabledAlwaysTrue(t *testing.T) { - s := &JenkinsSource{} - if !s.Enabled(recon.Config{}) { - t.Fatal("expected Enabled=true") - } -} - -func TestJenkins_NameAndRate(t *testing.T) { - s := &JenkinsSource{} - if s.Name() != "jenkins" { - t.Errorf("unexpected name: %s", s.Name()) - } - if s.Burst() != 1 { - t.Errorf("burst: %d", s.Burst()) - } - if !s.RespectsRobots() { - t.Error("expected RespectsRobots=true") -======= if len(findings) == 0 { t.Fatal("expected at least one finding from Jenkins console output") } if findings[0].SourceType != "recon:jenkins" { t.Fatalf("expected recon:jenkins, got %s", findings[0].SourceType) ->>>>>>> worktree-agent-adad8c10 } } diff --git a/pkg/recon/sources/register.go b/pkg/recon/sources/register.go index e388f20..d38d3ad 100644 --- a/pkg/recon/sources/register.go +++ b/pkg/recon/sources/register.go @@ -39,9 +39,6 @@ type SourcesConfig struct { // Brave Search API subscription token. BraveAPIKey string - // Phase 14: CI/CD tokens. - CircleCIToken string - // Phase 12: IoT scanner API keys. ShodanAPIKey string CensysAPIId string diff --git a/pkg/recon/sources/travisci.go b/pkg/recon/sources/travisci.go index 7a581c7..2f703b4 100644 --- a/pkg/recon/sources/travisci.go +++ b/pkg/recon/sources/travisci.go @@ -4,15 +4,9 @@ import ( "context" "encoding/json" "fmt" -<<<<<<< HEAD - "net/http" - "net/url" - "strings" -======= "io" "net/http" "regexp" ->>>>>>> worktree-agent-adad8c10 "time" "golang.org/x/time/rate" @@ -21,16 +15,10 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -// TravisCISource searches public Travis CI build logs for leaked API keys. -// It queries the Travis CI API v3 /builds endpoint for builds matching -// provider keywords. No authentication required for public repositories. -======= // TravisCISource scrapes public Travis CI build logs for leaked API keys. // Travis CI exposes build logs publicly by default for open-source projects. // Developers frequently print environment variables or use secrets insecurely // in CI scripts, causing API keys to appear in build output. ->>>>>>> worktree-agent-adad8c10 type TravisCISource struct { BaseURL string Registry *providers.Registry @@ -40,14 +28,6 @@ type TravisCISource struct { var _ recon.ReconSource = (*TravisCISource)(nil) -<<<<<<< HEAD -func (s *TravisCISource) Name() string { return "travisci" } -func (s *TravisCISource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } -func (s *TravisCISource) Burst() int { return 1 } -func (s *TravisCISource) RespectsRobots() bool { return true } -func (s *TravisCISource) Enabled(_ recon.Config) bool { return true } - -======= func (s *TravisCISource) Name() string { return "travisci" } func (s *TravisCISource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } func (s *TravisCISource) Burst() int { return 2 } @@ -55,29 +35,17 @@ func (s *TravisCISource) RespectsRobots() bool { return false } func (s *TravisCISource) Enabled(_ recon.Config) bool { return true } // travisBuildResponse represents the Travis CI API builds response. ->>>>>>> worktree-agent-adad8c10 type travisBuildResponse struct { Builds []travisBuild `json:"builds"` } type travisBuild struct { -<<<<<<< HEAD - ID int `json:"id"` - State string `json:"state"` - Repository travisRepository `json:"repository"` -} - -type travisRepository struct { - Slug string `json:"slug"` -} -======= ID int `json:"id"` State string `json:"state"` } // ciLogKeyPattern matches API key patterns commonly leaked in CI logs. var ciLogKeyPattern = regexp.MustCompile(`(?i)(api[_-]?key|secret[_-]?key|token|password|credential|auth[_-]?token)['":\s]*[=:]\s*['"]?([a-zA-Z0-9_\-]{16,})['"]?`) ->>>>>>> worktree-agent-adad8c10 func (s *TravisCISource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { base := s.BaseURL @@ -90,41 +58,26 @@ func (s *TravisCISource) Sweep(ctx context.Context, _ string, out chan<- recon.F } queries := BuildQueries(s.Registry, "travisci") -<<<<<<< HEAD - kwIndex := travisKeywordIndex(s.Registry) -======= if len(queries) == 0 { return nil } ->>>>>>> worktree-agent-adad8c10 for _, q := range queries { if err := ctx.Err(); err != nil { return err } -<<<<<<< HEAD -======= ->>>>>>> worktree-agent-adad8c10 if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } -<<<<<<< HEAD - endpoint := fmt.Sprintf("%s/builds?limit=20&sort_by=finished_at:desc&state=passed&event_type=push", - base) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) - if err != nil { - return fmt.Errorf("travisci: build request: %w", err) -======= // Search for builds related to the query keyword. searchURL := fmt.Sprintf("%s/builds?search=%s&limit=5", base, q) req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) if err != nil { continue ->>>>>>> worktree-agent-adad8c10 } req.Header.Set("Travis-API-Version", "3") req.Header.Set("Accept", "application/json") @@ -134,30 +87,6 @@ func (s *TravisCISource) Sweep(ctx context.Context, _ string, out chan<- recon.F continue } -<<<<<<< HEAD - var result travisBuildResponse - decErr := json.NewDecoder(resp.Body).Decode(&result) - _ = resp.Body.Close() - if decErr != nil { - continue - } - - provName := kwIndex[strings.ToLower(q)] - for _, build := range result.Builds { - source := fmt.Sprintf("https://app.travis-ci.com/%s/builds/%d", - url.PathEscape(build.Repository.Slug), build.ID) - f := recon.Finding{ - ProviderName: provName, - Confidence: "low", - Source: source, - SourceType: "recon:travisci", - DetectedAt: time.Now(), - } - select { - case out <- f: - case <-ctx.Done(): - return ctx.Err() -======= var builds travisBuildResponse if err := json.NewDecoder(resp.Body).Decode(&builds); err != nil { _ = resp.Body.Close() @@ -204,30 +133,8 @@ func (s *TravisCISource) Sweep(ctx context.Context, _ string, out chan<- recon.F Confidence: "medium", DetectedAt: time.Now(), } ->>>>>>> worktree-agent-adad8c10 } } } return nil } -<<<<<<< HEAD - -func travisKeywordIndex(reg *providers.Registry) map[string]string { - m := make(map[string]string) - if reg == nil { - return m - } - for _, p := range reg.List() { - for _, k := range p.Keywords { - kl := strings.ToLower(strings.TrimSpace(k)) - if kl != "" { - if _, exists := m[kl]; !exists { - m[kl] = p.Name - } - } - } - } - return m -} -======= ->>>>>>> worktree-agent-adad8c10 diff --git a/pkg/recon/sources/travisci_test.go b/pkg/recon/sources/travisci_test.go index aca8e4f..0c61b57 100644 --- a/pkg/recon/sources/travisci_test.go +++ b/pkg/recon/sources/travisci_test.go @@ -11,53 +11,6 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -const travisFixtureJSON = `{ - "builds": [ - { - "id": 12345, - "state": "passed", - "repository": {"slug": "alice/project"} - }, - { - "id": 67890, - "state": "passed", - "repository": {"slug": "bob/app"} - } - ] -}` - -func TestTravisCI_Sweep_ExtractsFindings(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/builds" { - t.Errorf("unexpected path: %s", r.URL.Path) - } - if r.Header.Get("Travis-API-Version") != "3" { - t.Error("missing Travis-API-Version header") - } - w.Header().Set("Content-Type", "application/json") - _, _ = w.Write([]byte(travisFixtureJSON)) - })) - defer srv.Close() - - src := &TravisCISource{ - BaseURL: srv.URL, - Registry: providers.NewRegistryFromProviders([]providers.Provider{ - {Name: "openai", Keywords: []string{"sk-proj-"}}, - }), - Limiters: recon.NewLimiterRegistry(), - Client: NewClient(), - } - - out := make(chan recon.Finding, 16) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - if err := src.Sweep(ctx, "", out); err != nil { - t.Fatalf("Sweep err: %v", err) - } - close(out) -======= func TestTravisCI_Name(t *testing.T) { s := &TravisCISource{} if s.Name() != "travisci" { @@ -107,46 +60,15 @@ Running tests...`)) if err != nil { t.Fatalf("Sweep error: %v", err) } ->>>>>>> worktree-agent-adad8c10 var findings []recon.Finding for f := range out { findings = append(findings, f) } -<<<<<<< HEAD - if len(findings) != 2 { - t.Fatalf("expected 2 findings, got %d", len(findings)) - } - for _, f := range findings { - if f.SourceType != "recon:travisci" { - t.Errorf("unexpected SourceType: %s", f.SourceType) - } - } -} - -func TestTravisCI_EnabledAlwaysTrue(t *testing.T) { - s := &TravisCISource{} - if !s.Enabled(recon.Config{}) { - t.Fatal("expected Enabled=true") - } -} - -func TestTravisCI_NameAndRate(t *testing.T) { - s := &TravisCISource{} - if s.Name() != "travisci" { - t.Errorf("unexpected name: %s", s.Name()) - } - if s.Burst() != 1 { - t.Errorf("burst: %d", s.Burst()) - } - if !s.RespectsRobots() { - t.Error("expected RespectsRobots=true") -======= if len(findings) == 0 { t.Fatal("expected at least one finding from Travis CI build log") } if findings[0].SourceType != "recon:travisci" { t.Fatalf("expected recon:travisci, got %s", findings[0].SourceType) ->>>>>>> worktree-agent-adad8c10 } } diff --git a/pkg/recon/sources/wayback.go b/pkg/recon/sources/wayback.go index bd43cbc..9851ede 100644 --- a/pkg/recon/sources/wayback.go +++ b/pkg/recon/sources/wayback.go @@ -1,20 +1,11 @@ package sources import ( -<<<<<<< HEAD - "bufio" - "context" - "fmt" - "net/http" - "net/url" - "strings" -======= "context" "encoding/json" "fmt" "io" "net/http" ->>>>>>> worktree-agent-adad8c10 "time" "golang.org/x/time/rate" @@ -23,41 +14,6 @@ import ( "github.com/salvacybersec/keyhunter/pkg/recon" ) -<<<<<<< HEAD -// WaybackMachineSource implements recon.ReconSource against the Wayback Machine -// CDX Server API. It queries web.archive.org/cdx/search/cdx for historical -// snapshots of pages matching provider keywords (e.g. domains known to host -// API key documentation or configuration files). -// -// RECON-ARCH-01: Each matching CDX record yields a Finding pointing at the -// archived snapshot URL. The source is credentialless and always enabled. -type WaybackMachineSource struct { - // BaseURL defaults to https://web.archive.org. Tests override with httptest URL. - BaseURL string - // Registry drives the keyword query list via BuildQueries. - Registry *providers.Registry - // Limiters is the shared recon.LimiterRegistry. - Limiters *recon.LimiterRegistry - // Client is the shared retry HTTP wrapper. If nil, a default is used. - Client *Client -} - -// Compile-time assertion that WaybackMachineSource satisfies recon.ReconSource. -var _ recon.ReconSource = (*WaybackMachineSource)(nil) - -func (s *WaybackMachineSource) Name() string { return "wayback" } -func (s *WaybackMachineSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) } -func (s *WaybackMachineSource) Burst() int { return 1 } -func (s *WaybackMachineSource) RespectsRobots() bool { return true } - -// Enabled always returns true: CDX API is unauthenticated. -func (s *WaybackMachineSource) Enabled(_ recon.Config) bool { return true } - -// Sweep iterates provider keywords, queries the CDX API for each, and emits -// a Finding for every archived snapshot URL returned. The CDX API returns -// plain-text lines with space-separated fields; we extract the original URL -// and timestamp to construct the full Wayback snapshot link. -======= // WaybackMachineSource searches the Internet Archive's Wayback Machine CDX API // for archived pages that may contain leaked API keys. Developers sometimes // remove secrets from live pages but cached versions persist in web archives. @@ -76,7 +32,6 @@ func (s *WaybackMachineSource) Burst() int { return 1 } func (s *WaybackMachineSource) RespectsRobots() bool { return true } func (s *WaybackMachineSource) Enabled(_ recon.Config) bool { return true } ->>>>>>> worktree-agent-adad8c10 func (s *WaybackMachineSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { base := s.BaseURL if base == "" { @@ -103,55 +58,6 @@ func (s *WaybackMachineSource) Sweep(ctx context.Context, _ string, out chan<- r } } -<<<<<<< HEAD - // CDX API: output=text, fl=timestamp,original limits response to two fields per line. - // limit=50 keeps the response bounded per keyword. - endpoint := fmt.Sprintf("%s/cdx/search/cdx?url=*&output=text&fl=timestamp,original&limit=50&matchType=prefix&filter=statuscode:200&query=%s", - base, url.QueryEscape(q)) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) - if err != nil { - return fmt.Errorf("wayback: build req: %w", err) - } - req.Header.Set("Accept", "text/plain") - - resp, err := client.Do(ctx, req) - if err != nil { - // Non-fatal: skip this keyword on transient errors. - continue - } - - scanner := bufio.NewScanner(resp.Body) - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - if line == "" { - continue - } - // CDX text output: "timestamp original-url" - parts := strings.SplitN(line, " ", 2) - if len(parts) < 2 { - continue - } - ts := parts[0] - origURL := parts[1] - - snapshotURL := fmt.Sprintf("%s/web/%s/%s", base, ts, origURL) - - f := recon.Finding{ - ProviderName: "", - Source: snapshotURL, - SourceType: "recon:wayback", - Confidence: "low", - DetectedAt: time.Now(), - } - select { - case out <- f: - case <-ctx.Done(): - _ = resp.Body.Close() - return ctx.Err() - } - } - _ = resp.Body.Close() -======= // CDX API: search for archived URLs matching the query. // Filter for .env, config, and JS files that commonly contain keys. cdxURL := fmt.Sprintf("%s/cdx/search/cdx?url=*%s*&output=json&limit=10&fl=url,timestamp,statuscode&filter=statuscode:200", base, q) @@ -223,7 +129,6 @@ func (s *WaybackMachineSource) Sweep(ctx context.Context, _ string, out chan<- r } } } ->>>>>>> worktree-agent-adad8c10 } return nil } diff --git a/pkg/recon/sources/wayback_test.go b/pkg/recon/sources/wayback_test.go index 3ba3d18..0d5f99b 100644 --- a/pkg/recon/sources/wayback_test.go +++ b/pkg/recon/sources/wayback_test.go @@ -2,170 +2,6 @@ package sources import ( "context" -<<<<<<< HEAD - "errors" - "fmt" - "net/http" - "net/http/httptest" - "sync/atomic" - "testing" - "time" - - "github.com/salvacybersec/keyhunter/pkg/recon" -) - -func waybackStubHandler(t *testing.T, calls *int32) http.HandlerFunc { - t.Helper() - return func(w http.ResponseWriter, r *http.Request) { - atomic.AddInt32(calls, 1) - if r.URL.Path != "/cdx/search/cdx" { - t.Errorf("unexpected path: %s", r.URL.Path) - } - if r.URL.Query().Get("query") == "" { - t.Errorf("missing query param") - } - w.Header().Set("Content-Type", "text/plain") - // Two CDX records per query: "timestamp original-url" - fmt.Fprintln(w, "20230101120000 https://example.com/config.js") - fmt.Fprintln(w, "20230615080000 https://example.com/env.json") - } -} - -func TestWayback_SweepEmitsFindings(t *testing.T) { - reg := syntheticRegistry() - lim := recon.NewLimiterRegistry() - _ = lim.For("wayback", 1000, 100) - - var calls int32 - srv := httptest.NewServer(waybackStubHandler(t, &calls)) - defer srv.Close() - - src := &WaybackMachineSource{ - BaseURL: srv.URL, - Registry: reg, - Limiters: lim, - Client: NewClient(), - } - - out := make(chan recon.Finding, 32) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - done := make(chan error, 1) - go func() { done <- src.Sweep(ctx, "", out); close(out) }() - - var findings []recon.Finding - for f := range out { - findings = append(findings, f) - } - if err := <-done; err != nil { - t.Fatalf("Sweep error: %v", err) - } - - // 2 keywords * 2 results = 4 findings - if len(findings) != 4 { - t.Fatalf("expected 4 findings, got %d", len(findings)) - } - for _, f := range findings { - if f.SourceType != "recon:wayback" { - t.Errorf("SourceType=%q want recon:wayback", f.SourceType) - } - } - if got := atomic.LoadInt32(&calls); got != 2 { - t.Errorf("expected 2 server calls, got %d", got) - } -} - -func TestWayback_SnapshotURL(t *testing.T) { - reg := syntheticRegistry() - lim := recon.NewLimiterRegistry() - _ = lim.For("wayback", 1000, 100) - - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/plain") - fmt.Fprintln(w, "20240101000000 https://target.com/page") - })) - defer srv.Close() - - src := &WaybackMachineSource{ - BaseURL: srv.URL, - Registry: reg, - Limiters: lim, - Client: NewClient(), - } - - out := make(chan recon.Finding, 32) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - done := make(chan error, 1) - go func() { done <- src.Sweep(ctx, "", out); close(out) }() - - var findings []recon.Finding - for f := range out { - findings = append(findings, f) - } - if err := <-done; err != nil { - t.Fatalf("Sweep error: %v", err) - } - - // Each finding should have a proper Wayback snapshot URL - for _, f := range findings { - want := srv.URL + "/web/20240101000000/https://target.com/page" - if f.Source != want { - t.Errorf("Source=%q want %q", f.Source, want) - } - } -} - -func TestWayback_EnabledAlwaysTrue(t *testing.T) { - s := &WaybackMachineSource{} - if !s.Enabled(recon.Config{}) { - t.Fatal("expected Enabled=true") - } -} - -func TestWayback_NameAndRate(t *testing.T) { - s := &WaybackMachineSource{} - if s.Name() != "wayback" { - t.Errorf("unexpected name: %s", s.Name()) - } - if s.Burst() != 1 { - t.Errorf("burst: %d", s.Burst()) - } - if !s.RespectsRobots() { - t.Error("expected RespectsRobots=true") - } -} - -func TestWayback_CtxCancelled(t *testing.T) { - reg := syntheticRegistry() - lim := recon.NewLimiterRegistry() - _ = lim.For("wayback", 1000, 100) - - src := &WaybackMachineSource{ - BaseURL: "http://127.0.0.1:1", - Registry: reg, - Limiters: lim, - Client: NewClient(), - } - - ctx, cancel := context.WithCancel(context.Background()) - cancel() - - out := make(chan recon.Finding, 1) - err := src.Sweep(ctx, "", out) - if !errors.Is(err, context.Canceled) { - t.Fatalf("expected context.Canceled, got %v", err) - } -} - -func TestWayback_NilRegistryNoError(t *testing.T) { - src := &WaybackMachineSource{Client: NewClient()} - out := make(chan recon.Finding, 1) - if err := src.Sweep(context.Background(), "", out); err != nil { - t.Fatalf("expected nil, got %v", err) -======= "net/http" "net/http/httptest" "testing" @@ -231,6 +67,5 @@ func TestWayback_Sweep(t *testing.T) { } if findings[0].SourceType != "recon:wayback" { t.Fatalf("expected recon:wayback, got %s", findings[0].SourceType) ->>>>>>> worktree-agent-adad8c10 } }