From 169b80b3bc18b34285d00f5a67fc4b5b19d52a25 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 13:34:09 +0300 Subject: [PATCH] feat(14-04): implement 7 Phase 14 sources (CI/CD, archives, JS bundles) - TravisCISource: scrapes public Travis CI build logs for API key leaks - GitHubActionsSource: searches Actions workflow logs (requires GitHub token) - CircleCISource: scrapes CircleCI pipeline logs (requires CircleCI token) - JenkinsSource: scrapes public Jenkins console output for leaked secrets - WaybackMachineSource: searches Wayback Machine CDX for archived key leaks - CommonCrawlSource: searches Common Crawl index for exposed pages - JSBundleSource: probes JS bundles for embedded API key literals --- pkg/recon/sources/circleci.go | 139 +++++++++++++++++++++++ pkg/recon/sources/circleci_test.go | 78 +++++++++++++ pkg/recon/sources/commoncrawl.go | 120 ++++++++++++++++++++ pkg/recon/sources/commoncrawl_test.go | 70 ++++++++++++ pkg/recon/sources/githubactions.go | 142 ++++++++++++++++++++++++ pkg/recon/sources/githubactions_test.go | 84 ++++++++++++++ pkg/recon/sources/jenkins.go | 134 ++++++++++++++++++++++ pkg/recon/sources/jenkins_test.go | 75 +++++++++++++ pkg/recon/sources/jsbundle.go | 116 +++++++++++++++++++ pkg/recon/sources/jsbundle_test.go | 68 ++++++++++++ pkg/recon/sources/travisci.go | 140 +++++++++++++++++++++++ pkg/recon/sources/travisci_test.go | 74 ++++++++++++ pkg/recon/sources/wayback.go | 134 ++++++++++++++++++++++ pkg/recon/sources/wayback_test.go | 71 ++++++++++++ 14 files changed, 1445 insertions(+) create mode 100644 pkg/recon/sources/circleci.go create mode 100644 pkg/recon/sources/circleci_test.go create mode 100644 pkg/recon/sources/commoncrawl.go create mode 100644 pkg/recon/sources/commoncrawl_test.go create mode 100644 pkg/recon/sources/githubactions.go create mode 100644 pkg/recon/sources/githubactions_test.go create mode 100644 pkg/recon/sources/jenkins.go create mode 100644 pkg/recon/sources/jenkins_test.go create mode 100644 pkg/recon/sources/jsbundle.go create mode 100644 pkg/recon/sources/jsbundle_test.go create mode 100644 pkg/recon/sources/travisci.go create mode 100644 pkg/recon/sources/travisci_test.go create mode 100644 pkg/recon/sources/wayback.go create mode 100644 pkg/recon/sources/wayback_test.go diff --git a/pkg/recon/sources/circleci.go b/pkg/recon/sources/circleci.go new file mode 100644 index 0000000..e4a44cb --- /dev/null +++ b/pkg/recon/sources/circleci.go @@ -0,0 +1,139 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// CircleCISource scrapes CircleCI build logs for leaked API keys. +// CircleCI exposes build logs via its API; a personal API token is required +// to access build artifacts and logs. Misconfigured pipelines often leak +// secrets in build output. +type CircleCISource struct { + Token string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*CircleCISource)(nil) + +func (s *CircleCISource) Name() string { return "circleci" } +func (s *CircleCISource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *CircleCISource) Burst() int { return 2 } +func (s *CircleCISource) RespectsRobots() bool { return false } + +// Enabled requires a CircleCI API token. +func (s *CircleCISource) Enabled(_ recon.Config) bool { return s.Token != "" } + +// circleciPipelineResponse represents the CircleCI v2 pipeline search result. +type circleciPipelineResponse struct { + Items []circleciPipeline `json:"items"` +} + +type circleciPipeline struct { + ID string `json:"id"` + Number int `json:"number"` +} + +func (s *CircleCISource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://circleci.com/api/v2" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "circleci") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Search for pipelines by project slug (query is used as slug hint). + searchURL := fmt.Sprintf("%s/project/gh/%s/pipeline?limit=5", base, q) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + continue + } + req.Header.Set("Circle-Token", s.Token) + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + var pipelines circleciPipelineResponse + if err := json.NewDecoder(resp.Body).Decode(&pipelines); err != nil { + _ = resp.Body.Close() + continue + } + _ = resp.Body.Close() + + for _, p := range pipelines.Items { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Fetch pipeline workflow logs. + logURL := fmt.Sprintf("%s/pipeline/%s/workflow", base, p.ID) + logReq, err := http.NewRequestWithContext(ctx, http.MethodGet, logURL, nil) + if err != nil { + continue + } + logReq.Header.Set("Circle-Token", s.Token) + logReq.Header.Set("Accept", "text/plain") + + logResp, err := client.Do(ctx, logReq) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(logResp.Body, 256*1024)) + _ = logResp.Body.Close() + if err != nil { + continue + } + + if ciLogKeyPattern.Match(body) { + out <- recon.Finding{ + ProviderName: q, + Source: logURL, + SourceType: "recon:circleci", + Confidence: "medium", + DetectedAt: time.Now(), + } + } + } + } + return nil +} diff --git a/pkg/recon/sources/circleci_test.go b/pkg/recon/sources/circleci_test.go new file mode 100644 index 0000000..357d055 --- /dev/null +++ b/pkg/recon/sources/circleci_test.go @@ -0,0 +1,78 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestCircleCI_Name(t *testing.T) { + s := &CircleCISource{} + if s.Name() != "circleci" { + t.Fatalf("expected circleci, got %s", s.Name()) + } +} + +func TestCircleCI_Enabled(t *testing.T) { + s := &CircleCISource{} + if s.Enabled(recon.Config{}) { + t.Fatal("should be disabled without token") + } + s.Token = "cci-test" + if !s.Enabled(recon.Config{}) { + t.Fatal("should be enabled with token") + } +} + +func TestCircleCI_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/project/gh/", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"items":[{"id":"pipe-abc-123","number":42}]}`)) + }) + mux.HandleFunc("/pipeline/pipe-abc-123/workflow", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`Build step: npm test +Setting SECRET_KEY="sk-proj-CIRCLELEAK12345678" +Tests completed successfully`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &CircleCISource{ + Token: "cci-test", + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from CircleCI pipeline log") + } + if findings[0].SourceType != "recon:circleci" { + t.Fatalf("expected recon:circleci, got %s", findings[0].SourceType) + } +} diff --git a/pkg/recon/sources/commoncrawl.go b/pkg/recon/sources/commoncrawl.go new file mode 100644 index 0000000..ec88de0 --- /dev/null +++ b/pkg/recon/sources/commoncrawl.go @@ -0,0 +1,120 @@ +package sources + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// CommonCrawlSource searches the Common Crawl index for web pages that may +// contain leaked API keys. Common Crawl archives petabytes of web content; +// its CDX API allows searching by URL pattern to find pages that historically +// exposed secrets. +type CommonCrawlSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*CommonCrawlSource)(nil) + +func (s *CommonCrawlSource) Name() string { return "commoncrawl" } +func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) } +func (s *CommonCrawlSource) Burst() int { return 1 } +func (s *CommonCrawlSource) RespectsRobots() bool { return true } +func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true } + +// ccIndexResult represents a single Common Crawl CDX index record. +type ccIndexResult struct { + URL string `json:"url"` + Timestamp string `json:"timestamp"` + Status string `json:"status"` + Filename string `json:"filename"` + Length string `json:"length"` + Offset string `json:"offset"` +} + +func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://index.commoncrawl.org/CC-MAIN-2024-10-index" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "commoncrawl") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // CDX API: search for URLs matching the query. + searchURL := fmt.Sprintf("%s?url=*%s*&output=json&limit=10", base, q) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + continue + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) + _ = resp.Body.Close() + if err != nil { + continue + } + + // Common Crawl returns NDJSON (newline-delimited JSON). + // Parse each line as a separate JSON object. + var results []ccIndexResult + dec := json.NewDecoder(bytes.NewReader(body)) + for dec.More() { + var r ccIndexResult + if err := dec.Decode(&r); err != nil { + break + } + results = append(results, r) + } + + for _, r := range results { + if err := ctx.Err(); err != nil { + return err + } + + // Each indexed URL is a potential leak location; emit as finding. + out <- recon.Finding{ + ProviderName: q, + Source: r.URL, + SourceType: "recon:commoncrawl", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} diff --git a/pkg/recon/sources/commoncrawl_test.go b/pkg/recon/sources/commoncrawl_test.go new file mode 100644 index 0000000..a29fcb6 --- /dev/null +++ b/pkg/recon/sources/commoncrawl_test.go @@ -0,0 +1,70 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestCommonCrawl_Name(t *testing.T) { + s := &CommonCrawlSource{} + if s.Name() != "commoncrawl" { + t.Fatalf("expected commoncrawl, got %s", s.Name()) + } +} + +func TestCommonCrawl_Enabled(t *testing.T) { + s := &CommonCrawlSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("CommonCrawlSource should always be enabled (credentialless)") + } +} + +func TestCommonCrawl_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + // NDJSON format: one JSON object per line. + _, _ = w.Write([]byte(`{"url":"https://example.com/.env","timestamp":"20240101000000","status":"200","filename":"CC-MAIN-2024.warc.gz","length":"1234","offset":"5678"} +`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &CommonCrawlSource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from Common Crawl index") + } + if findings[0].SourceType != "recon:commoncrawl" { + t.Fatalf("expected recon:commoncrawl, got %s", findings[0].SourceType) + } +} diff --git a/pkg/recon/sources/githubactions.go b/pkg/recon/sources/githubactions.go new file mode 100644 index 0000000..4fedab9 --- /dev/null +++ b/pkg/recon/sources/githubactions.go @@ -0,0 +1,142 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// GitHubActionsSource searches GitHub Actions workflow run logs for leaked API +// keys. Workflow logs are public for public repositories and frequently contain +// accidentally printed secrets, debug output with credentials, or insecure +// echo statements that expose environment variables. +type GitHubActionsSource struct { + Token string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*GitHubActionsSource)(nil) + +func (s *GitHubActionsSource) Name() string { return "ghactions" } +func (s *GitHubActionsSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *GitHubActionsSource) Burst() int { return 3 } +func (s *GitHubActionsSource) RespectsRobots() bool { return false } + +// Enabled requires a GitHub token (reuses GitHubToken from SourcesConfig). +func (s *GitHubActionsSource) Enabled(_ recon.Config) bool { return s.Token != "" } + +// ghActionsRunsResponse represents the GitHub Actions workflow runs list. +type ghActionsRunsResponse struct { + WorkflowRuns []ghActionsRun `json:"workflow_runs"` +} + +type ghActionsRun struct { + ID int64 `json:"id"` + LogsURL string `json:"logs_url"` + HTMLURL string `json:"html_url"` + Status string `json:"status"` + Conclusion string `json:"conclusion"` +} + +func (s *GitHubActionsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://api.github.com" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "ghactions") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Search for workflow runs via the Actions API. + searchURL := fmt.Sprintf("%s/search/code?q=%s+path:.github/workflows", base, q) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + continue + } + req.Header.Set("Authorization", "Bearer "+s.Token) + req.Header.Set("Accept", "application/vnd.github.v3+json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + var runs ghActionsRunsResponse + if err := json.NewDecoder(resp.Body).Decode(&runs); err != nil { + _ = resp.Body.Close() + continue + } + _ = resp.Body.Close() + + for _, run := range runs.WorkflowRuns { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Fetch log content. + logURL := fmt.Sprintf("%s/actions/runs/%d/logs", base, run.ID) + logReq, err := http.NewRequestWithContext(ctx, http.MethodGet, logURL, nil) + if err != nil { + continue + } + logReq.Header.Set("Authorization", "Bearer "+s.Token) + logReq.Header.Set("Accept", "application/vnd.github.v3+json") + + logResp, err := client.Do(ctx, logReq) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(logResp.Body, 256*1024)) + _ = logResp.Body.Close() + if err != nil { + continue + } + + if ciLogKeyPattern.Match(body) { + out <- recon.Finding{ + ProviderName: q, + Source: logURL, + SourceType: "recon:ghactions", + Confidence: "medium", + DetectedAt: time.Now(), + } + } + } + } + return nil +} diff --git a/pkg/recon/sources/githubactions_test.go b/pkg/recon/sources/githubactions_test.go new file mode 100644 index 0000000..e3bb53e --- /dev/null +++ b/pkg/recon/sources/githubactions_test.go @@ -0,0 +1,84 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestGitHubActions_Name(t *testing.T) { + s := &GitHubActionsSource{} + if s.Name() != "ghactions" { + t.Fatalf("expected ghactions, got %s", s.Name()) + } +} + +func TestGitHubActions_Enabled(t *testing.T) { + s := &GitHubActionsSource{} + if s.Enabled(recon.Config{}) { + t.Fatal("should be disabled without token") + } + s.Token = "ghp-test" + if !s.Enabled(recon.Config{}) { + t.Fatal("should be enabled with token") + } +} + +func TestGitHubActions_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/search/code", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(ghActionsRunsResponse{ + WorkflowRuns: []ghActionsRun{ + {ID: 42, Status: "completed", Conclusion: "success"}, + }, + }) + }) + mux.HandleFunc("/actions/runs/42/logs", func(w http.ResponseWriter, r *http.Request) { + _, _ = fmt.Fprint(w, `2024-01-01T00:00:00Z Run setup +Setting env: API_KEY="sk-proj-LEAKED1234567890" +Tests passed.`) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &GitHubActionsSource{ + Token: "ghp-test", + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from GitHub Actions logs") + } + if findings[0].SourceType != "recon:ghactions" { + t.Fatalf("expected recon:ghactions, got %s", findings[0].SourceType) + } +} diff --git a/pkg/recon/sources/jenkins.go b/pkg/recon/sources/jenkins.go new file mode 100644 index 0000000..d3313b6 --- /dev/null +++ b/pkg/recon/sources/jenkins.go @@ -0,0 +1,134 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// JenkinsSource scrapes publicly accessible Jenkins build consoles for leaked +// API keys. Many Jenkins instances are exposed to the internet without +// authentication, and build console output frequently contains printed +// environment variables or secrets passed via command-line arguments. +type JenkinsSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*JenkinsSource)(nil) + +func (s *JenkinsSource) Name() string { return "jenkins" } +func (s *JenkinsSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *JenkinsSource) Burst() int { return 2 } +func (s *JenkinsSource) RespectsRobots() bool { return true } +func (s *JenkinsSource) Enabled(_ recon.Config) bool { return true } + +// jenkinsJobsResponse represents the Jenkins API jobs listing. +type jenkinsJobsResponse struct { + Jobs []jenkinsJob `json:"jobs"` +} + +type jenkinsJob struct { + Name string `json:"name"` + URL string `json:"url"` + Color string `json:"color"` +} + +func (s *JenkinsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + return nil // No default; Jenkins instances are discovered via dorking + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "jenkins") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // List jobs from the Jenkins API. + jobsURL := fmt.Sprintf("%s/api/json?tree=jobs[name,url,color]", base) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, jobsURL, nil) + if err != nil { + continue + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + var jobs jenkinsJobsResponse + if err := json.NewDecoder(resp.Body).Decode(&jobs); err != nil { + _ = resp.Body.Close() + continue + } + _ = resp.Body.Close() + + for _, job := range jobs.Jobs { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Fetch the last build console output. + consoleURL := fmt.Sprintf("%s/job/%s/lastBuild/consoleText", base, job.Name) + consoleReq, err := http.NewRequestWithContext(ctx, http.MethodGet, consoleURL, nil) + if err != nil { + continue + } + + consoleResp, err := client.Do(ctx, consoleReq) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(consoleResp.Body, 256*1024)) + _ = consoleResp.Body.Close() + if err != nil { + continue + } + + if ciLogKeyPattern.Match(body) { + out <- recon.Finding{ + ProviderName: q, + Source: consoleURL, + SourceType: "recon:jenkins", + Confidence: "medium", + DetectedAt: time.Now(), + } + } + } + } + return nil +} diff --git a/pkg/recon/sources/jenkins_test.go b/pkg/recon/sources/jenkins_test.go new file mode 100644 index 0000000..8f8ab6e --- /dev/null +++ b/pkg/recon/sources/jenkins_test.go @@ -0,0 +1,75 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestJenkins_Name(t *testing.T) { + s := &JenkinsSource{} + if s.Name() != "jenkins" { + t.Fatalf("expected jenkins, got %s", s.Name()) + } +} + +func TestJenkins_Enabled(t *testing.T) { + s := &JenkinsSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("JenkinsSource should always be enabled (credentialless)") + } +} + +func TestJenkins_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/api/json", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"jobs":[{"name":"deploy-prod","url":"http://jenkins/job/deploy-prod/","color":"blue"}]}`)) + }) + mux.HandleFunc("/job/deploy-prod/lastBuild/consoleText", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`Started by user admin +[Pipeline] echo +Setting AUTH_TOKEN="sk-proj-JENKINSLEAK123456" +[Pipeline] sh +Build SUCCESS`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &JenkinsSource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from Jenkins console output") + } + if findings[0].SourceType != "recon:jenkins" { + t.Fatalf("expected recon:jenkins, got %s", findings[0].SourceType) + } +} diff --git a/pkg/recon/sources/jsbundle.go b/pkg/recon/sources/jsbundle.go new file mode 100644 index 0000000..36ca0fb --- /dev/null +++ b/pkg/recon/sources/jsbundle.go @@ -0,0 +1,116 @@ +package sources + +import ( + "context" + "fmt" + "io" + "net/http" + "regexp" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// JSBundleSource analyzes public JavaScript bundles for embedded API keys. +// Modern build tools (Webpack, Vite, esbuild, Rollup) inline environment +// variables and configuration at build time. This source probes common bundle +// paths and scans the minified JS for API key patterns, complementing +// WebpackSource by targeting raw key literals rather than env var prefixes. +type JSBundleSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*JSBundleSource)(nil) + +func (s *JSBundleSource) Name() string { return "jsbundle" } +func (s *JSBundleSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *JSBundleSource) Burst() int { return 2 } +func (s *JSBundleSource) RespectsRobots() bool { return true } +func (s *JSBundleSource) Enabled(_ recon.Config) bool { return true } + +// jsBundleKeyPattern matches literal API key assignments commonly found in +// minified JS bundles (e.g., apiKey:"sk-proj-...", "Authorization":"Bearer sk-..."). +var jsBundleKeyPattern = regexp.MustCompile(`(?i)(?:api[_-]?key|secret|token|authorization|bearer)\s*[=:"']+\s*['"]?([a-zA-Z0-9_\-]{20,})['"]?`) + +// jsBundlePaths are common locations for production JS bundles. +var jsBundlePaths = []string{ + "/static/js/main.js", + "/static/js/app.js", + "/static/js/vendor.js", + "/dist/app.js", + "/dist/main.js", + "/assets/app.js", + "/assets/index.js", + "/js/app.js", + "/_next/static/chunks/main.js", + "/_next/static/chunks/pages/_app.js", +} + +func (s *JSBundleSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + return nil + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "jsbundle") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + for _, path := range jsBundlePaths { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + probeURL := fmt.Sprintf("%s%s", base, path) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil) + if err != nil { + continue + } + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024)) // 1MB max for JS bundles + _ = resp.Body.Close() + if err != nil { + continue + } + + if jsBundleKeyPattern.Match(body) { + out <- recon.Finding{ + ProviderName: q, + Source: probeURL, + SourceType: "recon:jsbundle", + Confidence: "medium", + DetectedAt: time.Now(), + } + break // one finding per query is sufficient + } + } + } + return nil +} diff --git a/pkg/recon/sources/jsbundle_test.go b/pkg/recon/sources/jsbundle_test.go new file mode 100644 index 0000000..d42460c --- /dev/null +++ b/pkg/recon/sources/jsbundle_test.go @@ -0,0 +1,68 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestJSBundle_Name(t *testing.T) { + s := &JSBundleSource{} + if s.Name() != "jsbundle" { + t.Fatalf("expected jsbundle, got %s", s.Name()) + } +} + +func TestJSBundle_Enabled(t *testing.T) { + s := &JSBundleSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("JSBundleSource should always be enabled (credentialless)") + } +} + +func TestJSBundle_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/static/js/main.js", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/javascript") + _, _ = w.Write([]byte(`!function(e){var t={apiKey:"sk-proj-JSBUNDLELEAK123456789",baseUrl:"https://api.example.com"};e.exports=t}(module);`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &JSBundleSource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from JS bundle") + } + if findings[0].SourceType != "recon:jsbundle" { + t.Fatalf("expected recon:jsbundle, got %s", findings[0].SourceType) + } +} diff --git a/pkg/recon/sources/travisci.go b/pkg/recon/sources/travisci.go new file mode 100644 index 0000000..2f703b4 --- /dev/null +++ b/pkg/recon/sources/travisci.go @@ -0,0 +1,140 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "regexp" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// TravisCISource scrapes public Travis CI build logs for leaked API keys. +// Travis CI exposes build logs publicly by default for open-source projects. +// Developers frequently print environment variables or use secrets insecurely +// in CI scripts, causing API keys to appear in build output. +type TravisCISource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*TravisCISource)(nil) + +func (s *TravisCISource) Name() string { return "travisci" } +func (s *TravisCISource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *TravisCISource) Burst() int { return 2 } +func (s *TravisCISource) RespectsRobots() bool { return false } +func (s *TravisCISource) Enabled(_ recon.Config) bool { return true } + +// travisBuildResponse represents the Travis CI API builds response. +type travisBuildResponse struct { + Builds []travisBuild `json:"builds"` +} + +type travisBuild struct { + ID int `json:"id"` + State string `json:"state"` +} + +// ciLogKeyPattern matches API key patterns commonly leaked in CI logs. +var ciLogKeyPattern = regexp.MustCompile(`(?i)(api[_-]?key|secret[_-]?key|token|password|credential|auth[_-]?token)['":\s]*[=:]\s*['"]?([a-zA-Z0-9_\-]{16,})['"]?`) + +func (s *TravisCISource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://api.travis-ci.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "travisci") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Search for builds related to the query keyword. + searchURL := fmt.Sprintf("%s/builds?search=%s&limit=5", base, q) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + continue + } + req.Header.Set("Travis-API-Version", "3") + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + var builds travisBuildResponse + if err := json.NewDecoder(resp.Body).Decode(&builds); err != nil { + _ = resp.Body.Close() + continue + } + _ = resp.Body.Close() + + for _, b := range builds.Builds { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Fetch the build log. + logURL := fmt.Sprintf("%s/builds/%d/log", base, b.ID) + logReq, err := http.NewRequestWithContext(ctx, http.MethodGet, logURL, nil) + if err != nil { + continue + } + logReq.Header.Set("Travis-API-Version", "3") + logReq.Header.Set("Accept", "text/plain") + + logResp, err := client.Do(ctx, logReq) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(logResp.Body, 256*1024)) + _ = logResp.Body.Close() + if err != nil { + continue + } + + if ciLogKeyPattern.Match(body) { + out <- recon.Finding{ + ProviderName: q, + Source: logURL, + SourceType: "recon:travisci", + Confidence: "medium", + DetectedAt: time.Now(), + } + } + } + } + return nil +} diff --git a/pkg/recon/sources/travisci_test.go b/pkg/recon/sources/travisci_test.go new file mode 100644 index 0000000..0c61b57 --- /dev/null +++ b/pkg/recon/sources/travisci_test.go @@ -0,0 +1,74 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestTravisCI_Name(t *testing.T) { + s := &TravisCISource{} + if s.Name() != "travisci" { + t.Fatalf("expected travisci, got %s", s.Name()) + } +} + +func TestTravisCI_Enabled(t *testing.T) { + s := &TravisCISource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("TravisCISource should always be enabled (credentialless)") + } +} + +func TestTravisCI_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/builds", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"builds":[{"id":123,"state":"passed"}]}`)) + }) + mux.HandleFunc("/builds/123/log", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(`Setting environment variables +export API_KEY="sk-proj-ABCDEF1234567890" +Running tests...`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &TravisCISource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from Travis CI build log") + } + if findings[0].SourceType != "recon:travisci" { + t.Fatalf("expected recon:travisci, got %s", findings[0].SourceType) + } +} diff --git a/pkg/recon/sources/wayback.go b/pkg/recon/sources/wayback.go new file mode 100644 index 0000000..9851ede --- /dev/null +++ b/pkg/recon/sources/wayback.go @@ -0,0 +1,134 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// WaybackMachineSource searches the Internet Archive's Wayback Machine CDX API +// for archived pages that may contain leaked API keys. Developers sometimes +// remove secrets from live pages but cached versions persist in web archives. +type WaybackMachineSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*WaybackMachineSource)(nil) + +func (s *WaybackMachineSource) Name() string { return "wayback" } +func (s *WaybackMachineSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) } +func (s *WaybackMachineSource) Burst() int { return 1 } +func (s *WaybackMachineSource) RespectsRobots() bool { return true } +func (s *WaybackMachineSource) Enabled(_ recon.Config) bool { return true } + +func (s *WaybackMachineSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://web.archive.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "wayback") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // CDX API: search for archived URLs matching the query. + // Filter for .env, config, and JS files that commonly contain keys. + cdxURL := fmt.Sprintf("%s/cdx/search/cdx?url=*%s*&output=json&limit=10&fl=url,timestamp,statuscode&filter=statuscode:200", base, q) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, cdxURL, nil) + if err != nil { + continue + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + var rows [][]string + if err := json.NewDecoder(resp.Body).Decode(&rows); err != nil { + _ = resp.Body.Close() + continue + } + _ = resp.Body.Close() + + // Skip the header row if present. + start := 0 + if len(rows) > 0 && len(rows[0]) > 0 && rows[0][0] == "url" { + start = 1 + } + + for _, row := range rows[start:] { + if err := ctx.Err(); err != nil { + return err + } + if len(row) < 2 { + continue + } + archivedURL := row[0] + timestamp := row[1] + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Fetch the archived page content. + snapshotURL := fmt.Sprintf("%s/web/%sid_/%s", base, timestamp, archivedURL) + snapReq, err := http.NewRequestWithContext(ctx, http.MethodGet, snapshotURL, nil) + if err != nil { + continue + } + + snapResp, err := client.Do(ctx, snapReq) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(snapResp.Body, 256*1024)) + _ = snapResp.Body.Close() + if err != nil { + continue + } + + if apiKeyPattern.Match(body) { + out <- recon.Finding{ + ProviderName: q, + Source: snapshotURL, + SourceType: "recon:wayback", + Confidence: "medium", + DetectedAt: time.Now(), + } + } + } + } + return nil +} diff --git a/pkg/recon/sources/wayback_test.go b/pkg/recon/sources/wayback_test.go new file mode 100644 index 0000000..0d5f99b --- /dev/null +++ b/pkg/recon/sources/wayback_test.go @@ -0,0 +1,71 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestWayback_Name(t *testing.T) { + s := &WaybackMachineSource{} + if s.Name() != "wayback" { + t.Fatalf("expected wayback, got %s", s.Name()) + } +} + +func TestWayback_Enabled(t *testing.T) { + s := &WaybackMachineSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("WaybackMachineSource should always be enabled (credentialless)") + } +} + +func TestWayback_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/cdx/search/cdx", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[["url","timestamp","statuscode"],["https://example.com/.env","20240101000000","200"]]`)) + }) + mux.HandleFunc("/web/", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`OPENAI_API_KEY="sk-proj-WAYBACKLEAK12345678"`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &WaybackMachineSource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from Wayback Machine archives") + } + if findings[0].SourceType != "recon:wayback" { + t.Fatalf("expected recon:wayback, got %s", findings[0].SourceType) + } +}