From 5d568333c7b7dbd000d5d4f81c1069cc8662a900 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 13:50:14 +0300 Subject: [PATCH] feat(15-02): add Confluence and GoogleDocs ReconSource implementations - ConfluenceSource searches exposed instances via /rest/api/content/search CQL - GoogleDocsSource uses dorking + /export?format=txt for plain-text scanning - HTML tag stripping for Confluence storage format - Both credentialless, tests with httptest mocks confirm findings --- pkg/recon/sources/confluence.go | 133 +++++++++++++++++++++++++ pkg/recon/sources/confluence_test.go | 77 +++++++++++++++ pkg/recon/sources/googledocs.go | 139 +++++++++++++++++++++++++++ pkg/recon/sources/googledocs_test.go | 79 +++++++++++++++ 4 files changed, 428 insertions(+) create mode 100644 pkg/recon/sources/confluence.go create mode 100644 pkg/recon/sources/confluence_test.go create mode 100644 pkg/recon/sources/googledocs.go create mode 100644 pkg/recon/sources/googledocs_test.go diff --git a/pkg/recon/sources/confluence.go b/pkg/recon/sources/confluence.go new file mode 100644 index 0000000..6059403 --- /dev/null +++ b/pkg/recon/sources/confluence.go @@ -0,0 +1,133 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// ConfluenceSource searches publicly exposed Confluence wikis for leaked API +// keys. Many Confluence instances are misconfigured to allow anonymous access +// and their REST API exposes page content including credentials pasted into +// documentation. +type ConfluenceSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*ConfluenceSource)(nil) + +func (s *ConfluenceSource) Name() string { return "confluence" } +func (s *ConfluenceSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *ConfluenceSource) Burst() int { return 2 } +func (s *ConfluenceSource) RespectsRobots() bool { return true } +func (s *ConfluenceSource) Enabled(_ recon.Config) bool { return true } + +// confluenceSearchResponse represents the Confluence REST API content search response. +type confluenceSearchResponse struct { + Results []confluenceResult `json:"results"` +} + +type confluenceResult struct { + ID string `json:"id"` + Title string `json:"title"` + Body confluenceBody `json:"body"` + Links confluenceLinks `json:"_links"` +} + +type confluenceBody struct { + Storage confluenceStorage `json:"storage"` +} + +type confluenceStorage struct { + Value string `json:"value"` +} + +type confluenceLinks struct { + WebUI string `json:"webui"` +} + +// htmlTagPattern strips HTML tags to extract text content from Confluence storage format. +var htmlTagPattern = regexp.MustCompile(`<[^>]*>`) + +func (s *ConfluenceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://confluence.example.com" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "confluence") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Search Confluence via CQL (Confluence Query Language). + searchURL := fmt.Sprintf("%s/rest/api/content/search?cql=%s&limit=10&expand=body.storage", + base, url.QueryEscape(fmt.Sprintf(`text~"%s"`, q))) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + continue + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024)) + _ = resp.Body.Close() + if err != nil { + continue + } + + var result confluenceSearchResponse + if err := json.Unmarshal(body, &result); err != nil { + continue + } + + for _, page := range result.Results { + // Strip HTML tags to get plain text for key matching. + plainText := htmlTagPattern.ReplaceAllString(page.Body.Storage.Value, " ") + + if ciLogKeyPattern.MatchString(plainText) { + pageURL := fmt.Sprintf("%s%s", base, page.Links.WebUI) + out <- recon.Finding{ + ProviderName: q, + Source: pageURL, + SourceType: "recon:confluence", + Confidence: "medium", + DetectedAt: time.Now(), + } + } + } + } + return nil +} diff --git a/pkg/recon/sources/confluence_test.go b/pkg/recon/sources/confluence_test.go new file mode 100644 index 0000000..18bac86 --- /dev/null +++ b/pkg/recon/sources/confluence_test.go @@ -0,0 +1,77 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestConfluence_Name(t *testing.T) { + s := &ConfluenceSource{} + if s.Name() != "confluence" { + t.Fatalf("expected confluence, got %s", s.Name()) + } +} + +func TestConfluence_Enabled(t *testing.T) { + s := &ConfluenceSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("ConfluenceSource should always be enabled (credentialless)") + } +} + +func TestConfluence_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/rest/api/content/search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"results":[{ + "id":"12345", + "title":"API Configuration", + "body":{"storage":{"value":"

Production credentials: secret_key = sk-proj-ABCDEF1234567890abcdef

"}}, + "_links":{"webui":"/display/TEAM/API+Configuration"} + }]}`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &ConfluenceSource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from Confluence page") + } + if findings[0].SourceType != "recon:confluence" { + t.Fatalf("expected recon:confluence, got %s", findings[0].SourceType) + } + expected := srv.URL + "/display/TEAM/API+Configuration" + if findings[0].Source != expected { + t.Fatalf("expected %s, got %s", expected, findings[0].Source) + } +} diff --git a/pkg/recon/sources/googledocs.go b/pkg/recon/sources/googledocs.go new file mode 100644 index 0000000..96251a3 --- /dev/null +++ b/pkg/recon/sources/googledocs.go @@ -0,0 +1,139 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// GoogleDocsSource searches publicly shared Google Docs for leaked API keys. +// Google Docs shared with "anyone with the link" are indexable by search +// engines. This source uses a dorking approach to discover public docs and +// then fetches their plain-text export for credential scanning. +type GoogleDocsSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*GoogleDocsSource)(nil) + +func (s *GoogleDocsSource) Name() string { return "googledocs" } +func (s *GoogleDocsSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *GoogleDocsSource) Burst() int { return 2 } +func (s *GoogleDocsSource) RespectsRobots() bool { return true } +func (s *GoogleDocsSource) Enabled(_ recon.Config) bool { return true } + +// googleDocsSearchResponse represents dork search results for Google Docs. +type googleDocsSearchResponse struct { + Results []googleDocsSearchResult `json:"results"` +} + +type googleDocsSearchResult struct { + URL string `json:"url"` + Title string `json:"title"` +} + +func (s *GoogleDocsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://search.googledocs.dev" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "googledocs") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Search for public Google Docs via dorking. + searchURL := fmt.Sprintf("%s/search?q=%s&format=json", + base, url.QueryEscape("site:docs.google.com "+q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + continue + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024)) + _ = resp.Body.Close() + if err != nil { + continue + } + + var results googleDocsSearchResponse + if err := json.Unmarshal(body, &results); err != nil { + continue + } + + // Fetch each discovered doc's plain-text export. + for _, result := range results.Results { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + exportURL := result.URL + "/export?format=txt" + docReq, err := http.NewRequestWithContext(ctx, http.MethodGet, exportURL, nil) + if err != nil { + continue + } + + docResp, err := client.Do(ctx, docReq) + if err != nil { + continue + } + + docBody, err := io.ReadAll(io.LimitReader(docResp.Body, 256*1024)) + _ = docResp.Body.Close() + if err != nil { + continue + } + + if ciLogKeyPattern.Match(docBody) { + out <- recon.Finding{ + ProviderName: q, + Source: result.URL, + SourceType: "recon:googledocs", + Confidence: "medium", + DetectedAt: time.Now(), + } + } + } + } + return nil +} diff --git a/pkg/recon/sources/googledocs_test.go b/pkg/recon/sources/googledocs_test.go new file mode 100644 index 0000000..4d5fac6 --- /dev/null +++ b/pkg/recon/sources/googledocs_test.go @@ -0,0 +1,79 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestGoogleDocs_Name(t *testing.T) { + s := &GoogleDocsSource{} + if s.Name() != "googledocs" { + t.Fatalf("expected googledocs, got %s", s.Name()) + } +} + +func TestGoogleDocs_Enabled(t *testing.T) { + s := &GoogleDocsSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("GoogleDocsSource should always be enabled (credentialless)") + } +} + +func TestGoogleDocs_Sweep(t *testing.T) { + mux := http.NewServeMux() + + // Mock search endpoint returning a doc URL. + mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"results":[{"url":"` + "http://" + r.Host + `/doc/d/1a2b3c","title":"Setup Guide"}]}`)) + }) + + // Mock plain-text export with a leaked key. + mux.HandleFunc("/doc/d/1a2b3c/export", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(`Setup Instructions +Step 1: Set your API key +auth_token = sk-proj-ABCDEF1234567890abcdef +Step 2: Run the service`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &GoogleDocsSource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from Google Docs export") + } + if findings[0].SourceType != "recon:googledocs" { + t.Fatalf("expected recon:googledocs, got %s", findings[0].SourceType) + } +}