From 0137dc57b170b507350a8336ab2700a5cfd6d0bd Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 01:15:49 +0300 Subject: [PATCH] feat(10-03): add GitLabSource for /api/v4/search blobs - Implements recon.ReconSource against GitLab Search API - PRIVATE-TOKEN header auth; rate.Every(30ms) burst 5 (~2000/min) - Disabled when token empty; Sweep returns nil without calls - Emits Finding per blob with Source=/projects//-/blob// - 401 wrapped as ErrUnauthorized; ctx cancellation honored - httptest coverage: enabled gating, happy path, 401, ctx cancel, iface assert --- pkg/recon/sources/gitlab.go | 175 +++++++++++++++++++++++ pkg/recon/sources/gitlab_test.go | 229 +++++++++++++++++++++++++++++++ 2 files changed, 404 insertions(+) create mode 100644 pkg/recon/sources/gitlab.go create mode 100644 pkg/recon/sources/gitlab_test.go diff --git a/pkg/recon/sources/gitlab.go b/pkg/recon/sources/gitlab.go new file mode 100644 index 0000000..18d4d9e --- /dev/null +++ b/pkg/recon/sources/gitlab.go @@ -0,0 +1,175 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// GitLabSource implements recon.ReconSource against GitLab's Search API +// (/api/v4/search?scope=blobs). It honors PRIVATE-TOKEN header auth and the +// published 2000 req/min rate limit. Sweep iterates BuildQueries(reg, "gitlab") +// — one request per keyword-derived query — and emits one Finding per returned +// blob, with Source pointing at the blob's project/ref/path URL. +// +// RECON-CODE-02. +type GitLabSource struct { + // Token is the GitLab personal access token. When empty the source is + // disabled (Enabled returns false, Sweep is a no-op). + Token string + // BaseURL is the GitLab instance root. Defaults to https://gitlab.com. + BaseURL string + // Registry drives query generation via BuildQueries and provider name + // mapping for emitted findings. + Registry *providers.Registry + // Limiters is the shared per-source rate-limiter registry. + Limiters *recon.LimiterRegistry + + // client is the retry-aware HTTP wrapper. A nil client is replaced with + // NewClient() lazily inside Sweep so zero-value construction works. + client *Client +} + +// Compile-time interface assertion. +var _ recon.ReconSource = (*GitLabSource)(nil) + +// Name returns the stable source identifier. +func (s *GitLabSource) Name() string { return "gitlab" } + +// RateLimit returns ~2000 req/min (one token every 30ms). +func (s *GitLabSource) RateLimit() rate.Limit { return rate.Every(30 * time.Millisecond) } + +// Burst allows short bursts of 5 requests. +func (s *GitLabSource) Burst() int { return 5 } + +// RespectsRobots returns false: this source uses an authenticated REST API, +// not HTML scraping. +func (s *GitLabSource) RespectsRobots() bool { return false } + +// Enabled reports whether a token is configured. +func (s *GitLabSource) Enabled(_ recon.Config) bool { return strings.TrimSpace(s.Token) != "" } + +// glBlob is the subset of the GitLab Search API blob response we consume. +type glBlob struct { + Basename string `json:"basename"` + Data string `json:"data"` + Path string `json:"path"` + ProjectID int `json:"project_id"` + Ref string `json:"ref"` + Startline int `json:"startline"` +} + +// Sweep runs the GitLab blob search for every keyword-derived query and emits +// one Finding per blob. It returns nil when the source is disabled (empty +// token) so callers can safely skip without special-casing. +func (s *GitLabSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if !s.Enabled(recon.Config{}) { + return nil + } + if s.client == nil { + s.client = NewClient() + } + base := strings.TrimRight(s.BaseURL, "/") + if base == "" { + base = "https://gitlab.com" + } + limiters := s.Limiters + if limiters == nil { + limiters = recon.NewLimiterRegistry() + } + + queries := BuildQueries(s.Registry, "gitlab") + if len(queries) == 0 { + return nil + } + + kwIndex := gitlabKeywordIndex(s.Registry) + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if err := limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + + endpoint := fmt.Sprintf("%s/api/v4/search?scope=blobs&search=%s&per_page=20", + base, url.QueryEscape(q)) + req, err := http.NewRequest(http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("gitlab: build request: %w", err) + } + req.Header.Set("PRIVATE-TOKEN", s.Token) + req.Header.Set("Accept", "application/json") + + resp, err := s.client.Do(ctx, req) + if err != nil { + return fmt.Errorf("gitlab: %w", err) + } + + var blobs []glBlob + decErr := json.NewDecoder(resp.Body).Decode(&blobs) + _ = resp.Body.Close() + if decErr != nil { + return fmt.Errorf("gitlab: decode: %w", decErr) + } + + // For "gitlab", BuildQueries emits bare keywords, so a direct map + // lookup recovers the provider name for each query. + provName := kwIndex[q] + if provName == "" { + provName = "unknown" + } + + for _, b := range blobs { + sourceURL := fmt.Sprintf("%s/projects/%d/-/blob/%s/%s", + base, b.ProjectID, b.Ref, b.Path) + finding := recon.Finding{ + ProviderName: provName, + Confidence: "low", + Source: sourceURL, + SourceType: "recon:gitlab", + LineNumber: b.Startline, + DetectedAt: time.Now().UTC(), + } + select { + case out <- finding: + case <-ctx.Done(): + return ctx.Err() + } + } + } + + return nil +} + +// gitlabKeywordIndex maps each provider keyword back to its provider name for +// Finding.ProviderName population. A name prefixed with "gitlab" avoids +// colliding with the shared keywordIndex helper introduced by peer sources +// (github.go) in the same package. +func gitlabKeywordIndex(reg *providers.Registry) map[string]string { + idx := make(map[string]string) + if reg == nil { + return idx + } + for _, p := range reg.List() { + for _, kw := range p.Keywords { + if kw == "" { + continue + } + if _, exists := idx[kw]; !exists { + idx[kw] = p.Name + } + } + } + return idx +} diff --git a/pkg/recon/sources/gitlab_test.go b/pkg/recon/sources/gitlab_test.go new file mode 100644 index 0000000..6bb9a4e --- /dev/null +++ b/pkg/recon/sources/gitlab_test.go @@ -0,0 +1,229 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// gitlabTestRegistry returns a synthetic registry with two providers whose +// keywords drive the query loop. Keywords are chosen so BuildQueries output is +// deterministic and map lookups are unambiguous. +func gitlabTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + { + Name: "openai", + Keywords: []string{"sk-test"}, + Patterns: []providers.Pattern{{Regex: "sk-test[A-Za-z0-9]+", Confidence: "high"}}, + }, + { + Name: "demo", + Keywords: []string{"ghkey"}, + Patterns: []providers.Pattern{{Regex: "ghkey[A-Za-z0-9]+", Confidence: "low"}}, + }, + }) +} + +type gitlabBlobFixture struct { + Basename string `json:"basename"` + Data string `json:"data"` + Path string `json:"path"` + ProjectID int `json:"project_id"` + Ref string `json:"ref"` + Startline int `json:"startline"` +} + +func TestGitLabSource_EnabledFalseWhenTokenEmpty(t *testing.T) { + s := &GitLabSource{Token: "", Registry: gitlabTestRegistry(), Limiters: recon.NewLimiterRegistry()} + if s.Enabled(recon.Config{}) { + t.Fatalf("expected Enabled=false when token empty") + } + s2 := &GitLabSource{Token: "glpat-xxx", Registry: gitlabTestRegistry(), Limiters: recon.NewLimiterRegistry()} + if !s2.Enabled(recon.Config{}) { + t.Fatalf("expected Enabled=true when token set") + } + if s.Name() != "gitlab" { + t.Fatalf("expected Name=gitlab, got %q", s.Name()) + } + if s.RespectsRobots() { + t.Fatalf("expected RespectsRobots=false for REST API source") + } +} + +func TestGitLabSource_EmptyToken_NoCallsNoError(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.WriteHeader(200) + _, _ = w.Write([]byte("[]")) + })) + defer srv.Close() + + s := &GitLabSource{ + Token: "", + BaseURL: srv.URL, + Registry: gitlabTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + } + out := make(chan recon.Finding, 4) + if err := s.Sweep(context.Background(), "", out); err != nil { + t.Fatalf("expected nil err on empty token, got %v", err) + } + close(out) + if atomic.LoadInt32(&calls) != 0 { + t.Fatalf("expected zero HTTP calls, got %d", calls) + } + if len(out) != 0 { + t.Fatalf("expected zero findings, got %d", len(out)) + } +} + +func TestGitLabSource_Sweep_EmitsFindings(t *testing.T) { + var gotToken string + var gotScopes []string + var gotSearches []string + + blobs := []gitlabBlobFixture{ + {Basename: "config.env", Data: "API_KEY=sk-testABCDEF", Path: "app/config.env", ProjectID: 42, Ref: "main", Startline: 3}, + {Basename: "README.md", Data: "use ghkeyXYZ", Path: "docs/README.md", ProjectID: 99, Ref: "master", Startline: 10}, + } + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v4/search" { + http.Error(w, "not found", 404) + return + } + gotToken = r.Header.Get("PRIVATE-TOKEN") + gotScopes = append(gotScopes, r.URL.Query().Get("scope")) + gotSearches = append(gotSearches, r.URL.Query().Get("search")) + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(blobs) + })) + defer srv.Close() + + s := &GitLabSource{ + Token: "glpat-secret", + BaseURL: srv.URL, + Registry: gitlabTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + } + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := s.Sweep(ctx, "", out); err != nil { + t.Fatalf("sweep err: %v", err) + } + close(out) + + if gotToken != "glpat-secret" { + t.Fatalf("expected PRIVATE-TOKEN header, got %q", gotToken) + } + for _, sc := range gotScopes { + if sc != "blobs" { + t.Fatalf("expected scope=blobs, got %q", sc) + } + } + // Two providers → two queries → two requests → 4 findings (2 blobs each). + if len(gotSearches) != 2 { + t.Fatalf("expected 2 search calls, got %d: %v", len(gotSearches), gotSearches) + } + + findings := gitlabDrain(out) + if len(findings) != 4 { + t.Fatalf("expected 4 findings (2 blobs × 2 queries), got %d", len(findings)) + } + + var sawP42, sawP99 bool + for _, f := range findings { + if f.SourceType != "recon:gitlab" { + t.Errorf("bad SourceType: %q", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("bad confidence: %q", f.Confidence) + } + if strings.Contains(f.Source, "/projects/42/-/blob/main/app/config.env") { + sawP42 = true + } + if strings.Contains(f.Source, "/projects/99/-/blob/master/docs/README.md") { + sawP99 = true + } + } + if !sawP42 || !sawP99 { + t.Fatalf("expected both project URLs in Source fields: p42=%v p99=%v", sawP42, sawP99) + } +} + +func TestGitLabSource_Unauthorized(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(401) + _, _ = w.Write([]byte(`{"message":"401 Unauthorized"}`)) + })) + defer srv.Close() + + s := &GitLabSource{ + Token: "bad", + BaseURL: srv.URL, + Registry: gitlabTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + } + out := make(chan recon.Finding, 4) + err := s.Sweep(context.Background(), "", out) + close(out) + if err == nil { + t.Fatalf("expected error, got nil") + } + if !errors.Is(err, ErrUnauthorized) { + t.Fatalf("expected ErrUnauthorized, got %v", err) + } +} + +func TestGitLabSource_CtxCancellation(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + select { + case <-r.Context().Done(): + return + case <-time.After(2 * time.Second): + w.WriteHeader(200) + _, _ = w.Write([]byte("[]")) + } + })) + defer srv.Close() + + s := &GitLabSource{ + Token: "glpat-x", + BaseURL: srv.URL, + Registry: gitlabTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + } + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + out := make(chan recon.Finding, 4) + err := s.Sweep(ctx, "", out) + close(out) + if err == nil { + t.Fatalf("expected ctx error, got nil") + } +} + +func TestGitLabSource_InterfaceAssertion(t *testing.T) { + var _ recon.ReconSource = (*GitLabSource)(nil) +} + +func gitlabDrain(ch <-chan recon.Finding) []recon.Finding { + var out []recon.Finding + for f := range ch { + out = append(out, f) + } + return out +}