package sources import ( "context" "encoding/json" "fmt" "net/http" "net/url" "strings" "time" "golang.org/x/time/rate" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // GitLabSource implements recon.ReconSource against GitLab's Search API // (/api/v4/search?scope=blobs). It honors PRIVATE-TOKEN header auth and the // published 2000 req/min rate limit. Sweep iterates BuildQueries(reg, "gitlab") // — one request per keyword-derived query — and emits one Finding per returned // blob, with Source pointing at the blob's project/ref/path URL. // // RECON-CODE-02. type GitLabSource struct { // Token is the GitLab personal access token. When empty the source is // disabled (Enabled returns false, Sweep is a no-op). Token string // BaseURL is the GitLab instance root. Defaults to https://gitlab.com. BaseURL string // Registry drives query generation via BuildQueries and provider name // mapping for emitted findings. Registry *providers.Registry // Limiters is the shared per-source rate-limiter registry. Limiters *recon.LimiterRegistry // client is the retry-aware HTTP wrapper. A nil client is replaced with // NewClient() lazily inside Sweep so zero-value construction works. client *Client } // Compile-time interface assertion. var _ recon.ReconSource = (*GitLabSource)(nil) // Name returns the stable source identifier. func (s *GitLabSource) Name() string { return "gitlab" } // RateLimit returns ~2000 req/min (one token every 30ms). func (s *GitLabSource) RateLimit() rate.Limit { return rate.Every(30 * time.Millisecond) } // Burst allows short bursts of 5 requests. func (s *GitLabSource) Burst() int { return 5 } // RespectsRobots returns false: this source uses an authenticated REST API, // not HTML scraping. func (s *GitLabSource) RespectsRobots() bool { return false } // Enabled reports whether a token is configured. func (s *GitLabSource) Enabled(_ recon.Config) bool { return strings.TrimSpace(s.Token) != "" } // glBlob is the subset of the GitLab Search API blob response we consume. type glBlob struct { Basename string `json:"basename"` Data string `json:"data"` Path string `json:"path"` ProjectID int `json:"project_id"` Ref string `json:"ref"` Startline int `json:"startline"` } // Sweep runs the GitLab blob search for every keyword-derived query and emits // one Finding per blob. It returns nil when the source is disabled (empty // token) so callers can safely skip without special-casing. func (s *GitLabSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { if !s.Enabled(recon.Config{}) { return nil } if s.client == nil { s.client = NewClient() } base := strings.TrimRight(s.BaseURL, "/") if base == "" { base = "https://gitlab.com" } limiters := s.Limiters if limiters == nil { limiters = recon.NewLimiterRegistry() } queries := BuildQueries(s.Registry, "gitlab") if len(queries) == 0 { return nil } kwIndex := gitlabKeywordIndex(s.Registry) for _, q := range queries { if err := ctx.Err(); err != nil { return err } if err := limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } endpoint := fmt.Sprintf("%s/api/v4/search?scope=blobs&search=%s&per_page=20", base, url.QueryEscape(q)) req, err := http.NewRequest(http.MethodGet, endpoint, nil) if err != nil { return fmt.Errorf("gitlab: build request: %w", err) } req.Header.Set("PRIVATE-TOKEN", s.Token) req.Header.Set("Accept", "application/json") resp, err := s.client.Do(ctx, req) if err != nil { return fmt.Errorf("gitlab: %w", err) } var blobs []glBlob decErr := json.NewDecoder(resp.Body).Decode(&blobs) _ = resp.Body.Close() if decErr != nil { return fmt.Errorf("gitlab: decode: %w", decErr) } // For "gitlab", BuildQueries emits bare keywords, so a direct map // lookup recovers the provider name for each query. provName := kwIndex[q] if provName == "" { provName = "unknown" } for _, b := range blobs { sourceURL := fmt.Sprintf("%s/projects/%d/-/blob/%s/%s", base, b.ProjectID, b.Ref, b.Path) finding := recon.Finding{ ProviderName: provName, Confidence: "low", Source: sourceURL, SourceType: "recon:gitlab", LineNumber: b.Startline, DetectedAt: time.Now().UTC(), } select { case out <- finding: case <-ctx.Done(): return ctx.Err() } } } return nil } // gitlabKeywordIndex maps each provider keyword back to its provider name for // Finding.ProviderName population. A name prefixed with "gitlab" avoids // colliding with the shared keywordIndex helper introduced by peer sources // (github.go) in the same package. func gitlabKeywordIndex(reg *providers.Registry) map[string]string { idx := make(map[string]string) if reg == nil { return idx } for _, p := range reg.List() { for _, kw := range p.Keywords { if kw == "" { continue } if _, exists := idx[kw]; !exists { idx[kw] = p.Name } } } return idx }