From fb6cb53975e8db70de851f10d621e2dda299d8b0 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 01:14:52 +0300 Subject: [PATCH] feat(10-02): implement GitHubSource recon.ReconSource --- pkg/recon/sources/github.go | 199 ++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 pkg/recon/sources/github.go diff --git a/pkg/recon/sources/github.go b/pkg/recon/sources/github.go new file mode 100644 index 0000000..d6192cc --- /dev/null +++ b/pkg/recon/sources/github.go @@ -0,0 +1,199 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/url" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// GitHubSource implements recon.ReconSource against the GitHub Code Search +// REST API. It iterates provider keyword queries (via BuildQueries) and emits +// a recon.Finding for every /search/code item returned. +// +// RECON-CODE-01: refactors the logic from pkg/dorks/github.go (Phase 8's +// GitHubExecutor) into the Phase 10 recon framework. Retries/429/5xx handling +// is delegated to the shared sources.Client; per-source rate limiting is +// delegated to recon.LimiterRegistry. Registration with recon.Engine happens +// in Plan 10-09. +// +// A missing token disables the source — Sweep returns nil and Enabled reports +// false. This keeps GitHub optional in sweeps that only need credential-free +// sources (Codeberg, Replit, etc.). +type GitHubSource struct { + Token string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + client *Client +} + +// NewGitHubSource constructs a GitHubSource pointing at api.github.com with +// the shared retry/backoff HTTP client. +func NewGitHubSource(token string, reg *providers.Registry, lim *recon.LimiterRegistry) *GitHubSource { + return &GitHubSource{ + Token: token, + BaseURL: "https://api.github.com", + Registry: reg, + Limiters: lim, + client: NewClient(), + } +} + +// Name returns the stable source identifier used by the engine and limiter +// registry. +func (s *GitHubSource) Name() string { return "github" } + +// RateLimit returns 1 request every 2 seconds — GitHub's authenticated code +// search endpoint allows 30 req/min. +func (s *GitHubSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } + +// Burst returns 1; GitHub code search does not tolerate bursts. +func (s *GitHubSource) Burst() int { return 1 } + +// RespectsRobots is false — this source talks to an authenticated REST API, +// not a scraped website. +func (s *GitHubSource) RespectsRobots() bool { return false } + +// Enabled returns true only when a token is configured. A missing token is not +// an error — the source is simply skipped. +func (s *GitHubSource) Enabled(_ recon.Config) bool { return s.Token != "" } + +// Sweep issues one /search/code request per provider keyword and emits a +// Finding for every item returned. The `query` parameter is ignored because +// GitHubSource builds its own queries from the provider registry; this keeps +// recon.Engine's common signature. +func (s *GitHubSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if s.Token == "" { + return nil + } + base := s.BaseURL + if base == "" { + base = "https://api.github.com" + } + + queries := BuildQueries(s.Registry, "github") + kwIndex := githubKeywordIndex(s.Registry) + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + endpoint := fmt.Sprintf("%s/search/code?q=%s&per_page=30", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("github: build request: %w", err) + } + req.Header.Set("Accept", "application/vnd.github.v3.text-match+json") + req.Header.Set("Authorization", "Bearer "+s.Token) + req.Header.Set("User-Agent", "keyhunter-recon") + + resp, err := s.client.Do(ctx, req) + if err != nil { + // 401 → unauthorized, abort the whole sweep (token is bad). + if errors.Is(err, ErrUnauthorized) { + return err + } + // Ctx cancellation during retry backoff also aborts. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + // Transient failures: log-and-continue per Phase 10 context + // (sources downgrade, not abort the whole sweep). + continue + } + + var parsed ghSearchResponse + decErr := json.NewDecoder(resp.Body).Decode(&parsed) + _ = resp.Body.Close() + if decErr != nil { + continue + } + + provName := kwIndex[strings.ToLower(extractGitHubKeyword(q))] + for _, it := range parsed.Items { + f := recon.Finding{ + ProviderName: provName, + Confidence: "low", + Source: it.HTMLURL, + SourceType: "recon:github", + DetectedAt: time.Now(), + } + select { + case out <- f: + case <-ctx.Done(): + return ctx.Err() + } + } + } + return nil +} + +// ghSearchResponse mirrors the subset of GitHub's /search/code JSON response +// that this source consumes. It is kept private to avoid cross-package +// coupling with pkg/dorks (which carries its own identical shapes). +type ghSearchResponse struct { + Items []ghCodeItem `json:"items"` +} + +type ghCodeItem struct { + HTMLURL string `json:"html_url"` + Repository ghRepository `json:"repository"` + TextMatches []ghTextMatchEntry `json:"text_matches"` +} + +type ghRepository struct { + FullName string `json:"full_name"` +} + +type ghTextMatchEntry struct { + Fragment string `json:"fragment"` +} + +// githubKeywordIndex maps every provider keyword (lowercased) to its provider +// name for Finding.ProviderName population. A nil registry returns an empty +// map. Named distinctly from any sibling source's helper to avoid symbol +// collisions across the sources package. +func githubKeywordIndex(reg *providers.Registry) map[string]string { + m := make(map[string]string) + if reg == nil { + return m + } + for _, p := range reg.List() { + for _, k := range p.Keywords { + kl := strings.ToLower(strings.TrimSpace(k)) + if kl == "" { + continue + } + if _, exists := m[kl]; !exists { + m[kl] = p.Name + } + } + } + return m +} + +// extractGitHubKeyword reverses BuildQueries("github", k) — it strips the +// surrounding quotes and the trailing ` in:file` qualifier. For any other +// shape it falls back to the trimmed input so new formats degrade safely. +func extractGitHubKeyword(q string) string { + s := strings.TrimSuffix(q, " in:file") + s = strings.TrimPrefix(s, "\"") + s = strings.TrimSuffix(s, "\"") + return s +}