- GistSource implements recon.ReconSource (RECON-CODE-04) - Lists /gists/public?per_page=100, fetches each file's raw content, scans against provider keyword set, emits one Finding per matching gist - Disabled when GitHub token empty - Rate: rate.Every(2s), burst 1 (30 req/min GitHub limit) - 256KB read cap per file; skips gists without keyword matches - httptest coverage: enable gating, sweep match, no-match, 401, ctx cancel
185 lines
4.7 KiB
Go
185 lines
4.7 KiB
Go
package sources
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"golang.org/x/time/rate"
|
|
|
|
"github.com/salvacybersec/keyhunter/pkg/providers"
|
|
"github.com/salvacybersec/keyhunter/pkg/recon"
|
|
)
|
|
|
|
// GistSource scans recent public GitHub Gists for provider keyword leaks
|
|
// (RECON-CODE-04).
|
|
//
|
|
// GitHub does not expose a dedicated /search/gists endpoint, so this source
|
|
// enumerates /gists/public (most-recent page) and fetches each file's raw URL
|
|
// to scan its content against the provider keyword set. Keep Phase 10 minimal:
|
|
// only the first page is walked; broader sweeps are a future optimization.
|
|
//
|
|
// Auth: GitHub token via Bearer header. Rate: 30 req/min (shared with GitHub
|
|
// search limits) → rate.Every(2s), burst 1.
|
|
type GistSource struct {
|
|
Token string
|
|
BaseURL string
|
|
Registry *providers.Registry
|
|
Limiters *recon.LimiterRegistry
|
|
|
|
client *Client
|
|
}
|
|
|
|
var _ recon.ReconSource = (*GistSource)(nil)
|
|
|
|
// Name returns the stable source identifier.
|
|
func (s *GistSource) Name() string { return "gist" }
|
|
|
|
// RateLimit reports the per-source token bucket rate (30/min).
|
|
func (s *GistSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) }
|
|
|
|
// Burst reports the token bucket burst capacity.
|
|
func (s *GistSource) Burst() int { return 1 }
|
|
|
|
// RespectsRobots reports whether robots.txt applies (REST API → false).
|
|
func (s *GistSource) RespectsRobots() bool { return false }
|
|
|
|
// Enabled reports whether the source runs. Requires a GitHub token.
|
|
func (s *GistSource) Enabled(_ recon.Config) bool { return s.Token != "" }
|
|
|
|
type gistListEntry struct {
|
|
HTMLURL string `json:"html_url"`
|
|
Files map[string]struct {
|
|
Filename string `json:"filename"`
|
|
RawURL string `json:"raw_url"`
|
|
} `json:"files"`
|
|
}
|
|
|
|
// Sweep fetches /gists/public, scans each file's raw content against the
|
|
// keyword set from the registry, and emits one Finding per gist that matches
|
|
// any keyword (not one per file — gists often split a single leak across
|
|
// helper files).
|
|
func (s *GistSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
|
if s.client == nil {
|
|
s.client = NewClient()
|
|
}
|
|
base := s.BaseURL
|
|
if base == "" {
|
|
base = "https://api.github.com"
|
|
}
|
|
|
|
keywords := s.keywordSet()
|
|
if len(keywords) == 0 {
|
|
return nil
|
|
}
|
|
|
|
if s.Limiters != nil {
|
|
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
listReq, err := http.NewRequest(http.MethodGet, base+"/gists/public?per_page=100", nil)
|
|
if err != nil {
|
|
return fmt.Errorf("gist: build list request: %w", err)
|
|
}
|
|
listReq.Header.Set("Authorization", "Bearer "+s.Token)
|
|
listReq.Header.Set("Accept", "application/vnd.github+json")
|
|
|
|
listResp, err := s.client.Do(ctx, listReq)
|
|
if err != nil {
|
|
return fmt.Errorf("gist: list: %w", err)
|
|
}
|
|
var gists []gistListEntry
|
|
dec := json.NewDecoder(listResp.Body)
|
|
decodeErr := dec.Decode(&gists)
|
|
_ = listResp.Body.Close()
|
|
if decodeErr != nil {
|
|
return fmt.Errorf("gist: decode list: %w", decodeErr)
|
|
}
|
|
|
|
for _, g := range gists {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
matched := false
|
|
var matchedProvider string
|
|
|
|
fileLoop:
|
|
for _, f := range g.Files {
|
|
if f.RawURL == "" {
|
|
continue
|
|
}
|
|
if s.Limiters != nil {
|
|
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
rawReq, err := http.NewRequest(http.MethodGet, f.RawURL, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("gist: build raw request: %w", err)
|
|
}
|
|
rawReq.Header.Set("Authorization", "Bearer "+s.Token)
|
|
rawResp, err := s.client.Do(ctx, rawReq)
|
|
if err != nil {
|
|
return fmt.Errorf("gist: fetch raw: %w", err)
|
|
}
|
|
// Cap read to 256KB to avoid pathological gists.
|
|
body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024))
|
|
_ = rawResp.Body.Close()
|
|
if readErr != nil {
|
|
return fmt.Errorf("gist: read raw: %w", readErr)
|
|
}
|
|
|
|
content := string(body)
|
|
for kw, provName := range keywords {
|
|
if strings.Contains(content, kw) {
|
|
matched = true
|
|
matchedProvider = provName
|
|
break fileLoop
|
|
}
|
|
}
|
|
}
|
|
|
|
if matched {
|
|
select {
|
|
case out <- recon.Finding{
|
|
ProviderName: matchedProvider,
|
|
Source: g.HTMLURL,
|
|
SourceType: "recon:gist",
|
|
DetectedAt: time.Now().UTC(),
|
|
}:
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// keywordSet flattens the registry into a keyword→providerName map for
|
|
// content scanning. Empty keywords are skipped.
|
|
func (s *GistSource) keywordSet() map[string]string {
|
|
out := make(map[string]string)
|
|
if s.Registry == nil {
|
|
return out
|
|
}
|
|
for _, p := range s.Registry.List() {
|
|
for _, k := range p.Keywords {
|
|
if k == "" {
|
|
continue
|
|
}
|
|
if _, ok := out[k]; !ok {
|
|
out[k] = p.Name
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|