Files
keyhunter/pkg/recon/sources/register.go
salvacybersec c5332454b0 feat(14-02): add WaybackMachine + CommonCrawl recon sources
- WaybackMachineSource queries CDX API for historical snapshots
- CommonCrawlSource queries CC Index API for matching pages
- Both credentialless, rate-limited at 1 req/5s, RespectsRobots=true
- RegisterAll extended to 42 sources (40 Phase 10-13 + 2 Phase 14)
- Full httptest-based test coverage for both sources
2026-04-06 13:16:13 +03:00

237 lines
6.5 KiB
Go

package sources
import (
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// SourcesConfig carries per-source credentials and shared dependencies read
// from viper/env by cmd/recon.go and handed to RegisterAll.
//
// Fields are populated from environment variables (GITHUB_TOKEN, GITLAB_TOKEN,
// ...) or viper config keys (recon.github.token, ...). Empty values are
// permitted: the corresponding source is still registered on the engine, but
// its Enabled() reports false so SweepAll skips it cleanly.
type SourcesConfig struct {
// GitHub / Gist share the same token.
GitHubToken string
// GitLab personal access token.
GitLabToken string
// Bitbucket Cloud app password or OAuth token + required workspace slug.
BitbucketToken string
BitbucketWorkspace string
// Codeberg (Gitea) token — optional, raises rate limit when present.
CodebergToken string
// HuggingFace Hub token — optional, raises rate limit when present.
HuggingFaceToken string
// Kaggle Basic-auth username + API key.
KaggleUser string
KaggleKey string
// Google Custom Search API key and search engine ID (CX).
GoogleAPIKey string
GoogleCX string
// Bing Web Search API subscription key.
BingAPIKey string
// Yandex XML Search user and API key.
YandexUser string
YandexAPIKey string
// Brave Search API subscription token.
BraveAPIKey string
// Phase 12: IoT scanner API keys.
ShodanAPIKey string
CensysAPIId string
CensysAPISecret string
ZoomEyeAPIKey string
FOFAEmail string
FOFAAPIKey string
NetlasAPIKey string
BinaryEdgeAPIKey string
// Registry drives query generation for every source via BuildQueries.
Registry *providers.Registry
// Limiters is the shared per-source rate-limiter registry.
Limiters *recon.LimiterRegistry
}
// RegisterAll registers every Phase 10 code-hosting, Phase 11 search engine /
// paste site, Phase 12 IoT scanner / cloud storage, Phase 13 package
// registry / container / IaC, and Phase 14 web archive source on engine
// (42 sources total).
//
// All sources are registered unconditionally so that cmd/recon.go can surface
// the full catalog via `keyhunter recon list` regardless of which credentials
// are configured. Sources without required credentials return Enabled()==false
// so SweepAll skips them without erroring.
//
// A nil engine is treated as a no-op (not an error) -- callers in broken init
// paths shouldn't panic.
func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
if engine == nil {
return
}
reg := cfg.Registry
lim := cfg.Limiters
// API sources with constructors.
engine.Register(NewGitHubSource(cfg.GitHubToken, reg, lim))
engine.Register(NewKaggleSource(cfg.KaggleUser, cfg.KaggleKey, reg, lim))
engine.Register(NewHuggingFaceSource(HuggingFaceConfig{
Token: cfg.HuggingFaceToken,
Registry: reg,
Limiters: lim,
}))
// API sources exposed as struct literals (no New* constructor in Wave 2).
engine.Register(&GitLabSource{
Token: cfg.GitLabToken,
Registry: reg,
Limiters: lim,
})
engine.Register(&BitbucketSource{
Token: cfg.BitbucketToken,
Workspace: cfg.BitbucketWorkspace,
Registry: reg,
Limiters: lim,
})
engine.Register(&GistSource{
Token: cfg.GitHubToken,
Registry: reg,
Limiters: lim,
})
engine.Register(&CodebergSource{
Token: cfg.CodebergToken,
Registry: reg,
Limiters: lim,
})
// Scraping sources (credentialless).
engine.Register(&ReplitSource{
Registry: reg,
Limiters: lim,
})
engine.Register(&CodeSandboxSource{
Registry: reg,
Limiters: lim,
})
engine.Register(&SandboxesSource{
Registry: reg,
Limiters: lim,
})
// Phase 11: Search engine dorking sources.
engine.Register(&GoogleDorkSource{
APIKey: cfg.GoogleAPIKey,
CX: cfg.GoogleCX,
Registry: reg,
Limiters: lim,
})
engine.Register(&BingDorkSource{
APIKey: cfg.BingAPIKey,
Registry: reg,
Limiters: lim,
})
engine.Register(&DuckDuckGoSource{
Registry: reg,
Limiters: lim,
})
engine.Register(&YandexSource{
User: cfg.YandexUser,
APIKey: cfg.YandexAPIKey,
Registry: reg,
Limiters: lim,
})
engine.Register(&BraveSource{
APIKey: cfg.BraveAPIKey,
Registry: reg,
Limiters: lim,
})
// Phase 11: Paste site sources.
engine.Register(&PastebinSource{
Registry: reg,
Limiters: lim,
})
engine.Register(&GistPasteSource{
Registry: reg,
Limiters: lim,
})
engine.Register(&PasteSitesSource{
Registry: reg,
Limiters: lim,
})
// Phase 12: IoT scanner sources.
engine.Register(&ShodanSource{
APIKey: cfg.ShodanAPIKey,
Registry: reg,
Limiters: lim,
})
engine.Register(&CensysSource{
APIId: cfg.CensysAPIId,
APISecret: cfg.CensysAPISecret,
Registry: reg,
Limiters: lim,
})
engine.Register(&ZoomEyeSource{
APIKey: cfg.ZoomEyeAPIKey,
Registry: reg,
Limiters: lim,
})
engine.Register(&FOFASource{
Email: cfg.FOFAEmail,
APIKey: cfg.FOFAAPIKey,
Registry: reg,
Limiters: lim,
})
engine.Register(&NetlasSource{
APIKey: cfg.NetlasAPIKey,
Registry: reg,
Limiters: lim,
})
engine.Register(&BinaryEdgeSource{
APIKey: cfg.BinaryEdgeAPIKey,
Registry: reg,
Limiters: lim,
})
// Phase 12: Cloud storage sources (credentialless).
engine.Register(&S3Scanner{
Registry: reg,
Limiters: lim,
})
engine.Register(&GCSScanner{
Registry: reg,
Limiters: lim,
})
engine.Register(&AzureBlobScanner{
Registry: reg,
Limiters: lim,
})
engine.Register(&DOSpacesScanner{
Registry: reg,
Limiters: lim,
})
// Phase 13: Package registry sources (credentialless).
engine.Register(&NpmSource{Registry: reg, Limiters: lim})
engine.Register(&PyPISource{Registry: reg, Limiters: lim})
engine.Register(&CratesIOSource{Registry: reg, Limiters: lim})
engine.Register(&RubyGemsSource{Registry: reg, Limiters: lim})
engine.Register(&MavenSource{Registry: reg, Limiters: lim})
engine.Register(&NuGetSource{Registry: reg, Limiters: lim})
engine.Register(&GoProxySource{Registry: reg, Limiters: lim})
engine.Register(&PackagistSource{Registry: reg, Limiters: lim})
// Phase 13: Container and IaC sources (credentialless).
engine.Register(&DockerHubSource{Registry: reg, Limiters: lim})
engine.Register(&KubernetesSource{Registry: reg, Limiters: lim})
engine.Register(&TerraformSource{Registry: reg, Limiters: lim})
engine.Register(&HelmSource{Registry: reg, Limiters: lim})
// Phase 14: Web archive sources (credentialless).
engine.Register(&WaybackMachineSource{Registry: reg, Limiters: lim})
engine.Register(&CommonCrawlSource{Registry: reg, Limiters: lim})
}