diff --git a/.planning/phases/10-osint-code-hosting/10-05-SUMMARY.md b/.planning/phases/10-osint-code-hosting/10-05-SUMMARY.md new file mode 100644 index 0000000..9c1ac93 --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-05-SUMMARY.md @@ -0,0 +1,99 @@ +--- +phase: 10-osint-code-hosting +plan: 05 +subsystem: recon +tags: [codeberg, gitea, osint, rest-api, httptest] + +requires: + - phase: 09-osint-infrastructure + provides: ReconSource interface, LimiterRegistry, Engine + - phase: 10-osint-code-hosting/01 + provides: shared sources.Client (retry/backoff), BuildQueries helper +provides: + - CodebergSource implementing recon.ReconSource against Gitea REST API + - Reusable pattern for any Gitea-compatible instance via BaseURL override + - Dual-mode rate limiting (unauth 60/hr, auth ~1000/hr) +affects: [10-09 register-all, future Gitea-compatible sources, verification pipeline] + +tech-stack: + added: [] + patterns: + - "Keyword → ProviderName index built at Sweep() entry to re-attribute BuildQueries output" + - "BaseURL override enables generic Gitea targeting" + - "httptest.Server with request-capturing handlers for header presence/absence assertions" + +key-files: + created: + - pkg/recon/sources/codeberg.go + - pkg/recon/sources/codeberg_test.go + modified: [] + +key-decisions: + - "Sweep ignores its query argument and iterates provider keywords, matching sibling code-hosting sources" + - "Findings use Confidence=low since /repos/search matches repo metadata, not file contents — verification downstream separates real hits" + - "Token is optional; Enabled() always returns true because public API works anonymously" + - "DefaultCodebergBaseURL constant exported so Plan 10-09 can point at alternate Gitea hosts" + +patterns-established: + - "Dual-mode rate limiting: if Token == \"\" return unauth rate else auth rate" + - "Per-source httptest suite covers: interface assertion, rate limits, decoding, header auth presence, header auth absence, ctx cancellation" + +requirements-completed: [RECON-CODE-05] + +duration: ~10min +completed: 2026-04-05 +--- + +# Phase 10 Plan 05: CodebergSource Summary + +**Gitea REST API source targeting Codeberg.org via /api/v1/repos/search with optional token auth and dual-mode rate limiting.** + +## Performance + +- **Duration:** ~10 min +- **Started:** 2026-04-05T22:07:00Z +- **Completed:** 2026-04-05T22:17:31Z +- **Tasks:** 1 (TDD) +- **Files modified:** 2 created + +## Accomplishments +- CodebergSource implements recon.ReconSource with compile-time assertion +- Unauthenticated operation against /api/v1/repos/search (60/hour rate limit) +- Optional token mode sends `Authorization: token ` and raises limit to ~1000/hour +- Findings keyed to repo html_url with SourceType="recon:codeberg" and ProviderName resolved via keyword→provider index +- Shared sources.Client handles retries/429s; no bespoke HTTP logic in the source +- Six httptest-backed tests covering interface, both rate modes, sweep decoding, auth header presence/absence, and context cancellation + +## Task Commits + +1. **Task 1: CodebergSource + tests (TDD combined)** — `4fafc01` (feat) + +## Files Created/Modified +- `pkg/recon/sources/codeberg.go` — CodebergSource struct, rate mode selection, Sweep over /api/v1/repos/search +- `pkg/recon/sources/codeberg_test.go` — httptest fixtures for all six behaviors + +## Decisions Made +- TDD RED+GREEN collapsed into a single commit because the file pair is small and was verified end-to-end in one iteration (all six tests pass on first green build). +- `Confidence="low"` on emitted Findings: repo-metadata match is a weak signal until content verification runs. +- `Sweep` ignores the `query` parameter; the plan specifies driving queries from the provider registry via `BuildQueries`, consistent with sibling code-hosting sources. + +## Deviations from Plan +None — plan executed exactly as written. + +## Issues Encountered +- **Worktree path confusion (environmental, not code):** Initial Write tool calls targeted the main repo path instead of the active worktree. Files silently failed to persist and `go test` surfaced unrelated pre-existing `github_test.go` references in the main repo. Recovered by writing into the worktree path `/home/salva/Documents/apikey/.claude/worktrees/agent-a2637f83/`. No code changes resulted from this; purely a path fix. + +## Next Phase Readiness +- Ready for Plan 10-09 (RegisterAll) to wire CodebergSource into `RegisterAll` with `cfg.CodebergToken` (field to be added when 10-09 finalizes SourcesConfig). +- No blockers. + +## Self-Check: PASSED +- FOUND: pkg/recon/sources/codeberg.go +- FOUND: pkg/recon/sources/codeberg_test.go +- FOUND: commit 4fafc01 +- Tests: 6/6 passing (`go test ./pkg/recon/sources/ -run TestCodeberg -v`) +- Package: `go vet` clean, full package tests green + +--- +*Phase: 10-osint-code-hosting* +*Completed: 2026-04-05* diff --git a/pkg/recon/sources/codeberg.go b/pkg/recon/sources/codeberg.go new file mode 100644 index 0000000..0aaed2a --- /dev/null +++ b/pkg/recon/sources/codeberg.go @@ -0,0 +1,167 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// DefaultCodebergBaseURL is the public Codeberg instance. Any Gitea-compatible +// server can be substituted by setting CodebergSource.BaseURL. +const DefaultCodebergBaseURL = "https://codeberg.org" + +// CodebergSource implements recon.ReconSource against a Gitea-compatible REST +// API (Codeberg runs Gitea). Public repository metadata searches do not +// require authentication; when a Token is provided it is sent as +// "Authorization: token " which raises Gitea's per-user rate limit from +// 60/hour to ~1000/hour. +// +// Sweep iterates every keyword from the provider registry, queries +// /api/v1/repos/search?q=&limit=50, and emits one recon.Finding per +// returned repository. The html_url is used as Source; the matching provider +// name is attached so downstream verification can target the correct API. +type CodebergSource struct { + Token string + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + + client *Client +} + +// Compile-time interface assertion. +var _ recon.ReconSource = (*CodebergSource)(nil) + +// Name returns the stable identifier used by the limiter registry and +// Finding.SourceType. +func (s *CodebergSource) Name() string { return "codeberg" } + +// RateLimit returns rate.Every(60s) unauthenticated (60/hour) or +// rate.Every(3.6s) authenticated (~1000/hour). +func (s *CodebergSource) RateLimit() rate.Limit { + if s.Token == "" { + return rate.Every(60 * time.Second) + } + return rate.Every(3600 * time.Millisecond) +} + +// Burst returns 1 — Gitea's rate limits are per-hour smoothed, a burst of one +// keeps us safely within headroom for both auth modes. +func (s *CodebergSource) Burst() int { return 1 } + +// RespectsRobots is false — /api/v1/repos/search is a documented REST API. +func (s *CodebergSource) RespectsRobots() bool { return false } + +// Enabled is always true because the /repos/search endpoint works anonymously. +// A token, when present, only raises the rate limit. +func (s *CodebergSource) Enabled(_ recon.Config) bool { return true } + +// Sweep queries Gitea /api/v1/repos/search for every keyword in the provider +// registry, decodes the data array, and emits one Finding per result. The +// query parameter is ignored — Codeberg is swept by provider keyword, not by +// arbitrary Config.Query text, matching the sibling GitHub/GitLab sources. +func (s *CodebergSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if err := ctx.Err(); err != nil { + return err + } + + base := s.BaseURL + if base == "" { + base = DefaultCodebergBaseURL + } + if s.client == nil { + s.client = NewClient() + } + + // Build a keyword → providerName map once so emitted findings are + // correctly attributed even though BuildQueries returns bare strings. + keywordIndex := make(map[string]string) + if s.Registry != nil { + for _, p := range s.Registry.List() { + for _, k := range p.Keywords { + if k == "" { + continue + } + if _, exists := keywordIndex[k]; !exists { + keywordIndex[k] = p.Name + } + } + } + } + + queries := BuildQueries(s.Registry, s.Name()) + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + endpoint := fmt.Sprintf("%s/api/v1/repos/search?q=%s&limit=50", + base, url.QueryEscape(q)) + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("codeberg: build request: %w", err) + } + req.Header.Set("Accept", "application/json") + if s.Token != "" { + req.Header.Set("Authorization", "token "+s.Token) + } + + resp, err := s.client.Do(ctx, req) + if err != nil { + return fmt.Errorf("codeberg: search %q: %w", q, err) + } + + var decoded struct { + OK bool `json:"ok"` + Data []struct { + FullName string `json:"full_name"` + HTMLURL string `json:"html_url"` + } `json:"data"` + } + if err := json.NewDecoder(resp.Body).Decode(&decoded); err != nil { + _, _ = io.Copy(io.Discard, resp.Body) + _ = resp.Body.Close() + return fmt.Errorf("codeberg: decode: %w", err) + } + _ = resp.Body.Close() + + provider := keywordIndex[q] + for _, item := range decoded.Data { + if item.HTMLURL == "" { + continue + } + select { + case out <- recon.Finding{ + ProviderName: provider, + Source: item.HTMLURL, + SourceType: "recon:codeberg", + Confidence: "low", + DetectedAt: time.Now().UTC(), + }: + case <-ctx.Done(): + return ctx.Err() + } + } + } + + return nil +} diff --git a/pkg/recon/sources/codeberg_test.go b/pkg/recon/sources/codeberg_test.go new file mode 100644 index 0000000..40e415d --- /dev/null +++ b/pkg/recon/sources/codeberg_test.go @@ -0,0 +1,182 @@ +package sources + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func newCodebergTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + { + Name: "openai", + DisplayName: "OpenAI", + Tier: 1, + Keywords: []string{"sk-proj-"}, + FormatVersion: 1, + }, + }) +} + +func TestCodebergSource_NameAndInterface(t *testing.T) { + var _ recon.ReconSource = (*CodebergSource)(nil) + + s := &CodebergSource{} + if got := s.Name(); got != "codeberg" { + t.Errorf("Name() = %q, want %q", got, "codeberg") + } + if s.RespectsRobots() { + t.Errorf("RespectsRobots() = true, want false") + } + if !s.Enabled(recon.Config{}) { + t.Errorf("Enabled() = false, want true (public API)") + } +} + +func TestCodebergSource_RateLimitUnauthenticated(t *testing.T) { + s := &CodebergSource{} + got := s.RateLimit() + want := rate.Every(60 * time.Second) + if got != want { + t.Errorf("RateLimit() no token = %v, want %v", got, want) + } + if s.Burst() != 1 { + t.Errorf("Burst() = %d, want 1", s.Burst()) + } +} + +func TestCodebergSource_RateLimitAuthenticated(t *testing.T) { + s := &CodebergSource{Token: "abc123"} + got := s.RateLimit() + want := rate.Every(3600 * time.Millisecond) + if got != want { + t.Errorf("RateLimit() with token = %v, want %v", got, want) + } +} + +func TestCodebergSource_SweepEmitsFindings(t *testing.T) { + var gotAuth string + var gotPath string + var gotQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + gotPath = r.URL.Path + gotQuery = r.URL.Query().Get("q") + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{ + "ok": true, + "data": []map[string]any{ + { + "full_name": "alice/leaked-keys", + "html_url": "https://codeberg.org/alice/leaked-keys", + }, + }, + }) + })) + defer srv.Close() + + s := &CodebergSource{ + BaseURL: srv.URL, + Registry: newCodebergTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + } + + out := make(chan recon.Finding, 8) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := s.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatalf("expected at least one finding") + } + + f := findings[0] + if f.Source != "https://codeberg.org/alice/leaked-keys" { + t.Errorf("Source = %q, want codeberg html_url", f.Source) + } + if f.SourceType != "recon:codeberg" { + t.Errorf("SourceType = %q, want recon:codeberg", f.SourceType) + } + if f.ProviderName != "openai" { + t.Errorf("ProviderName = %q, want openai", f.ProviderName) + } + + if gotPath != "/api/v1/repos/search" { + t.Errorf("path = %q, want /api/v1/repos/search", gotPath) + } + if gotQuery == "" { + t.Errorf("query param empty") + } + if gotAuth != "" { + t.Errorf("Authorization header should be absent without token, got %q", gotAuth) + } +} + +func TestCodebergSource_SweepWithTokenSetsAuthHeader(t *testing.T) { + var gotAuth string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"ok":true,"data":[]}`)) + })) + defer srv.Close() + + s := &CodebergSource{ + Token: "s3cret", + BaseURL: srv.URL, + Registry: newCodebergTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + } + out := make(chan recon.Finding, 1) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := s.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep: %v", err) + } + + if !strings.HasPrefix(gotAuth, "token ") || !strings.Contains(gotAuth, "s3cret") { + t.Errorf("Authorization header = %q, want \"token s3cret\"", gotAuth) + } +} + +func TestCodebergSource_SweepContextCancellation(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + select { + case <-r.Context().Done(): + case <-time.After(3 * time.Second): + } + })) + defer srv.Close() + + s := &CodebergSource{ + BaseURL: srv.URL, + Registry: newCodebergTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + } + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 1) + err := s.Sweep(ctx, "", out) + if err == nil { + t.Fatalf("expected error on cancelled context") + } +}