From 35fa4ad1746076ed67fcf64a381bf376e117a519 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 16:45:23 +0300 Subject: [PATCH] feat(16-01): add URLhaus recon source - URLhausSource searches abuse.ch URLhaus API for malicious URLs with API keys - Credentialless source (Enabled always true, no API key needed) - Tag lookup with payload endpoint fallback - ciLogKeyPattern used for content matching - Tests with httptest mocks for happy path and empty results Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/recon/sources/urlhaus.go | 152 ++++++++++++++++++++++++++++++ pkg/recon/sources/urlhaus_test.go | 119 +++++++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 pkg/recon/sources/urlhaus.go create mode 100644 pkg/recon/sources/urlhaus_test.go diff --git a/pkg/recon/sources/urlhaus.go b/pkg/recon/sources/urlhaus.go new file mode 100644 index 0000000..a0e128a --- /dev/null +++ b/pkg/recon/sources/urlhaus.go @@ -0,0 +1,152 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// URLhausSource searches the abuse.ch URLhaus API for malicious URLs that +// contain API key patterns. Threat actors often embed stolen API keys in +// malware C2 URLs, phishing pages, and credential-harvesting infrastructure. +// URLhaus is free and unauthenticated — no API key required. +type URLhausSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*URLhausSource)(nil) + +func (s *URLhausSource) Name() string { return "urlhaus" } +func (s *URLhausSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) } +func (s *URLhausSource) Burst() int { return 2 } +func (s *URLhausSource) RespectsRobots() bool { return false } +func (s *URLhausSource) Enabled(_ recon.Config) bool { return true } + +// urlhausResponse represents the URLhaus API response for tag/payload lookups. +type urlhausResponse struct { + QueryStatus string `json:"query_status"` + URLs []urlhausEntry `json:"urls"` +} + +// urlhausEntry is a single URL record from URLhaus. +type urlhausEntry struct { + URL string `json:"url"` + URLStatus string `json:"url_status"` + Tags []string `json:"tags"` + Reporter string `json:"reporter"` +} + +func (s *URLhausSource) Sweep(ctx context.Context, query string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://urlhaus-api.abuse.ch/v1" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "urlhaus") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + // Try tag lookup first. + tagURL := fmt.Sprintf("%s/tag/%s/", base, url.PathEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, tagURL, nil) + if err != nil { + continue + } + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + + resp, err := client.Do(ctx, req) + if err != nil { + // Fallback to payload endpoint on tag lookup failure. + resp, err = s.payloadFallback(ctx, client, base, q) + if err != nil { + continue + } + } + + data, err := io.ReadAll(io.LimitReader(resp.Body, 512*1024)) + _ = resp.Body.Close() + if err != nil { + continue + } + + var result urlhausResponse + if err := json.Unmarshal(data, &result); err != nil { + continue + } + + // If tag lookup returned no results, try payload fallback. + if result.QueryStatus != "ok" || len(result.URLs) == 0 { + resp, err = s.payloadFallback(ctx, client, base, q) + if err != nil { + continue + } + + data, err = io.ReadAll(io.LimitReader(resp.Body, 512*1024)) + _ = resp.Body.Close() + if err != nil { + continue + } + + if err := json.Unmarshal(data, &result); err != nil { + continue + } + } + + for _, entry := range result.URLs { + // Stringify the record and check for key patterns. + record := fmt.Sprintf("url=%s status=%s tags=%v reporter=%s", + entry.URL, entry.URLStatus, entry.Tags, entry.Reporter) + if ciLogKeyPattern.MatchString(record) || ciLogKeyPattern.MatchString(entry.URL) { + out <- recon.Finding{ + ProviderName: q, + Source: entry.URL, + SourceType: "recon:urlhaus", + Confidence: "medium", + DetectedAt: time.Now(), + } + } + } + } + return nil +} + +// payloadFallback tries the URLhaus payload endpoint as a secondary search method. +func (s *URLhausSource) payloadFallback(ctx context.Context, client *Client, base, tag string) (*http.Response, error) { + payloadURL := fmt.Sprintf("%s/payload/", base) + body := fmt.Sprintf("md5_hash=&sha256_hash=&tag=%s", url.QueryEscape(tag)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, payloadURL, strings.NewReader(body)) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + return client.Do(ctx, req) +} diff --git a/pkg/recon/sources/urlhaus_test.go b/pkg/recon/sources/urlhaus_test.go new file mode 100644 index 0000000..ad287fc --- /dev/null +++ b/pkg/recon/sources/urlhaus_test.go @@ -0,0 +1,119 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func TestURLhaus_Name(t *testing.T) { + s := &URLhausSource{} + if s.Name() != "urlhaus" { + t.Fatalf("expected urlhaus, got %s", s.Name()) + } +} + +func TestURLhaus_Enabled(t *testing.T) { + s := &URLhausSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("URLhausSource should always be enabled (credentialless)") + } +} + +func TestURLhaus_Sweep(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/tag/", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "query_status": "ok", + "urls": [{ + "url": "https://evil.example.com/exfil?token=sk-proj-ABCDEF1234567890abcdef", + "url_status": "online", + "tags": ["malware", "api_key"], + "reporter": "abuse_ch" + }] + }`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &URLhausSource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) == 0 { + t.Fatal("expected at least one finding from URLhaus") + } + if findings[0].SourceType != "recon:urlhaus" { + t.Fatalf("expected recon:urlhaus, got %s", findings[0].SourceType) + } +} + +func TestURLhaus_Sweep_Empty(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/tag/", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"query_status": "no_results", "urls": []}`)) + }) + mux.HandleFunc("/payload/", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"query_status": "no_results", "urls": []}`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + + s := &URLhausSource{ + BaseURL: srv.URL, + Registry: reg, + Client: NewClient(), + } + + out := make(chan recon.Finding, 10) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := s.Sweep(ctx, "", out) + close(out) + if err != nil { + t.Fatalf("Sweep error: %v", err) + } + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 0 { + t.Fatalf("expected no findings, got %d", len(findings)) + } +}