From 0373931490e3d3b38fe89570630d485804c7f0fe Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 00:42:33 +0300 Subject: [PATCH] feat(09-04): implement RobotsCache with 1h per-host TTL - Parses robots.txt via temoto/robotstxt - Caches per host for 1 hour; second call within TTL skips HTTP fetch - Default-allow on network/parse/4xx/5xx errors - Matches 'keyhunter' user-agent against disallowed paths - Client field allows httptest injection Satisfies RECON-INFRA-07. --- pkg/recon/robots.go | 95 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 pkg/recon/robots.go diff --git a/pkg/recon/robots.go b/pkg/recon/robots.go new file mode 100644 index 0000000..74c6515 --- /dev/null +++ b/pkg/recon/robots.go @@ -0,0 +1,95 @@ +package recon + +import ( + "context" + "io" + "net/http" + "net/url" + "sync" + "time" + + "github.com/temoto/robotstxt" +) + +const ( + robotsTTL = 1 * time.Hour + robotsUA = "keyhunter" +) + +type robotsEntry struct { + data *robotstxt.RobotsData + fetched time.Time +} + +// RobotsCache fetches and caches per-host robots.txt for 1 hour. +// Sources whose RespectsRobots() returns true should call Allowed before each request. +// +// On fetch or parse failure, Allowed returns true (default-allow) to avoid silently +// disabling recon sources when a target site has a broken robots endpoint. +type RobotsCache struct { + mu sync.Mutex + cache map[string]robotsEntry + + // Client is the HTTP client used to fetch robots.txt. If nil, + // http.DefaultClient is used. Tests inject httptest.Server.Client(). + Client *http.Client +} + +// NewRobotsCache returns an empty cache ready for use. +func NewRobotsCache() *RobotsCache { + return &RobotsCache{cache: make(map[string]robotsEntry)} +} + +// Allowed reports whether the `keyhunter` user-agent may fetch rawURL per the +// host's robots.txt. Results are cached per-host for 1 hour. +func (rc *RobotsCache) Allowed(ctx context.Context, rawURL string) (bool, error) { + u, err := url.Parse(rawURL) + if err != nil || u.Host == "" { + return true, nil + } + host := u.Host + + rc.mu.Lock() + entry, ok := rc.cache[host] + rc.mu.Unlock() + if ok && time.Since(entry.fetched) < robotsTTL { + return entry.data.TestAgent(u.Path, robotsUA), nil + } + + client := rc.Client + if client == nil { + client = http.DefaultClient + } + + robotsURL := u.Scheme + "://" + host + "/robots.txt" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil) + if err != nil { + return true, nil + } + + resp, err := client.Do(req) + if err != nil { + return true, nil + } + defer resp.Body.Close() + + if resp.StatusCode >= 400 { + return true, nil + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return true, nil + } + + data, err := robotstxt.FromBytes(body) + if err != nil { + return true, nil + } + + rc.mu.Lock() + rc.cache[host] = robotsEntry{data: data, fetched: time.Now()} + rc.mu.Unlock() + + return data.TestAgent(u.Path, robotsUA), nil +}