package recon import ( "context" "io" "net/http" "net/url" "sync" "time" "github.com/temoto/robotstxt" ) const ( robotsTTL = 1 * time.Hour robotsUA = "keyhunter" ) type robotsEntry struct { data *robotstxt.RobotsData fetched time.Time } // RobotsCache fetches and caches per-host robots.txt for 1 hour. // Sources whose RespectsRobots() returns true should call Allowed before each request. // // On fetch or parse failure, Allowed returns true (default-allow) to avoid silently // disabling recon sources when a target site has a broken robots endpoint. type RobotsCache struct { mu sync.Mutex cache map[string]robotsEntry // Client is the HTTP client used to fetch robots.txt. If nil, // http.DefaultClient is used. Tests inject httptest.Server.Client(). Client *http.Client } // NewRobotsCache returns an empty cache ready for use. func NewRobotsCache() *RobotsCache { return &RobotsCache{cache: make(map[string]robotsEntry)} } // Allowed reports whether the `keyhunter` user-agent may fetch rawURL per the // host's robots.txt. Results are cached per-host for 1 hour. func (rc *RobotsCache) Allowed(ctx context.Context, rawURL string) (bool, error) { u, err := url.Parse(rawURL) if err != nil || u.Host == "" { return true, nil } host := u.Host rc.mu.Lock() entry, ok := rc.cache[host] rc.mu.Unlock() if ok && time.Since(entry.fetched) < robotsTTL { return entry.data.TestAgent(u.Path, robotsUA), nil } client := rc.Client if client == nil { client = http.DefaultClient } robotsURL := u.Scheme + "://" + host + "/robots.txt" req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil) if err != nil { return true, nil } resp, err := client.Do(req) if err != nil { return true, nil } defer resp.Body.Close() if resp.StatusCode >= 400 { return true, nil } body, err := io.ReadAll(resp.Body) if err != nil { return true, nil } data, err := robotstxt.FromBytes(body) if err != nil { return true, nil } rc.mu.Lock() rc.cache[host] = robotsEntry{data: data, fetched: time.Now()} rc.mu.Unlock() return data.TestAgent(u.Path, robotsUA), nil }