feat(09-04): implement RobotsCache with 1h per-host TTL

- Parses robots.txt via temoto/robotstxt
- Caches per host for 1 hour; second call within TTL skips HTTP fetch
- Default-allow on network/parse/4xx/5xx errors
- Matches 'keyhunter' user-agent against disallowed paths
- Client field allows httptest injection

Satisfies RECON-INFRA-07.
This commit is contained in:
salvacybersec
2026-04-06 00:42:33 +03:00
parent 1d5d12740c
commit 0373931490

95
pkg/recon/robots.go Normal file
View File

@@ -0,0 +1,95 @@
package recon
import (
"context"
"io"
"net/http"
"net/url"
"sync"
"time"
"github.com/temoto/robotstxt"
)
const (
robotsTTL = 1 * time.Hour
robotsUA = "keyhunter"
)
type robotsEntry struct {
data *robotstxt.RobotsData
fetched time.Time
}
// RobotsCache fetches and caches per-host robots.txt for 1 hour.
// Sources whose RespectsRobots() returns true should call Allowed before each request.
//
// On fetch or parse failure, Allowed returns true (default-allow) to avoid silently
// disabling recon sources when a target site has a broken robots endpoint.
type RobotsCache struct {
mu sync.Mutex
cache map[string]robotsEntry
// Client is the HTTP client used to fetch robots.txt. If nil,
// http.DefaultClient is used. Tests inject httptest.Server.Client().
Client *http.Client
}
// NewRobotsCache returns an empty cache ready for use.
func NewRobotsCache() *RobotsCache {
return &RobotsCache{cache: make(map[string]robotsEntry)}
}
// Allowed reports whether the `keyhunter` user-agent may fetch rawURL per the
// host's robots.txt. Results are cached per-host for 1 hour.
func (rc *RobotsCache) Allowed(ctx context.Context, rawURL string) (bool, error) {
u, err := url.Parse(rawURL)
if err != nil || u.Host == "" {
return true, nil
}
host := u.Host
rc.mu.Lock()
entry, ok := rc.cache[host]
rc.mu.Unlock()
if ok && time.Since(entry.fetched) < robotsTTL {
return entry.data.TestAgent(u.Path, robotsUA), nil
}
client := rc.Client
if client == nil {
client = http.DefaultClient
}
robotsURL := u.Scheme + "://" + host + "/robots.txt"
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
if err != nil {
return true, nil
}
resp, err := client.Do(req)
if err != nil {
return true, nil
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return true, nil
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return true, nil
}
data, err := robotstxt.FromBytes(body)
if err != nil {
return true, nil
}
rc.mu.Lock()
rc.cache[host] = robotsEntry{data: data, fetched: time.Now()}
rc.mu.Unlock()
return data.TestAgent(u.Path, robotsUA), nil
}