feat(09-04): implement RobotsCache with 1h per-host TTL
- Parses robots.txt via temoto/robotstxt - Caches per host for 1 hour; second call within TTL skips HTTP fetch - Default-allow on network/parse/4xx/5xx errors - Matches 'keyhunter' user-agent against disallowed paths - Client field allows httptest injection Satisfies RECON-INFRA-07.
This commit is contained in:
95
pkg/recon/robots.go
Normal file
95
pkg/recon/robots.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package recon
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/temoto/robotstxt"
|
||||
)
|
||||
|
||||
const (
|
||||
robotsTTL = 1 * time.Hour
|
||||
robotsUA = "keyhunter"
|
||||
)
|
||||
|
||||
type robotsEntry struct {
|
||||
data *robotstxt.RobotsData
|
||||
fetched time.Time
|
||||
}
|
||||
|
||||
// RobotsCache fetches and caches per-host robots.txt for 1 hour.
|
||||
// Sources whose RespectsRobots() returns true should call Allowed before each request.
|
||||
//
|
||||
// On fetch or parse failure, Allowed returns true (default-allow) to avoid silently
|
||||
// disabling recon sources when a target site has a broken robots endpoint.
|
||||
type RobotsCache struct {
|
||||
mu sync.Mutex
|
||||
cache map[string]robotsEntry
|
||||
|
||||
// Client is the HTTP client used to fetch robots.txt. If nil,
|
||||
// http.DefaultClient is used. Tests inject httptest.Server.Client().
|
||||
Client *http.Client
|
||||
}
|
||||
|
||||
// NewRobotsCache returns an empty cache ready for use.
|
||||
func NewRobotsCache() *RobotsCache {
|
||||
return &RobotsCache{cache: make(map[string]robotsEntry)}
|
||||
}
|
||||
|
||||
// Allowed reports whether the `keyhunter` user-agent may fetch rawURL per the
|
||||
// host's robots.txt. Results are cached per-host for 1 hour.
|
||||
func (rc *RobotsCache) Allowed(ctx context.Context, rawURL string) (bool, error) {
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil || u.Host == "" {
|
||||
return true, nil
|
||||
}
|
||||
host := u.Host
|
||||
|
||||
rc.mu.Lock()
|
||||
entry, ok := rc.cache[host]
|
||||
rc.mu.Unlock()
|
||||
if ok && time.Since(entry.fetched) < robotsTTL {
|
||||
return entry.data.TestAgent(u.Path, robotsUA), nil
|
||||
}
|
||||
|
||||
client := rc.Client
|
||||
if client == nil {
|
||||
client = http.DefaultClient
|
||||
}
|
||||
|
||||
robotsURL := u.Scheme + "://" + host + "/robots.txt"
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
|
||||
if err != nil {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return true, nil
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
data, err := robotstxt.FromBytes(body)
|
||||
if err != nil {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
rc.mu.Lock()
|
||||
rc.cache[host] = robotsEntry{data: data, fetched: time.Now()}
|
||||
rc.mu.Unlock()
|
||||
|
||||
return data.TestAgent(u.Path, robotsUA), nil
|
||||
}
|
||||
Reference in New Issue
Block a user