- Parses robots.txt via temoto/robotstxt - Caches per host for 1 hour; second call within TTL skips HTTP fetch - Default-allow on network/parse/4xx/5xx errors - Matches 'keyhunter' user-agent against disallowed paths - Client field allows httptest injection Satisfies RECON-INFRA-07.
96 lines
2.1 KiB
Go
96 lines
2.1 KiB
Go
package recon
|
|
|
|
import (
|
|
"context"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/temoto/robotstxt"
|
|
)
|
|
|
|
const (
|
|
robotsTTL = 1 * time.Hour
|
|
robotsUA = "keyhunter"
|
|
)
|
|
|
|
type robotsEntry struct {
|
|
data *robotstxt.RobotsData
|
|
fetched time.Time
|
|
}
|
|
|
|
// RobotsCache fetches and caches per-host robots.txt for 1 hour.
|
|
// Sources whose RespectsRobots() returns true should call Allowed before each request.
|
|
//
|
|
// On fetch or parse failure, Allowed returns true (default-allow) to avoid silently
|
|
// disabling recon sources when a target site has a broken robots endpoint.
|
|
type RobotsCache struct {
|
|
mu sync.Mutex
|
|
cache map[string]robotsEntry
|
|
|
|
// Client is the HTTP client used to fetch robots.txt. If nil,
|
|
// http.DefaultClient is used. Tests inject httptest.Server.Client().
|
|
Client *http.Client
|
|
}
|
|
|
|
// NewRobotsCache returns an empty cache ready for use.
|
|
func NewRobotsCache() *RobotsCache {
|
|
return &RobotsCache{cache: make(map[string]robotsEntry)}
|
|
}
|
|
|
|
// Allowed reports whether the `keyhunter` user-agent may fetch rawURL per the
|
|
// host's robots.txt. Results are cached per-host for 1 hour.
|
|
func (rc *RobotsCache) Allowed(ctx context.Context, rawURL string) (bool, error) {
|
|
u, err := url.Parse(rawURL)
|
|
if err != nil || u.Host == "" {
|
|
return true, nil
|
|
}
|
|
host := u.Host
|
|
|
|
rc.mu.Lock()
|
|
entry, ok := rc.cache[host]
|
|
rc.mu.Unlock()
|
|
if ok && time.Since(entry.fetched) < robotsTTL {
|
|
return entry.data.TestAgent(u.Path, robotsUA), nil
|
|
}
|
|
|
|
client := rc.Client
|
|
if client == nil {
|
|
client = http.DefaultClient
|
|
}
|
|
|
|
robotsURL := u.Scheme + "://" + host + "/robots.txt"
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
|
|
if err != nil {
|
|
return true, nil
|
|
}
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return true, nil
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 400 {
|
|
return true, nil
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return true, nil
|
|
}
|
|
|
|
data, err := robotstxt.FromBytes(body)
|
|
if err != nil {
|
|
return true, nil
|
|
}
|
|
|
|
rc.mu.Lock()
|
|
rc.cache[host] = robotsEntry{data: data, fetched: time.Now()}
|
|
rc.mu.Unlock()
|
|
|
|
return data.TestAgent(u.Path, robotsUA), nil
|
|
}
|