package sources

import (
	"bufio"
	"context"
	"encoding/json"
	"fmt"
	"net/http"
	"net/url"
	"time"

	"golang.org/x/time/rate"

	"github.com/salvacybersec/keyhunter/pkg/providers"
	"github.com/salvacybersec/keyhunter/pkg/recon"
)

// CommonCrawlSource implements recon.ReconSource against the CommonCrawl
// Index Server API. It queries index.commoncrawl.org for pages matching
// provider keywords in the CC index.
//
// RECON-ARCH-02: Each matching index record yields a Finding pointing at the
// original URL discovered in the crawl. The source is credentialless and
// always enabled.
type CommonCrawlSource struct {
	// BaseURL defaults to https://index.commoncrawl.org. Tests override with httptest URL.
	BaseURL string
	// IndexName defaults to CC-MAIN-2024-10 (recent crawl). Tests may override.
	IndexName string
	// Registry drives the keyword query list via BuildQueries.
	Registry *providers.Registry
	// Limiters is the shared recon.LimiterRegistry.
	Limiters *recon.LimiterRegistry
	// Client is the shared retry HTTP wrapper. If nil, a default is used.
	Client *Client
}

// Compile-time assertion that CommonCrawlSource satisfies recon.ReconSource.
var _ recon.ReconSource = (*CommonCrawlSource)(nil)

func (s *CommonCrawlSource) Name() string          { return "commoncrawl" }
func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) }
func (s *CommonCrawlSource) Burst() int            { return 1 }
func (s *CommonCrawlSource) RespectsRobots() bool  { return true }

// Enabled always returns true: CommonCrawl index is unauthenticated.
func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true }

// Sweep iterates provider keywords, queries the CC index for each, and emits
// a Finding for every matched URL. The CC Index API returns NDJSON (one JSON
// object per line) with fields like url, timestamp, status, mime, etc.
func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
	base := s.BaseURL
	if base == "" {
		base = "https://index.commoncrawl.org"
	}
	idx := s.IndexName
	if idx == "" {
		idx = "CC-MAIN-2024-10"
	}
	client := s.Client
	if client == nil {
		client = NewClient()
	}

	queries := BuildQueries(s.Registry, "commoncrawl")
	if len(queries) == 0 {
		return nil
	}

	for _, q := range queries {
		if err := ctx.Err(); err != nil {
			return err
		}

		if s.Limiters != nil {
			if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
				return err
			}
		}

		// CC Index API: output=json returns NDJSON, limit=50 bounds the response.
		endpoint := fmt.Sprintf("%s/%s-index?url=*&output=json&limit=50&filter=status:200&query=%s",
			base, idx, url.QueryEscape(q))
		req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
		if err != nil {
			return fmt.Errorf("commoncrawl: build req: %w", err)
		}
		req.Header.Set("Accept", "application/json")

		resp, err := client.Do(ctx, req)
		if err != nil {
			// Non-fatal: skip this keyword on transient errors.
			continue
		}

		scanner := bufio.NewScanner(resp.Body)
		for scanner.Scan() {
			line := scanner.Bytes()
			if len(line) == 0 {
				continue
			}

			var rec ccIndexRecord
			if err := json.Unmarshal(line, &rec); err != nil {
				continue
			}
			if rec.URL == "" {
				continue
			}

			f := recon.Finding{
				ProviderName: "",
				Source:       rec.URL,
				SourceType:   "recon:commoncrawl",
				Confidence:   "low",
				DetectedAt:   time.Now(),
			}
			select {
			case out <- f:
			case <-ctx.Done():
				_ = resp.Body.Close()
				return ctx.Err()
			}
		}
		_ = resp.Body.Close()
	}
	return nil
}

// ccIndexRecord mirrors the subset of fields returned by the CommonCrawl Index
// API that this source consumes. Additional fields (mime, status, digest, etc.)
// are ignored to keep the decoder tolerant.
type ccIndexRecord struct {
	URL       string `json:"url"`
	Timestamp string `json:"timestamp"`
	Status    string `json:"status"`
}