package articles

import (
	"math"
	"regexp"
	"strings"

	"golang.org/x/net/html"
)

// ContentScore represents the score and metadata for a content node.
type ContentScore struct {
	Node            *html.Node
	Score           float64
	TextLength      int
	LinkDensity     float64
	ParagraphCount  int
	AncestorDepth   int
	ConfidenceLevel float64
}

// Scorer implements Readability-style heuristic scoring for content extraction.
type Scorer struct {
	linkDensityWeight   float64
	classWeightPositive float64
	classWeightNegative float64
	paragraphWeight     float64
	ancestorDecayFactor float64
	minContentLength    int
	minScore            float64
	positivePattern     *regexp.Regexp
	negativePattern     *regexp.Regexp
	unlikelyPattern     *regexp.Regexp
}

// NewScorer creates a new Scorer with default Readability.js-inspired weights.
func NewScorer() *Scorer {
	return &Scorer{
		linkDensityWeight:   -1.0,
		classWeightPositive: 25.0,
		classWeightNegative: -25.0,
		paragraphWeight:     1.0,
		ancestorDecayFactor: 0.5,
		minContentLength:    140,
		minScore:            20.0,

		positivePattern: regexp.MustCompile(`(?i)(article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story)`),
		negativePattern: regexp.MustCompile(`(?i)(combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|ad-|advertisement|breadcrumb|hidden|nav|menu|header)`),
		unlikelyPattern: regexp.MustCompile(`(?i)(banner|cookie|popup|modal)`),
	}
}

// ScoreNode calculates a content score for the given node based on multiple heuristics.
// This implements the core Readability scoring algorithm.
func (s *Scorer) ScoreNode(node *html.Node) *ContentScore {
	if node == nil || node.Type != html.ElementNode {
		return nil
	}

	score := &ContentScore{
		Node:          node,
		Score:         0.0,
		AncestorDepth: s.calculateDepth(node),
	}

	score.Score = s.getTagScore(node.Data)
	score.Score += s.getClassIdScore(node)

	score.TextLength = s.calculateTextLength(node)
	score.LinkDensity = s.calculateLinkDensity(node)
	score.ParagraphCount = s.countParagraphs(node)

	score.Score += score.LinkDensity * s.linkDensityWeight
	score.Score += float64(score.ParagraphCount) * s.paragraphWeight
	score.Score += s.getTextLengthScore(score.TextLength)

	score.ConfidenceLevel = s.calculateConfidence(score)
	return score
}

// getTagScore returns a base score based on the HTML tag type.
// Some tags are more likely to contain main content than others.
func (s *Scorer) getTagScore(tagName string) float64 {
	switch strings.ToLower(tagName) {
	case "article":
		return 30.0
	case "section":
		return 15.0
	case "div":
		return 5.0
	case "main":
		return 40.0
	case "p":
		return 3.0
	case "pre", "td", "blockquote":
		return 3.0
	case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
		return -3.0
	case "h1", "h2", "h3", "h4", "h5", "h6", "th":
		return -5.0
	default:
		return 0.0
	}
}

// getClassIdScore analyzes class and ID attributes for positive/negative indicators.
// Returns a positive score for content-like names, negative for navigation/ads.
func (s *Scorer) getClassIdScore(node *html.Node) float64 {
	score := 0.0
	classID := s.getClassAndID(node)

	if classID == "" {
		return 0.0
	}

	if s.unlikelyPattern.MatchString(classID) {
		return -50.0
	}

	if s.negativePattern.MatchString(classID) {
		score += s.classWeightNegative
	}

	if s.positivePattern.MatchString(classID) {
		score += s.classWeightPositive
	}

	return score
}

// getClassAndID concatenates class and ID attributes for pattern matching.
func (s *Scorer) getClassAndID(node *html.Node) string {
	var parts []string

	for _, attr := range node.Attr {
		if attr.Key == "class" || attr.Key == "id" {
			parts = append(parts, attr.Val)
		}
	}

	return strings.Join(parts, " ")
}

// calculateTextLength returns the total text length within the node.
func (s *Scorer) calculateTextLength(node *html.Node) int {
	text := s.getInnerText(node)
	return len(strings.TrimSpace(text))
}

// calculateLinkDensity calculates the ratio of link text to total text.
// Higher link density indicates navigation or related links, not main content.
func (s *Scorer) calculateLinkDensity(node *html.Node) float64 {
	totalText := s.getInnerText(node)
	linkText := s.getLinkText(node)

	totalLen := len(strings.TrimSpace(totalText))
	linkLen := len(strings.TrimSpace(linkText))

	if totalLen == 0 {
		return 0.0
	}

	return float64(linkLen) / float64(totalLen)
}

// getInnerText extracts all text content from a node and its descendants.
func (s *Scorer) getInnerText(node *html.Node) string {
	var buf strings.Builder
	s.extractText(node, &buf)
	return buf.String()
}

// extractText recursively extracts text from a node tree.
func (s *Scorer) extractText(node *html.Node, buf *strings.Builder) {
	if node == nil {
		return
	}

	if node.Type == html.TextNode {
		buf.WriteString(node.Data)
		buf.WriteString(" ")
		return
	}

	if node.Type == html.ElementNode {
		tag := strings.ToLower(node.Data)
		if tag == "script" || tag == "style" || tag == "noscript" {
			return
		}
	}

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		s.extractText(child, buf)
	}
}

// getLinkText extracts text from anchor tags only.
func (s *Scorer) getLinkText(node *html.Node) string {
	var buf strings.Builder
	s.extractLinkText(node, &buf)
	return buf.String()
}

// extractLinkText recursively extracts text from anchor tags.
func (s *Scorer) extractLinkText(node *html.Node, buf *strings.Builder) {
	if node == nil {
		return
	}

	if node.Type == html.ElementNode && strings.ToLower(node.Data) == "a" {
		s.extractText(node, buf)
		return
	}

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		s.extractLinkText(child, buf)
	}
}

// countParagraphs counts paragraph elements within the node.
func (s *Scorer) countParagraphs(node *html.Node) int {
	count := 0
	s.walkParagraphs(node, &count)
	return count
}

// walkParagraphs recursively counts paragraph elements.
func (s *Scorer) walkParagraphs(node *html.Node, count *int) {
	if node == nil {
		return
	}

	if node.Type == html.ElementNode && strings.ToLower(node.Data) == "p" {
		*count++
	}

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		s.walkParagraphs(child, count)
	}
}

// getTextLengthScore provides a bonus for nodes with substantial text content.
func (s *Scorer) getTextLengthScore(textLen int) float64 {
	if textLen < 25 {
		return 0.0
	}
	return math.Log10(float64(textLen)) * 2.0
}

// calculateDepth calculates how deep in the DOM tree this node is.
func (s *Scorer) calculateDepth(node *html.Node) int {
	depth := 0
	for n := node.Parent; n != nil; n = n.Parent {
		depth++
	}
	return depth
}

// ScoreAncestors propagates scores up the DOM tree with decay.
// This implements the Readability algorithm's ancestor scoring.
func (s *Scorer) ScoreAncestors(scores map[*html.Node]*ContentScore, node *html.Node, baseScore float64) {
	if node == nil || baseScore <= 0 {
		return
	}

	currentScore := baseScore
	level := 0

	for parent := node.Parent; parent != nil && level < 5; parent = parent.Parent {
		if parent.Type != html.ElementNode {
			continue
		}

		if _, exists := scores[parent]; !exists {
			scores[parent] = s.ScoreNode(parent)
			if scores[parent] == nil {
				continue
			}
		}

		decayedScore := currentScore * math.Pow(s.ancestorDecayFactor, float64(level+1))
		scores[parent].Score += decayedScore
		level++
	}
}

// FindTopCandidates identifies the N highest-scoring content candidates.
func (s *Scorer) FindTopCandidates(root *html.Node, n int) []*ContentScore {
	if root == nil || n <= 0 {
		return nil
	}

	scores := make(map[*html.Node]*ContentScore)
	s.scoreTree(root, scores)

	var candidates []*ContentScore
	for _, score := range scores {
		if score.Score >= s.minScore && score.TextLength >= s.minContentLength {
			candidates = append(candidates, score)
		}
	}

	for i := 0; i < len(candidates); i++ {
		for j := i + 1; j < len(candidates); j++ {
			if candidates[j].Score > candidates[i].Score {
				candidates[i], candidates[j] = candidates[j], candidates[i]
			}
		}
	}

	if len(candidates) > n {
		candidates = candidates[:n]
	}

	return candidates
}

// scoreTree recursively scores all nodes in the tree.
func (s *Scorer) scoreTree(node *html.Node, scores map[*html.Node]*ContentScore) {
	if node == nil {
		return
	}

	if node.Type == html.ElementNode {
		tag := strings.ToLower(node.Data)
		if tag != "script" && tag != "style" && tag != "noscript" {
			score := s.ScoreNode(node)
			if score != nil && score.Score > 0 {
				scores[node] = score
				s.ScoreAncestors(scores, node, score.Score)
			}
		}
	}

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		s.scoreTree(child, scores)
	}
}

// calculateConfidence estimates how confident we are in this content selection (between 0 & 1).
func (s *Scorer) calculateConfidence(score *ContentScore) float64 {
	if score == nil {
		return 0.0
	}

	confidence := 0.0

	if score.Score > s.minScore*2 {
		confidence += 0.3
	} else if score.Score > s.minScore {
		confidence += 0.15
	}

	if score.TextLength > s.minContentLength*3 {
		confidence += 0.3
	} else if score.TextLength > s.minContentLength {
		confidence += 0.15
	}

	if score.LinkDensity < 0.2 {
		confidence += 0.2
	} else if score.LinkDensity < 0.4 {
		confidence += 0.1
	}

	if score.ParagraphCount >= 3 {
		confidence += 0.2
	} else if score.ParagraphCount >= 1 {
		confidence += 0.1
	}

	if confidence > 1.0 {
		confidence = 1.0
	}

	return confidence
}

// IsProbablyReadable determines if a document is likely to have extractable content.
// This is inspired by Readability.js's isProbablyReaderable function.
func (s *Scorer) IsProbablyReadable(doc *html.Node) bool {
	if doc == nil {
		return false
	}

	paragraphCount := s.countParagraphs(doc)
	textLength := s.calculateTextLength(doc)
	return paragraphCount >= 3 && textLength >= s.minContentLength
}