package articles import ( "encoding/json" "strings" "github.com/antchfx/htmlquery" "golang.org/x/net/html" ) // MetadataExtractor implements multi-strategy metadata extraction from HTML documents. // It attempts to extract article metadata using OpenGraph, Schema.org, meta tags, // and semantic HTML5 elements, with fallback chains for each field. type MetadataExtractor struct{} // NewMetadataExtractor creates a new metadata extractor. func NewMetadataExtractor() *MetadataExtractor { return &MetadataExtractor{} } // ExtractMetadata extracts all available metadata from an HTML document. // Returns an ExtractionResult with populated metadata fields. func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult { if doc == nil { return &ExtractionResult{} } result := &ExtractionResult{} result.Title = m.ExtractTitle(doc) result.Author = m.ExtractAuthor(doc) result.PublishedDate = m.ExtractPublishedDate(doc) result.SiteName = m.ExtractSiteName(doc) result.Language = m.ExtractLanguage(doc) return result } // ExtractTitle extracts the article title using multiple strategies. // Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag. func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string { if doc == nil { return "" } if title := m.getMetaContent(doc, "property", "og:title"); title != "" { return title } if title := m.getSchemaOrgField(doc, "headline"); title != "" { return title } if title := m.getSchemaOrgField(doc, "name"); title != "" { return title } if title := m.getMetaContent(doc, "name", "twitter:title"); title != "" { return title } if title := m.getMetaContent(doc, "property", "article:title"); title != "" { return title } if h1 := htmlquery.FindOne(doc, "//h1"); h1 != nil { if title := htmlquery.InnerText(h1); title != "" { return strings.TrimSpace(title) } } if titleNode := htmlquery.FindOne(doc, "//title"); titleNode != nil { if title := htmlquery.InnerText(titleNode); title != "" { return strings.TrimSpace(title) } } return "" } // ExtractAuthor extracts the article author using multiple strategies. // Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements. func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string { if doc == nil { return "" } if author := m.getMetaContent(doc, "property", "og:author"); author != "" { return author } if author := m.getSchemaOrgField(doc, "author"); author != "" { return author } if author := m.getMetaContent(doc, "property", "article:author"); author != "" { return author } if author := m.getMetaContent(doc, "name", "twitter:creator"); author != "" { return author } if author := m.getMetaContent(doc, "name", "author"); author != "" { return author } if authorLink := htmlquery.FindOne(doc, "//a[@rel='author']"); authorLink != nil { if author := htmlquery.InnerText(authorLink); author != "" { return strings.TrimSpace(author) } } bylineSelectors := []string{ "//span[contains(@class, 'author')]", "//div[contains(@class, 'author')]", "//p[contains(@class, 'byline')]", "//span[contains(@class, 'byline')]", } for _, selector := range bylineSelectors { if node := htmlquery.FindOne(doc, selector); node != nil { if author := htmlquery.InnerText(node); author != "" { return strings.TrimSpace(author) } } } return "" } // ExtractPublishedDate extracts the publication date using multiple strategies. // Tries in order: OpenGraph, Schema.org, article:published_time, time elements. func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string { if doc == nil { return "" } if date := m.getMetaContent(doc, "property", "og:published_time"); date != "" { return date } if date := m.getSchemaOrgField(doc, "datePublished"); date != "" { return date } if date := m.getSchemaOrgField(doc, "publishDate"); date != "" { return date } if date := m.getMetaContent(doc, "property", "article:published_time"); date != "" { return date } if date := m.getMetaContent(doc, "name", "publication_date"); date != "" { return date } if date := m.getMetaContent(doc, "name", "date"); date != "" { return date } if timeNode := htmlquery.FindOne(doc, "//time[@datetime]"); timeNode != nil { for _, attr := range timeNode.Attr { if attr.Key == "datetime" { return attr.Val } } } return "" } // ExtractSiteName extracts the site name using multiple strategies. // Tries in order: OpenGraph, Schema.org, meta tags. func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string { if doc == nil { return "" } if siteName := m.getMetaContent(doc, "property", "og:site_name"); siteName != "" { return siteName } if publisher := m.getSchemaOrgField(doc, "publisher"); publisher != "" { return publisher } if siteName := m.getMetaContent(doc, "name", "application-name"); siteName != "" { return siteName } return "" } // ExtractLanguage extracts the document language. // Tries in order: html lang attribute, OpenGraph, meta tags. func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string { if doc == nil { return "" } if htmlNode := htmlquery.FindOne(doc, "//html"); htmlNode != nil { for _, attr := range htmlNode.Attr { if attr.Key == "lang" { return attr.Val } } } if locale := m.getMetaContent(doc, "property", "og:locale"); locale != "" { return locale } if lang := m.getMetaContent(doc, "http-equiv", "content-language"); lang != "" { return lang } return "" } // getMetaContent retrieves the content attribute from a meta tag. // Searches for meta tags with the specified attribute name and value. func (m *MetadataExtractor) getMetaContent(doc *html.Node, attrName, attrValue string) string { if doc == nil { return "" } xpath := "//meta[@" + attrName + "='" + attrValue + "']" metaNode := htmlquery.FindOne(doc, xpath) if metaNode == nil { return "" } for _, attr := range metaNode.Attr { if attr.Key == "content" { return strings.TrimSpace(attr.Val) } } return "" } // getSchemaOrgField extracts a field from Schema.org JSON-LD structured data. func (m *MetadataExtractor) getSchemaOrgField(doc *html.Node, fieldName string) string { if doc == nil { return "" } scripts := htmlquery.Find(doc, "//script[@type='application/ld+json']") for _, script := range scripts { if script.FirstChild == nil || script.FirstChild.Type != html.TextNode { continue } var data map[string]any if err := json.Unmarshal([]byte(script.FirstChild.Data), &data); err != nil { continue } context, hasContext := data["@context"] typeVal, hasType := data["@type"] if !hasContext || !hasType { continue } contextStr, ok := context.(string) if !ok || !strings.Contains(contextStr, "schema.org") { continue } typeStr, ok := typeVal.(string) if !ok || (!strings.Contains(typeStr, "Article") && !strings.Contains(typeStr, "NewsArticle")) { continue } if value, exists := data[fieldName]; exists { return m.extractStringValue(value) } } return "" } // extractStringValue extracts a string from various JSON value types. func (m *MetadataExtractor) extractStringValue(value any) string { switch v := value.(type) { case string: return v case map[string]any: if name, exists := v["name"]; exists { if nameStr, ok := name.(string); ok { return nameStr } } case []any: if len(v) > 0 { return m.extractStringValue(v[0]) } } return "" }