cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
at main 305 lines 7.5 kB view raw
1package articles 2 3import ( 4 "encoding/json" 5 "strings" 6 7 "github.com/antchfx/htmlquery" 8 "golang.org/x/net/html" 9) 10 11// MetadataExtractor implements multi-strategy metadata extraction from HTML documents. 12// It attempts to extract article metadata using OpenGraph, Schema.org, meta tags, 13// and semantic HTML5 elements, with fallback chains for each field. 14type MetadataExtractor struct{} 15 16// NewMetadataExtractor creates a new metadata extractor. 17func NewMetadataExtractor() *MetadataExtractor { 18 return &MetadataExtractor{} 19} 20 21// ExtractMetadata extracts all available metadata from an HTML document. 22// Returns an ExtractionResult with populated metadata fields. 23func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult { 24 if doc == nil { 25 return &ExtractionResult{} 26 } 27 28 result := &ExtractionResult{} 29 30 result.Title = m.ExtractTitle(doc) 31 result.Author = m.ExtractAuthor(doc) 32 result.PublishedDate = m.ExtractPublishedDate(doc) 33 result.SiteName = m.ExtractSiteName(doc) 34 result.Language = m.ExtractLanguage(doc) 35 36 return result 37} 38 39// ExtractTitle extracts the article title using multiple strategies. 40// Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag. 41func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string { 42 if doc == nil { 43 return "" 44 } 45 46 if title := m.getMetaContent(doc, "property", "og:title"); title != "" { 47 return title 48 } 49 50 if title := m.getSchemaOrgField(doc, "headline"); title != "" { 51 return title 52 } 53 54 if title := m.getSchemaOrgField(doc, "name"); title != "" { 55 return title 56 } 57 58 if title := m.getMetaContent(doc, "name", "twitter:title"); title != "" { 59 return title 60 } 61 62 if title := m.getMetaContent(doc, "property", "article:title"); title != "" { 63 return title 64 } 65 66 if h1 := htmlquery.FindOne(doc, "//h1"); h1 != nil { 67 if title := htmlquery.InnerText(h1); title != "" { 68 return strings.TrimSpace(title) 69 } 70 } 71 72 if titleNode := htmlquery.FindOne(doc, "//title"); titleNode != nil { 73 if title := htmlquery.InnerText(titleNode); title != "" { 74 return strings.TrimSpace(title) 75 } 76 } 77 78 return "" 79} 80 81// ExtractAuthor extracts the article author using multiple strategies. 82// Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements. 83func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string { 84 if doc == nil { 85 return "" 86 } 87 88 if author := m.getMetaContent(doc, "property", "og:author"); author != "" { 89 return author 90 } 91 92 if author := m.getSchemaOrgField(doc, "author"); author != "" { 93 return author 94 } 95 96 if author := m.getMetaContent(doc, "property", "article:author"); author != "" { 97 return author 98 } 99 100 if author := m.getMetaContent(doc, "name", "twitter:creator"); author != "" { 101 return author 102 } 103 104 if author := m.getMetaContent(doc, "name", "author"); author != "" { 105 return author 106 } 107 108 if authorLink := htmlquery.FindOne(doc, "//a[@rel='author']"); authorLink != nil { 109 if author := htmlquery.InnerText(authorLink); author != "" { 110 return strings.TrimSpace(author) 111 } 112 } 113 114 bylineSelectors := []string{ 115 "//span[contains(@class, 'author')]", 116 "//div[contains(@class, 'author')]", 117 "//p[contains(@class, 'byline')]", 118 "//span[contains(@class, 'byline')]", 119 } 120 121 for _, selector := range bylineSelectors { 122 if node := htmlquery.FindOne(doc, selector); node != nil { 123 if author := htmlquery.InnerText(node); author != "" { 124 return strings.TrimSpace(author) 125 } 126 } 127 } 128 129 return "" 130} 131 132// ExtractPublishedDate extracts the publication date using multiple strategies. 133// Tries in order: OpenGraph, Schema.org, article:published_time, time elements. 134func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string { 135 if doc == nil { 136 return "" 137 } 138 139 if date := m.getMetaContent(doc, "property", "og:published_time"); date != "" { 140 return date 141 } 142 143 if date := m.getSchemaOrgField(doc, "datePublished"); date != "" { 144 return date 145 } 146 147 if date := m.getSchemaOrgField(doc, "publishDate"); date != "" { 148 return date 149 } 150 151 if date := m.getMetaContent(doc, "property", "article:published_time"); date != "" { 152 return date 153 } 154 155 if date := m.getMetaContent(doc, "name", "publication_date"); date != "" { 156 return date 157 } 158 159 if date := m.getMetaContent(doc, "name", "date"); date != "" { 160 return date 161 } 162 163 if timeNode := htmlquery.FindOne(doc, "//time[@datetime]"); timeNode != nil { 164 for _, attr := range timeNode.Attr { 165 if attr.Key == "datetime" { 166 return attr.Val 167 } 168 } 169 } 170 171 return "" 172} 173 174// ExtractSiteName extracts the site name using multiple strategies. 175// Tries in order: OpenGraph, Schema.org, meta tags. 176func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string { 177 if doc == nil { 178 return "" 179 } 180 181 if siteName := m.getMetaContent(doc, "property", "og:site_name"); siteName != "" { 182 return siteName 183 } 184 185 if publisher := m.getSchemaOrgField(doc, "publisher"); publisher != "" { 186 return publisher 187 } 188 189 if siteName := m.getMetaContent(doc, "name", "application-name"); siteName != "" { 190 return siteName 191 } 192 193 return "" 194} 195 196// ExtractLanguage extracts the document language. 197// Tries in order: html lang attribute, OpenGraph, meta tags. 198func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string { 199 if doc == nil { 200 return "" 201 } 202 203 if htmlNode := htmlquery.FindOne(doc, "//html"); htmlNode != nil { 204 for _, attr := range htmlNode.Attr { 205 if attr.Key == "lang" { 206 return attr.Val 207 } 208 } 209 } 210 211 if locale := m.getMetaContent(doc, "property", "og:locale"); locale != "" { 212 return locale 213 } 214 215 if lang := m.getMetaContent(doc, "http-equiv", "content-language"); lang != "" { 216 return lang 217 } 218 219 return "" 220} 221 222// getMetaContent retrieves the content attribute from a meta tag. 223// Searches for meta tags with the specified attribute name and value. 224func (m *MetadataExtractor) getMetaContent(doc *html.Node, attrName, attrValue string) string { 225 if doc == nil { 226 return "" 227 } 228 229 xpath := "//meta[@" + attrName + "='" + attrValue + "']" 230 metaNode := htmlquery.FindOne(doc, xpath) 231 232 if metaNode == nil { 233 return "" 234 } 235 236 for _, attr := range metaNode.Attr { 237 if attr.Key == "content" { 238 return strings.TrimSpace(attr.Val) 239 } 240 } 241 242 return "" 243} 244 245// getSchemaOrgField extracts a field from Schema.org JSON-LD structured data. 246func (m *MetadataExtractor) getSchemaOrgField(doc *html.Node, fieldName string) string { 247 if doc == nil { 248 return "" 249 } 250 251 scripts := htmlquery.Find(doc, "//script[@type='application/ld+json']") 252 253 for _, script := range scripts { 254 if script.FirstChild == nil || script.FirstChild.Type != html.TextNode { 255 continue 256 } 257 258 var data map[string]any 259 if err := json.Unmarshal([]byte(script.FirstChild.Data), &data); err != nil { 260 continue 261 } 262 263 context, hasContext := data["@context"] 264 typeVal, hasType := data["@type"] 265 266 if !hasContext || !hasType { 267 continue 268 } 269 270 contextStr, ok := context.(string) 271 if !ok || !strings.Contains(contextStr, "schema.org") { 272 continue 273 } 274 275 typeStr, ok := typeVal.(string) 276 if !ok || (!strings.Contains(typeStr, "Article") && !strings.Contains(typeStr, "NewsArticle")) { 277 continue 278 } 279 280 if value, exists := data[fieldName]; exists { 281 return m.extractStringValue(value) 282 } 283 } 284 285 return "" 286} 287 288// extractStringValue extracts a string from various JSON value types. 289func (m *MetadataExtractor) extractStringValue(value any) string { 290 switch v := value.(type) { 291 case string: 292 return v 293 case map[string]any: 294 if name, exists := v["name"]; exists { 295 if nameStr, ok := name.(string); ok { 296 return nameStr 297 } 298 } 299 case []any: 300 if len(v) > 0 { 301 return m.extractStringValue(v[0]) 302 } 303 } 304 return "" 305}