A browser extension that lets you summarize any webpage and ask questions using AI.
at main 2786 lines 90 kB view raw
1/* 2 * Copyright (c) 2010 Arc90 Inc 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 * This code is heavily based on Arc90's readability.js (1.7.1) script 19 * available at: http://code.google.com/p/arc90labs-readability 20 */ 21 22/** 23 * Public constructor. 24 * @param {HTMLDocument} doc The document to parse. 25 * @param {Object} options The options object. 26 */ 27function Readability(doc, options) { 28 // In some older versions, people passed a URI as the first argument. Cope: 29 if (options && options.documentElement) { 30 doc = options; 31 options = arguments[2]; 32 } else if (!doc || !doc.documentElement) { 33 throw new Error( 34 "First argument to Readability constructor should be a document object." 35 ); 36 } 37 options = options || {}; 38 39 this._doc = doc; 40 this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; 41 this._articleTitle = null; 42 this._articleByline = null; 43 this._articleDir = null; 44 this._articleSiteName = null; 45 this._attempts = []; 46 this._metadata = {}; 47 48 // Configurable options 49 this._debug = !!options.debug; 50 this._maxElemsToParse = 51 options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; 52 this._nbTopCandidates = 53 options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; 54 this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; 55 this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat( 56 options.classesToPreserve || [] 57 ); 58 this._keepClasses = !!options.keepClasses; 59 this._serializer = 60 options.serializer || 61 function (el) { 62 return el.innerHTML; 63 }; 64 this._disableJSONLD = !!options.disableJSONLD; 65 this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos; 66 this._linkDensityModifier = options.linkDensityModifier || 0; 67 68 // Start with all flags set 69 this._flags = 70 this.FLAG_STRIP_UNLIKELYS | 71 this.FLAG_WEIGHT_CLASSES | 72 this.FLAG_CLEAN_CONDITIONALLY; 73 74 // Control whether log messages are sent to the console 75 if (this._debug) { 76 let logNode = function (node) { 77 if (node.nodeType == node.TEXT_NODE) { 78 return `${node.nodeName} ("${node.textContent}")`; 79 } 80 let attrPairs = Array.from(node.attributes || [], function (attr) { 81 return `${attr.name}="${attr.value}"`; 82 }).join(" "); 83 return `<${node.localName} ${attrPairs}>`; 84 }; 85 this.log = function () { 86 if (typeof console !== "undefined") { 87 let args = Array.from(arguments, arg => { 88 if (arg && arg.nodeType == this.ELEMENT_NODE) { 89 return logNode(arg); 90 } 91 return arg; 92 }); 93 args.unshift("Reader: (Readability)"); 94 // eslint-disable-next-line no-console 95 console.log(...args); 96 } else if (typeof dump !== "undefined") { 97 /* global dump */ 98 var msg = Array.prototype.map 99 .call(arguments, function (x) { 100 return x && x.nodeName ? logNode(x) : x; 101 }) 102 .join(" "); 103 dump("Reader: (Readability) " + msg + "\n"); 104 } 105 }; 106 } else { 107 this.log = function () {}; 108 } 109} 110 111Readability.prototype = { 112 FLAG_STRIP_UNLIKELYS: 0x1, 113 FLAG_WEIGHT_CLASSES: 0x2, 114 FLAG_CLEAN_CONDITIONALLY: 0x4, 115 116 // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType 117 ELEMENT_NODE: 1, 118 TEXT_NODE: 3, 119 120 // Max number of nodes supported by this parser. Default: 0 (no limit) 121 DEFAULT_MAX_ELEMS_TO_PARSE: 0, 122 123 // The number of top candidates to consider when analysing how 124 // tight the competition is among candidates. 125 DEFAULT_N_TOP_CANDIDATES: 5, 126 127 // Element tags to score by default. 128 DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre" 129 .toUpperCase() 130 .split(","), 131 132 // The default number of chars an article must have in order to return a result 133 DEFAULT_CHAR_THRESHOLD: 500, 134 135 // All of the regular expressions in use within readability. 136 // Defined up here so we don't instantiate them repeatedly in loops. 137 REGEXPS: { 138 // NOTE: These two regular expressions are duplicated in 139 // Readability-readerable.js. Please keep both copies in sync. 140 unlikelyCandidates: 141 /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, 142 okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, 143 144 positive: 145 /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, 146 negative: 147 /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i, 148 extraneous: 149 /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, 150 byline: /byline|author|dateline|writtenby|p-author/i, 151 replaceFonts: /<(\/?)font[^>]*>/gi, 152 normalize: /\s{2,}/g, 153 videos: 154 /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, 155 shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, 156 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, 157 prevLink: /(prev|earl|old|new|<|«)/i, 158 tokenize: /\W+/g, 159 whitespace: /^\s*$/, 160 hasContent: /\S$/, 161 hashUrl: /^#.+/, 162 srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, 163 b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, 164 // Commas as used in Latin, Sindhi, Chinese and various other scripts. 165 // see: https://en.wikipedia.org/wiki/Comma#Comma_variants 166 commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g, 167 // See: https://schema.org/Article 168 jsonLdArticleTypes: 169 /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/, 170 // used to see if a node's content matches words commonly used for ad blocks or loading indicators 171 adWords: 172 /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu, 173 loadingWords: 174 /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu, 175 }, 176 177 UNLIKELY_ROLES: [ 178 "menu", 179 "menubar", 180 "complementary", 181 "navigation", 182 "alert", 183 "alertdialog", 184 "dialog", 185 ], 186 187 DIV_TO_P_ELEMS: new Set([ 188 "BLOCKQUOTE", 189 "DL", 190 "DIV", 191 "IMG", 192 "OL", 193 "P", 194 "PRE", 195 "TABLE", 196 "UL", 197 ]), 198 199 ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P", "OL", "UL"], 200 201 PRESENTATIONAL_ATTRIBUTES: [ 202 "align", 203 "background", 204 "bgcolor", 205 "border", 206 "cellpadding", 207 "cellspacing", 208 "frame", 209 "hspace", 210 "rules", 211 "style", 212 "valign", 213 "vspace", 214 ], 215 216 DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"], 217 218 // The commented out elements qualify as phrasing content but tend to be 219 // removed by readability when put into paragraphs, so we ignore them here. 220 PHRASING_ELEMS: [ 221 // "CANVAS", "IFRAME", "SVG", "VIDEO", 222 "ABBR", 223 "AUDIO", 224 "B", 225 "BDO", 226 "BR", 227 "BUTTON", 228 "CITE", 229 "CODE", 230 "DATA", 231 "DATALIST", 232 "DFN", 233 "EM", 234 "EMBED", 235 "I", 236 "IMG", 237 "INPUT", 238 "KBD", 239 "LABEL", 240 "MARK", 241 "MATH", 242 "METER", 243 "NOSCRIPT", 244 "OBJECT", 245 "OUTPUT", 246 "PROGRESS", 247 "Q", 248 "RUBY", 249 "SAMP", 250 "SCRIPT", 251 "SELECT", 252 "SMALL", 253 "SPAN", 254 "STRONG", 255 "SUB", 256 "SUP", 257 "TEXTAREA", 258 "TIME", 259 "VAR", 260 "WBR", 261 ], 262 263 // These are the classes that readability sets itself. 264 CLASSES_TO_PRESERVE: ["page"], 265 266 // These are the list of HTML entities that need to be escaped. 267 HTML_ESCAPE_MAP: { 268 lt: "<", 269 gt: ">", 270 amp: "&", 271 quot: '"', 272 apos: "'", 273 }, 274 275 /** 276 * Run any post-process modifications to article content as necessary. 277 * 278 * @param Element 279 * @return void 280 **/ 281 _postProcessContent(articleContent) { 282 // Readability cannot open relative uris so we convert them to absolute uris. 283 this._fixRelativeUris(articleContent); 284 285 this._simplifyNestedElements(articleContent); 286 287 if (!this._keepClasses) { 288 // Remove classes. 289 this._cleanClasses(articleContent); 290 } 291 }, 292 293 /** 294 * Iterates over a NodeList, calls `filterFn` for each node and removes node 295 * if function returned `true`. 296 * 297 * If function is not passed, removes all the nodes in node list. 298 * 299 * @param NodeList nodeList The nodes to operate on 300 * @param Function filterFn the function to use as a filter 301 * @return void 302 */ 303 _removeNodes(nodeList, filterFn) { 304 // Avoid ever operating on live node lists. 305 if (this._docJSDOMParser && nodeList._isLiveNodeList) { 306 throw new Error("Do not pass live node lists to _removeNodes"); 307 } 308 for (var i = nodeList.length - 1; i >= 0; i--) { 309 var node = nodeList[i]; 310 var parentNode = node.parentNode; 311 if (parentNode) { 312 if (!filterFn || filterFn.call(this, node, i, nodeList)) { 313 parentNode.removeChild(node); 314 } 315 } 316 } 317 }, 318 319 /** 320 * Iterates over a NodeList, and calls _setNodeTag for each node. 321 * 322 * @param NodeList nodeList The nodes to operate on 323 * @param String newTagName the new tag name to use 324 * @return void 325 */ 326 _replaceNodeTags(nodeList, newTagName) { 327 // Avoid ever operating on live node lists. 328 if (this._docJSDOMParser && nodeList._isLiveNodeList) { 329 throw new Error("Do not pass live node lists to _replaceNodeTags"); 330 } 331 for (const node of nodeList) { 332 this._setNodeTag(node, newTagName); 333 } 334 }, 335 336 /** 337 * Iterate over a NodeList, which doesn't natively fully implement the Array 338 * interface. 339 * 340 * For convenience, the current object context is applied to the provided 341 * iterate function. 342 * 343 * @param NodeList nodeList The NodeList. 344 * @param Function fn The iterate function. 345 * @return void 346 */ 347 _forEachNode(nodeList, fn) { 348 Array.prototype.forEach.call(nodeList, fn, this); 349 }, 350 351 /** 352 * Iterate over a NodeList, and return the first node that passes 353 * the supplied test function 354 * 355 * For convenience, the current object context is applied to the provided 356 * test function. 357 * 358 * @param NodeList nodeList The NodeList. 359 * @param Function fn The test function. 360 * @return void 361 */ 362 _findNode(nodeList, fn) { 363 return Array.prototype.find.call(nodeList, fn, this); 364 }, 365 366 /** 367 * Iterate over a NodeList, return true if any of the provided iterate 368 * function calls returns true, false otherwise. 369 * 370 * For convenience, the current object context is applied to the 371 * provided iterate function. 372 * 373 * @param NodeList nodeList The NodeList. 374 * @param Function fn The iterate function. 375 * @return Boolean 376 */ 377 _someNode(nodeList, fn) { 378 return Array.prototype.some.call(nodeList, fn, this); 379 }, 380 381 /** 382 * Iterate over a NodeList, return true if all of the provided iterate 383 * function calls return true, false otherwise. 384 * 385 * For convenience, the current object context is applied to the 386 * provided iterate function. 387 * 388 * @param NodeList nodeList The NodeList. 389 * @param Function fn The iterate function. 390 * @return Boolean 391 */ 392 _everyNode(nodeList, fn) { 393 return Array.prototype.every.call(nodeList, fn, this); 394 }, 395 396 _getAllNodesWithTag(node, tagNames) { 397 if (node.querySelectorAll) { 398 return node.querySelectorAll(tagNames.join(",")); 399 } 400 return [].concat.apply( 401 [], 402 tagNames.map(function (tag) { 403 var collection = node.getElementsByTagName(tag); 404 return Array.isArray(collection) ? collection : Array.from(collection); 405 }) 406 ); 407 }, 408 409 /** 410 * Removes the class="" attribute from every element in the given 411 * subtree, except those that match CLASSES_TO_PRESERVE and 412 * the classesToPreserve array from the options object. 413 * 414 * @param Element 415 * @return void 416 */ 417 _cleanClasses(node) { 418 var classesToPreserve = this._classesToPreserve; 419 var className = (node.getAttribute("class") || "") 420 .split(/\s+/) 421 .filter(cls => classesToPreserve.includes(cls)) 422 .join(" "); 423 424 if (className) { 425 node.setAttribute("class", className); 426 } else { 427 node.removeAttribute("class"); 428 } 429 430 for (node = node.firstElementChild; node; node = node.nextElementSibling) { 431 this._cleanClasses(node); 432 } 433 }, 434 435 /** 436 * Tests whether a string is a URL or not. 437 * 438 * @param {string} str The string to test 439 * @return {boolean} true if str is a URL, false if not 440 */ 441 _isUrl(str) { 442 try { 443 new URL(str); 444 return true; 445 } catch { 446 return false; 447 } 448 }, 449 /** 450 * Converts each <a> and <img> uri in the given element to an absolute URI, 451 * ignoring #ref URIs. 452 * 453 * @param Element 454 * @return void 455 */ 456 _fixRelativeUris(articleContent) { 457 var baseURI = this._doc.baseURI; 458 var documentURI = this._doc.documentURI; 459 function toAbsoluteURI(uri) { 460 // Leave hash links alone if the base URI matches the document URI: 461 if (baseURI == documentURI && uri.charAt(0) == "#") { 462 return uri; 463 } 464 465 // Otherwise, resolve against base URI: 466 try { 467 return new URL(uri, baseURI).href; 468 } catch (ex) { 469 // Something went wrong, just return the original: 470 } 471 return uri; 472 } 473 474 var links = this._getAllNodesWithTag(articleContent, ["a"]); 475 this._forEachNode(links, function (link) { 476 var href = link.getAttribute("href"); 477 if (href) { 478 // Remove links with javascript: URIs, since 479 // they won't work after scripts have been removed from the page. 480 if (href.indexOf("javascript:") === 0) { 481 // if the link only contains simple text content, it can be converted to a text node 482 if ( 483 link.childNodes.length === 1 && 484 link.childNodes[0].nodeType === this.TEXT_NODE 485 ) { 486 var text = this._doc.createTextNode(link.textContent); 487 link.parentNode.replaceChild(text, link); 488 } else { 489 // if the link has multiple children, they should all be preserved 490 var container = this._doc.createElement("span"); 491 while (link.firstChild) { 492 container.appendChild(link.firstChild); 493 } 494 link.parentNode.replaceChild(container, link); 495 } 496 } else { 497 link.setAttribute("href", toAbsoluteURI(href)); 498 } 499 } 500 }); 501 502 var medias = this._getAllNodesWithTag(articleContent, [ 503 "img", 504 "picture", 505 "figure", 506 "video", 507 "audio", 508 "source", 509 ]); 510 511 this._forEachNode(medias, function (media) { 512 var src = media.getAttribute("src"); 513 var poster = media.getAttribute("poster"); 514 var srcset = media.getAttribute("srcset"); 515 516 if (src) { 517 media.setAttribute("src", toAbsoluteURI(src)); 518 } 519 520 if (poster) { 521 media.setAttribute("poster", toAbsoluteURI(poster)); 522 } 523 524 if (srcset) { 525 var newSrcset = srcset.replace( 526 this.REGEXPS.srcsetUrl, 527 function (_, p1, p2, p3) { 528 return toAbsoluteURI(p1) + (p2 || "") + p3; 529 } 530 ); 531 532 media.setAttribute("srcset", newSrcset); 533 } 534 }); 535 }, 536 537 _simplifyNestedElements(articleContent) { 538 var node = articleContent; 539 540 while (node) { 541 if ( 542 node.parentNode && 543 ["DIV", "SECTION"].includes(node.tagName) && 544 !(node.id && node.id.startsWith("readability")) 545 ) { 546 if (this._isElementWithoutContent(node)) { 547 node = this._removeAndGetNext(node); 548 continue; 549 } else if ( 550 this._hasSingleTagInsideElement(node, "DIV") || 551 this._hasSingleTagInsideElement(node, "SECTION") 552 ) { 553 var child = node.children[0]; 554 for (var i = 0; i < node.attributes.length; i++) { 555 child.setAttributeNode(node.attributes[i].cloneNode()); 556 } 557 node.parentNode.replaceChild(child, node); 558 node = child; 559 continue; 560 } 561 } 562 563 node = this._getNextNode(node); 564 } 565 }, 566 567 /** 568 * Get the article title as an H1. 569 * 570 * @return string 571 **/ 572 _getArticleTitle() { 573 var doc = this._doc; 574 var curTitle = ""; 575 var origTitle = ""; 576 577 try { 578 curTitle = origTitle = doc.title.trim(); 579 580 // If they had an element with id "title" in their HTML 581 if (typeof curTitle !== "string") { 582 curTitle = origTitle = this._getInnerText( 583 doc.getElementsByTagName("title")[0] 584 ); 585 } 586 } catch (e) { 587 /* ignore exceptions setting the title. */ 588 } 589 590 var titleHadHierarchicalSeparators = false; 591 function wordCount(str) { 592 return str.split(/\s+/).length; 593 } 594 595 // If there's a separator in the title, first remove the final part 596 if (/ [\|\-\\\/>»] /.test(curTitle)) { 597 titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); 598 let allSeparators = Array.from(origTitle.matchAll(/ [\|\-\\\/>»] /gi)); 599 curTitle = origTitle.substring(0, allSeparators.pop().index); 600 601 // If the resulting title is too short, remove the first part instead: 602 if (wordCount(curTitle) < 3) { 603 curTitle = origTitle.replace(/^[^\|\-\\\/>»]*[\|\-\\\/>»]/gi, ""); 604 } 605 } else if (curTitle.includes(": ")) { 606 // Check if we have an heading containing this exact string, so we 607 // could assume it's the full title. 608 var headings = this._getAllNodesWithTag(doc, ["h1", "h2"]); 609 var trimmedTitle = curTitle.trim(); 610 var match = this._someNode(headings, function (heading) { 611 return heading.textContent.trim() === trimmedTitle; 612 }); 613 614 // If we don't, let's extract the title out of the original title string. 615 if (!match) { 616 curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); 617 618 // If the title is now too short, try the first colon instead: 619 if (wordCount(curTitle) < 3) { 620 curTitle = origTitle.substring(origTitle.indexOf(":") + 1); 621 // But if we have too many words before the colon there's something weird 622 // with the titles and the H tags so let's just use the original title instead 623 } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { 624 curTitle = origTitle; 625 } 626 } 627 } else if (curTitle.length > 150 || curTitle.length < 15) { 628 var hOnes = doc.getElementsByTagName("h1"); 629 630 if (hOnes.length === 1) { 631 curTitle = this._getInnerText(hOnes[0]); 632 } 633 } 634 635 curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); 636 // If we now have 4 words or fewer as our title, and either no 637 // 'hierarchical' separators (\, /, > or ») were found in the original 638 // title or we decreased the number of words by more than 1 word, use 639 // the original title. 640 var curTitleWordCount = wordCount(curTitle); 641 if ( 642 curTitleWordCount <= 4 && 643 (!titleHadHierarchicalSeparators || 644 curTitleWordCount != 645 wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1) 646 ) { 647 curTitle = origTitle; 648 } 649 650 return curTitle; 651 }, 652 653 /** 654 * Prepare the HTML document for readability to scrape it. 655 * This includes things like stripping javascript, CSS, and handling terrible markup. 656 * 657 * @return void 658 **/ 659 _prepDocument() { 660 var doc = this._doc; 661 662 // Remove all style tags in head 663 this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); 664 665 if (doc.body) { 666 this._replaceBrs(doc.body); 667 } 668 669 this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); 670 }, 671 672 /** 673 * Finds the next node, starting from the given node, and ignoring 674 * whitespace in between. If the given node is an element, the same node is 675 * returned. 676 */ 677 _nextNode(node) { 678 var next = node; 679 while ( 680 next && 681 next.nodeType != this.ELEMENT_NODE && 682 this.REGEXPS.whitespace.test(next.textContent) 683 ) { 684 next = next.nextSibling; 685 } 686 return next; 687 }, 688 689 /** 690 * Replaces 2 or more successive <br> elements with a single <p>. 691 * Whitespace between <br> elements are ignored. For example: 692 * <div>foo<br>bar<br> <br><br>abc</div> 693 * will become: 694 * <div>foo<br>bar<p>abc</p></div> 695 */ 696 _replaceBrs(elem) { 697 this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) { 698 var next = br.nextSibling; 699 700 // Whether 2 or more <br> elements have been found and replaced with a 701 // <p> block. 702 var replaced = false; 703 704 // If we find a <br> chain, remove the <br>s until we hit another node 705 // or non-whitespace. This leaves behind the first <br> in the chain 706 // (which will be replaced with a <p> later). 707 while ((next = this._nextNode(next)) && next.tagName == "BR") { 708 replaced = true; 709 var brSibling = next.nextSibling; 710 next.remove(); 711 next = brSibling; 712 } 713 714 // If we removed a <br> chain, replace the remaining <br> with a <p>. Add 715 // all sibling nodes as children of the <p> until we hit another <br> 716 // chain. 717 if (replaced) { 718 var p = this._doc.createElement("p"); 719 br.parentNode.replaceChild(p, br); 720 721 next = p.nextSibling; 722 while (next) { 723 // If we've hit another <br><br>, we're done adding children to this <p>. 724 if (next.tagName == "BR") { 725 var nextElem = this._nextNode(next.nextSibling); 726 if (nextElem && nextElem.tagName == "BR") { 727 break; 728 } 729 } 730 731 if (!this._isPhrasingContent(next)) { 732 break; 733 } 734 735 // Otherwise, make this node a child of the new <p>. 736 var sibling = next.nextSibling; 737 p.appendChild(next); 738 next = sibling; 739 } 740 741 while (p.lastChild && this._isWhitespace(p.lastChild)) { 742 p.lastChild.remove(); 743 } 744 745 if (p.parentNode.tagName === "P") { 746 this._setNodeTag(p.parentNode, "DIV"); 747 } 748 } 749 }); 750 }, 751 752 _setNodeTag(node, tag) { 753 this.log("_setNodeTag", node, tag); 754 if (this._docJSDOMParser) { 755 node.localName = tag.toLowerCase(); 756 node.tagName = tag.toUpperCase(); 757 return node; 758 } 759 760 var replacement = node.ownerDocument.createElement(tag); 761 while (node.firstChild) { 762 replacement.appendChild(node.firstChild); 763 } 764 node.parentNode.replaceChild(replacement, node); 765 if (node.readability) { 766 replacement.readability = node.readability; 767 } 768 769 for (var i = 0; i < node.attributes.length; i++) { 770 replacement.setAttributeNode(node.attributes[i].cloneNode()); 771 } 772 return replacement; 773 }, 774 775 /** 776 * Prepare the article node for display. Clean out any inline styles, 777 * iframes, forms, strip extraneous <p> tags, etc. 778 * 779 * @param Element 780 * @return void 781 **/ 782 _prepArticle(articleContent) { 783 this._cleanStyles(articleContent); 784 785 // Check for data tables before we continue, to avoid removing items in 786 // those tables, which will often be isolated even though they're 787 // visually linked to other content-ful elements (text, images, etc.). 788 this._markDataTables(articleContent); 789 790 this._fixLazyImages(articleContent); 791 792 // Clean out junk from the article content 793 this._cleanConditionally(articleContent, "form"); 794 this._cleanConditionally(articleContent, "fieldset"); 795 this._clean(articleContent, "object"); 796 this._clean(articleContent, "embed"); 797 this._clean(articleContent, "footer"); 798 this._clean(articleContent, "link"); 799 this._clean(articleContent, "aside"); 800 801 // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, 802 // which means we don't remove the top candidates even they have "share". 803 804 var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; 805 806 this._forEachNode(articleContent.children, function (topCandidate) { 807 this._cleanMatchedNodes(topCandidate, function (node, matchString) { 808 return ( 809 this.REGEXPS.shareElements.test(matchString) && 810 node.textContent.length < shareElementThreshold 811 ); 812 }); 813 }); 814 815 this._clean(articleContent, "iframe"); 816 this._clean(articleContent, "input"); 817 this._clean(articleContent, "textarea"); 818 this._clean(articleContent, "select"); 819 this._clean(articleContent, "button"); 820 this._cleanHeaders(articleContent); 821 822 // Do these last as the previous stuff may have removed junk 823 // that will affect these 824 this._cleanConditionally(articleContent, "table"); 825 this._cleanConditionally(articleContent, "ul"); 826 this._cleanConditionally(articleContent, "div"); 827 828 // replace H1 with H2 as H1 should be only title that is displayed separately 829 this._replaceNodeTags( 830 this._getAllNodesWithTag(articleContent, ["h1"]), 831 "h2" 832 ); 833 834 // Remove extra paragraphs 835 this._removeNodes( 836 this._getAllNodesWithTag(articleContent, ["p"]), 837 function (paragraph) { 838 // At this point, nasty iframes have been removed; only embedded video 839 // ones remain. 840 var contentElementCount = this._getAllNodesWithTag(paragraph, [ 841 "img", 842 "embed", 843 "object", 844 "iframe", 845 ]).length; 846 return ( 847 contentElementCount === 0 && !this._getInnerText(paragraph, false) 848 ); 849 } 850 ); 851 852 this._forEachNode( 853 this._getAllNodesWithTag(articleContent, ["br"]), 854 function (br) { 855 var next = this._nextNode(br.nextSibling); 856 if (next && next.tagName == "P") { 857 br.remove(); 858 } 859 } 860 ); 861 862 // Remove single-cell tables 863 this._forEachNode( 864 this._getAllNodesWithTag(articleContent, ["table"]), 865 function (table) { 866 var tbody = this._hasSingleTagInsideElement(table, "TBODY") 867 ? table.firstElementChild 868 : table; 869 if (this._hasSingleTagInsideElement(tbody, "TR")) { 870 var row = tbody.firstElementChild; 871 if (this._hasSingleTagInsideElement(row, "TD")) { 872 var cell = row.firstElementChild; 873 cell = this._setNodeTag( 874 cell, 875 this._everyNode(cell.childNodes, this._isPhrasingContent) 876 ? "P" 877 : "DIV" 878 ); 879 table.parentNode.replaceChild(cell, table); 880 } 881 } 882 } 883 ); 884 }, 885 886 /** 887 * Initialize a node with the readability object. Also checks the 888 * className/id for special names to add to its score. 889 * 890 * @param Element 891 * @return void 892 **/ 893 _initializeNode(node) { 894 node.readability = { contentScore: 0 }; 895 896 switch (node.tagName) { 897 case "DIV": 898 node.readability.contentScore += 5; 899 break; 900 901 case "PRE": 902 case "TD": 903 case "BLOCKQUOTE": 904 node.readability.contentScore += 3; 905 break; 906 907 case "ADDRESS": 908 case "OL": 909 case "UL": 910 case "DL": 911 case "DD": 912 case "DT": 913 case "LI": 914 case "FORM": 915 node.readability.contentScore -= 3; 916 break; 917 918 case "H1": 919 case "H2": 920 case "H3": 921 case "H4": 922 case "H5": 923 case "H6": 924 case "TH": 925 node.readability.contentScore -= 5; 926 break; 927 } 928 929 node.readability.contentScore += this._getClassWeight(node); 930 }, 931 932 _removeAndGetNext(node) { 933 var nextNode = this._getNextNode(node, true); 934 node.remove(); 935 return nextNode; 936 }, 937 938 /** 939 * Traverse the DOM from node to node, starting at the node passed in. 940 * Pass true for the second parameter to indicate this node itself 941 * (and its kids) are going away, and we want the next node over. 942 * 943 * Calling this in a loop will traverse the DOM depth-first. 944 * 945 * @param {Element} node 946 * @param {boolean} ignoreSelfAndKids 947 * @return {Element} 948 */ 949 _getNextNode(node, ignoreSelfAndKids) { 950 // First check for kids if those aren't being ignored 951 if (!ignoreSelfAndKids && node.firstElementChild) { 952 return node.firstElementChild; 953 } 954 // Then for siblings... 955 if (node.nextElementSibling) { 956 return node.nextElementSibling; 957 } 958 // And finally, move up the parent chain *and* find a sibling 959 // (because this is depth-first traversal, we will have already 960 // seen the parent nodes themselves). 961 do { 962 node = node.parentNode; 963 } while (node && !node.nextElementSibling); 964 return node && node.nextElementSibling; 965 }, 966 967 // compares second text to first one 968 // 1 = same text, 0 = completely different text 969 // works the way that it splits both texts into words and then finds words that are unique in second text 970 // the result is given by the lower length of unique parts 971 _textSimilarity(textA, textB) { 972 var tokensA = textA 973 .toLowerCase() 974 .split(this.REGEXPS.tokenize) 975 .filter(Boolean); 976 var tokensB = textB 977 .toLowerCase() 978 .split(this.REGEXPS.tokenize) 979 .filter(Boolean); 980 if (!tokensA.length || !tokensB.length) { 981 return 0; 982 } 983 var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); 984 var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; 985 return 1 - distanceB; 986 }, 987 988 /** 989 * Checks whether an element node contains a valid byline 990 * 991 * @param node {Element} 992 * @param matchString {string} 993 * @return boolean 994 */ 995 _isValidByline(node, matchString) { 996 var rel = node.getAttribute("rel"); 997 var itemprop = node.getAttribute("itemprop"); 998 var bylineLength = node.textContent.trim().length; 999 1000 return ( 1001 (rel === "author" || 1002 (itemprop && itemprop.includes("author")) || 1003 this.REGEXPS.byline.test(matchString)) && 1004 !!bylineLength && 1005 bylineLength < 100 1006 ); 1007 }, 1008 1009 _getNodeAncestors(node, maxDepth) { 1010 maxDepth = maxDepth || 0; 1011 var i = 0, 1012 ancestors = []; 1013 while (node.parentNode) { 1014 ancestors.push(node.parentNode); 1015 if (maxDepth && ++i === maxDepth) { 1016 break; 1017 } 1018 node = node.parentNode; 1019 } 1020 return ancestors; 1021 }, 1022 1023 /*** 1024 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 1025 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 1026 * 1027 * @param page a document to run upon. Needs to be a full document, complete with body. 1028 * @return Element 1029 **/ 1030 /* eslint-disable-next-line complexity */ 1031 _grabArticle(page) { 1032 this.log("**** grabArticle ****"); 1033 var doc = this._doc; 1034 var isPaging = page !== null; 1035 page = page ? page : this._doc.body; 1036 1037 // We can't grab an article if we don't have a page! 1038 if (!page) { 1039 this.log("No body found in document. Abort."); 1040 return null; 1041 } 1042 1043 var pageCacheHtml = page.innerHTML; 1044 1045 while (true) { 1046 this.log("Starting grabArticle loop"); 1047 var stripUnlikelyCandidates = this._flagIsActive( 1048 this.FLAG_STRIP_UNLIKELYS 1049 ); 1050 1051 // First, node prepping. Trash nodes that look cruddy (like ones with the 1052 // class name "comment", etc), and turn divs into P tags where they have been 1053 // used inappropriately (as in, where they contain no other block level elements.) 1054 var elementsToScore = []; 1055 var node = this._doc.documentElement; 1056 1057 let shouldRemoveTitleHeader = true; 1058 1059 while (node) { 1060 if (node.tagName === "HTML") { 1061 this._articleLang = node.getAttribute("lang"); 1062 } 1063 1064 var matchString = node.className + " " + node.id; 1065 1066 if (!this._isProbablyVisible(node)) { 1067 this.log("Removing hidden node - " + matchString); 1068 node = this._removeAndGetNext(node); 1069 continue; 1070 } 1071 1072 // User is not able to see elements applied with both "aria-modal = true" and "role = dialog" 1073 if ( 1074 node.getAttribute("aria-modal") == "true" && 1075 node.getAttribute("role") == "dialog" 1076 ) { 1077 node = this._removeAndGetNext(node); 1078 continue; 1079 } 1080 1081 // If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node. 1082 if ( 1083 !this._articleByline && 1084 !this._metadata.byline && 1085 this._isValidByline(node, matchString) 1086 ) { 1087 // Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline 1088 var endOfSearchMarkerNode = this._getNextNode(node, true); 1089 var next = this._getNextNode(node); 1090 var itemPropNameNode = null; 1091 while (next && next != endOfSearchMarkerNode) { 1092 var itemprop = next.getAttribute("itemprop"); 1093 if (itemprop && itemprop.includes("name")) { 1094 itemPropNameNode = next; 1095 break; 1096 } else { 1097 next = this._getNextNode(next); 1098 } 1099 } 1100 this._articleByline = (itemPropNameNode ?? node).textContent.trim(); 1101 node = this._removeAndGetNext(node); 1102 continue; 1103 } 1104 1105 if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { 1106 this.log( 1107 "Removing header: ", 1108 node.textContent.trim(), 1109 this._articleTitle.trim() 1110 ); 1111 shouldRemoveTitleHeader = false; 1112 node = this._removeAndGetNext(node); 1113 continue; 1114 } 1115 1116 // Remove unlikely candidates 1117 if (stripUnlikelyCandidates) { 1118 if ( 1119 this.REGEXPS.unlikelyCandidates.test(matchString) && 1120 !this.REGEXPS.okMaybeItsACandidate.test(matchString) && 1121 !this._hasAncestorTag(node, "table") && 1122 !this._hasAncestorTag(node, "code") && 1123 node.tagName !== "BODY" && 1124 node.tagName !== "A" 1125 ) { 1126 this.log("Removing unlikely candidate - " + matchString); 1127 node = this._removeAndGetNext(node); 1128 continue; 1129 } 1130 1131 if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { 1132 this.log( 1133 "Removing content with role " + 1134 node.getAttribute("role") + 1135 " - " + 1136 matchString 1137 ); 1138 node = this._removeAndGetNext(node); 1139 continue; 1140 } 1141 } 1142 1143 // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). 1144 if ( 1145 (node.tagName === "DIV" || 1146 node.tagName === "SECTION" || 1147 node.tagName === "HEADER" || 1148 node.tagName === "H1" || 1149 node.tagName === "H2" || 1150 node.tagName === "H3" || 1151 node.tagName === "H4" || 1152 node.tagName === "H5" || 1153 node.tagName === "H6") && 1154 this._isElementWithoutContent(node) 1155 ) { 1156 node = this._removeAndGetNext(node); 1157 continue; 1158 } 1159 1160 if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) { 1161 elementsToScore.push(node); 1162 } 1163 1164 // Turn all divs that don't have children block level elements into p's 1165 if (node.tagName === "DIV") { 1166 // Put phrasing content into paragraphs. 1167 var p = null; 1168 var childNode = node.firstChild; 1169 while (childNode) { 1170 var nextSibling = childNode.nextSibling; 1171 if (this._isPhrasingContent(childNode)) { 1172 if (p !== null) { 1173 p.appendChild(childNode); 1174 } else if (!this._isWhitespace(childNode)) { 1175 p = doc.createElement("p"); 1176 node.replaceChild(p, childNode); 1177 p.appendChild(childNode); 1178 } 1179 } else if (p !== null) { 1180 while (p.lastChild && this._isWhitespace(p.lastChild)) { 1181 p.lastChild.remove(); 1182 } 1183 p = null; 1184 } 1185 childNode = nextSibling; 1186 } 1187 1188 // Sites like http://mobile.slate.com encloses each paragraph with a DIV 1189 // element. DIVs with only a P element inside and no text content can be 1190 // safely converted into plain P elements to avoid confusing the scoring 1191 // algorithm with DIVs with are, in practice, paragraphs. 1192 if ( 1193 this._hasSingleTagInsideElement(node, "P") && 1194 this._getLinkDensity(node) < 0.25 1195 ) { 1196 var newNode = node.children[0]; 1197 node.parentNode.replaceChild(newNode, node); 1198 node = newNode; 1199 elementsToScore.push(node); 1200 } else if (!this._hasChildBlockElement(node)) { 1201 node = this._setNodeTag(node, "P"); 1202 elementsToScore.push(node); 1203 } 1204 } 1205 node = this._getNextNode(node); 1206 } 1207 1208 /** 1209 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 1210 * Then add their score to their parent node. 1211 * 1212 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 1213 **/ 1214 var candidates = []; 1215 this._forEachNode(elementsToScore, function (elementToScore) { 1216 if ( 1217 !elementToScore.parentNode || 1218 typeof elementToScore.parentNode.tagName === "undefined" 1219 ) { 1220 return; 1221 } 1222 1223 // If this paragraph is less than 25 characters, don't even count it. 1224 var innerText = this._getInnerText(elementToScore); 1225 if (innerText.length < 25) { 1226 return; 1227 } 1228 1229 // Exclude nodes with no ancestor. 1230 var ancestors = this._getNodeAncestors(elementToScore, 5); 1231 if (ancestors.length === 0) { 1232 return; 1233 } 1234 1235 var contentScore = 0; 1236 1237 // Add a point for the paragraph itself as a base. 1238 contentScore += 1; 1239 1240 // Add points for any commas within this paragraph. 1241 contentScore += innerText.split(this.REGEXPS.commas).length; 1242 1243 // For every 100 characters in this paragraph, add another point. Up to 3 points. 1244 contentScore += Math.min(Math.floor(innerText.length / 100), 3); 1245 1246 // Initialize and score ancestors. 1247 this._forEachNode(ancestors, function (ancestor, level) { 1248 if ( 1249 !ancestor.tagName || 1250 !ancestor.parentNode || 1251 typeof ancestor.parentNode.tagName === "undefined" 1252 ) { 1253 return; 1254 } 1255 1256 if (typeof ancestor.readability === "undefined") { 1257 this._initializeNode(ancestor); 1258 candidates.push(ancestor); 1259 } 1260 1261 // Node score divider: 1262 // - parent: 1 (no division) 1263 // - grandparent: 2 1264 // - great grandparent+: ancestor level * 3 1265 if (level === 0) { 1266 var scoreDivider = 1; 1267 } else if (level === 1) { 1268 scoreDivider = 2; 1269 } else { 1270 scoreDivider = level * 3; 1271 } 1272 ancestor.readability.contentScore += contentScore / scoreDivider; 1273 }); 1274 }); 1275 1276 // After we've calculated scores, loop through all of the possible 1277 // candidate nodes we found and find the one with the highest score. 1278 var topCandidates = []; 1279 for (var c = 0, cl = candidates.length; c < cl; c += 1) { 1280 var candidate = candidates[c]; 1281 1282 // Scale the final candidates score based on link density. Good content 1283 // should have a relatively small link density (5% or less) and be mostly 1284 // unaffected by this operation. 1285 var candidateScore = 1286 candidate.readability.contentScore * 1287 (1 - this._getLinkDensity(candidate)); 1288 candidate.readability.contentScore = candidateScore; 1289 1290 this.log("Candidate:", candidate, "with score " + candidateScore); 1291 1292 for (var t = 0; t < this._nbTopCandidates; t++) { 1293 var aTopCandidate = topCandidates[t]; 1294 1295 if ( 1296 !aTopCandidate || 1297 candidateScore > aTopCandidate.readability.contentScore 1298 ) { 1299 topCandidates.splice(t, 0, candidate); 1300 if (topCandidates.length > this._nbTopCandidates) { 1301 topCandidates.pop(); 1302 } 1303 break; 1304 } 1305 } 1306 } 1307 1308 var topCandidate = topCandidates[0] || null; 1309 var neededToCreateTopCandidate = false; 1310 var parentOfTopCandidate; 1311 1312 // If we still have no top candidate, just use the body as a last resort. 1313 // We also have to copy the body node so it is something we can modify. 1314 if (topCandidate === null || topCandidate.tagName === "BODY") { 1315 // Move all of the page's children into topCandidate 1316 topCandidate = doc.createElement("DIV"); 1317 neededToCreateTopCandidate = true; 1318 // Move everything (not just elements, also text nodes etc.) into the container 1319 // so we even include text directly in the body: 1320 while (page.firstChild) { 1321 this.log("Moving child out:", page.firstChild); 1322 topCandidate.appendChild(page.firstChild); 1323 } 1324 1325 page.appendChild(topCandidate); 1326 1327 this._initializeNode(topCandidate); 1328 } else if (topCandidate) { 1329 // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array 1330 // and whose scores are quite closed with current `topCandidate` node. 1331 var alternativeCandidateAncestors = []; 1332 for (var i = 1; i < topCandidates.length; i++) { 1333 if ( 1334 topCandidates[i].readability.contentScore / 1335 topCandidate.readability.contentScore >= 1336 0.75 1337 ) { 1338 alternativeCandidateAncestors.push( 1339 this._getNodeAncestors(topCandidates[i]) 1340 ); 1341 } 1342 } 1343 var MINIMUM_TOPCANDIDATES = 3; 1344 if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { 1345 parentOfTopCandidate = topCandidate.parentNode; 1346 while (parentOfTopCandidate.tagName !== "BODY") { 1347 var listsContainingThisAncestor = 0; 1348 for ( 1349 var ancestorIndex = 0; 1350 ancestorIndex < alternativeCandidateAncestors.length && 1351 listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; 1352 ancestorIndex++ 1353 ) { 1354 listsContainingThisAncestor += Number( 1355 alternativeCandidateAncestors[ancestorIndex].includes( 1356 parentOfTopCandidate 1357 ) 1358 ); 1359 } 1360 if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { 1361 topCandidate = parentOfTopCandidate; 1362 break; 1363 } 1364 parentOfTopCandidate = parentOfTopCandidate.parentNode; 1365 } 1366 } 1367 if (!topCandidate.readability) { 1368 this._initializeNode(topCandidate); 1369 } 1370 1371 // Because of our bonus system, parents of candidates might have scores 1372 // themselves. They get half of the node. There won't be nodes with higher 1373 // scores than our topCandidate, but if we see the score going *up* in the first 1374 // few steps up the tree, that's a decent sign that there might be more content 1375 // lurking in other places that we want to unify in. The sibling stuff 1376 // below does some of that - but only if we've looked high enough up the DOM 1377 // tree. 1378 parentOfTopCandidate = topCandidate.parentNode; 1379 var lastScore = topCandidate.readability.contentScore; 1380 // The scores shouldn't get too low. 1381 var scoreThreshold = lastScore / 3; 1382 while (parentOfTopCandidate.tagName !== "BODY") { 1383 if (!parentOfTopCandidate.readability) { 1384 parentOfTopCandidate = parentOfTopCandidate.parentNode; 1385 continue; 1386 } 1387 var parentScore = parentOfTopCandidate.readability.contentScore; 1388 if (parentScore < scoreThreshold) { 1389 break; 1390 } 1391 if (parentScore > lastScore) { 1392 // Alright! We found a better parent to use. 1393 topCandidate = parentOfTopCandidate; 1394 break; 1395 } 1396 lastScore = parentOfTopCandidate.readability.contentScore; 1397 parentOfTopCandidate = parentOfTopCandidate.parentNode; 1398 } 1399 1400 // If the top candidate is the only child, use parent instead. This will help sibling 1401 // joining logic when adjacent content is actually located in parent's sibling node. 1402 parentOfTopCandidate = topCandidate.parentNode; 1403 while ( 1404 parentOfTopCandidate.tagName != "BODY" && 1405 parentOfTopCandidate.children.length == 1 1406 ) { 1407 topCandidate = parentOfTopCandidate; 1408 parentOfTopCandidate = topCandidate.parentNode; 1409 } 1410 if (!topCandidate.readability) { 1411 this._initializeNode(topCandidate); 1412 } 1413 } 1414 1415 // Now that we have the top candidate, look through its siblings for content 1416 // that might also be related. Things like preambles, content split by ads 1417 // that we removed, etc. 1418 var articleContent = doc.createElement("DIV"); 1419 if (isPaging) { 1420 articleContent.id = "readability-content"; 1421 } 1422 1423 var siblingScoreThreshold = Math.max( 1424 10, 1425 topCandidate.readability.contentScore * 0.2 1426 ); 1427 // Keep potential top candidate's parent node to try to get text direction of it later. 1428 parentOfTopCandidate = topCandidate.parentNode; 1429 var siblings = parentOfTopCandidate.children; 1430 1431 for (var s = 0, sl = siblings.length; s < sl; s++) { 1432 var sibling = siblings[s]; 1433 var append = false; 1434 1435 this.log( 1436 "Looking at sibling node:", 1437 sibling, 1438 sibling.readability 1439 ? "with score " + sibling.readability.contentScore 1440 : "" 1441 ); 1442 this.log( 1443 "Sibling has score", 1444 sibling.readability ? sibling.readability.contentScore : "Unknown" 1445 ); 1446 1447 if (sibling === topCandidate) { 1448 append = true; 1449 } else { 1450 var contentBonus = 0; 1451 1452 // Give a bonus if sibling nodes and top candidates have the example same classname 1453 if ( 1454 sibling.className === topCandidate.className && 1455 topCandidate.className !== "" 1456 ) { 1457 contentBonus += topCandidate.readability.contentScore * 0.2; 1458 } 1459 1460 if ( 1461 sibling.readability && 1462 sibling.readability.contentScore + contentBonus >= 1463 siblingScoreThreshold 1464 ) { 1465 append = true; 1466 } else if (sibling.nodeName === "P") { 1467 var linkDensity = this._getLinkDensity(sibling); 1468 var nodeContent = this._getInnerText(sibling); 1469 var nodeLength = nodeContent.length; 1470 1471 if (nodeLength > 80 && linkDensity < 0.25) { 1472 append = true; 1473 } else if ( 1474 nodeLength < 80 && 1475 nodeLength > 0 && 1476 linkDensity === 0 && 1477 nodeContent.search(/\.( |$)/) !== -1 1478 ) { 1479 append = true; 1480 } 1481 } 1482 } 1483 1484 if (append) { 1485 this.log("Appending node:", sibling); 1486 1487 if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) { 1488 // We have a node that isn't a common block level element, like a form or td tag. 1489 // Turn it into a div so it doesn't get filtered out later by accident. 1490 this.log("Altering sibling:", sibling, "to div."); 1491 1492 sibling = this._setNodeTag(sibling, "DIV"); 1493 } 1494 1495 articleContent.appendChild(sibling); 1496 // Fetch children again to make it compatible 1497 // with DOM parsers without live collection support. 1498 siblings = parentOfTopCandidate.children; 1499 // siblings is a reference to the children array, and 1500 // sibling is removed from the array when we call appendChild(). 1501 // As a result, we must revisit this index since the nodes 1502 // have been shifted. 1503 s -= 1; 1504 sl -= 1; 1505 } 1506 } 1507 1508 if (this._debug) { 1509 this.log("Article content pre-prep: " + articleContent.innerHTML); 1510 } 1511 // So we have all of the content that we need. Now we clean it up for presentation. 1512 this._prepArticle(articleContent); 1513 if (this._debug) { 1514 this.log("Article content post-prep: " + articleContent.innerHTML); 1515 } 1516 1517 if (neededToCreateTopCandidate) { 1518 // We already created a fake div thing, and there wouldn't have been any siblings left 1519 // for the previous loop, so there's no point trying to create a new div, and then 1520 // move all the children over. Just assign IDs and class names here. No need to append 1521 // because that already happened anyway. 1522 topCandidate.id = "readability-page-1"; 1523 topCandidate.className = "page"; 1524 } else { 1525 var div = doc.createElement("DIV"); 1526 div.id = "readability-page-1"; 1527 div.className = "page"; 1528 while (articleContent.firstChild) { 1529 div.appendChild(articleContent.firstChild); 1530 } 1531 articleContent.appendChild(div); 1532 } 1533 1534 if (this._debug) { 1535 this.log("Article content after paging: " + articleContent.innerHTML); 1536 } 1537 1538 var parseSuccessful = true; 1539 1540 // Now that we've gone through the full algorithm, check to see if 1541 // we got any meaningful content. If we didn't, we may need to re-run 1542 // grabArticle with different flags set. This gives us a higher likelihood of 1543 // finding the content, and the sieve approach gives us a higher likelihood of 1544 // finding the -right- content. 1545 var textLength = this._getInnerText(articleContent, true).length; 1546 if (textLength < this._charThreshold) { 1547 parseSuccessful = false; 1548 // eslint-disable-next-line no-unsanitized/property 1549 page.innerHTML = pageCacheHtml; 1550 1551 this._attempts.push({ 1552 articleContent, 1553 textLength, 1554 }); 1555 1556 if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { 1557 this._removeFlag(this.FLAG_STRIP_UNLIKELYS); 1558 } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 1559 this._removeFlag(this.FLAG_WEIGHT_CLASSES); 1560 } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 1561 this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); 1562 } else { 1563 // No luck after removing flags, just return the longest text we found during the different loops 1564 this._attempts.sort(function (a, b) { 1565 return b.textLength - a.textLength; 1566 }); 1567 1568 // But first check if we actually have something 1569 if (!this._attempts[0].textLength) { 1570 return null; 1571 } 1572 1573 articleContent = this._attempts[0].articleContent; 1574 parseSuccessful = true; 1575 } 1576 } 1577 1578 if (parseSuccessful) { 1579 // Find out text direction from ancestors of final top candidate. 1580 var ancestors = [parentOfTopCandidate, topCandidate].concat( 1581 this._getNodeAncestors(parentOfTopCandidate) 1582 ); 1583 this._someNode(ancestors, function (ancestor) { 1584 if (!ancestor.tagName) { 1585 return false; 1586 } 1587 var articleDir = ancestor.getAttribute("dir"); 1588 if (articleDir) { 1589 this._articleDir = articleDir; 1590 return true; 1591 } 1592 return false; 1593 }); 1594 return articleContent; 1595 } 1596 } 1597 }, 1598 1599 /** 1600 * Converts some of the common HTML entities in string to their corresponding characters. 1601 * 1602 * @param str {string} - a string to unescape. 1603 * @return string without HTML entity. 1604 */ 1605 _unescapeHtmlEntities(str) { 1606 if (!str) { 1607 return str; 1608 } 1609 1610 var htmlEscapeMap = this.HTML_ESCAPE_MAP; 1611 return str 1612 .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) { 1613 return htmlEscapeMap[tag]; 1614 }) 1615 .replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (_, hex, numStr) { 1616 var num = parseInt(hex || numStr, hex ? 16 : 10); 1617 1618 // these character references are replaced by a conforming HTML parser 1619 if (num == 0 || num > 0x10ffff || (num >= 0xd800 && num <= 0xdfff)) { 1620 num = 0xfffd; 1621 } 1622 1623 return String.fromCodePoint(num); 1624 }); 1625 }, 1626 1627 /** 1628 * Try to extract metadata from JSON-LD object. 1629 * For now, only Schema.org objects of type Article or its subtypes are supported. 1630 * @return Object with any metadata that could be extracted (possibly none) 1631 */ 1632 _getJSONLD(doc) { 1633 var scripts = this._getAllNodesWithTag(doc, ["script"]); 1634 1635 var metadata; 1636 1637 this._forEachNode(scripts, function (jsonLdElement) { 1638 if ( 1639 !metadata && 1640 jsonLdElement.getAttribute("type") === "application/ld+json" 1641 ) { 1642 try { 1643 // Strip CDATA markers if present 1644 var content = jsonLdElement.textContent.replace( 1645 /^\s*<!\[CDATA\[|\]\]>\s*$/g, 1646 "" 1647 ); 1648 var parsed = JSON.parse(content); 1649 1650 if (Array.isArray(parsed)) { 1651 parsed = parsed.find(it => { 1652 return ( 1653 it["@type"] && 1654 it["@type"].match(this.REGEXPS.jsonLdArticleTypes) 1655 ); 1656 }); 1657 if (!parsed) { 1658 return; 1659 } 1660 } 1661 1662 var schemaDotOrgRegex = /^https?\:\/\/schema\.org\/?$/; 1663 var matches = 1664 (typeof parsed["@context"] === "string" && 1665 parsed["@context"].match(schemaDotOrgRegex)) || 1666 (typeof parsed["@context"] === "object" && 1667 typeof parsed["@context"]["@vocab"] == "string" && 1668 parsed["@context"]["@vocab"].match(schemaDotOrgRegex)); 1669 1670 if (!matches) { 1671 return; 1672 } 1673 1674 if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { 1675 parsed = parsed["@graph"].find(it => { 1676 return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes); 1677 }); 1678 } 1679 1680 if ( 1681 !parsed || 1682 !parsed["@type"] || 1683 !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) 1684 ) { 1685 return; 1686 } 1687 1688 metadata = {}; 1689 1690 if ( 1691 typeof parsed.name === "string" && 1692 typeof parsed.headline === "string" && 1693 parsed.name !== parsed.headline 1694 ) { 1695 // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz 1696 // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either 1697 // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. 1698 1699 var title = this._getArticleTitle(); 1700 var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; 1701 var headlineMatches = 1702 this._textSimilarity(parsed.headline, title) > 0.75; 1703 1704 if (headlineMatches && !nameMatches) { 1705 metadata.title = parsed.headline; 1706 } else { 1707 metadata.title = parsed.name; 1708 } 1709 } else if (typeof parsed.name === "string") { 1710 metadata.title = parsed.name.trim(); 1711 } else if (typeof parsed.headline === "string") { 1712 metadata.title = parsed.headline.trim(); 1713 } 1714 if (parsed.author) { 1715 if (typeof parsed.author.name === "string") { 1716 metadata.byline = parsed.author.name.trim(); 1717 } else if ( 1718 Array.isArray(parsed.author) && 1719 parsed.author[0] && 1720 typeof parsed.author[0].name === "string" 1721 ) { 1722 metadata.byline = parsed.author 1723 .filter(function (author) { 1724 return author && typeof author.name === "string"; 1725 }) 1726 .map(function (author) { 1727 return author.name.trim(); 1728 }) 1729 .join(", "); 1730 } 1731 } 1732 if (typeof parsed.description === "string") { 1733 metadata.excerpt = parsed.description.trim(); 1734 } 1735 if (parsed.publisher && typeof parsed.publisher.name === "string") { 1736 metadata.siteName = parsed.publisher.name.trim(); 1737 } 1738 if (typeof parsed.datePublished === "string") { 1739 metadata.datePublished = parsed.datePublished.trim(); 1740 } 1741 } catch (err) { 1742 this.log(err.message); 1743 } 1744 } 1745 }); 1746 return metadata ? metadata : {}; 1747 }, 1748 1749 /** 1750 * Attempts to get excerpt and byline metadata for the article. 1751 * 1752 * @param {Object} jsonld — object containing any metadata that 1753 * could be extracted from JSON-LD object. 1754 * 1755 * @return Object with optional "excerpt" and "byline" properties 1756 */ 1757 _getArticleMetadata(jsonld) { 1758 var metadata = {}; 1759 var values = {}; 1760 var metaElements = this._doc.getElementsByTagName("meta"); 1761 1762 // property is a space-separated list of values 1763 var propertyPattern = 1764 /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi; 1765 1766 // name is a single value 1767 var namePattern = 1768 /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; 1769 1770 // Find description tags. 1771 this._forEachNode(metaElements, function (element) { 1772 var elementName = element.getAttribute("name"); 1773 var elementProperty = element.getAttribute("property"); 1774 var content = element.getAttribute("content"); 1775 if (!content) { 1776 return; 1777 } 1778 var matches = null; 1779 var name = null; 1780 1781 if (elementProperty) { 1782 matches = elementProperty.match(propertyPattern); 1783 if (matches) { 1784 // Convert to lowercase, and remove any whitespace 1785 // so we can match below. 1786 name = matches[0].toLowerCase().replace(/\s/g, ""); 1787 // multiple authors 1788 values[name] = content.trim(); 1789 } 1790 } 1791 if (!matches && elementName && namePattern.test(elementName)) { 1792 name = elementName; 1793 if (content) { 1794 // Convert to lowercase, remove any whitespace, and convert dots 1795 // to colons so we can match below. 1796 name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); 1797 values[name] = content.trim(); 1798 } 1799 } 1800 }); 1801 1802 // get title 1803 metadata.title = 1804 jsonld.title || 1805 values["dc:title"] || 1806 values["dcterm:title"] || 1807 values["og:title"] || 1808 values["weibo:article:title"] || 1809 values["weibo:webpage:title"] || 1810 values.title || 1811 values["twitter:title"] || 1812 values["parsely-title"]; 1813 1814 if (!metadata.title) { 1815 metadata.title = this._getArticleTitle(); 1816 } 1817 1818 const articleAuthor = 1819 typeof values["article:author"] === "string" && 1820 !this._isUrl(values["article:author"]) 1821 ? values["article:author"] 1822 : undefined; 1823 1824 // get author 1825 metadata.byline = 1826 jsonld.byline || 1827 values["dc:creator"] || 1828 values["dcterm:creator"] || 1829 values.author || 1830 values["parsely-author"] || 1831 articleAuthor; 1832 1833 // get description 1834 metadata.excerpt = 1835 jsonld.excerpt || 1836 values["dc:description"] || 1837 values["dcterm:description"] || 1838 values["og:description"] || 1839 values["weibo:article:description"] || 1840 values["weibo:webpage:description"] || 1841 values.description || 1842 values["twitter:description"]; 1843 1844 // get site name 1845 metadata.siteName = jsonld.siteName || values["og:site_name"]; 1846 1847 // get article published time 1848 metadata.publishedTime = 1849 jsonld.datePublished || 1850 values["article:published_time"] || 1851 values["parsely-pub-date"] || 1852 null; 1853 1854 // in many sites the meta value is escaped with HTML entities, 1855 // so here we need to unescape it 1856 metadata.title = this._unescapeHtmlEntities(metadata.title); 1857 metadata.byline = this._unescapeHtmlEntities(metadata.byline); 1858 metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); 1859 metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); 1860 metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime); 1861 1862 return metadata; 1863 }, 1864 1865 /** 1866 * Check if node is image, or if node contains exactly only one image 1867 * whether as a direct child or as its descendants. 1868 * 1869 * @param Element 1870 **/ 1871 _isSingleImage(node) { 1872 while (node) { 1873 if (node.tagName === "IMG") { 1874 return true; 1875 } 1876 if (node.children.length !== 1 || node.textContent.trim() !== "") { 1877 return false; 1878 } 1879 node = node.children[0]; 1880 } 1881 return false; 1882 }, 1883 1884 /** 1885 * Find all <noscript> that are located after <img> nodes, and which contain only one 1886 * <img> element. Replace the first image with the image from inside the <noscript> tag, 1887 * and remove the <noscript> tag. This improves the quality of the images we use on 1888 * some sites (e.g. Medium). 1889 * 1890 * @param Element 1891 **/ 1892 _unwrapNoscriptImages(doc) { 1893 // Find img without source or attributes that might contains image, and remove it. 1894 // This is done to prevent a placeholder img is replaced by img from noscript in next step. 1895 var imgs = Array.from(doc.getElementsByTagName("img")); 1896 this._forEachNode(imgs, function (img) { 1897 for (var i = 0; i < img.attributes.length; i++) { 1898 var attr = img.attributes[i]; 1899 switch (attr.name) { 1900 case "src": 1901 case "srcset": 1902 case "data-src": 1903 case "data-srcset": 1904 return; 1905 } 1906 1907 if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { 1908 return; 1909 } 1910 } 1911 1912 img.remove(); 1913 }); 1914 1915 // Next find noscript and try to extract its image 1916 var noscripts = Array.from(doc.getElementsByTagName("noscript")); 1917 this._forEachNode(noscripts, function (noscript) { 1918 // Parse content of noscript and make sure it only contains image 1919 if (!this._isSingleImage(noscript)) { 1920 return; 1921 } 1922 var tmp = doc.createElement("div"); 1923 // We're running in the document context, and using unmodified 1924 // document contents, so doing this should be safe. 1925 // (Also we heavily discourage people from allowing script to 1926 // run at all in this document...) 1927 // eslint-disable-next-line no-unsanitized/property 1928 tmp.innerHTML = noscript.innerHTML; 1929 1930 // If noscript has previous sibling and it only contains image, 1931 // replace it with noscript content. However we also keep old 1932 // attributes that might contains image. 1933 var prevElement = noscript.previousElementSibling; 1934 if (prevElement && this._isSingleImage(prevElement)) { 1935 var prevImg = prevElement; 1936 if (prevImg.tagName !== "IMG") { 1937 prevImg = prevElement.getElementsByTagName("img")[0]; 1938 } 1939 1940 var newImg = tmp.getElementsByTagName("img")[0]; 1941 for (var i = 0; i < prevImg.attributes.length; i++) { 1942 var attr = prevImg.attributes[i]; 1943 if (attr.value === "") { 1944 continue; 1945 } 1946 1947 if ( 1948 attr.name === "src" || 1949 attr.name === "srcset" || 1950 /\.(jpg|jpeg|png|webp)/i.test(attr.value) 1951 ) { 1952 if (newImg.getAttribute(attr.name) === attr.value) { 1953 continue; 1954 } 1955 1956 var attrName = attr.name; 1957 if (newImg.hasAttribute(attrName)) { 1958 attrName = "data-old-" + attrName; 1959 } 1960 1961 newImg.setAttribute(attrName, attr.value); 1962 } 1963 } 1964 1965 noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement); 1966 } 1967 }); 1968 }, 1969 1970 /** 1971 * Removes script tags from the document. 1972 * 1973 * @param Element 1974 **/ 1975 _removeScripts(doc) { 1976 this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"])); 1977 }, 1978 1979 /** 1980 * Check if this node has only whitespace and a single element with given tag 1981 * Returns false if the DIV node contains non-empty text nodes 1982 * or if it contains no element with given tag or more than 1 element. 1983 * 1984 * @param Element 1985 * @param string tag of child element 1986 **/ 1987 _hasSingleTagInsideElement(element, tag) { 1988 // There should be exactly 1 element child with given tag 1989 if (element.children.length != 1 || element.children[0].tagName !== tag) { 1990 return false; 1991 } 1992 1993 // And there should be no text nodes with real content 1994 return !this._someNode(element.childNodes, function (node) { 1995 return ( 1996 node.nodeType === this.TEXT_NODE && 1997 this.REGEXPS.hasContent.test(node.textContent) 1998 ); 1999 }); 2000 }, 2001 2002 _isElementWithoutContent(node) { 2003 return ( 2004 node.nodeType === this.ELEMENT_NODE && 2005 !node.textContent.trim().length && 2006 (!node.children.length || 2007 node.children.length == 2008 node.getElementsByTagName("br").length + 2009 node.getElementsByTagName("hr").length) 2010 ); 2011 }, 2012 2013 /** 2014 * Determine whether element has any children block level elements. 2015 * 2016 * @param Element 2017 */ 2018 _hasChildBlockElement(element) { 2019 return this._someNode(element.childNodes, function (node) { 2020 return ( 2021 this.DIV_TO_P_ELEMS.has(node.tagName) || 2022 this._hasChildBlockElement(node) 2023 ); 2024 }); 2025 }, 2026 2027 /*** 2028 * Determine if a node qualifies as phrasing content. 2029 * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content 2030 **/ 2031 _isPhrasingContent(node) { 2032 return ( 2033 node.nodeType === this.TEXT_NODE || 2034 this.PHRASING_ELEMS.includes(node.tagName) || 2035 ((node.tagName === "A" || 2036 node.tagName === "DEL" || 2037 node.tagName === "INS") && 2038 this._everyNode(node.childNodes, this._isPhrasingContent)) 2039 ); 2040 }, 2041 2042 _isWhitespace(node) { 2043 return ( 2044 (node.nodeType === this.TEXT_NODE && 2045 node.textContent.trim().length === 0) || 2046 (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR") 2047 ); 2048 }, 2049 2050 /** 2051 * Get the inner text of a node - cross browser compatibly. 2052 * This also strips out any excess whitespace to be found. 2053 * 2054 * @param Element 2055 * @param Boolean normalizeSpaces (default: true) 2056 * @return string 2057 **/ 2058 _getInnerText(e, normalizeSpaces) { 2059 normalizeSpaces = 2060 typeof normalizeSpaces === "undefined" ? true : normalizeSpaces; 2061 var textContent = e.textContent.trim(); 2062 2063 if (normalizeSpaces) { 2064 return textContent.replace(this.REGEXPS.normalize, " "); 2065 } 2066 return textContent; 2067 }, 2068 2069 /** 2070 * Get the number of times a string s appears in the node e. 2071 * 2072 * @param Element 2073 * @param string - what to split on. Default is "," 2074 * @return number (integer) 2075 **/ 2076 _getCharCount(e, s) { 2077 s = s || ","; 2078 return this._getInnerText(e).split(s).length - 1; 2079 }, 2080 2081 /** 2082 * Remove the style attribute on every e and under. 2083 * TODO: Test if getElementsByTagName(*) is faster. 2084 * 2085 * @param Element 2086 * @return void 2087 **/ 2088 _cleanStyles(e) { 2089 if (!e || e.tagName.toLowerCase() === "svg") { 2090 return; 2091 } 2092 2093 // Remove `style` and deprecated presentational attributes 2094 for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { 2095 e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); 2096 } 2097 2098 if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.includes(e.tagName)) { 2099 e.removeAttribute("width"); 2100 e.removeAttribute("height"); 2101 } 2102 2103 var cur = e.firstElementChild; 2104 while (cur !== null) { 2105 this._cleanStyles(cur); 2106 cur = cur.nextElementSibling; 2107 } 2108 }, 2109 2110 /** 2111 * Get the density of links as a percentage of the content 2112 * This is the amount of text that is inside a link divided by the total text in the node. 2113 * 2114 * @param Element 2115 * @return number (float) 2116 **/ 2117 _getLinkDensity(element) { 2118 var textLength = this._getInnerText(element).length; 2119 if (textLength === 0) { 2120 return 0; 2121 } 2122 2123 var linkLength = 0; 2124 2125 // XXX implement _reduceNodeList? 2126 this._forEachNode(element.getElementsByTagName("a"), function (linkNode) { 2127 var href = linkNode.getAttribute("href"); 2128 var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1; 2129 linkLength += this._getInnerText(linkNode).length * coefficient; 2130 }); 2131 2132 return linkLength / textLength; 2133 }, 2134 2135 /** 2136 * Get an elements class/id weight. Uses regular expressions to tell if this 2137 * element looks good or bad. 2138 * 2139 * @param Element 2140 * @return number (Integer) 2141 **/ 2142 _getClassWeight(e) { 2143 if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 2144 return 0; 2145 } 2146 2147 var weight = 0; 2148 2149 // Look for a special classname 2150 if (typeof e.className === "string" && e.className !== "") { 2151 if (this.REGEXPS.negative.test(e.className)) { 2152 weight -= 25; 2153 } 2154 2155 if (this.REGEXPS.positive.test(e.className)) { 2156 weight += 25; 2157 } 2158 } 2159 2160 // Look for a special ID 2161 if (typeof e.id === "string" && e.id !== "") { 2162 if (this.REGEXPS.negative.test(e.id)) { 2163 weight -= 25; 2164 } 2165 2166 if (this.REGEXPS.positive.test(e.id)) { 2167 weight += 25; 2168 } 2169 } 2170 2171 return weight; 2172 }, 2173 2174 /** 2175 * Clean a node of all elements of type "tag". 2176 * (Unless it's a youtube/vimeo video. People love movies.) 2177 * 2178 * @param Element 2179 * @param string tag to clean 2180 * @return void 2181 **/ 2182 _clean(e, tag) { 2183 var isEmbed = ["object", "embed", "iframe"].includes(tag); 2184 2185 this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) { 2186 // Allow youtube and vimeo videos through as people usually want to see those. 2187 if (isEmbed) { 2188 // First, check the elements attributes to see if any of them contain youtube or vimeo 2189 for (var i = 0; i < element.attributes.length; i++) { 2190 if (this._allowedVideoRegex.test(element.attributes[i].value)) { 2191 return false; 2192 } 2193 } 2194 2195 // For embed with <object> tag, check inner HTML as well. 2196 if ( 2197 element.tagName === "object" && 2198 this._allowedVideoRegex.test(element.innerHTML) 2199 ) { 2200 return false; 2201 } 2202 } 2203 2204 return true; 2205 }); 2206 }, 2207 2208 /** 2209 * Check if a given node has one of its ancestor tag name matching the 2210 * provided one. 2211 * @param HTMLElement node 2212 * @param String tagName 2213 * @param Number maxDepth 2214 * @param Function filterFn a filter to invoke to determine whether this node 'counts' 2215 * @return Boolean 2216 */ 2217 _hasAncestorTag(node, tagName, maxDepth, filterFn) { 2218 maxDepth = maxDepth || 3; 2219 tagName = tagName.toUpperCase(); 2220 var depth = 0; 2221 while (node.parentNode) { 2222 if (maxDepth > 0 && depth > maxDepth) { 2223 return false; 2224 } 2225 if ( 2226 node.parentNode.tagName === tagName && 2227 (!filterFn || filterFn(node.parentNode)) 2228 ) { 2229 return true; 2230 } 2231 node = node.parentNode; 2232 depth++; 2233 } 2234 return false; 2235 }, 2236 2237 /** 2238 * Return an object indicating how many rows and columns this table has. 2239 */ 2240 _getRowAndColumnCount(table) { 2241 var rows = 0; 2242 var columns = 0; 2243 var trs = table.getElementsByTagName("tr"); 2244 for (var i = 0; i < trs.length; i++) { 2245 var rowspan = trs[i].getAttribute("rowspan") || 0; 2246 if (rowspan) { 2247 rowspan = parseInt(rowspan, 10); 2248 } 2249 rows += rowspan || 1; 2250 2251 // Now look for column-related info 2252 var columnsInThisRow = 0; 2253 var cells = trs[i].getElementsByTagName("td"); 2254 for (var j = 0; j < cells.length; j++) { 2255 var colspan = cells[j].getAttribute("colspan") || 0; 2256 if (colspan) { 2257 colspan = parseInt(colspan, 10); 2258 } 2259 columnsInThisRow += colspan || 1; 2260 } 2261 columns = Math.max(columns, columnsInThisRow); 2262 } 2263 return { rows, columns }; 2264 }, 2265 2266 /** 2267 * Look for 'data' (as opposed to 'layout') tables, for which we use 2268 * similar checks as 2269 * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19 2270 */ 2271 _markDataTables(root) { 2272 var tables = root.getElementsByTagName("table"); 2273 for (var i = 0; i < tables.length; i++) { 2274 var table = tables[i]; 2275 var role = table.getAttribute("role"); 2276 if (role == "presentation") { 2277 table._readabilityDataTable = false; 2278 continue; 2279 } 2280 var datatable = table.getAttribute("datatable"); 2281 if (datatable == "0") { 2282 table._readabilityDataTable = false; 2283 continue; 2284 } 2285 var summary = table.getAttribute("summary"); 2286 if (summary) { 2287 table._readabilityDataTable = true; 2288 continue; 2289 } 2290 2291 var caption = table.getElementsByTagName("caption")[0]; 2292 if (caption && caption.childNodes.length) { 2293 table._readabilityDataTable = true; 2294 continue; 2295 } 2296 2297 // If the table has a descendant with any of these tags, consider a data table: 2298 var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"]; 2299 var descendantExists = function (tag) { 2300 return !!table.getElementsByTagName(tag)[0]; 2301 }; 2302 if (dataTableDescendants.some(descendantExists)) { 2303 this.log("Data table because found data-y descendant"); 2304 table._readabilityDataTable = true; 2305 continue; 2306 } 2307 2308 // Nested tables indicate a layout table: 2309 if (table.getElementsByTagName("table")[0]) { 2310 table._readabilityDataTable = false; 2311 continue; 2312 } 2313 2314 var sizeInfo = this._getRowAndColumnCount(table); 2315 2316 if (sizeInfo.columns == 1 || sizeInfo.rows == 1) { 2317 // single colum/row tables are commonly used for page layout purposes. 2318 table._readabilityDataTable = false; 2319 continue; 2320 } 2321 2322 if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) { 2323 table._readabilityDataTable = true; 2324 continue; 2325 } 2326 // Now just go by size entirely: 2327 table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10; 2328 } 2329 }, 2330 2331 /* convert images and figures that have properties like data-src into images that can be loaded without JS */ 2332 _fixLazyImages(root) { 2333 this._forEachNode( 2334 this._getAllNodesWithTag(root, ["img", "picture", "figure"]), 2335 function (elem) { 2336 // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. 2337 // So, here we check if the data uri is too short, just might as well remove it. 2338 if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) { 2339 // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. 2340 var parts = this.REGEXPS.b64DataUrl.exec(elem.src); 2341 if (parts[1] === "image/svg+xml") { 2342 return; 2343 } 2344 2345 // Make sure this element has other attributes which contains image. 2346 // If it doesn't, then this src is important and shouldn't be removed. 2347 var srcCouldBeRemoved = false; 2348 for (var i = 0; i < elem.attributes.length; i++) { 2349 var attr = elem.attributes[i]; 2350 if (attr.name === "src") { 2351 continue; 2352 } 2353 2354 if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { 2355 srcCouldBeRemoved = true; 2356 break; 2357 } 2358 } 2359 2360 // Here we assume if image is less than 100 bytes (or 133 after encoded to base64) 2361 // it will be too small, therefore it might be placeholder image. 2362 if (srcCouldBeRemoved) { 2363 var b64starts = parts[0].length; 2364 var b64length = elem.src.length - b64starts; 2365 if (b64length < 133) { 2366 elem.removeAttribute("src"); 2367 } 2368 } 2369 } 2370 2371 // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 2372 if ( 2373 (elem.src || (elem.srcset && elem.srcset != "null")) && 2374 !elem.className.toLowerCase().includes("lazy") 2375 ) { 2376 return; 2377 } 2378 2379 for (var j = 0; j < elem.attributes.length; j++) { 2380 attr = elem.attributes[j]; 2381 if ( 2382 attr.name === "src" || 2383 attr.name === "srcset" || 2384 attr.name === "alt" 2385 ) { 2386 continue; 2387 } 2388 var copyTo = null; 2389 if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { 2390 copyTo = "srcset"; 2391 } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { 2392 copyTo = "src"; 2393 } 2394 if (copyTo) { 2395 //if this is an img or picture, set the attribute directly 2396 if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { 2397 elem.setAttribute(copyTo, attr.value); 2398 } else if ( 2399 elem.tagName === "FIGURE" && 2400 !this._getAllNodesWithTag(elem, ["img", "picture"]).length 2401 ) { 2402 //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure 2403 //see the nytimes-3 testcase for an example 2404 var img = this._doc.createElement("img"); 2405 img.setAttribute(copyTo, attr.value); 2406 elem.appendChild(img); 2407 } 2408 } 2409 } 2410 } 2411 ); 2412 }, 2413 2414 _getTextDensity(e, tags) { 2415 var textLength = this._getInnerText(e, true).length; 2416 if (textLength === 0) { 2417 return 0; 2418 } 2419 var childrenLength = 0; 2420 var children = this._getAllNodesWithTag(e, tags); 2421 this._forEachNode( 2422 children, 2423 child => (childrenLength += this._getInnerText(child, true).length) 2424 ); 2425 return childrenLength / textLength; 2426 }, 2427 2428 /** 2429 * Clean an element of all tags of type "tag" if they look fishy. 2430 * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 2431 * 2432 * @return void 2433 **/ 2434 _cleanConditionally(e, tag) { 2435 if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 2436 return; 2437 } 2438 2439 // Gather counts for other typical elements embedded within. 2440 // Traverse backwards so we can remove nodes at the same time 2441 // without effecting the traversal. 2442 // 2443 // TODO: Consider taking into account original contentScore here. 2444 this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) { 2445 // First check if this node IS data table, in which case don't remove it. 2446 var isDataTable = function (t) { 2447 return t._readabilityDataTable; 2448 }; 2449 2450 var isList = tag === "ul" || tag === "ol"; 2451 if (!isList) { 2452 var listLength = 0; 2453 var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]); 2454 this._forEachNode( 2455 listNodes, 2456 list => (listLength += this._getInnerText(list).length) 2457 ); 2458 isList = listLength / this._getInnerText(node).length > 0.9; 2459 } 2460 2461 if (tag === "table" && isDataTable(node)) { 2462 return false; 2463 } 2464 2465 // Next check if we're inside a data table, in which case don't remove it as well. 2466 if (this._hasAncestorTag(node, "table", -1, isDataTable)) { 2467 return false; 2468 } 2469 2470 if (this._hasAncestorTag(node, "code")) { 2471 return false; 2472 } 2473 2474 // keep element if it has a data tables 2475 if ( 2476 [...node.getElementsByTagName("table")].some( 2477 tbl => tbl._readabilityDataTable 2478 ) 2479 ) { 2480 return false; 2481 } 2482 2483 var weight = this._getClassWeight(node); 2484 2485 this.log("Cleaning Conditionally", node); 2486 2487 var contentScore = 0; 2488 2489 if (weight + contentScore < 0) { 2490 return true; 2491 } 2492 2493 if (this._getCharCount(node, ",") < 10) { 2494 // If there are not very many commas, and the number of 2495 // non-paragraph elements is more than paragraphs or other 2496 // ominous signs, remove the element. 2497 var p = node.getElementsByTagName("p").length; 2498 var img = node.getElementsByTagName("img").length; 2499 var li = node.getElementsByTagName("li").length - 100; 2500 var input = node.getElementsByTagName("input").length; 2501 var headingDensity = this._getTextDensity(node, [ 2502 "h1", 2503 "h2", 2504 "h3", 2505 "h4", 2506 "h5", 2507 "h6", 2508 ]); 2509 2510 var embedCount = 0; 2511 var embeds = this._getAllNodesWithTag(node, [ 2512 "object", 2513 "embed", 2514 "iframe", 2515 ]); 2516 2517 for (var i = 0; i < embeds.length; i++) { 2518 // If this embed has attribute that matches video regex, don't delete it. 2519 for (var j = 0; j < embeds[i].attributes.length; j++) { 2520 if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) { 2521 return false; 2522 } 2523 } 2524 2525 // For embed with <object> tag, check inner HTML as well. 2526 if ( 2527 embeds[i].tagName === "object" && 2528 this._allowedVideoRegex.test(embeds[i].innerHTML) 2529 ) { 2530 return false; 2531 } 2532 2533 embedCount++; 2534 } 2535 2536 var innerText = this._getInnerText(node); 2537 2538 // toss any node whose inner text contains nothing but suspicious words 2539 if ( 2540 this.REGEXPS.adWords.test(innerText) || 2541 this.REGEXPS.loadingWords.test(innerText) 2542 ) { 2543 return true; 2544 } 2545 2546 var contentLength = innerText.length; 2547 var linkDensity = this._getLinkDensity(node); 2548 var textishTags = ["SPAN", "LI", "TD"].concat( 2549 Array.from(this.DIV_TO_P_ELEMS) 2550 ); 2551 var textDensity = this._getTextDensity(node, textishTags); 2552 var isFigureChild = this._hasAncestorTag(node, "figure"); 2553 2554 // apply shadiness checks, then check for exceptions 2555 const shouldRemoveNode = () => { 2556 const errs = []; 2557 if (!isFigureChild && img > 1 && p / img < 0.5) { 2558 errs.push(`Bad p to img ratio (img=${img}, p=${p})`); 2559 } 2560 if (!isList && li > p) { 2561 errs.push(`Too many li's outside of a list. (li=${li} > p=${p})`); 2562 } 2563 if (input > Math.floor(p / 3)) { 2564 errs.push(`Too many inputs per p. (input=${input}, p=${p})`); 2565 } 2566 if ( 2567 !isList && 2568 !isFigureChild && 2569 headingDensity < 0.9 && 2570 contentLength < 25 && 2571 (img === 0 || img > 2) && 2572 linkDensity > 0 2573 ) { 2574 errs.push( 2575 `Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})` 2576 ); 2577 } 2578 if ( 2579 !isList && 2580 weight < 25 && 2581 linkDensity > 0.2 + this._linkDensityModifier 2582 ) { 2583 errs.push( 2584 `Low weight and a little linky. (linkDensity=${linkDensity})` 2585 ); 2586 } 2587 if (weight >= 25 && linkDensity > 0.5 + this._linkDensityModifier) { 2588 errs.push( 2589 `High weight and mostly links. (linkDensity=${linkDensity})` 2590 ); 2591 } 2592 if ((embedCount === 1 && contentLength < 75) || embedCount > 1) { 2593 errs.push( 2594 `Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})` 2595 ); 2596 } 2597 if (img === 0 && textDensity === 0) { 2598 errs.push( 2599 `No useful content. (img=${img}, textDensity=${textDensity})` 2600 ); 2601 } 2602 2603 if (errs.length) { 2604 this.log("Checks failed", errs); 2605 return true; 2606 } 2607 2608 return false; 2609 }; 2610 2611 var haveToRemove = shouldRemoveNode(); 2612 2613 // Allow simple lists of images to remain in pages 2614 if (isList && haveToRemove) { 2615 for (var x = 0; x < node.children.length; x++) { 2616 let child = node.children[x]; 2617 // Don't filter in lists with li's that contain more than one child 2618 if (child.children.length > 1) { 2619 return haveToRemove; 2620 } 2621 } 2622 let li_count = node.getElementsByTagName("li").length; 2623 // Only allow the list to remain if every li contains an image 2624 if (img == li_count) { 2625 return false; 2626 } 2627 } 2628 return haveToRemove; 2629 } 2630 return false; 2631 }); 2632 }, 2633 2634 /** 2635 * Clean out elements that match the specified conditions 2636 * 2637 * @param Element 2638 * @param Function determines whether a node should be removed 2639 * @return void 2640 **/ 2641 _cleanMatchedNodes(e, filter) { 2642 var endOfSearchMarkerNode = this._getNextNode(e, true); 2643 var next = this._getNextNode(e); 2644 while (next && next != endOfSearchMarkerNode) { 2645 if (filter.call(this, next, next.className + " " + next.id)) { 2646 next = this._removeAndGetNext(next); 2647 } else { 2648 next = this._getNextNode(next); 2649 } 2650 } 2651 }, 2652 2653 /** 2654 * Clean out spurious headers from an Element. 2655 * 2656 * @param Element 2657 * @return void 2658 **/ 2659 _cleanHeaders(e) { 2660 let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]); 2661 this._removeNodes(headingNodes, function (node) { 2662 let shouldRemove = this._getClassWeight(node) < 0; 2663 if (shouldRemove) { 2664 this.log("Removing header with low class weight:", node); 2665 } 2666 return shouldRemove; 2667 }); 2668 }, 2669 2670 /** 2671 * Check if this node is an H1 or H2 element whose content is mostly 2672 * the same as the article title. 2673 * 2674 * @param Element the node to check. 2675 * @return boolean indicating whether this is a title-like header. 2676 */ 2677 _headerDuplicatesTitle(node) { 2678 if (node.tagName != "H1" && node.tagName != "H2") { 2679 return false; 2680 } 2681 var heading = this._getInnerText(node, false); 2682 this.log("Evaluating similarity of header:", heading, this._articleTitle); 2683 return this._textSimilarity(this._articleTitle, heading) > 0.75; 2684 }, 2685 2686 _flagIsActive(flag) { 2687 return (this._flags & flag) > 0; 2688 }, 2689 2690 _removeFlag(flag) { 2691 this._flags = this._flags & ~flag; 2692 }, 2693 2694 _isProbablyVisible(node) { 2695 // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes. 2696 return ( 2697 (!node.style || node.style.display != "none") && 2698 (!node.style || node.style.visibility != "hidden") && 2699 !node.hasAttribute("hidden") && 2700 //check for "fallback-image" so that wikimedia math images are displayed 2701 (!node.hasAttribute("aria-hidden") || 2702 node.getAttribute("aria-hidden") != "true" || 2703 (node.className && 2704 node.className.includes && 2705 node.className.includes("fallback-image"))) 2706 ); 2707 }, 2708 2709 /** 2710 * Runs readability. 2711 * 2712 * Workflow: 2713 * 1. Prep the document by removing script tags, css, etc. 2714 * 2. Build readability's DOM tree. 2715 * 3. Grab the article content from the current dom tree. 2716 * 4. Replace the current DOM tree with the new one. 2717 * 5. Read peacefully. 2718 * 2719 * @return void 2720 **/ 2721 parse() { 2722 // Avoid parsing too large documents, as per configuration option 2723 if (this._maxElemsToParse > 0) { 2724 var numTags = this._doc.getElementsByTagName("*").length; 2725 if (numTags > this._maxElemsToParse) { 2726 throw new Error( 2727 "Aborting parsing document; " + numTags + " elements found" 2728 ); 2729 } 2730 } 2731 2732 // Unwrap image from noscript 2733 this._unwrapNoscriptImages(this._doc); 2734 2735 // Extract JSON-LD metadata before removing scripts 2736 var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc); 2737 2738 // Remove script tags from the document. 2739 this._removeScripts(this._doc); 2740 2741 this._prepDocument(); 2742 2743 var metadata = this._getArticleMetadata(jsonLd); 2744 this._metadata = metadata; 2745 this._articleTitle = metadata.title; 2746 2747 var articleContent = this._grabArticle(); 2748 if (!articleContent) { 2749 return null; 2750 } 2751 2752 this.log("Grabbed: " + articleContent.innerHTML); 2753 2754 this._postProcessContent(articleContent); 2755 2756 // If we haven't found an excerpt in the article's metadata, use the article's 2757 // first paragraph as the excerpt. This is used for displaying a preview of 2758 // the article's content. 2759 if (!metadata.excerpt) { 2760 var paragraphs = articleContent.getElementsByTagName("p"); 2761 if (paragraphs.length) { 2762 metadata.excerpt = paragraphs[0].textContent.trim(); 2763 } 2764 } 2765 2766 var textContent = articleContent.textContent; 2767 return { 2768 title: this._articleTitle, 2769 byline: metadata.byline || this._articleByline, 2770 dir: this._articleDir, 2771 lang: this._articleLang, 2772 content: this._serializer(articleContent), 2773 textContent, 2774 length: textContent.length, 2775 excerpt: metadata.excerpt, 2776 siteName: metadata.siteName || this._articleSiteName, 2777 publishedTime: metadata.publishedTime, 2778 }; 2779 }, 2780}; 2781 2782if (typeof module === "object") { 2783 /* eslint-disable-next-line no-redeclare */ 2784 /* global module */ 2785 module.exports = Readability; 2786}