scripts/Readability.js at main · ellioth.co/summarizer-extension

ellioth.co / summarizer-extension
fork atom
A browser extension that lets you summarize any webpage and ask questions using AI.
fork atom
summarizer-extension / scripts / Readability.js
at main 2786 lines 90 kB view raw
wrap content
ellioth.co refactor: migrate to Mozilla Readability and centralize config 2d ago
ffa94d16
   1/*
   2 * Copyright (c) 2010 Arc90 Inc
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *     http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 */
  16
  17/*
  18 * This code is heavily based on Arc90's readability.js (1.7.1) script
  19 * available at: http://code.google.com/p/arc90labs-readability
  20 */
  21
  22/**
  23 * Public constructor.
  24 * @param {HTMLDocument} doc     The document to parse.
  25 * @param {Object}       options The options object.
  26 */
  27function Readability(doc, options) {
  28  // In some older versions, people passed a URI as the first argument. Cope:
  29  if (options && options.documentElement) {
  30    doc = options;
  31    options = arguments[2];
  32  } else if (!doc || !doc.documentElement) {
  33    throw new Error(
  34      "First argument to Readability constructor should be a document object."
  35    );
  36  }
  37  options = options || {};
  38
  39  this._doc = doc;
  40  this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
  41  this._articleTitle = null;
  42  this._articleByline = null;
  43  this._articleDir = null;
  44  this._articleSiteName = null;
  45  this._attempts = [];
  46  this._metadata = {};
  47
  48  // Configurable options
  49  this._debug = !!options.debug;
  50  this._maxElemsToParse =
  51    options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
  52  this._nbTopCandidates =
  53    options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
  54  this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
  55  this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(
  56    options.classesToPreserve || []
  57  );
  58  this._keepClasses = !!options.keepClasses;
  59  this._serializer =
  60    options.serializer ||
  61    function (el) {
  62      return el.innerHTML;
  63    };
  64  this._disableJSONLD = !!options.disableJSONLD;
  65  this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
  66  this._linkDensityModifier = options.linkDensityModifier || 0;
  67
  68  // Start with all flags set
  69  this._flags =
  70    this.FLAG_STRIP_UNLIKELYS |
  71    this.FLAG_WEIGHT_CLASSES |
  72    this.FLAG_CLEAN_CONDITIONALLY;
  73
  74  // Control whether log messages are sent to the console
  75  if (this._debug) {
  76    let logNode = function (node) {
  77      if (node.nodeType == node.TEXT_NODE) {
  78        return `${node.nodeName} ("${node.textContent}")`;
  79      }
  80      let attrPairs = Array.from(node.attributes || [], function (attr) {
  81        return `${attr.name}="${attr.value}"`;
  82      }).join(" ");
  83      return `<${node.localName} ${attrPairs}>`;
  84    };
  85    this.log = function () {
  86      if (typeof console !== "undefined") {
  87        let args = Array.from(arguments, arg => {
  88          if (arg && arg.nodeType == this.ELEMENT_NODE) {
  89            return logNode(arg);
  90          }
  91          return arg;
  92        });
  93        args.unshift("Reader: (Readability)");
  94        // eslint-disable-next-line no-console
  95        console.log(...args);
  96      } else if (typeof dump !== "undefined") {
  97        /* global dump */
  98        var msg = Array.prototype.map
  99          .call(arguments, function (x) {
 100            return x && x.nodeName ? logNode(x) : x;
 101          })
 102          .join(" ");
 103        dump("Reader: (Readability) " + msg + "\n");
 104      }
 105    };
 106  } else {
 107    this.log = function () {};
 108  }
 109}
 110
 111Readability.prototype = {
 112  FLAG_STRIP_UNLIKELYS: 0x1,
 113  FLAG_WEIGHT_CLASSES: 0x2,
 114  FLAG_CLEAN_CONDITIONALLY: 0x4,
 115
 116  // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
 117  ELEMENT_NODE: 1,
 118  TEXT_NODE: 3,
 119
 120  // Max number of nodes supported by this parser. Default: 0 (no limit)
 121  DEFAULT_MAX_ELEMS_TO_PARSE: 0,
 122
 123  // The number of top candidates to consider when analysing how
 124  // tight the competition is among candidates.
 125  DEFAULT_N_TOP_CANDIDATES: 5,
 126
 127  // Element tags to score by default.
 128  DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre"
 129    .toUpperCase()
 130    .split(","),
 131
 132  // The default number of chars an article must have in order to return a result
 133  DEFAULT_CHAR_THRESHOLD: 500,
 134
 135  // All of the regular expressions in use within readability.
 136  // Defined up here so we don't instantiate them repeatedly in loops.
 137  REGEXPS: {
 138    // NOTE: These two regular expressions are duplicated in
 139    // Readability-readerable.js. Please keep both copies in sync.
 140    unlikelyCandidates:
 141      /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
 142    okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
 143
 144    positive:
 145      /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
 146    negative:
 147      /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i,
 148    extraneous:
 149      /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
 150    byline: /byline|author|dateline|writtenby|p-author/i,
 151    replaceFonts: /<(\/?)font[^>]*>/gi,
 152    normalize: /\s{2,}/g,
 153    videos:
 154      /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
 155    shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
 156    nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
 157    prevLink: /(prev|earl|old|new|<|«)/i,
 158    tokenize: /\W+/g,
 159    whitespace: /^\s*$/,
 160    hasContent: /\S$/,
 161    hashUrl: /^#.+/,
 162    srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
 163    b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
 164    // Commas as used in Latin, Sindhi, Chinese and various other scripts.
 165    // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
 166    commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
 167    // See: https://schema.org/Article
 168    jsonLdArticleTypes:
 169      /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,
 170    // used to see if a node's content matches words commonly used for ad blocks or loading indicators
 171    adWords:
 172      /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu,
 173    loadingWords:
 174      /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu,
 175  },
 176
 177  UNLIKELY_ROLES: [
 178    "menu",
 179    "menubar",
 180    "complementary",
 181    "navigation",
 182    "alert",
 183    "alertdialog",
 184    "dialog",
 185  ],
 186
 187  DIV_TO_P_ELEMS: new Set([
 188    "BLOCKQUOTE",
 189    "DL",
 190    "DIV",
 191    "IMG",
 192    "OL",
 193    "P",
 194    "PRE",
 195    "TABLE",
 196    "UL",
 197  ]),
 198
 199  ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P", "OL", "UL"],
 200
 201  PRESENTATIONAL_ATTRIBUTES: [
 202    "align",
 203    "background",
 204    "bgcolor",
 205    "border",
 206    "cellpadding",
 207    "cellspacing",
 208    "frame",
 209    "hspace",
 210    "rules",
 211    "style",
 212    "valign",
 213    "vspace",
 214  ],
 215
 216  DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"],
 217
 218  // The commented out elements qualify as phrasing content but tend to be
 219  // removed by readability when put into paragraphs, so we ignore them here.
 220  PHRASING_ELEMS: [
 221    // "CANVAS", "IFRAME", "SVG", "VIDEO",
 222    "ABBR",
 223    "AUDIO",
 224    "B",
 225    "BDO",
 226    "BR",
 227    "BUTTON",
 228    "CITE",
 229    "CODE",
 230    "DATA",
 231    "DATALIST",
 232    "DFN",
 233    "EM",
 234    "EMBED",
 235    "I",
 236    "IMG",
 237    "INPUT",
 238    "KBD",
 239    "LABEL",
 240    "MARK",
 241    "MATH",
 242    "METER",
 243    "NOSCRIPT",
 244    "OBJECT",
 245    "OUTPUT",
 246    "PROGRESS",
 247    "Q",
 248    "RUBY",
 249    "SAMP",
 250    "SCRIPT",
 251    "SELECT",
 252    "SMALL",
 253    "SPAN",
 254    "STRONG",
 255    "SUB",
 256    "SUP",
 257    "TEXTAREA",
 258    "TIME",
 259    "VAR",
 260    "WBR",
 261  ],
 262
 263  // These are the classes that readability sets itself.
 264  CLASSES_TO_PRESERVE: ["page"],
 265
 266  // These are the list of HTML entities that need to be escaped.
 267  HTML_ESCAPE_MAP: {
 268    lt: "<",
 269    gt: ">",
 270    amp: "&",
 271    quot: '"',
 272    apos: "'",
 273  },
 274
 275  /**
 276   * Run any post-process modifications to article content as necessary.
 277   *
 278   * @param Element
 279   * @return void
 280   **/
 281  _postProcessContent(articleContent) {
 282    // Readability cannot open relative uris so we convert them to absolute uris.
 283    this._fixRelativeUris(articleContent);
 284
 285    this._simplifyNestedElements(articleContent);
 286
 287    if (!this._keepClasses) {
 288      // Remove classes.
 289      this._cleanClasses(articleContent);
 290    }
 291  },
 292
 293  /**
 294   * Iterates over a NodeList, calls `filterFn` for each node and removes node
 295   * if function returned `true`.
 296   *
 297   * If function is not passed, removes all the nodes in node list.
 298   *
 299   * @param NodeList nodeList The nodes to operate on
 300   * @param Function filterFn the function to use as a filter
 301   * @return void
 302   */
 303  _removeNodes(nodeList, filterFn) {
 304    // Avoid ever operating on live node lists.
 305    if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 306      throw new Error("Do not pass live node lists to _removeNodes");
 307    }
 308    for (var i = nodeList.length - 1; i >= 0; i--) {
 309      var node = nodeList[i];
 310      var parentNode = node.parentNode;
 311      if (parentNode) {
 312        if (!filterFn || filterFn.call(this, node, i, nodeList)) {
 313          parentNode.removeChild(node);
 314        }
 315      }
 316    }
 317  },
 318
 319  /**
 320   * Iterates over a NodeList, and calls _setNodeTag for each node.
 321   *
 322   * @param NodeList nodeList The nodes to operate on
 323   * @param String newTagName the new tag name to use
 324   * @return void
 325   */
 326  _replaceNodeTags(nodeList, newTagName) {
 327    // Avoid ever operating on live node lists.
 328    if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 329      throw new Error("Do not pass live node lists to _replaceNodeTags");
 330    }
 331    for (const node of nodeList) {
 332      this._setNodeTag(node, newTagName);
 333    }
 334  },
 335
 336  /**
 337   * Iterate over a NodeList, which doesn't natively fully implement the Array
 338   * interface.
 339   *
 340   * For convenience, the current object context is applied to the provided
 341   * iterate function.
 342   *
 343   * @param  NodeList nodeList The NodeList.
 344   * @param  Function fn       The iterate function.
 345   * @return void
 346   */
 347  _forEachNode(nodeList, fn) {
 348    Array.prototype.forEach.call(nodeList, fn, this);
 349  },
 350
 351  /**
 352   * Iterate over a NodeList, and return the first node that passes
 353   * the supplied test function
 354   *
 355   * For convenience, the current object context is applied to the provided
 356   * test function.
 357   *
 358   * @param  NodeList nodeList The NodeList.
 359   * @param  Function fn       The test function.
 360   * @return void
 361   */
 362  _findNode(nodeList, fn) {
 363    return Array.prototype.find.call(nodeList, fn, this);
 364  },
 365
 366  /**
 367   * Iterate over a NodeList, return true if any of the provided iterate
 368   * function calls returns true, false otherwise.
 369   *
 370   * For convenience, the current object context is applied to the
 371   * provided iterate function.
 372   *
 373   * @param  NodeList nodeList The NodeList.
 374   * @param  Function fn       The iterate function.
 375   * @return Boolean
 376   */
 377  _someNode(nodeList, fn) {
 378    return Array.prototype.some.call(nodeList, fn, this);
 379  },
 380
 381  /**
 382   * Iterate over a NodeList, return true if all of the provided iterate
 383   * function calls return true, false otherwise.
 384   *
 385   * For convenience, the current object context is applied to the
 386   * provided iterate function.
 387   *
 388   * @param  NodeList nodeList The NodeList.
 389   * @param  Function fn       The iterate function.
 390   * @return Boolean
 391   */
 392  _everyNode(nodeList, fn) {
 393    return Array.prototype.every.call(nodeList, fn, this);
 394  },
 395
 396  _getAllNodesWithTag(node, tagNames) {
 397    if (node.querySelectorAll) {
 398      return node.querySelectorAll(tagNames.join(","));
 399    }
 400    return [].concat.apply(
 401      [],
 402      tagNames.map(function (tag) {
 403        var collection = node.getElementsByTagName(tag);
 404        return Array.isArray(collection) ? collection : Array.from(collection);
 405      })
 406    );
 407  },
 408
 409  /**
 410   * Removes the class="" attribute from every element in the given
 411   * subtree, except those that match CLASSES_TO_PRESERVE and
 412   * the classesToPreserve array from the options object.
 413   *
 414   * @param Element
 415   * @return void
 416   */
 417  _cleanClasses(node) {
 418    var classesToPreserve = this._classesToPreserve;
 419    var className = (node.getAttribute("class") || "")
 420      .split(/\s+/)
 421      .filter(cls => classesToPreserve.includes(cls))
 422      .join(" ");
 423
 424    if (className) {
 425      node.setAttribute("class", className);
 426    } else {
 427      node.removeAttribute("class");
 428    }
 429
 430    for (node = node.firstElementChild; node; node = node.nextElementSibling) {
 431      this._cleanClasses(node);
 432    }
 433  },
 434
 435  /**
 436   * Tests whether a string is a URL or not.
 437   *
 438   * @param {string} str The string to test
 439   * @return {boolean} true if str is a URL, false if not
 440   */
 441  _isUrl(str) {
 442    try {
 443      new URL(str);
 444      return true;
 445    } catch {
 446      return false;
 447    }
 448  },
 449  /**
 450   * Converts each <a> and <img> uri in the given element to an absolute URI,
 451   * ignoring #ref URIs.
 452   *
 453   * @param Element
 454   * @return void
 455   */
 456  _fixRelativeUris(articleContent) {
 457    var baseURI = this._doc.baseURI;
 458    var documentURI = this._doc.documentURI;
 459    function toAbsoluteURI(uri) {
 460      // Leave hash links alone if the base URI matches the document URI:
 461      if (baseURI == documentURI && uri.charAt(0) == "#") {
 462        return uri;
 463      }
 464
 465      // Otherwise, resolve against base URI:
 466      try {
 467        return new URL(uri, baseURI).href;
 468      } catch (ex) {
 469        // Something went wrong, just return the original:
 470      }
 471      return uri;
 472    }
 473
 474    var links = this._getAllNodesWithTag(articleContent, ["a"]);
 475    this._forEachNode(links, function (link) {
 476      var href = link.getAttribute("href");
 477      if (href) {
 478        // Remove links with javascript: URIs, since
 479        // they won't work after scripts have been removed from the page.
 480        if (href.indexOf("javascript:") === 0) {
 481          // if the link only contains simple text content, it can be converted to a text node
 482          if (
 483            link.childNodes.length === 1 &&
 484            link.childNodes[0].nodeType === this.TEXT_NODE
 485          ) {
 486            var text = this._doc.createTextNode(link.textContent);
 487            link.parentNode.replaceChild(text, link);
 488          } else {
 489            // if the link has multiple children, they should all be preserved
 490            var container = this._doc.createElement("span");
 491            while (link.firstChild) {
 492              container.appendChild(link.firstChild);
 493            }
 494            link.parentNode.replaceChild(container, link);
 495          }
 496        } else {
 497          link.setAttribute("href", toAbsoluteURI(href));
 498        }
 499      }
 500    });
 501
 502    var medias = this._getAllNodesWithTag(articleContent, [
 503      "img",
 504      "picture",
 505      "figure",
 506      "video",
 507      "audio",
 508      "source",
 509    ]);
 510
 511    this._forEachNode(medias, function (media) {
 512      var src = media.getAttribute("src");
 513      var poster = media.getAttribute("poster");
 514      var srcset = media.getAttribute("srcset");
 515
 516      if (src) {
 517        media.setAttribute("src", toAbsoluteURI(src));
 518      }
 519
 520      if (poster) {
 521        media.setAttribute("poster", toAbsoluteURI(poster));
 522      }
 523
 524      if (srcset) {
 525        var newSrcset = srcset.replace(
 526          this.REGEXPS.srcsetUrl,
 527          function (_, p1, p2, p3) {
 528            return toAbsoluteURI(p1) + (p2 || "") + p3;
 529          }
 530        );
 531
 532        media.setAttribute("srcset", newSrcset);
 533      }
 534    });
 535  },
 536
 537  _simplifyNestedElements(articleContent) {
 538    var node = articleContent;
 539
 540    while (node) {
 541      if (
 542        node.parentNode &&
 543        ["DIV", "SECTION"].includes(node.tagName) &&
 544        !(node.id && node.id.startsWith("readability"))
 545      ) {
 546        if (this._isElementWithoutContent(node)) {
 547          node = this._removeAndGetNext(node);
 548          continue;
 549        } else if (
 550          this._hasSingleTagInsideElement(node, "DIV") ||
 551          this._hasSingleTagInsideElement(node, "SECTION")
 552        ) {
 553          var child = node.children[0];
 554          for (var i = 0; i < node.attributes.length; i++) {
 555            child.setAttributeNode(node.attributes[i].cloneNode());
 556          }
 557          node.parentNode.replaceChild(child, node);
 558          node = child;
 559          continue;
 560        }
 561      }
 562
 563      node = this._getNextNode(node);
 564    }
 565  },
 566
 567  /**
 568   * Get the article title as an H1.
 569   *
 570   * @return string
 571   **/
 572  _getArticleTitle() {
 573    var doc = this._doc;
 574    var curTitle = "";
 575    var origTitle = "";
 576
 577    try {
 578      curTitle = origTitle = doc.title.trim();
 579
 580      // If they had an element with id "title" in their HTML
 581      if (typeof curTitle !== "string") {
 582        curTitle = origTitle = this._getInnerText(
 583          doc.getElementsByTagName("title")[0]
 584        );
 585      }
 586    } catch (e) {
 587      /* ignore exceptions setting the title. */
 588    }
 589
 590    var titleHadHierarchicalSeparators = false;
 591    function wordCount(str) {
 592      return str.split(/\s+/).length;
 593    }
 594
 595    // If there's a separator in the title, first remove the final part
 596    if (/ [\|\-\\\/>»] /.test(curTitle)) {
 597      titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
 598      let allSeparators = Array.from(origTitle.matchAll(/ [\|\-\\\/>»] /gi));
 599      curTitle = origTitle.substring(0, allSeparators.pop().index);
 600
 601      // If the resulting title is too short, remove the first part instead:
 602      if (wordCount(curTitle) < 3) {
 603        curTitle = origTitle.replace(/^[^\|\-\\\/>»]*[\|\-\\\/>»]/gi, "");
 604      }
 605    } else if (curTitle.includes(": ")) {
 606      // Check if we have an heading containing this exact string, so we
 607      // could assume it's the full title.
 608      var headings = this._getAllNodesWithTag(doc, ["h1", "h2"]);
 609      var trimmedTitle = curTitle.trim();
 610      var match = this._someNode(headings, function (heading) {
 611        return heading.textContent.trim() === trimmedTitle;
 612      });
 613
 614      // If we don't, let's extract the title out of the original title string.
 615      if (!match) {
 616        curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
 617
 618        // If the title is now too short, try the first colon instead:
 619        if (wordCount(curTitle) < 3) {
 620          curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
 621          // But if we have too many words before the colon there's something weird
 622          // with the titles and the H tags so let's just use the original title instead
 623        } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
 624          curTitle = origTitle;
 625        }
 626      }
 627    } else if (curTitle.length > 150 || curTitle.length < 15) {
 628      var hOnes = doc.getElementsByTagName("h1");
 629
 630      if (hOnes.length === 1) {
 631        curTitle = this._getInnerText(hOnes[0]);
 632      }
 633    }
 634
 635    curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
 636    // If we now have 4 words or fewer as our title, and either no
 637    // 'hierarchical' separators (\, /, > or ») were found in the original
 638    // title or we decreased the number of words by more than 1 word, use
 639    // the original title.
 640    var curTitleWordCount = wordCount(curTitle);
 641    if (
 642      curTitleWordCount <= 4 &&
 643      (!titleHadHierarchicalSeparators ||
 644        curTitleWordCount !=
 645          wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)
 646    ) {
 647      curTitle = origTitle;
 648    }
 649
 650    return curTitle;
 651  },
 652
 653  /**
 654   * Prepare the HTML document for readability to scrape it.
 655   * This includes things like stripping javascript, CSS, and handling terrible markup.
 656   *
 657   * @return void
 658   **/
 659  _prepDocument() {
 660    var doc = this._doc;
 661
 662    // Remove all style tags in head
 663    this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
 664
 665    if (doc.body) {
 666      this._replaceBrs(doc.body);
 667    }
 668
 669    this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
 670  },
 671
 672  /**
 673   * Finds the next node, starting from the given node, and ignoring
 674   * whitespace in between. If the given node is an element, the same node is
 675   * returned.
 676   */
 677  _nextNode(node) {
 678    var next = node;
 679    while (
 680      next &&
 681      next.nodeType != this.ELEMENT_NODE &&
 682      this.REGEXPS.whitespace.test(next.textContent)
 683    ) {
 684      next = next.nextSibling;
 685    }
 686    return next;
 687  },
 688
 689  /**
 690   * Replaces 2 or more successive <br> elements with a single <p>.
 691   * Whitespace between <br> elements are ignored. For example:
 692   *   <div>foo<br>bar<br> <br><br>abc</div>
 693   * will become:
 694   *   <div>foo<br>bar<p>abc</p></div>
 695   */
 696  _replaceBrs(elem) {
 697    this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) {
 698      var next = br.nextSibling;
 699
 700      // Whether 2 or more <br> elements have been found and replaced with a
 701      // <p> block.
 702      var replaced = false;
 703
 704      // If we find a <br> chain, remove the <br>s until we hit another node
 705      // or non-whitespace. This leaves behind the first <br> in the chain
 706      // (which will be replaced with a <p> later).
 707      while ((next = this._nextNode(next)) && next.tagName == "BR") {
 708        replaced = true;
 709        var brSibling = next.nextSibling;
 710        next.remove();
 711        next = brSibling;
 712      }
 713
 714      // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
 715      // all sibling nodes as children of the <p> until we hit another <br>
 716      // chain.
 717      if (replaced) {
 718        var p = this._doc.createElement("p");
 719        br.parentNode.replaceChild(p, br);
 720
 721        next = p.nextSibling;
 722        while (next) {
 723          // If we've hit another <br><br>, we're done adding children to this <p>.
 724          if (next.tagName == "BR") {
 725            var nextElem = this._nextNode(next.nextSibling);
 726            if (nextElem && nextElem.tagName == "BR") {
 727              break;
 728            }
 729          }
 730
 731          if (!this._isPhrasingContent(next)) {
 732            break;
 733          }
 734
 735          // Otherwise, make this node a child of the new <p>.
 736          var sibling = next.nextSibling;
 737          p.appendChild(next);
 738          next = sibling;
 739        }
 740
 741        while (p.lastChild && this._isWhitespace(p.lastChild)) {
 742          p.lastChild.remove();
 743        }
 744
 745        if (p.parentNode.tagName === "P") {
 746          this._setNodeTag(p.parentNode, "DIV");
 747        }
 748      }
 749    });
 750  },
 751
 752  _setNodeTag(node, tag) {
 753    this.log("_setNodeTag", node, tag);
 754    if (this._docJSDOMParser) {
 755      node.localName = tag.toLowerCase();
 756      node.tagName = tag.toUpperCase();
 757      return node;
 758    }
 759
 760    var replacement = node.ownerDocument.createElement(tag);
 761    while (node.firstChild) {
 762      replacement.appendChild(node.firstChild);
 763    }
 764    node.parentNode.replaceChild(replacement, node);
 765    if (node.readability) {
 766      replacement.readability = node.readability;
 767    }
 768
 769    for (var i = 0; i < node.attributes.length; i++) {
 770      replacement.setAttributeNode(node.attributes[i].cloneNode());
 771    }
 772    return replacement;
 773  },
 774
 775  /**
 776   * Prepare the article node for display. Clean out any inline styles,
 777   * iframes, forms, strip extraneous <p> tags, etc.
 778   *
 779   * @param Element
 780   * @return void
 781   **/
 782  _prepArticle(articleContent) {
 783    this._cleanStyles(articleContent);
 784
 785    // Check for data tables before we continue, to avoid removing items in
 786    // those tables, which will often be isolated even though they're
 787    // visually linked to other content-ful elements (text, images, etc.).
 788    this._markDataTables(articleContent);
 789
 790    this._fixLazyImages(articleContent);
 791
 792    // Clean out junk from the article content
 793    this._cleanConditionally(articleContent, "form");
 794    this._cleanConditionally(articleContent, "fieldset");
 795    this._clean(articleContent, "object");
 796    this._clean(articleContent, "embed");
 797    this._clean(articleContent, "footer");
 798    this._clean(articleContent, "link");
 799    this._clean(articleContent, "aside");
 800
 801    // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
 802    // which means we don't remove the top candidates even they have "share".
 803
 804    var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
 805
 806    this._forEachNode(articleContent.children, function (topCandidate) {
 807      this._cleanMatchedNodes(topCandidate, function (node, matchString) {
 808        return (
 809          this.REGEXPS.shareElements.test(matchString) &&
 810          node.textContent.length < shareElementThreshold
 811        );
 812      });
 813    });
 814
 815    this._clean(articleContent, "iframe");
 816    this._clean(articleContent, "input");
 817    this._clean(articleContent, "textarea");
 818    this._clean(articleContent, "select");
 819    this._clean(articleContent, "button");
 820    this._cleanHeaders(articleContent);
 821
 822    // Do these last as the previous stuff may have removed junk
 823    // that will affect these
 824    this._cleanConditionally(articleContent, "table");
 825    this._cleanConditionally(articleContent, "ul");
 826    this._cleanConditionally(articleContent, "div");
 827
 828    // replace H1 with H2 as H1 should be only title that is displayed separately
 829    this._replaceNodeTags(
 830      this._getAllNodesWithTag(articleContent, ["h1"]),
 831      "h2"
 832    );
 833
 834    // Remove extra paragraphs
 835    this._removeNodes(
 836      this._getAllNodesWithTag(articleContent, ["p"]),
 837      function (paragraph) {
 838        // At this point, nasty iframes have been removed; only embedded video
 839        // ones remain.
 840        var contentElementCount = this._getAllNodesWithTag(paragraph, [
 841          "img",
 842          "embed",
 843          "object",
 844          "iframe",
 845        ]).length;
 846        return (
 847          contentElementCount === 0 && !this._getInnerText(paragraph, false)
 848        );
 849      }
 850    );
 851
 852    this._forEachNode(
 853      this._getAllNodesWithTag(articleContent, ["br"]),
 854      function (br) {
 855        var next = this._nextNode(br.nextSibling);
 856        if (next && next.tagName == "P") {
 857          br.remove();
 858        }
 859      }
 860    );
 861
 862    // Remove single-cell tables
 863    this._forEachNode(
 864      this._getAllNodesWithTag(articleContent, ["table"]),
 865      function (table) {
 866        var tbody = this._hasSingleTagInsideElement(table, "TBODY")
 867          ? table.firstElementChild
 868          : table;
 869        if (this._hasSingleTagInsideElement(tbody, "TR")) {
 870          var row = tbody.firstElementChild;
 871          if (this._hasSingleTagInsideElement(row, "TD")) {
 872            var cell = row.firstElementChild;
 873            cell = this._setNodeTag(
 874              cell,
 875              this._everyNode(cell.childNodes, this._isPhrasingContent)
 876                ? "P"
 877                : "DIV"
 878            );
 879            table.parentNode.replaceChild(cell, table);
 880          }
 881        }
 882      }
 883    );
 884  },
 885
 886  /**
 887   * Initialize a node with the readability object. Also checks the
 888   * className/id for special names to add to its score.
 889   *
 890   * @param Element
 891   * @return void
 892   **/
 893  _initializeNode(node) {
 894    node.readability = { contentScore: 0 };
 895
 896    switch (node.tagName) {
 897      case "DIV":
 898        node.readability.contentScore += 5;
 899        break;
 900
 901      case "PRE":
 902      case "TD":
 903      case "BLOCKQUOTE":
 904        node.readability.contentScore += 3;
 905        break;
 906
 907      case "ADDRESS":
 908      case "OL":
 909      case "UL":
 910      case "DL":
 911      case "DD":
 912      case "DT":
 913      case "LI":
 914      case "FORM":
 915        node.readability.contentScore -= 3;
 916        break;
 917
 918      case "H1":
 919      case "H2":
 920      case "H3":
 921      case "H4":
 922      case "H5":
 923      case "H6":
 924      case "TH":
 925        node.readability.contentScore -= 5;
 926        break;
 927    }
 928
 929    node.readability.contentScore += this._getClassWeight(node);
 930  },
 931
 932  _removeAndGetNext(node) {
 933    var nextNode = this._getNextNode(node, true);
 934    node.remove();
 935    return nextNode;
 936  },
 937
 938  /**
 939   * Traverse the DOM from node to node, starting at the node passed in.
 940   * Pass true for the second parameter to indicate this node itself
 941   * (and its kids) are going away, and we want the next node over.
 942   *
 943   * Calling this in a loop will traverse the DOM depth-first.
 944   *
 945   * @param {Element} node
 946   * @param {boolean} ignoreSelfAndKids
 947   * @return {Element}
 948   */
 949  _getNextNode(node, ignoreSelfAndKids) {
 950    // First check for kids if those aren't being ignored
 951    if (!ignoreSelfAndKids && node.firstElementChild) {
 952      return node.firstElementChild;
 953    }
 954    // Then for siblings...
 955    if (node.nextElementSibling) {
 956      return node.nextElementSibling;
 957    }
 958    // And finally, move up the parent chain *and* find a sibling
 959    // (because this is depth-first traversal, we will have already
 960    // seen the parent nodes themselves).
 961    do {
 962      node = node.parentNode;
 963    } while (node && !node.nextElementSibling);
 964    return node && node.nextElementSibling;
 965  },
 966
 967  // compares second text to first one
 968  // 1 = same text, 0 = completely different text
 969  // works the way that it splits both texts into words and then finds words that are unique in second text
 970  // the result is given by the lower length of unique parts
 971  _textSimilarity(textA, textB) {
 972    var tokensA = textA
 973      .toLowerCase()
 974      .split(this.REGEXPS.tokenize)
 975      .filter(Boolean);
 976    var tokensB = textB
 977      .toLowerCase()
 978      .split(this.REGEXPS.tokenize)
 979      .filter(Boolean);
 980    if (!tokensA.length || !tokensB.length) {
 981      return 0;
 982    }
 983    var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
 984    var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
 985    return 1 - distanceB;
 986  },
 987
 988  /**
 989   * Checks whether an element node contains a valid byline
 990   *
 991   * @param node {Element}
 992   * @param matchString {string}
 993   * @return boolean
 994   */
 995  _isValidByline(node, matchString) {
 996    var rel = node.getAttribute("rel");
 997    var itemprop = node.getAttribute("itemprop");
 998    var bylineLength = node.textContent.trim().length;
 999
1000    return (
1001      (rel === "author" ||
1002        (itemprop && itemprop.includes("author")) ||
1003        this.REGEXPS.byline.test(matchString)) &&
1004      !!bylineLength &&
1005      bylineLength < 100
1006    );
1007  },
1008
1009  _getNodeAncestors(node, maxDepth) {
1010    maxDepth = maxDepth || 0;
1011    var i = 0,
1012      ancestors = [];
1013    while (node.parentNode) {
1014      ancestors.push(node.parentNode);
1015      if (maxDepth && ++i === maxDepth) {
1016        break;
1017      }
1018      node = node.parentNode;
1019    }
1020    return ancestors;
1021  },
1022
1023  /***
1024   * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
1025   *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
1026   *
1027   * @param page a document to run upon. Needs to be a full document, complete with body.
1028   * @return Element
1029   **/
1030  /* eslint-disable-next-line complexity */
1031  _grabArticle(page) {
1032    this.log("**** grabArticle ****");
1033    var doc = this._doc;
1034    var isPaging = page !== null;
1035    page = page ? page : this._doc.body;
1036
1037    // We can't grab an article if we don't have a page!
1038    if (!page) {
1039      this.log("No body found in document. Abort.");
1040      return null;
1041    }
1042
1043    var pageCacheHtml = page.innerHTML;
1044
1045    while (true) {
1046      this.log("Starting grabArticle loop");
1047      var stripUnlikelyCandidates = this._flagIsActive(
1048        this.FLAG_STRIP_UNLIKELYS
1049      );
1050
1051      // First, node prepping. Trash nodes that look cruddy (like ones with the
1052      // class name "comment", etc), and turn divs into P tags where they have been
1053      // used inappropriately (as in, where they contain no other block level elements.)
1054      var elementsToScore = [];
1055      var node = this._doc.documentElement;
1056
1057      let shouldRemoveTitleHeader = true;
1058
1059      while (node) {
1060        if (node.tagName === "HTML") {
1061          this._articleLang = node.getAttribute("lang");
1062        }
1063
1064        var matchString = node.className + " " + node.id;
1065
1066        if (!this._isProbablyVisible(node)) {
1067          this.log("Removing hidden node - " + matchString);
1068          node = this._removeAndGetNext(node);
1069          continue;
1070        }
1071
1072        // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
1073        if (
1074          node.getAttribute("aria-modal") == "true" &&
1075          node.getAttribute("role") == "dialog"
1076        ) {
1077          node = this._removeAndGetNext(node);
1078          continue;
1079        }
1080
1081        // If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node.
1082        if (
1083          !this._articleByline &&
1084          !this._metadata.byline &&
1085          this._isValidByline(node, matchString)
1086        ) {
1087          // Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline
1088          var endOfSearchMarkerNode = this._getNextNode(node, true);
1089          var next = this._getNextNode(node);
1090          var itemPropNameNode = null;
1091          while (next && next != endOfSearchMarkerNode) {
1092            var itemprop = next.getAttribute("itemprop");
1093            if (itemprop && itemprop.includes("name")) {
1094              itemPropNameNode = next;
1095              break;
1096            } else {
1097              next = this._getNextNode(next);
1098            }
1099          }
1100          this._articleByline = (itemPropNameNode ?? node).textContent.trim();
1101          node = this._removeAndGetNext(node);
1102          continue;
1103        }
1104
1105        if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
1106          this.log(
1107            "Removing header: ",
1108            node.textContent.trim(),
1109            this._articleTitle.trim()
1110          );
1111          shouldRemoveTitleHeader = false;
1112          node = this._removeAndGetNext(node);
1113          continue;
1114        }
1115
1116        // Remove unlikely candidates
1117        if (stripUnlikelyCandidates) {
1118          if (
1119            this.REGEXPS.unlikelyCandidates.test(matchString) &&
1120            !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
1121            !this._hasAncestorTag(node, "table") &&
1122            !this._hasAncestorTag(node, "code") &&
1123            node.tagName !== "BODY" &&
1124            node.tagName !== "A"
1125          ) {
1126            this.log("Removing unlikely candidate - " + matchString);
1127            node = this._removeAndGetNext(node);
1128            continue;
1129          }
1130
1131          if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
1132            this.log(
1133              "Removing content with role " +
1134                node.getAttribute("role") +
1135                " - " +
1136                matchString
1137            );
1138            node = this._removeAndGetNext(node);
1139            continue;
1140          }
1141        }
1142
1143        // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
1144        if (
1145          (node.tagName === "DIV" ||
1146            node.tagName === "SECTION" ||
1147            node.tagName === "HEADER" ||
1148            node.tagName === "H1" ||
1149            node.tagName === "H2" ||
1150            node.tagName === "H3" ||
1151            node.tagName === "H4" ||
1152            node.tagName === "H5" ||
1153            node.tagName === "H6") &&
1154          this._isElementWithoutContent(node)
1155        ) {
1156          node = this._removeAndGetNext(node);
1157          continue;
1158        }
1159
1160        if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) {
1161          elementsToScore.push(node);
1162        }
1163
1164        // Turn all divs that don't have children block level elements into p's
1165        if (node.tagName === "DIV") {
1166          // Put phrasing content into paragraphs.
1167          var p = null;
1168          var childNode = node.firstChild;
1169          while (childNode) {
1170            var nextSibling = childNode.nextSibling;
1171            if (this._isPhrasingContent(childNode)) {
1172              if (p !== null) {
1173                p.appendChild(childNode);
1174              } else if (!this._isWhitespace(childNode)) {
1175                p = doc.createElement("p");
1176                node.replaceChild(p, childNode);
1177                p.appendChild(childNode);
1178              }
1179            } else if (p !== null) {
1180              while (p.lastChild && this._isWhitespace(p.lastChild)) {
1181                p.lastChild.remove();
1182              }
1183              p = null;
1184            }
1185            childNode = nextSibling;
1186          }
1187
1188          // Sites like http://mobile.slate.com encloses each paragraph with a DIV
1189          // element. DIVs with only a P element inside and no text content can be
1190          // safely converted into plain P elements to avoid confusing the scoring
1191          // algorithm with DIVs with are, in practice, paragraphs.
1192          if (
1193            this._hasSingleTagInsideElement(node, "P") &&
1194            this._getLinkDensity(node) < 0.25
1195          ) {
1196            var newNode = node.children[0];
1197            node.parentNode.replaceChild(newNode, node);
1198            node = newNode;
1199            elementsToScore.push(node);
1200          } else if (!this._hasChildBlockElement(node)) {
1201            node = this._setNodeTag(node, "P");
1202            elementsToScore.push(node);
1203          }
1204        }
1205        node = this._getNextNode(node);
1206      }
1207
1208      /**
1209       * Loop through all paragraphs, and assign a score to them based on how content-y they look.
1210       * Then add their score to their parent node.
1211       *
1212       * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
1213       **/
1214      var candidates = [];
1215      this._forEachNode(elementsToScore, function (elementToScore) {
1216        if (
1217          !elementToScore.parentNode ||
1218          typeof elementToScore.parentNode.tagName === "undefined"
1219        ) {
1220          return;
1221        }
1222
1223        // If this paragraph is less than 25 characters, don't even count it.
1224        var innerText = this._getInnerText(elementToScore);
1225        if (innerText.length < 25) {
1226          return;
1227        }
1228
1229        // Exclude nodes with no ancestor.
1230        var ancestors = this._getNodeAncestors(elementToScore, 5);
1231        if (ancestors.length === 0) {
1232          return;
1233        }
1234
1235        var contentScore = 0;
1236
1237        // Add a point for the paragraph itself as a base.
1238        contentScore += 1;
1239
1240        // Add points for any commas within this paragraph.
1241        contentScore += innerText.split(this.REGEXPS.commas).length;
1242
1243        // For every 100 characters in this paragraph, add another point. Up to 3 points.
1244        contentScore += Math.min(Math.floor(innerText.length / 100), 3);
1245
1246        // Initialize and score ancestors.
1247        this._forEachNode(ancestors, function (ancestor, level) {
1248          if (
1249            !ancestor.tagName ||
1250            !ancestor.parentNode ||
1251            typeof ancestor.parentNode.tagName === "undefined"
1252          ) {
1253            return;
1254          }
1255
1256          if (typeof ancestor.readability === "undefined") {
1257            this._initializeNode(ancestor);
1258            candidates.push(ancestor);
1259          }
1260
1261          // Node score divider:
1262          // - parent:             1 (no division)
1263          // - grandparent:        2
1264          // - great grandparent+: ancestor level * 3
1265          if (level === 0) {
1266            var scoreDivider = 1;
1267          } else if (level === 1) {
1268            scoreDivider = 2;
1269          } else {
1270            scoreDivider = level * 3;
1271          }
1272          ancestor.readability.contentScore += contentScore / scoreDivider;
1273        });
1274      });
1275
1276      // After we've calculated scores, loop through all of the possible
1277      // candidate nodes we found and find the one with the highest score.
1278      var topCandidates = [];
1279      for (var c = 0, cl = candidates.length; c < cl; c += 1) {
1280        var candidate = candidates[c];
1281
1282        // Scale the final candidates score based on link density. Good content
1283        // should have a relatively small link density (5% or less) and be mostly
1284        // unaffected by this operation.
1285        var candidateScore =
1286          candidate.readability.contentScore *
1287          (1 - this._getLinkDensity(candidate));
1288        candidate.readability.contentScore = candidateScore;
1289
1290        this.log("Candidate:", candidate, "with score " + candidateScore);
1291
1292        for (var t = 0; t < this._nbTopCandidates; t++) {
1293          var aTopCandidate = topCandidates[t];
1294
1295          if (
1296            !aTopCandidate ||
1297            candidateScore > aTopCandidate.readability.contentScore
1298          ) {
1299            topCandidates.splice(t, 0, candidate);
1300            if (topCandidates.length > this._nbTopCandidates) {
1301              topCandidates.pop();
1302            }
1303            break;
1304          }
1305        }
1306      }
1307
1308      var topCandidate = topCandidates[0] || null;
1309      var neededToCreateTopCandidate = false;
1310      var parentOfTopCandidate;
1311
1312      // If we still have no top candidate, just use the body as a last resort.
1313      // We also have to copy the body node so it is something we can modify.
1314      if (topCandidate === null || topCandidate.tagName === "BODY") {
1315        // Move all of the page's children into topCandidate
1316        topCandidate = doc.createElement("DIV");
1317        neededToCreateTopCandidate = true;
1318        // Move everything (not just elements, also text nodes etc.) into the container
1319        // so we even include text directly in the body:
1320        while (page.firstChild) {
1321          this.log("Moving child out:", page.firstChild);
1322          topCandidate.appendChild(page.firstChild);
1323        }
1324
1325        page.appendChild(topCandidate);
1326
1327        this._initializeNode(topCandidate);
1328      } else if (topCandidate) {
1329        // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
1330        // and whose scores are quite closed with current `topCandidate` node.
1331        var alternativeCandidateAncestors = [];
1332        for (var i = 1; i < topCandidates.length; i++) {
1333          if (
1334            topCandidates[i].readability.contentScore /
1335              topCandidate.readability.contentScore >=
1336            0.75
1337          ) {
1338            alternativeCandidateAncestors.push(
1339              this._getNodeAncestors(topCandidates[i])
1340            );
1341          }
1342        }
1343        var MINIMUM_TOPCANDIDATES = 3;
1344        if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
1345          parentOfTopCandidate = topCandidate.parentNode;
1346          while (parentOfTopCandidate.tagName !== "BODY") {
1347            var listsContainingThisAncestor = 0;
1348            for (
1349              var ancestorIndex = 0;
1350              ancestorIndex < alternativeCandidateAncestors.length &&
1351              listsContainingThisAncestor < MINIMUM_TOPCANDIDATES;
1352              ancestorIndex++
1353            ) {
1354              listsContainingThisAncestor += Number(
1355                alternativeCandidateAncestors[ancestorIndex].includes(
1356                  parentOfTopCandidate
1357                )
1358              );
1359            }
1360            if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
1361              topCandidate = parentOfTopCandidate;
1362              break;
1363            }
1364            parentOfTopCandidate = parentOfTopCandidate.parentNode;
1365          }
1366        }
1367        if (!topCandidate.readability) {
1368          this._initializeNode(topCandidate);
1369        }
1370
1371        // Because of our bonus system, parents of candidates might have scores
1372        // themselves. They get half of the node. There won't be nodes with higher
1373        // scores than our topCandidate, but if we see the score going *up* in the first
1374        // few steps up the tree, that's a decent sign that there might be more content
1375        // lurking in other places that we want to unify in. The sibling stuff
1376        // below does some of that - but only if we've looked high enough up the DOM
1377        // tree.
1378        parentOfTopCandidate = topCandidate.parentNode;
1379        var lastScore = topCandidate.readability.contentScore;
1380        // The scores shouldn't get too low.
1381        var scoreThreshold = lastScore / 3;
1382        while (parentOfTopCandidate.tagName !== "BODY") {
1383          if (!parentOfTopCandidate.readability) {
1384            parentOfTopCandidate = parentOfTopCandidate.parentNode;
1385            continue;
1386          }
1387          var parentScore = parentOfTopCandidate.readability.contentScore;
1388          if (parentScore < scoreThreshold) {
1389            break;
1390          }
1391          if (parentScore > lastScore) {
1392            // Alright! We found a better parent to use.
1393            topCandidate = parentOfTopCandidate;
1394            break;
1395          }
1396          lastScore = parentOfTopCandidate.readability.contentScore;
1397          parentOfTopCandidate = parentOfTopCandidate.parentNode;
1398        }
1399
1400        // If the top candidate is the only child, use parent instead. This will help sibling
1401        // joining logic when adjacent content is actually located in parent's sibling node.
1402        parentOfTopCandidate = topCandidate.parentNode;
1403        while (
1404          parentOfTopCandidate.tagName != "BODY" &&
1405          parentOfTopCandidate.children.length == 1
1406        ) {
1407          topCandidate = parentOfTopCandidate;
1408          parentOfTopCandidate = topCandidate.parentNode;
1409        }
1410        if (!topCandidate.readability) {
1411          this._initializeNode(topCandidate);
1412        }
1413      }
1414
1415      // Now that we have the top candidate, look through its siblings for content
1416      // that might also be related. Things like preambles, content split by ads
1417      // that we removed, etc.
1418      var articleContent = doc.createElement("DIV");
1419      if (isPaging) {
1420        articleContent.id = "readability-content";
1421      }
1422
1423      var siblingScoreThreshold = Math.max(
1424        10,
1425        topCandidate.readability.contentScore * 0.2
1426      );
1427      // Keep potential top candidate's parent node to try to get text direction of it later.
1428      parentOfTopCandidate = topCandidate.parentNode;
1429      var siblings = parentOfTopCandidate.children;
1430
1431      for (var s = 0, sl = siblings.length; s < sl; s++) {
1432        var sibling = siblings[s];
1433        var append = false;
1434
1435        this.log(
1436          "Looking at sibling node:",
1437          sibling,
1438          sibling.readability
1439            ? "with score " + sibling.readability.contentScore
1440            : ""
1441        );
1442        this.log(
1443          "Sibling has score",
1444          sibling.readability ? sibling.readability.contentScore : "Unknown"
1445        );
1446
1447        if (sibling === topCandidate) {
1448          append = true;
1449        } else {
1450          var contentBonus = 0;
1451
1452          // Give a bonus if sibling nodes and top candidates have the example same classname
1453          if (
1454            sibling.className === topCandidate.className &&
1455            topCandidate.className !== ""
1456          ) {
1457            contentBonus += topCandidate.readability.contentScore * 0.2;
1458          }
1459
1460          if (
1461            sibling.readability &&
1462            sibling.readability.contentScore + contentBonus >=
1463              siblingScoreThreshold
1464          ) {
1465            append = true;
1466          } else if (sibling.nodeName === "P") {
1467            var linkDensity = this._getLinkDensity(sibling);
1468            var nodeContent = this._getInnerText(sibling);
1469            var nodeLength = nodeContent.length;
1470
1471            if (nodeLength > 80 && linkDensity < 0.25) {
1472              append = true;
1473            } else if (
1474              nodeLength < 80 &&
1475              nodeLength > 0 &&
1476              linkDensity === 0 &&
1477              nodeContent.search(/\.( |$)/) !== -1
1478            ) {
1479              append = true;
1480            }
1481          }
1482        }
1483
1484        if (append) {
1485          this.log("Appending node:", sibling);
1486
1487          if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) {
1488            // We have a node that isn't a common block level element, like a form or td tag.
1489            // Turn it into a div so it doesn't get filtered out later by accident.
1490            this.log("Altering sibling:", sibling, "to div.");
1491
1492            sibling = this._setNodeTag(sibling, "DIV");
1493          }
1494
1495          articleContent.appendChild(sibling);
1496          // Fetch children again to make it compatible
1497          // with DOM parsers without live collection support.
1498          siblings = parentOfTopCandidate.children;
1499          // siblings is a reference to the children array, and
1500          // sibling is removed from the array when we call appendChild().
1501          // As a result, we must revisit this index since the nodes
1502          // have been shifted.
1503          s -= 1;
1504          sl -= 1;
1505        }
1506      }
1507
1508      if (this._debug) {
1509        this.log("Article content pre-prep: " + articleContent.innerHTML);
1510      }
1511      // So we have all of the content that we need. Now we clean it up for presentation.
1512      this._prepArticle(articleContent);
1513      if (this._debug) {
1514        this.log("Article content post-prep: " + articleContent.innerHTML);
1515      }
1516
1517      if (neededToCreateTopCandidate) {
1518        // We already created a fake div thing, and there wouldn't have been any siblings left
1519        // for the previous loop, so there's no point trying to create a new div, and then
1520        // move all the children over. Just assign IDs and class names here. No need to append
1521        // because that already happened anyway.
1522        topCandidate.id = "readability-page-1";
1523        topCandidate.className = "page";
1524      } else {
1525        var div = doc.createElement("DIV");
1526        div.id = "readability-page-1";
1527        div.className = "page";
1528        while (articleContent.firstChild) {
1529          div.appendChild(articleContent.firstChild);
1530        }
1531        articleContent.appendChild(div);
1532      }
1533
1534      if (this._debug) {
1535        this.log("Article content after paging: " + articleContent.innerHTML);
1536      }
1537
1538      var parseSuccessful = true;
1539
1540      // Now that we've gone through the full algorithm, check to see if
1541      // we got any meaningful content. If we didn't, we may need to re-run
1542      // grabArticle with different flags set. This gives us a higher likelihood of
1543      // finding the content, and the sieve approach gives us a higher likelihood of
1544      // finding the -right- content.
1545      var textLength = this._getInnerText(articleContent, true).length;
1546      if (textLength < this._charThreshold) {
1547        parseSuccessful = false;
1548        // eslint-disable-next-line no-unsanitized/property
1549        page.innerHTML = pageCacheHtml;
1550
1551        this._attempts.push({
1552          articleContent,
1553          textLength,
1554        });
1555
1556        if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
1557          this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
1558        } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
1559          this._removeFlag(this.FLAG_WEIGHT_CLASSES);
1560        } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
1561          this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
1562        } else {
1563          // No luck after removing flags, just return the longest text we found during the different loops
1564          this._attempts.sort(function (a, b) {
1565            return b.textLength - a.textLength;
1566          });
1567
1568          // But first check if we actually have something
1569          if (!this._attempts[0].textLength) {
1570            return null;
1571          }
1572
1573          articleContent = this._attempts[0].articleContent;
1574          parseSuccessful = true;
1575        }
1576      }
1577
1578      if (parseSuccessful) {
1579        // Find out text direction from ancestors of final top candidate.
1580        var ancestors = [parentOfTopCandidate, topCandidate].concat(
1581          this._getNodeAncestors(parentOfTopCandidate)
1582        );
1583        this._someNode(ancestors, function (ancestor) {
1584          if (!ancestor.tagName) {
1585            return false;
1586          }
1587          var articleDir = ancestor.getAttribute("dir");
1588          if (articleDir) {
1589            this._articleDir = articleDir;
1590            return true;
1591          }
1592          return false;
1593        });
1594        return articleContent;
1595      }
1596    }
1597  },
1598
1599  /**
1600   * Converts some of the common HTML entities in string to their corresponding characters.
1601   *
1602   * @param str {string} - a string to unescape.
1603   * @return string without HTML entity.
1604   */
1605  _unescapeHtmlEntities(str) {
1606    if (!str) {
1607      return str;
1608    }
1609
1610    var htmlEscapeMap = this.HTML_ESCAPE_MAP;
1611    return str
1612      .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) {
1613        return htmlEscapeMap[tag];
1614      })
1615      .replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (_, hex, numStr) {
1616        var num = parseInt(hex || numStr, hex ? 16 : 10);
1617
1618        // these character references are replaced by a conforming HTML parser
1619        if (num == 0 || num > 0x10ffff || (num >= 0xd800 && num <= 0xdfff)) {
1620          num = 0xfffd;
1621        }
1622
1623        return String.fromCodePoint(num);
1624      });
1625  },
1626
1627  /**
1628   * Try to extract metadata from JSON-LD object.
1629   * For now, only Schema.org objects of type Article or its subtypes are supported.
1630   * @return Object with any metadata that could be extracted (possibly none)
1631   */
1632  _getJSONLD(doc) {
1633    var scripts = this._getAllNodesWithTag(doc, ["script"]);
1634
1635    var metadata;
1636
1637    this._forEachNode(scripts, function (jsonLdElement) {
1638      if (
1639        !metadata &&
1640        jsonLdElement.getAttribute("type") === "application/ld+json"
1641      ) {
1642        try {
1643          // Strip CDATA markers if present
1644          var content = jsonLdElement.textContent.replace(
1645            /^\s*<!\[CDATA\[|\]\]>\s*$/g,
1646            ""
1647          );
1648          var parsed = JSON.parse(content);
1649
1650          if (Array.isArray(parsed)) {
1651            parsed = parsed.find(it => {
1652              return (
1653                it["@type"] &&
1654                it["@type"].match(this.REGEXPS.jsonLdArticleTypes)
1655              );
1656            });
1657            if (!parsed) {
1658              return;
1659            }
1660          }
1661
1662          var schemaDotOrgRegex = /^https?\:\/\/schema\.org\/?$/;
1663          var matches =
1664            (typeof parsed["@context"] === "string" &&
1665              parsed["@context"].match(schemaDotOrgRegex)) ||
1666            (typeof parsed["@context"] === "object" &&
1667              typeof parsed["@context"]["@vocab"] == "string" &&
1668              parsed["@context"]["@vocab"].match(schemaDotOrgRegex));
1669
1670          if (!matches) {
1671            return;
1672          }
1673
1674          if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
1675            parsed = parsed["@graph"].find(it => {
1676              return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes);
1677            });
1678          }
1679
1680          if (
1681            !parsed ||
1682            !parsed["@type"] ||
1683            !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
1684          ) {
1685            return;
1686          }
1687
1688          metadata = {};
1689
1690          if (
1691            typeof parsed.name === "string" &&
1692            typeof parsed.headline === "string" &&
1693            parsed.name !== parsed.headline
1694          ) {
1695            // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1696            // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1697            // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1698
1699            var title = this._getArticleTitle();
1700            var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1701            var headlineMatches =
1702              this._textSimilarity(parsed.headline, title) > 0.75;
1703
1704            if (headlineMatches && !nameMatches) {
1705              metadata.title = parsed.headline;
1706            } else {
1707              metadata.title = parsed.name;
1708            }
1709          } else if (typeof parsed.name === "string") {
1710            metadata.title = parsed.name.trim();
1711          } else if (typeof parsed.headline === "string") {
1712            metadata.title = parsed.headline.trim();
1713          }
1714          if (parsed.author) {
1715            if (typeof parsed.author.name === "string") {
1716              metadata.byline = parsed.author.name.trim();
1717            } else if (
1718              Array.isArray(parsed.author) &&
1719              parsed.author[0] &&
1720              typeof parsed.author[0].name === "string"
1721            ) {
1722              metadata.byline = parsed.author
1723                .filter(function (author) {
1724                  return author && typeof author.name === "string";
1725                })
1726                .map(function (author) {
1727                  return author.name.trim();
1728                })
1729                .join(", ");
1730            }
1731          }
1732          if (typeof parsed.description === "string") {
1733            metadata.excerpt = parsed.description.trim();
1734          }
1735          if (parsed.publisher && typeof parsed.publisher.name === "string") {
1736            metadata.siteName = parsed.publisher.name.trim();
1737          }
1738          if (typeof parsed.datePublished === "string") {
1739            metadata.datePublished = parsed.datePublished.trim();
1740          }
1741        } catch (err) {
1742          this.log(err.message);
1743        }
1744      }
1745    });
1746    return metadata ? metadata : {};
1747  },
1748
1749  /**
1750   * Attempts to get excerpt and byline metadata for the article.
1751   *
1752   * @param {Object} jsonld — object containing any metadata that
1753   * could be extracted from JSON-LD object.
1754   *
1755   * @return Object with optional "excerpt" and "byline" properties
1756   */
1757  _getArticleMetadata(jsonld) {
1758    var metadata = {};
1759    var values = {};
1760    var metaElements = this._doc.getElementsByTagName("meta");
1761
1762    // property is a space-separated list of values
1763    var propertyPattern =
1764      /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
1765
1766    // name is a single value
1767    var namePattern =
1768      /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
1769
1770    // Find description tags.
1771    this._forEachNode(metaElements, function (element) {
1772      var elementName = element.getAttribute("name");
1773      var elementProperty = element.getAttribute("property");
1774      var content = element.getAttribute("content");
1775      if (!content) {
1776        return;
1777      }
1778      var matches = null;
1779      var name = null;
1780
1781      if (elementProperty) {
1782        matches = elementProperty.match(propertyPattern);
1783        if (matches) {
1784          // Convert to lowercase, and remove any whitespace
1785          // so we can match below.
1786          name = matches[0].toLowerCase().replace(/\s/g, "");
1787          // multiple authors
1788          values[name] = content.trim();
1789        }
1790      }
1791      if (!matches && elementName && namePattern.test(elementName)) {
1792        name = elementName;
1793        if (content) {
1794          // Convert to lowercase, remove any whitespace, and convert dots
1795          // to colons so we can match below.
1796          name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
1797          values[name] = content.trim();
1798        }
1799      }
1800    });
1801
1802    // get title
1803    metadata.title =
1804      jsonld.title ||
1805      values["dc:title"] ||
1806      values["dcterm:title"] ||
1807      values["og:title"] ||
1808      values["weibo:article:title"] ||
1809      values["weibo:webpage:title"] ||
1810      values.title ||
1811      values["twitter:title"] ||
1812      values["parsely-title"];
1813
1814    if (!metadata.title) {
1815      metadata.title = this._getArticleTitle();
1816    }
1817
1818    const articleAuthor =
1819      typeof values["article:author"] === "string" &&
1820      !this._isUrl(values["article:author"])
1821        ? values["article:author"]
1822        : undefined;
1823
1824    // get author
1825    metadata.byline =
1826      jsonld.byline ||
1827      values["dc:creator"] ||
1828      values["dcterm:creator"] ||
1829      values.author ||
1830      values["parsely-author"] ||
1831      articleAuthor;
1832
1833    // get description
1834    metadata.excerpt =
1835      jsonld.excerpt ||
1836      values["dc:description"] ||
1837      values["dcterm:description"] ||
1838      values["og:description"] ||
1839      values["weibo:article:description"] ||
1840      values["weibo:webpage:description"] ||
1841      values.description ||
1842      values["twitter:description"];
1843
1844    // get site name
1845    metadata.siteName = jsonld.siteName || values["og:site_name"];
1846
1847    // get article published time
1848    metadata.publishedTime =
1849      jsonld.datePublished ||
1850      values["article:published_time"] ||
1851      values["parsely-pub-date"] ||
1852      null;
1853
1854    // in many sites the meta value is escaped with HTML entities,
1855    // so here we need to unescape it
1856    metadata.title = this._unescapeHtmlEntities(metadata.title);
1857    metadata.byline = this._unescapeHtmlEntities(metadata.byline);
1858    metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
1859    metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
1860    metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
1861
1862    return metadata;
1863  },
1864
1865  /**
1866   * Check if node is image, or if node contains exactly only one image
1867   * whether as a direct child or as its descendants.
1868   *
1869   * @param Element
1870   **/
1871  _isSingleImage(node) {
1872    while (node) {
1873      if (node.tagName === "IMG") {
1874        return true;
1875      }
1876      if (node.children.length !== 1 || node.textContent.trim() !== "") {
1877        return false;
1878      }
1879      node = node.children[0];
1880    }
1881    return false;
1882  },
1883
1884  /**
1885   * Find all <noscript> that are located after <img> nodes, and which contain only one
1886   * <img> element. Replace the first image with the image from inside the <noscript> tag,
1887   * and remove the <noscript> tag. This improves the quality of the images we use on
1888   * some sites (e.g. Medium).
1889   *
1890   * @param Element
1891   **/
1892  _unwrapNoscriptImages(doc) {
1893    // Find img without source or attributes that might contains image, and remove it.
1894    // This is done to prevent a placeholder img is replaced by img from noscript in next step.
1895    var imgs = Array.from(doc.getElementsByTagName("img"));
1896    this._forEachNode(imgs, function (img) {
1897      for (var i = 0; i < img.attributes.length; i++) {
1898        var attr = img.attributes[i];
1899        switch (attr.name) {
1900          case "src":
1901          case "srcset":
1902          case "data-src":
1903          case "data-srcset":
1904            return;
1905        }
1906
1907        if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1908          return;
1909        }
1910      }
1911
1912      img.remove();
1913    });
1914
1915    // Next find noscript and try to extract its image
1916    var noscripts = Array.from(doc.getElementsByTagName("noscript"));
1917    this._forEachNode(noscripts, function (noscript) {
1918      // Parse content of noscript and make sure it only contains image
1919      if (!this._isSingleImage(noscript)) {
1920        return;
1921      }
1922      var tmp = doc.createElement("div");
1923      // We're running in the document context, and using unmodified
1924      // document contents, so doing this should be safe.
1925      // (Also we heavily discourage people from allowing script to
1926      // run at all in this document...)
1927      // eslint-disable-next-line no-unsanitized/property
1928      tmp.innerHTML = noscript.innerHTML;
1929
1930      // If noscript has previous sibling and it only contains image,
1931      // replace it with noscript content. However we also keep old
1932      // attributes that might contains image.
1933      var prevElement = noscript.previousElementSibling;
1934      if (prevElement && this._isSingleImage(prevElement)) {
1935        var prevImg = prevElement;
1936        if (prevImg.tagName !== "IMG") {
1937          prevImg = prevElement.getElementsByTagName("img")[0];
1938        }
1939
1940        var newImg = tmp.getElementsByTagName("img")[0];
1941        for (var i = 0; i < prevImg.attributes.length; i++) {
1942          var attr = prevImg.attributes[i];
1943          if (attr.value === "") {
1944            continue;
1945          }
1946
1947          if (
1948            attr.name === "src" ||
1949            attr.name === "srcset" ||
1950            /\.(jpg|jpeg|png|webp)/i.test(attr.value)
1951          ) {
1952            if (newImg.getAttribute(attr.name) === attr.value) {
1953              continue;
1954            }
1955
1956            var attrName = attr.name;
1957            if (newImg.hasAttribute(attrName)) {
1958              attrName = "data-old-" + attrName;
1959            }
1960
1961            newImg.setAttribute(attrName, attr.value);
1962          }
1963        }
1964
1965        noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
1966      }
1967    });
1968  },
1969
1970  /**
1971   * Removes script tags from the document.
1972   *
1973   * @param Element
1974   **/
1975  _removeScripts(doc) {
1976    this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"]));
1977  },
1978
1979  /**
1980   * Check if this node has only whitespace and a single element with given tag
1981   * Returns false if the DIV node contains non-empty text nodes
1982   * or if it contains no element with given tag or more than 1 element.
1983   *
1984   * @param Element
1985   * @param string tag of child element
1986   **/
1987  _hasSingleTagInsideElement(element, tag) {
1988    // There should be exactly 1 element child with given tag
1989    if (element.children.length != 1 || element.children[0].tagName !== tag) {
1990      return false;
1991    }
1992
1993    // And there should be no text nodes with real content
1994    return !this._someNode(element.childNodes, function (node) {
1995      return (
1996        node.nodeType === this.TEXT_NODE &&
1997        this.REGEXPS.hasContent.test(node.textContent)
1998      );
1999    });
2000  },
2001
2002  _isElementWithoutContent(node) {
2003    return (
2004      node.nodeType === this.ELEMENT_NODE &&
2005      !node.textContent.trim().length &&
2006      (!node.children.length ||
2007        node.children.length ==
2008          node.getElementsByTagName("br").length +
2009            node.getElementsByTagName("hr").length)
2010    );
2011  },
2012
2013  /**
2014   * Determine whether element has any children block level elements.
2015   *
2016   * @param Element
2017   */
2018  _hasChildBlockElement(element) {
2019    return this._someNode(element.childNodes, function (node) {
2020      return (
2021        this.DIV_TO_P_ELEMS.has(node.tagName) ||
2022        this._hasChildBlockElement(node)
2023      );
2024    });
2025  },
2026
2027  /***
2028   * Determine if a node qualifies as phrasing content.
2029   * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
2030   **/
2031  _isPhrasingContent(node) {
2032    return (
2033      node.nodeType === this.TEXT_NODE ||
2034      this.PHRASING_ELEMS.includes(node.tagName) ||
2035      ((node.tagName === "A" ||
2036        node.tagName === "DEL" ||
2037        node.tagName === "INS") &&
2038        this._everyNode(node.childNodes, this._isPhrasingContent))
2039    );
2040  },
2041
2042  _isWhitespace(node) {
2043    return (
2044      (node.nodeType === this.TEXT_NODE &&
2045        node.textContent.trim().length === 0) ||
2046      (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR")
2047    );
2048  },
2049
2050  /**
2051   * Get the inner text of a node - cross browser compatibly.
2052   * This also strips out any excess whitespace to be found.
2053   *
2054   * @param Element
2055   * @param Boolean normalizeSpaces (default: true)
2056   * @return string
2057   **/
2058  _getInnerText(e, normalizeSpaces) {
2059    normalizeSpaces =
2060      typeof normalizeSpaces === "undefined" ? true : normalizeSpaces;
2061    var textContent = e.textContent.trim();
2062
2063    if (normalizeSpaces) {
2064      return textContent.replace(this.REGEXPS.normalize, " ");
2065    }
2066    return textContent;
2067  },
2068
2069  /**
2070   * Get the number of times a string s appears in the node e.
2071   *
2072   * @param Element
2073   * @param string - what to split on. Default is ","
2074   * @return number (integer)
2075   **/
2076  _getCharCount(e, s) {
2077    s = s || ",";
2078    return this._getInnerText(e).split(s).length - 1;
2079  },
2080
2081  /**
2082   * Remove the style attribute on every e and under.
2083   * TODO: Test if getElementsByTagName(*) is faster.
2084   *
2085   * @param Element
2086   * @return void
2087   **/
2088  _cleanStyles(e) {
2089    if (!e || e.tagName.toLowerCase() === "svg") {
2090      return;
2091    }
2092
2093    // Remove `style` and deprecated presentational attributes
2094    for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
2095      e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
2096    }
2097
2098    if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.includes(e.tagName)) {
2099      e.removeAttribute("width");
2100      e.removeAttribute("height");
2101    }
2102
2103    var cur = e.firstElementChild;
2104    while (cur !== null) {
2105      this._cleanStyles(cur);
2106      cur = cur.nextElementSibling;
2107    }
2108  },
2109
2110  /**
2111   * Get the density of links as a percentage of the content
2112   * This is the amount of text that is inside a link divided by the total text in the node.
2113   *
2114   * @param Element
2115   * @return number (float)
2116   **/
2117  _getLinkDensity(element) {
2118    var textLength = this._getInnerText(element).length;
2119    if (textLength === 0) {
2120      return 0;
2121    }
2122
2123    var linkLength = 0;
2124
2125    // XXX implement _reduceNodeList?
2126    this._forEachNode(element.getElementsByTagName("a"), function (linkNode) {
2127      var href = linkNode.getAttribute("href");
2128      var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
2129      linkLength += this._getInnerText(linkNode).length * coefficient;
2130    });
2131
2132    return linkLength / textLength;
2133  },
2134
2135  /**
2136   * Get an elements class/id weight. Uses regular expressions to tell if this
2137   * element looks good or bad.
2138   *
2139   * @param Element
2140   * @return number (Integer)
2141   **/
2142  _getClassWeight(e) {
2143    if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
2144      return 0;
2145    }
2146
2147    var weight = 0;
2148
2149    // Look for a special classname
2150    if (typeof e.className === "string" && e.className !== "") {
2151      if (this.REGEXPS.negative.test(e.className)) {
2152        weight -= 25;
2153      }
2154
2155      if (this.REGEXPS.positive.test(e.className)) {
2156        weight += 25;
2157      }
2158    }
2159
2160    // Look for a special ID
2161    if (typeof e.id === "string" && e.id !== "") {
2162      if (this.REGEXPS.negative.test(e.id)) {
2163        weight -= 25;
2164      }
2165
2166      if (this.REGEXPS.positive.test(e.id)) {
2167        weight += 25;
2168      }
2169    }
2170
2171    return weight;
2172  },
2173
2174  /**
2175   * Clean a node of all elements of type "tag".
2176   * (Unless it's a youtube/vimeo video. People love movies.)
2177   *
2178   * @param Element
2179   * @param string tag to clean
2180   * @return void
2181   **/
2182  _clean(e, tag) {
2183    var isEmbed = ["object", "embed", "iframe"].includes(tag);
2184
2185    this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) {
2186      // Allow youtube and vimeo videos through as people usually want to see those.
2187      if (isEmbed) {
2188        // First, check the elements attributes to see if any of them contain youtube or vimeo
2189        for (var i = 0; i < element.attributes.length; i++) {
2190          if (this._allowedVideoRegex.test(element.attributes[i].value)) {
2191            return false;
2192          }
2193        }
2194
2195        // For embed with <object> tag, check inner HTML as well.
2196        if (
2197          element.tagName === "object" &&
2198          this._allowedVideoRegex.test(element.innerHTML)
2199        ) {
2200          return false;
2201        }
2202      }
2203
2204      return true;
2205    });
2206  },
2207
2208  /**
2209   * Check if a given node has one of its ancestor tag name matching the
2210   * provided one.
2211   * @param  HTMLElement node
2212   * @param  String      tagName
2213   * @param  Number      maxDepth
2214   * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
2215   * @return Boolean
2216   */
2217  _hasAncestorTag(node, tagName, maxDepth, filterFn) {
2218    maxDepth = maxDepth || 3;
2219    tagName = tagName.toUpperCase();
2220    var depth = 0;
2221    while (node.parentNode) {
2222      if (maxDepth > 0 && depth > maxDepth) {
2223        return false;
2224      }
2225      if (
2226        node.parentNode.tagName === tagName &&
2227        (!filterFn || filterFn(node.parentNode))
2228      ) {
2229        return true;
2230      }
2231      node = node.parentNode;
2232      depth++;
2233    }
2234    return false;
2235  },
2236
2237  /**
2238   * Return an object indicating how many rows and columns this table has.
2239   */
2240  _getRowAndColumnCount(table) {
2241    var rows = 0;
2242    var columns = 0;
2243    var trs = table.getElementsByTagName("tr");
2244    for (var i = 0; i < trs.length; i++) {
2245      var rowspan = trs[i].getAttribute("rowspan") || 0;
2246      if (rowspan) {
2247        rowspan = parseInt(rowspan, 10);
2248      }
2249      rows += rowspan || 1;
2250
2251      // Now look for column-related info
2252      var columnsInThisRow = 0;
2253      var cells = trs[i].getElementsByTagName("td");
2254      for (var j = 0; j < cells.length; j++) {
2255        var colspan = cells[j].getAttribute("colspan") || 0;
2256        if (colspan) {
2257          colspan = parseInt(colspan, 10);
2258        }
2259        columnsInThisRow += colspan || 1;
2260      }
2261      columns = Math.max(columns, columnsInThisRow);
2262    }
2263    return { rows, columns };
2264  },
2265
2266  /**
2267   * Look for 'data' (as opposed to 'layout') tables, for which we use
2268   * similar checks as
2269   * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
2270   */
2271  _markDataTables(root) {
2272    var tables = root.getElementsByTagName("table");
2273    for (var i = 0; i < tables.length; i++) {
2274      var table = tables[i];
2275      var role = table.getAttribute("role");
2276      if (role == "presentation") {
2277        table._readabilityDataTable = false;
2278        continue;
2279      }
2280      var datatable = table.getAttribute("datatable");
2281      if (datatable == "0") {
2282        table._readabilityDataTable = false;
2283        continue;
2284      }
2285      var summary = table.getAttribute("summary");
2286      if (summary) {
2287        table._readabilityDataTable = true;
2288        continue;
2289      }
2290
2291      var caption = table.getElementsByTagName("caption")[0];
2292      if (caption && caption.childNodes.length) {
2293        table._readabilityDataTable = true;
2294        continue;
2295      }
2296
2297      // If the table has a descendant with any of these tags, consider a data table:
2298      var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
2299      var descendantExists = function (tag) {
2300        return !!table.getElementsByTagName(tag)[0];
2301      };
2302      if (dataTableDescendants.some(descendantExists)) {
2303        this.log("Data table because found data-y descendant");
2304        table._readabilityDataTable = true;
2305        continue;
2306      }
2307
2308      // Nested tables indicate a layout table:
2309      if (table.getElementsByTagName("table")[0]) {
2310        table._readabilityDataTable = false;
2311        continue;
2312      }
2313
2314      var sizeInfo = this._getRowAndColumnCount(table);
2315
2316      if (sizeInfo.columns == 1 || sizeInfo.rows == 1) {
2317        // single colum/row tables are commonly used for page layout purposes.
2318        table._readabilityDataTable = false;
2319        continue;
2320      }
2321
2322      if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
2323        table._readabilityDataTable = true;
2324        continue;
2325      }
2326      // Now just go by size entirely:
2327      table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
2328    }
2329  },
2330
2331  /* convert images and figures that have properties like data-src into images that can be loaded without JS */
2332  _fixLazyImages(root) {
2333    this._forEachNode(
2334      this._getAllNodesWithTag(root, ["img", "picture", "figure"]),
2335      function (elem) {
2336        // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
2337        // So, here we check if the data uri is too short, just might as well remove it.
2338        if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
2339          // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
2340          var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
2341          if (parts[1] === "image/svg+xml") {
2342            return;
2343          }
2344
2345          // Make sure this element has other attributes which contains image.
2346          // If it doesn't, then this src is important and shouldn't be removed.
2347          var srcCouldBeRemoved = false;
2348          for (var i = 0; i < elem.attributes.length; i++) {
2349            var attr = elem.attributes[i];
2350            if (attr.name === "src") {
2351              continue;
2352            }
2353
2354            if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
2355              srcCouldBeRemoved = true;
2356              break;
2357            }
2358          }
2359
2360          // Here we assume if image is less than 100 bytes (or 133 after encoded to base64)
2361          // it will be too small, therefore it might be placeholder image.
2362          if (srcCouldBeRemoved) {
2363            var b64starts = parts[0].length;
2364            var b64length = elem.src.length - b64starts;
2365            if (b64length < 133) {
2366              elem.removeAttribute("src");
2367            }
2368          }
2369        }
2370
2371        // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
2372        if (
2373          (elem.src || (elem.srcset && elem.srcset != "null")) &&
2374          !elem.className.toLowerCase().includes("lazy")
2375        ) {
2376          return;
2377        }
2378
2379        for (var j = 0; j < elem.attributes.length; j++) {
2380          attr = elem.attributes[j];
2381          if (
2382            attr.name === "src" ||
2383            attr.name === "srcset" ||
2384            attr.name === "alt"
2385          ) {
2386            continue;
2387          }
2388          var copyTo = null;
2389          if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
2390            copyTo = "srcset";
2391          } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
2392            copyTo = "src";
2393          }
2394          if (copyTo) {
2395            //if this is an img or picture, set the attribute directly
2396            if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
2397              elem.setAttribute(copyTo, attr.value);
2398            } else if (
2399              elem.tagName === "FIGURE" &&
2400              !this._getAllNodesWithTag(elem, ["img", "picture"]).length
2401            ) {
2402              //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
2403              //see the nytimes-3 testcase for an example
2404              var img = this._doc.createElement("img");
2405              img.setAttribute(copyTo, attr.value);
2406              elem.appendChild(img);
2407            }
2408          }
2409        }
2410      }
2411    );
2412  },
2413
2414  _getTextDensity(e, tags) {
2415    var textLength = this._getInnerText(e, true).length;
2416    if (textLength === 0) {
2417      return 0;
2418    }
2419    var childrenLength = 0;
2420    var children = this._getAllNodesWithTag(e, tags);
2421    this._forEachNode(
2422      children,
2423      child => (childrenLength += this._getInnerText(child, true).length)
2424    );
2425    return childrenLength / textLength;
2426  },
2427
2428  /**
2429   * Clean an element of all tags of type "tag" if they look fishy.
2430   * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
2431   *
2432   * @return void
2433   **/
2434  _cleanConditionally(e, tag) {
2435    if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
2436      return;
2437    }
2438
2439    // Gather counts for other typical elements embedded within.
2440    // Traverse backwards so we can remove nodes at the same time
2441    // without effecting the traversal.
2442    //
2443    // TODO: Consider taking into account original contentScore here.
2444    this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) {
2445      // First check if this node IS data table, in which case don't remove it.
2446      var isDataTable = function (t) {
2447        return t._readabilityDataTable;
2448      };
2449
2450      var isList = tag === "ul" || tag === "ol";
2451      if (!isList) {
2452        var listLength = 0;
2453        var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]);
2454        this._forEachNode(
2455          listNodes,
2456          list => (listLength += this._getInnerText(list).length)
2457        );
2458        isList = listLength / this._getInnerText(node).length > 0.9;
2459      }
2460
2461      if (tag === "table" && isDataTable(node)) {
2462        return false;
2463      }
2464
2465      // Next check if we're inside a data table, in which case don't remove it as well.
2466      if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
2467        return false;
2468      }
2469
2470      if (this._hasAncestorTag(node, "code")) {
2471        return false;
2472      }
2473
2474      // keep element if it has a data tables
2475      if (
2476        [...node.getElementsByTagName("table")].some(
2477          tbl => tbl._readabilityDataTable
2478        )
2479      ) {
2480        return false;
2481      }
2482
2483      var weight = this._getClassWeight(node);
2484
2485      this.log("Cleaning Conditionally", node);
2486
2487      var contentScore = 0;
2488
2489      if (weight + contentScore < 0) {
2490        return true;
2491      }
2492
2493      if (this._getCharCount(node, ",") < 10) {
2494        // If there are not very many commas, and the number of
2495        // non-paragraph elements is more than paragraphs or other
2496        // ominous signs, remove the element.
2497        var p = node.getElementsByTagName("p").length;
2498        var img = node.getElementsByTagName("img").length;
2499        var li = node.getElementsByTagName("li").length - 100;
2500        var input = node.getElementsByTagName("input").length;
2501        var headingDensity = this._getTextDensity(node, [
2502          "h1",
2503          "h2",
2504          "h3",
2505          "h4",
2506          "h5",
2507          "h6",
2508        ]);
2509
2510        var embedCount = 0;
2511        var embeds = this._getAllNodesWithTag(node, [
2512          "object",
2513          "embed",
2514          "iframe",
2515        ]);
2516
2517        for (var i = 0; i < embeds.length; i++) {
2518          // If this embed has attribute that matches video regex, don't delete it.
2519          for (var j = 0; j < embeds[i].attributes.length; j++) {
2520            if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
2521              return false;
2522            }
2523          }
2524
2525          // For embed with <object> tag, check inner HTML as well.
2526          if (
2527            embeds[i].tagName === "object" &&
2528            this._allowedVideoRegex.test(embeds[i].innerHTML)
2529          ) {
2530            return false;
2531          }
2532
2533          embedCount++;
2534        }
2535
2536        var innerText = this._getInnerText(node);
2537
2538        // toss any node whose inner text contains nothing but suspicious words
2539        if (
2540          this.REGEXPS.adWords.test(innerText) ||
2541          this.REGEXPS.loadingWords.test(innerText)
2542        ) {
2543          return true;
2544        }
2545
2546        var contentLength = innerText.length;
2547        var linkDensity = this._getLinkDensity(node);
2548        var textishTags = ["SPAN", "LI", "TD"].concat(
2549          Array.from(this.DIV_TO_P_ELEMS)
2550        );
2551        var textDensity = this._getTextDensity(node, textishTags);
2552        var isFigureChild = this._hasAncestorTag(node, "figure");
2553
2554        // apply shadiness checks, then check for exceptions
2555        const shouldRemoveNode = () => {
2556          const errs = [];
2557          if (!isFigureChild && img > 1 && p / img < 0.5) {
2558            errs.push(`Bad p to img ratio (img=${img}, p=${p})`);
2559          }
2560          if (!isList && li > p) {
2561            errs.push(`Too many li's outside of a list. (li=${li} > p=${p})`);
2562          }
2563          if (input > Math.floor(p / 3)) {
2564            errs.push(`Too many inputs per p. (input=${input}, p=${p})`);
2565          }
2566          if (
2567            !isList &&
2568            !isFigureChild &&
2569            headingDensity < 0.9 &&
2570            contentLength < 25 &&
2571            (img === 0 || img > 2) &&
2572            linkDensity > 0
2573          ) {
2574            errs.push(
2575              `Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})`
2576            );
2577          }
2578          if (
2579            !isList &&
2580            weight < 25 &&
2581            linkDensity > 0.2 + this._linkDensityModifier
2582          ) {
2583            errs.push(
2584              `Low weight and a little linky. (linkDensity=${linkDensity})`
2585            );
2586          }
2587          if (weight >= 25 && linkDensity > 0.5 + this._linkDensityModifier) {
2588            errs.push(
2589              `High weight and mostly links. (linkDensity=${linkDensity})`
2590            );
2591          }
2592          if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
2593            errs.push(
2594              `Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})`
2595            );
2596          }
2597          if (img === 0 && textDensity === 0) {
2598            errs.push(
2599              `No useful content. (img=${img}, textDensity=${textDensity})`
2600            );
2601          }
2602
2603          if (errs.length) {
2604            this.log("Checks failed", errs);
2605            return true;
2606          }
2607
2608          return false;
2609        };
2610
2611        var haveToRemove = shouldRemoveNode();
2612
2613        // Allow simple lists of images to remain in pages
2614        if (isList && haveToRemove) {
2615          for (var x = 0; x < node.children.length; x++) {
2616            let child = node.children[x];
2617            // Don't filter in lists with li's that contain more than one child
2618            if (child.children.length > 1) {
2619              return haveToRemove;
2620            }
2621          }
2622          let li_count = node.getElementsByTagName("li").length;
2623          // Only allow the list to remain if every li contains an image
2624          if (img == li_count) {
2625            return false;
2626          }
2627        }
2628        return haveToRemove;
2629      }
2630      return false;
2631    });
2632  },
2633
2634  /**
2635   * Clean out elements that match the specified conditions
2636   *
2637   * @param Element
2638   * @param Function determines whether a node should be removed
2639   * @return void
2640   **/
2641  _cleanMatchedNodes(e, filter) {
2642    var endOfSearchMarkerNode = this._getNextNode(e, true);
2643    var next = this._getNextNode(e);
2644    while (next && next != endOfSearchMarkerNode) {
2645      if (filter.call(this, next, next.className + " " + next.id)) {
2646        next = this._removeAndGetNext(next);
2647      } else {
2648        next = this._getNextNode(next);
2649      }
2650    }
2651  },
2652
2653  /**
2654   * Clean out spurious headers from an Element.
2655   *
2656   * @param Element
2657   * @return void
2658   **/
2659  _cleanHeaders(e) {
2660    let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]);
2661    this._removeNodes(headingNodes, function (node) {
2662      let shouldRemove = this._getClassWeight(node) < 0;
2663      if (shouldRemove) {
2664        this.log("Removing header with low class weight:", node);
2665      }
2666      return shouldRemove;
2667    });
2668  },
2669
2670  /**
2671   * Check if this node is an H1 or H2 element whose content is mostly
2672   * the same as the article title.
2673   *
2674   * @param Element  the node to check.
2675   * @return boolean indicating whether this is a title-like header.
2676   */
2677  _headerDuplicatesTitle(node) {
2678    if (node.tagName != "H1" && node.tagName != "H2") {
2679      return false;
2680    }
2681    var heading = this._getInnerText(node, false);
2682    this.log("Evaluating similarity of header:", heading, this._articleTitle);
2683    return this._textSimilarity(this._articleTitle, heading) > 0.75;
2684  },
2685
2686  _flagIsActive(flag) {
2687    return (this._flags & flag) > 0;
2688  },
2689
2690  _removeFlag(flag) {
2691    this._flags = this._flags & ~flag;
2692  },
2693
2694  _isProbablyVisible(node) {
2695    // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes.
2696    return (
2697      (!node.style || node.style.display != "none") &&
2698      (!node.style || node.style.visibility != "hidden") &&
2699      !node.hasAttribute("hidden") &&
2700      //check for "fallback-image" so that wikimedia math images are displayed
2701      (!node.hasAttribute("aria-hidden") ||
2702        node.getAttribute("aria-hidden") != "true" ||
2703        (node.className &&
2704          node.className.includes &&
2705          node.className.includes("fallback-image")))
2706    );
2707  },
2708
2709  /**
2710   * Runs readability.
2711   *
2712   * Workflow:
2713   *  1. Prep the document by removing script tags, css, etc.
2714   *  2. Build readability's DOM tree.
2715   *  3. Grab the article content from the current dom tree.
2716   *  4. Replace the current DOM tree with the new one.
2717   *  5. Read peacefully.
2718   *
2719   * @return void
2720   **/
2721  parse() {
2722    // Avoid parsing too large documents, as per configuration option
2723    if (this._maxElemsToParse > 0) {
2724      var numTags = this._doc.getElementsByTagName("*").length;
2725      if (numTags > this._maxElemsToParse) {
2726        throw new Error(
2727          "Aborting parsing document; " + numTags + " elements found"
2728        );
2729      }
2730    }
2731
2732    // Unwrap image from noscript
2733    this._unwrapNoscriptImages(this._doc);
2734
2735    // Extract JSON-LD metadata before removing scripts
2736    var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
2737
2738    // Remove script tags from the document.
2739    this._removeScripts(this._doc);
2740
2741    this._prepDocument();
2742
2743    var metadata = this._getArticleMetadata(jsonLd);
2744    this._metadata = metadata;
2745    this._articleTitle = metadata.title;
2746
2747    var articleContent = this._grabArticle();
2748    if (!articleContent) {
2749      return null;
2750    }
2751
2752    this.log("Grabbed: " + articleContent.innerHTML);
2753
2754    this._postProcessContent(articleContent);
2755
2756    // If we haven't found an excerpt in the article's metadata, use the article's
2757    // first paragraph as the excerpt. This is used for displaying a preview of
2758    // the article's content.
2759    if (!metadata.excerpt) {
2760      var paragraphs = articleContent.getElementsByTagName("p");
2761      if (paragraphs.length) {
2762        metadata.excerpt = paragraphs[0].textContent.trim();
2763      }
2764    }
2765
2766    var textContent = articleContent.textContent;
2767    return {
2768      title: this._articleTitle,
2769      byline: metadata.byline || this._articleByline,
2770      dir: this._articleDir,
2771      lang: this._articleLang,
2772      content: this._serializer(articleContent),
2773      textContent,
2774      length: textContent.length,
2775      excerpt: metadata.excerpt,
2776      siteName: metadata.siteName || this._articleSiteName,
2777      publishedTime: metadata.publishedTime,
2778    };
2779  },
2780};
2781
2782if (typeof module === "object") {
2783  /* eslint-disable-next-line no-redeclare */
2784  /* global module */
2785  module.exports = Readability;
2786}