A browser extension that lets you summarize any webpage and ask questions using AI.
1/*
2 * Copyright (c) 2010 Arc90 Inc
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * This code is heavily based on Arc90's readability.js (1.7.1) script
19 * available at: http://code.google.com/p/arc90labs-readability
20 */
21
22/**
23 * Public constructor.
24 * @param {HTMLDocument} doc The document to parse.
25 * @param {Object} options The options object.
26 */
27function Readability(doc, options) {
28 // In some older versions, people passed a URI as the first argument. Cope:
29 if (options && options.documentElement) {
30 doc = options;
31 options = arguments[2];
32 } else if (!doc || !doc.documentElement) {
33 throw new Error(
34 "First argument to Readability constructor should be a document object."
35 );
36 }
37 options = options || {};
38
39 this._doc = doc;
40 this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
41 this._articleTitle = null;
42 this._articleByline = null;
43 this._articleDir = null;
44 this._articleSiteName = null;
45 this._attempts = [];
46 this._metadata = {};
47
48 // Configurable options
49 this._debug = !!options.debug;
50 this._maxElemsToParse =
51 options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
52 this._nbTopCandidates =
53 options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
54 this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
55 this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(
56 options.classesToPreserve || []
57 );
58 this._keepClasses = !!options.keepClasses;
59 this._serializer =
60 options.serializer ||
61 function (el) {
62 return el.innerHTML;
63 };
64 this._disableJSONLD = !!options.disableJSONLD;
65 this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
66 this._linkDensityModifier = options.linkDensityModifier || 0;
67
68 // Start with all flags set
69 this._flags =
70 this.FLAG_STRIP_UNLIKELYS |
71 this.FLAG_WEIGHT_CLASSES |
72 this.FLAG_CLEAN_CONDITIONALLY;
73
74 // Control whether log messages are sent to the console
75 if (this._debug) {
76 let logNode = function (node) {
77 if (node.nodeType == node.TEXT_NODE) {
78 return `${node.nodeName} ("${node.textContent}")`;
79 }
80 let attrPairs = Array.from(node.attributes || [], function (attr) {
81 return `${attr.name}="${attr.value}"`;
82 }).join(" ");
83 return `<${node.localName} ${attrPairs}>`;
84 };
85 this.log = function () {
86 if (typeof console !== "undefined") {
87 let args = Array.from(arguments, arg => {
88 if (arg && arg.nodeType == this.ELEMENT_NODE) {
89 return logNode(arg);
90 }
91 return arg;
92 });
93 args.unshift("Reader: (Readability)");
94 // eslint-disable-next-line no-console
95 console.log(...args);
96 } else if (typeof dump !== "undefined") {
97 /* global dump */
98 var msg = Array.prototype.map
99 .call(arguments, function (x) {
100 return x && x.nodeName ? logNode(x) : x;
101 })
102 .join(" ");
103 dump("Reader: (Readability) " + msg + "\n");
104 }
105 };
106 } else {
107 this.log = function () {};
108 }
109}
110
111Readability.prototype = {
112 FLAG_STRIP_UNLIKELYS: 0x1,
113 FLAG_WEIGHT_CLASSES: 0x2,
114 FLAG_CLEAN_CONDITIONALLY: 0x4,
115
116 // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
117 ELEMENT_NODE: 1,
118 TEXT_NODE: 3,
119
120 // Max number of nodes supported by this parser. Default: 0 (no limit)
121 DEFAULT_MAX_ELEMS_TO_PARSE: 0,
122
123 // The number of top candidates to consider when analysing how
124 // tight the competition is among candidates.
125 DEFAULT_N_TOP_CANDIDATES: 5,
126
127 // Element tags to score by default.
128 DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre"
129 .toUpperCase()
130 .split(","),
131
132 // The default number of chars an article must have in order to return a result
133 DEFAULT_CHAR_THRESHOLD: 500,
134
135 // All of the regular expressions in use within readability.
136 // Defined up here so we don't instantiate them repeatedly in loops.
137 REGEXPS: {
138 // NOTE: These two regular expressions are duplicated in
139 // Readability-readerable.js. Please keep both copies in sync.
140 unlikelyCandidates:
141 /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
142 okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
143
144 positive:
145 /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
146 negative:
147 /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i,
148 extraneous:
149 /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
150 byline: /byline|author|dateline|writtenby|p-author/i,
151 replaceFonts: /<(\/?)font[^>]*>/gi,
152 normalize: /\s{2,}/g,
153 videos:
154 /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
155 shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
156 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
157 prevLink: /(prev|earl|old|new|<|«)/i,
158 tokenize: /\W+/g,
159 whitespace: /^\s*$/,
160 hasContent: /\S$/,
161 hashUrl: /^#.+/,
162 srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
163 b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
164 // Commas as used in Latin, Sindhi, Chinese and various other scripts.
165 // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
166 commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
167 // See: https://schema.org/Article
168 jsonLdArticleTypes:
169 /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,
170 // used to see if a node's content matches words commonly used for ad blocks or loading indicators
171 adWords:
172 /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu,
173 loadingWords:
174 /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu,
175 },
176
177 UNLIKELY_ROLES: [
178 "menu",
179 "menubar",
180 "complementary",
181 "navigation",
182 "alert",
183 "alertdialog",
184 "dialog",
185 ],
186
187 DIV_TO_P_ELEMS: new Set([
188 "BLOCKQUOTE",
189 "DL",
190 "DIV",
191 "IMG",
192 "OL",
193 "P",
194 "PRE",
195 "TABLE",
196 "UL",
197 ]),
198
199 ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P", "OL", "UL"],
200
201 PRESENTATIONAL_ATTRIBUTES: [
202 "align",
203 "background",
204 "bgcolor",
205 "border",
206 "cellpadding",
207 "cellspacing",
208 "frame",
209 "hspace",
210 "rules",
211 "style",
212 "valign",
213 "vspace",
214 ],
215
216 DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"],
217
218 // The commented out elements qualify as phrasing content but tend to be
219 // removed by readability when put into paragraphs, so we ignore them here.
220 PHRASING_ELEMS: [
221 // "CANVAS", "IFRAME", "SVG", "VIDEO",
222 "ABBR",
223 "AUDIO",
224 "B",
225 "BDO",
226 "BR",
227 "BUTTON",
228 "CITE",
229 "CODE",
230 "DATA",
231 "DATALIST",
232 "DFN",
233 "EM",
234 "EMBED",
235 "I",
236 "IMG",
237 "INPUT",
238 "KBD",
239 "LABEL",
240 "MARK",
241 "MATH",
242 "METER",
243 "NOSCRIPT",
244 "OBJECT",
245 "OUTPUT",
246 "PROGRESS",
247 "Q",
248 "RUBY",
249 "SAMP",
250 "SCRIPT",
251 "SELECT",
252 "SMALL",
253 "SPAN",
254 "STRONG",
255 "SUB",
256 "SUP",
257 "TEXTAREA",
258 "TIME",
259 "VAR",
260 "WBR",
261 ],
262
263 // These are the classes that readability sets itself.
264 CLASSES_TO_PRESERVE: ["page"],
265
266 // These are the list of HTML entities that need to be escaped.
267 HTML_ESCAPE_MAP: {
268 lt: "<",
269 gt: ">",
270 amp: "&",
271 quot: '"',
272 apos: "'",
273 },
274
275 /**
276 * Run any post-process modifications to article content as necessary.
277 *
278 * @param Element
279 * @return void
280 **/
281 _postProcessContent(articleContent) {
282 // Readability cannot open relative uris so we convert them to absolute uris.
283 this._fixRelativeUris(articleContent);
284
285 this._simplifyNestedElements(articleContent);
286
287 if (!this._keepClasses) {
288 // Remove classes.
289 this._cleanClasses(articleContent);
290 }
291 },
292
293 /**
294 * Iterates over a NodeList, calls `filterFn` for each node and removes node
295 * if function returned `true`.
296 *
297 * If function is not passed, removes all the nodes in node list.
298 *
299 * @param NodeList nodeList The nodes to operate on
300 * @param Function filterFn the function to use as a filter
301 * @return void
302 */
303 _removeNodes(nodeList, filterFn) {
304 // Avoid ever operating on live node lists.
305 if (this._docJSDOMParser && nodeList._isLiveNodeList) {
306 throw new Error("Do not pass live node lists to _removeNodes");
307 }
308 for (var i = nodeList.length - 1; i >= 0; i--) {
309 var node = nodeList[i];
310 var parentNode = node.parentNode;
311 if (parentNode) {
312 if (!filterFn || filterFn.call(this, node, i, nodeList)) {
313 parentNode.removeChild(node);
314 }
315 }
316 }
317 },
318
319 /**
320 * Iterates over a NodeList, and calls _setNodeTag for each node.
321 *
322 * @param NodeList nodeList The nodes to operate on
323 * @param String newTagName the new tag name to use
324 * @return void
325 */
326 _replaceNodeTags(nodeList, newTagName) {
327 // Avoid ever operating on live node lists.
328 if (this._docJSDOMParser && nodeList._isLiveNodeList) {
329 throw new Error("Do not pass live node lists to _replaceNodeTags");
330 }
331 for (const node of nodeList) {
332 this._setNodeTag(node, newTagName);
333 }
334 },
335
336 /**
337 * Iterate over a NodeList, which doesn't natively fully implement the Array
338 * interface.
339 *
340 * For convenience, the current object context is applied to the provided
341 * iterate function.
342 *
343 * @param NodeList nodeList The NodeList.
344 * @param Function fn The iterate function.
345 * @return void
346 */
347 _forEachNode(nodeList, fn) {
348 Array.prototype.forEach.call(nodeList, fn, this);
349 },
350
351 /**
352 * Iterate over a NodeList, and return the first node that passes
353 * the supplied test function
354 *
355 * For convenience, the current object context is applied to the provided
356 * test function.
357 *
358 * @param NodeList nodeList The NodeList.
359 * @param Function fn The test function.
360 * @return void
361 */
362 _findNode(nodeList, fn) {
363 return Array.prototype.find.call(nodeList, fn, this);
364 },
365
366 /**
367 * Iterate over a NodeList, return true if any of the provided iterate
368 * function calls returns true, false otherwise.
369 *
370 * For convenience, the current object context is applied to the
371 * provided iterate function.
372 *
373 * @param NodeList nodeList The NodeList.
374 * @param Function fn The iterate function.
375 * @return Boolean
376 */
377 _someNode(nodeList, fn) {
378 return Array.prototype.some.call(nodeList, fn, this);
379 },
380
381 /**
382 * Iterate over a NodeList, return true if all of the provided iterate
383 * function calls return true, false otherwise.
384 *
385 * For convenience, the current object context is applied to the
386 * provided iterate function.
387 *
388 * @param NodeList nodeList The NodeList.
389 * @param Function fn The iterate function.
390 * @return Boolean
391 */
392 _everyNode(nodeList, fn) {
393 return Array.prototype.every.call(nodeList, fn, this);
394 },
395
396 _getAllNodesWithTag(node, tagNames) {
397 if (node.querySelectorAll) {
398 return node.querySelectorAll(tagNames.join(","));
399 }
400 return [].concat.apply(
401 [],
402 tagNames.map(function (tag) {
403 var collection = node.getElementsByTagName(tag);
404 return Array.isArray(collection) ? collection : Array.from(collection);
405 })
406 );
407 },
408
409 /**
410 * Removes the class="" attribute from every element in the given
411 * subtree, except those that match CLASSES_TO_PRESERVE and
412 * the classesToPreserve array from the options object.
413 *
414 * @param Element
415 * @return void
416 */
417 _cleanClasses(node) {
418 var classesToPreserve = this._classesToPreserve;
419 var className = (node.getAttribute("class") || "")
420 .split(/\s+/)
421 .filter(cls => classesToPreserve.includes(cls))
422 .join(" ");
423
424 if (className) {
425 node.setAttribute("class", className);
426 } else {
427 node.removeAttribute("class");
428 }
429
430 for (node = node.firstElementChild; node; node = node.nextElementSibling) {
431 this._cleanClasses(node);
432 }
433 },
434
435 /**
436 * Tests whether a string is a URL or not.
437 *
438 * @param {string} str The string to test
439 * @return {boolean} true if str is a URL, false if not
440 */
441 _isUrl(str) {
442 try {
443 new URL(str);
444 return true;
445 } catch {
446 return false;
447 }
448 },
449 /**
450 * Converts each <a> and <img> uri in the given element to an absolute URI,
451 * ignoring #ref URIs.
452 *
453 * @param Element
454 * @return void
455 */
456 _fixRelativeUris(articleContent) {
457 var baseURI = this._doc.baseURI;
458 var documentURI = this._doc.documentURI;
459 function toAbsoluteURI(uri) {
460 // Leave hash links alone if the base URI matches the document URI:
461 if (baseURI == documentURI && uri.charAt(0) == "#") {
462 return uri;
463 }
464
465 // Otherwise, resolve against base URI:
466 try {
467 return new URL(uri, baseURI).href;
468 } catch (ex) {
469 // Something went wrong, just return the original:
470 }
471 return uri;
472 }
473
474 var links = this._getAllNodesWithTag(articleContent, ["a"]);
475 this._forEachNode(links, function (link) {
476 var href = link.getAttribute("href");
477 if (href) {
478 // Remove links with javascript: URIs, since
479 // they won't work after scripts have been removed from the page.
480 if (href.indexOf("javascript:") === 0) {
481 // if the link only contains simple text content, it can be converted to a text node
482 if (
483 link.childNodes.length === 1 &&
484 link.childNodes[0].nodeType === this.TEXT_NODE
485 ) {
486 var text = this._doc.createTextNode(link.textContent);
487 link.parentNode.replaceChild(text, link);
488 } else {
489 // if the link has multiple children, they should all be preserved
490 var container = this._doc.createElement("span");
491 while (link.firstChild) {
492 container.appendChild(link.firstChild);
493 }
494 link.parentNode.replaceChild(container, link);
495 }
496 } else {
497 link.setAttribute("href", toAbsoluteURI(href));
498 }
499 }
500 });
501
502 var medias = this._getAllNodesWithTag(articleContent, [
503 "img",
504 "picture",
505 "figure",
506 "video",
507 "audio",
508 "source",
509 ]);
510
511 this._forEachNode(medias, function (media) {
512 var src = media.getAttribute("src");
513 var poster = media.getAttribute("poster");
514 var srcset = media.getAttribute("srcset");
515
516 if (src) {
517 media.setAttribute("src", toAbsoluteURI(src));
518 }
519
520 if (poster) {
521 media.setAttribute("poster", toAbsoluteURI(poster));
522 }
523
524 if (srcset) {
525 var newSrcset = srcset.replace(
526 this.REGEXPS.srcsetUrl,
527 function (_, p1, p2, p3) {
528 return toAbsoluteURI(p1) + (p2 || "") + p3;
529 }
530 );
531
532 media.setAttribute("srcset", newSrcset);
533 }
534 });
535 },
536
537 _simplifyNestedElements(articleContent) {
538 var node = articleContent;
539
540 while (node) {
541 if (
542 node.parentNode &&
543 ["DIV", "SECTION"].includes(node.tagName) &&
544 !(node.id && node.id.startsWith("readability"))
545 ) {
546 if (this._isElementWithoutContent(node)) {
547 node = this._removeAndGetNext(node);
548 continue;
549 } else if (
550 this._hasSingleTagInsideElement(node, "DIV") ||
551 this._hasSingleTagInsideElement(node, "SECTION")
552 ) {
553 var child = node.children[0];
554 for (var i = 0; i < node.attributes.length; i++) {
555 child.setAttributeNode(node.attributes[i].cloneNode());
556 }
557 node.parentNode.replaceChild(child, node);
558 node = child;
559 continue;
560 }
561 }
562
563 node = this._getNextNode(node);
564 }
565 },
566
567 /**
568 * Get the article title as an H1.
569 *
570 * @return string
571 **/
572 _getArticleTitle() {
573 var doc = this._doc;
574 var curTitle = "";
575 var origTitle = "";
576
577 try {
578 curTitle = origTitle = doc.title.trim();
579
580 // If they had an element with id "title" in their HTML
581 if (typeof curTitle !== "string") {
582 curTitle = origTitle = this._getInnerText(
583 doc.getElementsByTagName("title")[0]
584 );
585 }
586 } catch (e) {
587 /* ignore exceptions setting the title. */
588 }
589
590 var titleHadHierarchicalSeparators = false;
591 function wordCount(str) {
592 return str.split(/\s+/).length;
593 }
594
595 // If there's a separator in the title, first remove the final part
596 if (/ [\|\-\\\/>»] /.test(curTitle)) {
597 titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
598 let allSeparators = Array.from(origTitle.matchAll(/ [\|\-\\\/>»] /gi));
599 curTitle = origTitle.substring(0, allSeparators.pop().index);
600
601 // If the resulting title is too short, remove the first part instead:
602 if (wordCount(curTitle) < 3) {
603 curTitle = origTitle.replace(/^[^\|\-\\\/>»]*[\|\-\\\/>»]/gi, "");
604 }
605 } else if (curTitle.includes(": ")) {
606 // Check if we have an heading containing this exact string, so we
607 // could assume it's the full title.
608 var headings = this._getAllNodesWithTag(doc, ["h1", "h2"]);
609 var trimmedTitle = curTitle.trim();
610 var match = this._someNode(headings, function (heading) {
611 return heading.textContent.trim() === trimmedTitle;
612 });
613
614 // If we don't, let's extract the title out of the original title string.
615 if (!match) {
616 curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
617
618 // If the title is now too short, try the first colon instead:
619 if (wordCount(curTitle) < 3) {
620 curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
621 // But if we have too many words before the colon there's something weird
622 // with the titles and the H tags so let's just use the original title instead
623 } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
624 curTitle = origTitle;
625 }
626 }
627 } else if (curTitle.length > 150 || curTitle.length < 15) {
628 var hOnes = doc.getElementsByTagName("h1");
629
630 if (hOnes.length === 1) {
631 curTitle = this._getInnerText(hOnes[0]);
632 }
633 }
634
635 curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
636 // If we now have 4 words or fewer as our title, and either no
637 // 'hierarchical' separators (\, /, > or ») were found in the original
638 // title or we decreased the number of words by more than 1 word, use
639 // the original title.
640 var curTitleWordCount = wordCount(curTitle);
641 if (
642 curTitleWordCount <= 4 &&
643 (!titleHadHierarchicalSeparators ||
644 curTitleWordCount !=
645 wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)
646 ) {
647 curTitle = origTitle;
648 }
649
650 return curTitle;
651 },
652
653 /**
654 * Prepare the HTML document for readability to scrape it.
655 * This includes things like stripping javascript, CSS, and handling terrible markup.
656 *
657 * @return void
658 **/
659 _prepDocument() {
660 var doc = this._doc;
661
662 // Remove all style tags in head
663 this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
664
665 if (doc.body) {
666 this._replaceBrs(doc.body);
667 }
668
669 this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
670 },
671
672 /**
673 * Finds the next node, starting from the given node, and ignoring
674 * whitespace in between. If the given node is an element, the same node is
675 * returned.
676 */
677 _nextNode(node) {
678 var next = node;
679 while (
680 next &&
681 next.nodeType != this.ELEMENT_NODE &&
682 this.REGEXPS.whitespace.test(next.textContent)
683 ) {
684 next = next.nextSibling;
685 }
686 return next;
687 },
688
689 /**
690 * Replaces 2 or more successive <br> elements with a single <p>.
691 * Whitespace between <br> elements are ignored. For example:
692 * <div>foo<br>bar<br> <br><br>abc</div>
693 * will become:
694 * <div>foo<br>bar<p>abc</p></div>
695 */
696 _replaceBrs(elem) {
697 this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) {
698 var next = br.nextSibling;
699
700 // Whether 2 or more <br> elements have been found and replaced with a
701 // <p> block.
702 var replaced = false;
703
704 // If we find a <br> chain, remove the <br>s until we hit another node
705 // or non-whitespace. This leaves behind the first <br> in the chain
706 // (which will be replaced with a <p> later).
707 while ((next = this._nextNode(next)) && next.tagName == "BR") {
708 replaced = true;
709 var brSibling = next.nextSibling;
710 next.remove();
711 next = brSibling;
712 }
713
714 // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
715 // all sibling nodes as children of the <p> until we hit another <br>
716 // chain.
717 if (replaced) {
718 var p = this._doc.createElement("p");
719 br.parentNode.replaceChild(p, br);
720
721 next = p.nextSibling;
722 while (next) {
723 // If we've hit another <br><br>, we're done adding children to this <p>.
724 if (next.tagName == "BR") {
725 var nextElem = this._nextNode(next.nextSibling);
726 if (nextElem && nextElem.tagName == "BR") {
727 break;
728 }
729 }
730
731 if (!this._isPhrasingContent(next)) {
732 break;
733 }
734
735 // Otherwise, make this node a child of the new <p>.
736 var sibling = next.nextSibling;
737 p.appendChild(next);
738 next = sibling;
739 }
740
741 while (p.lastChild && this._isWhitespace(p.lastChild)) {
742 p.lastChild.remove();
743 }
744
745 if (p.parentNode.tagName === "P") {
746 this._setNodeTag(p.parentNode, "DIV");
747 }
748 }
749 });
750 },
751
752 _setNodeTag(node, tag) {
753 this.log("_setNodeTag", node, tag);
754 if (this._docJSDOMParser) {
755 node.localName = tag.toLowerCase();
756 node.tagName = tag.toUpperCase();
757 return node;
758 }
759
760 var replacement = node.ownerDocument.createElement(tag);
761 while (node.firstChild) {
762 replacement.appendChild(node.firstChild);
763 }
764 node.parentNode.replaceChild(replacement, node);
765 if (node.readability) {
766 replacement.readability = node.readability;
767 }
768
769 for (var i = 0; i < node.attributes.length; i++) {
770 replacement.setAttributeNode(node.attributes[i].cloneNode());
771 }
772 return replacement;
773 },
774
775 /**
776 * Prepare the article node for display. Clean out any inline styles,
777 * iframes, forms, strip extraneous <p> tags, etc.
778 *
779 * @param Element
780 * @return void
781 **/
782 _prepArticle(articleContent) {
783 this._cleanStyles(articleContent);
784
785 // Check for data tables before we continue, to avoid removing items in
786 // those tables, which will often be isolated even though they're
787 // visually linked to other content-ful elements (text, images, etc.).
788 this._markDataTables(articleContent);
789
790 this._fixLazyImages(articleContent);
791
792 // Clean out junk from the article content
793 this._cleanConditionally(articleContent, "form");
794 this._cleanConditionally(articleContent, "fieldset");
795 this._clean(articleContent, "object");
796 this._clean(articleContent, "embed");
797 this._clean(articleContent, "footer");
798 this._clean(articleContent, "link");
799 this._clean(articleContent, "aside");
800
801 // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
802 // which means we don't remove the top candidates even they have "share".
803
804 var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
805
806 this._forEachNode(articleContent.children, function (topCandidate) {
807 this._cleanMatchedNodes(topCandidate, function (node, matchString) {
808 return (
809 this.REGEXPS.shareElements.test(matchString) &&
810 node.textContent.length < shareElementThreshold
811 );
812 });
813 });
814
815 this._clean(articleContent, "iframe");
816 this._clean(articleContent, "input");
817 this._clean(articleContent, "textarea");
818 this._clean(articleContent, "select");
819 this._clean(articleContent, "button");
820 this._cleanHeaders(articleContent);
821
822 // Do these last as the previous stuff may have removed junk
823 // that will affect these
824 this._cleanConditionally(articleContent, "table");
825 this._cleanConditionally(articleContent, "ul");
826 this._cleanConditionally(articleContent, "div");
827
828 // replace H1 with H2 as H1 should be only title that is displayed separately
829 this._replaceNodeTags(
830 this._getAllNodesWithTag(articleContent, ["h1"]),
831 "h2"
832 );
833
834 // Remove extra paragraphs
835 this._removeNodes(
836 this._getAllNodesWithTag(articleContent, ["p"]),
837 function (paragraph) {
838 // At this point, nasty iframes have been removed; only embedded video
839 // ones remain.
840 var contentElementCount = this._getAllNodesWithTag(paragraph, [
841 "img",
842 "embed",
843 "object",
844 "iframe",
845 ]).length;
846 return (
847 contentElementCount === 0 && !this._getInnerText(paragraph, false)
848 );
849 }
850 );
851
852 this._forEachNode(
853 this._getAllNodesWithTag(articleContent, ["br"]),
854 function (br) {
855 var next = this._nextNode(br.nextSibling);
856 if (next && next.tagName == "P") {
857 br.remove();
858 }
859 }
860 );
861
862 // Remove single-cell tables
863 this._forEachNode(
864 this._getAllNodesWithTag(articleContent, ["table"]),
865 function (table) {
866 var tbody = this._hasSingleTagInsideElement(table, "TBODY")
867 ? table.firstElementChild
868 : table;
869 if (this._hasSingleTagInsideElement(tbody, "TR")) {
870 var row = tbody.firstElementChild;
871 if (this._hasSingleTagInsideElement(row, "TD")) {
872 var cell = row.firstElementChild;
873 cell = this._setNodeTag(
874 cell,
875 this._everyNode(cell.childNodes, this._isPhrasingContent)
876 ? "P"
877 : "DIV"
878 );
879 table.parentNode.replaceChild(cell, table);
880 }
881 }
882 }
883 );
884 },
885
886 /**
887 * Initialize a node with the readability object. Also checks the
888 * className/id for special names to add to its score.
889 *
890 * @param Element
891 * @return void
892 **/
893 _initializeNode(node) {
894 node.readability = { contentScore: 0 };
895
896 switch (node.tagName) {
897 case "DIV":
898 node.readability.contentScore += 5;
899 break;
900
901 case "PRE":
902 case "TD":
903 case "BLOCKQUOTE":
904 node.readability.contentScore += 3;
905 break;
906
907 case "ADDRESS":
908 case "OL":
909 case "UL":
910 case "DL":
911 case "DD":
912 case "DT":
913 case "LI":
914 case "FORM":
915 node.readability.contentScore -= 3;
916 break;
917
918 case "H1":
919 case "H2":
920 case "H3":
921 case "H4":
922 case "H5":
923 case "H6":
924 case "TH":
925 node.readability.contentScore -= 5;
926 break;
927 }
928
929 node.readability.contentScore += this._getClassWeight(node);
930 },
931
932 _removeAndGetNext(node) {
933 var nextNode = this._getNextNode(node, true);
934 node.remove();
935 return nextNode;
936 },
937
938 /**
939 * Traverse the DOM from node to node, starting at the node passed in.
940 * Pass true for the second parameter to indicate this node itself
941 * (and its kids) are going away, and we want the next node over.
942 *
943 * Calling this in a loop will traverse the DOM depth-first.
944 *
945 * @param {Element} node
946 * @param {boolean} ignoreSelfAndKids
947 * @return {Element}
948 */
949 _getNextNode(node, ignoreSelfAndKids) {
950 // First check for kids if those aren't being ignored
951 if (!ignoreSelfAndKids && node.firstElementChild) {
952 return node.firstElementChild;
953 }
954 // Then for siblings...
955 if (node.nextElementSibling) {
956 return node.nextElementSibling;
957 }
958 // And finally, move up the parent chain *and* find a sibling
959 // (because this is depth-first traversal, we will have already
960 // seen the parent nodes themselves).
961 do {
962 node = node.parentNode;
963 } while (node && !node.nextElementSibling);
964 return node && node.nextElementSibling;
965 },
966
967 // compares second text to first one
968 // 1 = same text, 0 = completely different text
969 // works the way that it splits both texts into words and then finds words that are unique in second text
970 // the result is given by the lower length of unique parts
971 _textSimilarity(textA, textB) {
972 var tokensA = textA
973 .toLowerCase()
974 .split(this.REGEXPS.tokenize)
975 .filter(Boolean);
976 var tokensB = textB
977 .toLowerCase()
978 .split(this.REGEXPS.tokenize)
979 .filter(Boolean);
980 if (!tokensA.length || !tokensB.length) {
981 return 0;
982 }
983 var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
984 var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
985 return 1 - distanceB;
986 },
987
988 /**
989 * Checks whether an element node contains a valid byline
990 *
991 * @param node {Element}
992 * @param matchString {string}
993 * @return boolean
994 */
995 _isValidByline(node, matchString) {
996 var rel = node.getAttribute("rel");
997 var itemprop = node.getAttribute("itemprop");
998 var bylineLength = node.textContent.trim().length;
999
1000 return (
1001 (rel === "author" ||
1002 (itemprop && itemprop.includes("author")) ||
1003 this.REGEXPS.byline.test(matchString)) &&
1004 !!bylineLength &&
1005 bylineLength < 100
1006 );
1007 },
1008
1009 _getNodeAncestors(node, maxDepth) {
1010 maxDepth = maxDepth || 0;
1011 var i = 0,
1012 ancestors = [];
1013 while (node.parentNode) {
1014 ancestors.push(node.parentNode);
1015 if (maxDepth && ++i === maxDepth) {
1016 break;
1017 }
1018 node = node.parentNode;
1019 }
1020 return ancestors;
1021 },
1022
1023 /***
1024 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
1025 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
1026 *
1027 * @param page a document to run upon. Needs to be a full document, complete with body.
1028 * @return Element
1029 **/
1030 /* eslint-disable-next-line complexity */
1031 _grabArticle(page) {
1032 this.log("**** grabArticle ****");
1033 var doc = this._doc;
1034 var isPaging = page !== null;
1035 page = page ? page : this._doc.body;
1036
1037 // We can't grab an article if we don't have a page!
1038 if (!page) {
1039 this.log("No body found in document. Abort.");
1040 return null;
1041 }
1042
1043 var pageCacheHtml = page.innerHTML;
1044
1045 while (true) {
1046 this.log("Starting grabArticle loop");
1047 var stripUnlikelyCandidates = this._flagIsActive(
1048 this.FLAG_STRIP_UNLIKELYS
1049 );
1050
1051 // First, node prepping. Trash nodes that look cruddy (like ones with the
1052 // class name "comment", etc), and turn divs into P tags where they have been
1053 // used inappropriately (as in, where they contain no other block level elements.)
1054 var elementsToScore = [];
1055 var node = this._doc.documentElement;
1056
1057 let shouldRemoveTitleHeader = true;
1058
1059 while (node) {
1060 if (node.tagName === "HTML") {
1061 this._articleLang = node.getAttribute("lang");
1062 }
1063
1064 var matchString = node.className + " " + node.id;
1065
1066 if (!this._isProbablyVisible(node)) {
1067 this.log("Removing hidden node - " + matchString);
1068 node = this._removeAndGetNext(node);
1069 continue;
1070 }
1071
1072 // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
1073 if (
1074 node.getAttribute("aria-modal") == "true" &&
1075 node.getAttribute("role") == "dialog"
1076 ) {
1077 node = this._removeAndGetNext(node);
1078 continue;
1079 }
1080
1081 // If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node.
1082 if (
1083 !this._articleByline &&
1084 !this._metadata.byline &&
1085 this._isValidByline(node, matchString)
1086 ) {
1087 // Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline
1088 var endOfSearchMarkerNode = this._getNextNode(node, true);
1089 var next = this._getNextNode(node);
1090 var itemPropNameNode = null;
1091 while (next && next != endOfSearchMarkerNode) {
1092 var itemprop = next.getAttribute("itemprop");
1093 if (itemprop && itemprop.includes("name")) {
1094 itemPropNameNode = next;
1095 break;
1096 } else {
1097 next = this._getNextNode(next);
1098 }
1099 }
1100 this._articleByline = (itemPropNameNode ?? node).textContent.trim();
1101 node = this._removeAndGetNext(node);
1102 continue;
1103 }
1104
1105 if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
1106 this.log(
1107 "Removing header: ",
1108 node.textContent.trim(),
1109 this._articleTitle.trim()
1110 );
1111 shouldRemoveTitleHeader = false;
1112 node = this._removeAndGetNext(node);
1113 continue;
1114 }
1115
1116 // Remove unlikely candidates
1117 if (stripUnlikelyCandidates) {
1118 if (
1119 this.REGEXPS.unlikelyCandidates.test(matchString) &&
1120 !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
1121 !this._hasAncestorTag(node, "table") &&
1122 !this._hasAncestorTag(node, "code") &&
1123 node.tagName !== "BODY" &&
1124 node.tagName !== "A"
1125 ) {
1126 this.log("Removing unlikely candidate - " + matchString);
1127 node = this._removeAndGetNext(node);
1128 continue;
1129 }
1130
1131 if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
1132 this.log(
1133 "Removing content with role " +
1134 node.getAttribute("role") +
1135 " - " +
1136 matchString
1137 );
1138 node = this._removeAndGetNext(node);
1139 continue;
1140 }
1141 }
1142
1143 // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
1144 if (
1145 (node.tagName === "DIV" ||
1146 node.tagName === "SECTION" ||
1147 node.tagName === "HEADER" ||
1148 node.tagName === "H1" ||
1149 node.tagName === "H2" ||
1150 node.tagName === "H3" ||
1151 node.tagName === "H4" ||
1152 node.tagName === "H5" ||
1153 node.tagName === "H6") &&
1154 this._isElementWithoutContent(node)
1155 ) {
1156 node = this._removeAndGetNext(node);
1157 continue;
1158 }
1159
1160 if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) {
1161 elementsToScore.push(node);
1162 }
1163
1164 // Turn all divs that don't have children block level elements into p's
1165 if (node.tagName === "DIV") {
1166 // Put phrasing content into paragraphs.
1167 var p = null;
1168 var childNode = node.firstChild;
1169 while (childNode) {
1170 var nextSibling = childNode.nextSibling;
1171 if (this._isPhrasingContent(childNode)) {
1172 if (p !== null) {
1173 p.appendChild(childNode);
1174 } else if (!this._isWhitespace(childNode)) {
1175 p = doc.createElement("p");
1176 node.replaceChild(p, childNode);
1177 p.appendChild(childNode);
1178 }
1179 } else if (p !== null) {
1180 while (p.lastChild && this._isWhitespace(p.lastChild)) {
1181 p.lastChild.remove();
1182 }
1183 p = null;
1184 }
1185 childNode = nextSibling;
1186 }
1187
1188 // Sites like http://mobile.slate.com encloses each paragraph with a DIV
1189 // element. DIVs with only a P element inside and no text content can be
1190 // safely converted into plain P elements to avoid confusing the scoring
1191 // algorithm with DIVs with are, in practice, paragraphs.
1192 if (
1193 this._hasSingleTagInsideElement(node, "P") &&
1194 this._getLinkDensity(node) < 0.25
1195 ) {
1196 var newNode = node.children[0];
1197 node.parentNode.replaceChild(newNode, node);
1198 node = newNode;
1199 elementsToScore.push(node);
1200 } else if (!this._hasChildBlockElement(node)) {
1201 node = this._setNodeTag(node, "P");
1202 elementsToScore.push(node);
1203 }
1204 }
1205 node = this._getNextNode(node);
1206 }
1207
1208 /**
1209 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
1210 * Then add their score to their parent node.
1211 *
1212 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
1213 **/
1214 var candidates = [];
1215 this._forEachNode(elementsToScore, function (elementToScore) {
1216 if (
1217 !elementToScore.parentNode ||
1218 typeof elementToScore.parentNode.tagName === "undefined"
1219 ) {
1220 return;
1221 }
1222
1223 // If this paragraph is less than 25 characters, don't even count it.
1224 var innerText = this._getInnerText(elementToScore);
1225 if (innerText.length < 25) {
1226 return;
1227 }
1228
1229 // Exclude nodes with no ancestor.
1230 var ancestors = this._getNodeAncestors(elementToScore, 5);
1231 if (ancestors.length === 0) {
1232 return;
1233 }
1234
1235 var contentScore = 0;
1236
1237 // Add a point for the paragraph itself as a base.
1238 contentScore += 1;
1239
1240 // Add points for any commas within this paragraph.
1241 contentScore += innerText.split(this.REGEXPS.commas).length;
1242
1243 // For every 100 characters in this paragraph, add another point. Up to 3 points.
1244 contentScore += Math.min(Math.floor(innerText.length / 100), 3);
1245
1246 // Initialize and score ancestors.
1247 this._forEachNode(ancestors, function (ancestor, level) {
1248 if (
1249 !ancestor.tagName ||
1250 !ancestor.parentNode ||
1251 typeof ancestor.parentNode.tagName === "undefined"
1252 ) {
1253 return;
1254 }
1255
1256 if (typeof ancestor.readability === "undefined") {
1257 this._initializeNode(ancestor);
1258 candidates.push(ancestor);
1259 }
1260
1261 // Node score divider:
1262 // - parent: 1 (no division)
1263 // - grandparent: 2
1264 // - great grandparent+: ancestor level * 3
1265 if (level === 0) {
1266 var scoreDivider = 1;
1267 } else if (level === 1) {
1268 scoreDivider = 2;
1269 } else {
1270 scoreDivider = level * 3;
1271 }
1272 ancestor.readability.contentScore += contentScore / scoreDivider;
1273 });
1274 });
1275
1276 // After we've calculated scores, loop through all of the possible
1277 // candidate nodes we found and find the one with the highest score.
1278 var topCandidates = [];
1279 for (var c = 0, cl = candidates.length; c < cl; c += 1) {
1280 var candidate = candidates[c];
1281
1282 // Scale the final candidates score based on link density. Good content
1283 // should have a relatively small link density (5% or less) and be mostly
1284 // unaffected by this operation.
1285 var candidateScore =
1286 candidate.readability.contentScore *
1287 (1 - this._getLinkDensity(candidate));
1288 candidate.readability.contentScore = candidateScore;
1289
1290 this.log("Candidate:", candidate, "with score " + candidateScore);
1291
1292 for (var t = 0; t < this._nbTopCandidates; t++) {
1293 var aTopCandidate = topCandidates[t];
1294
1295 if (
1296 !aTopCandidate ||
1297 candidateScore > aTopCandidate.readability.contentScore
1298 ) {
1299 topCandidates.splice(t, 0, candidate);
1300 if (topCandidates.length > this._nbTopCandidates) {
1301 topCandidates.pop();
1302 }
1303 break;
1304 }
1305 }
1306 }
1307
1308 var topCandidate = topCandidates[0] || null;
1309 var neededToCreateTopCandidate = false;
1310 var parentOfTopCandidate;
1311
1312 // If we still have no top candidate, just use the body as a last resort.
1313 // We also have to copy the body node so it is something we can modify.
1314 if (topCandidate === null || topCandidate.tagName === "BODY") {
1315 // Move all of the page's children into topCandidate
1316 topCandidate = doc.createElement("DIV");
1317 neededToCreateTopCandidate = true;
1318 // Move everything (not just elements, also text nodes etc.) into the container
1319 // so we even include text directly in the body:
1320 while (page.firstChild) {
1321 this.log("Moving child out:", page.firstChild);
1322 topCandidate.appendChild(page.firstChild);
1323 }
1324
1325 page.appendChild(topCandidate);
1326
1327 this._initializeNode(topCandidate);
1328 } else if (topCandidate) {
1329 // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
1330 // and whose scores are quite closed with current `topCandidate` node.
1331 var alternativeCandidateAncestors = [];
1332 for (var i = 1; i < topCandidates.length; i++) {
1333 if (
1334 topCandidates[i].readability.contentScore /
1335 topCandidate.readability.contentScore >=
1336 0.75
1337 ) {
1338 alternativeCandidateAncestors.push(
1339 this._getNodeAncestors(topCandidates[i])
1340 );
1341 }
1342 }
1343 var MINIMUM_TOPCANDIDATES = 3;
1344 if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
1345 parentOfTopCandidate = topCandidate.parentNode;
1346 while (parentOfTopCandidate.tagName !== "BODY") {
1347 var listsContainingThisAncestor = 0;
1348 for (
1349 var ancestorIndex = 0;
1350 ancestorIndex < alternativeCandidateAncestors.length &&
1351 listsContainingThisAncestor < MINIMUM_TOPCANDIDATES;
1352 ancestorIndex++
1353 ) {
1354 listsContainingThisAncestor += Number(
1355 alternativeCandidateAncestors[ancestorIndex].includes(
1356 parentOfTopCandidate
1357 )
1358 );
1359 }
1360 if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
1361 topCandidate = parentOfTopCandidate;
1362 break;
1363 }
1364 parentOfTopCandidate = parentOfTopCandidate.parentNode;
1365 }
1366 }
1367 if (!topCandidate.readability) {
1368 this._initializeNode(topCandidate);
1369 }
1370
1371 // Because of our bonus system, parents of candidates might have scores
1372 // themselves. They get half of the node. There won't be nodes with higher
1373 // scores than our topCandidate, but if we see the score going *up* in the first
1374 // few steps up the tree, that's a decent sign that there might be more content
1375 // lurking in other places that we want to unify in. The sibling stuff
1376 // below does some of that - but only if we've looked high enough up the DOM
1377 // tree.
1378 parentOfTopCandidate = topCandidate.parentNode;
1379 var lastScore = topCandidate.readability.contentScore;
1380 // The scores shouldn't get too low.
1381 var scoreThreshold = lastScore / 3;
1382 while (parentOfTopCandidate.tagName !== "BODY") {
1383 if (!parentOfTopCandidate.readability) {
1384 parentOfTopCandidate = parentOfTopCandidate.parentNode;
1385 continue;
1386 }
1387 var parentScore = parentOfTopCandidate.readability.contentScore;
1388 if (parentScore < scoreThreshold) {
1389 break;
1390 }
1391 if (parentScore > lastScore) {
1392 // Alright! We found a better parent to use.
1393 topCandidate = parentOfTopCandidate;
1394 break;
1395 }
1396 lastScore = parentOfTopCandidate.readability.contentScore;
1397 parentOfTopCandidate = parentOfTopCandidate.parentNode;
1398 }
1399
1400 // If the top candidate is the only child, use parent instead. This will help sibling
1401 // joining logic when adjacent content is actually located in parent's sibling node.
1402 parentOfTopCandidate = topCandidate.parentNode;
1403 while (
1404 parentOfTopCandidate.tagName != "BODY" &&
1405 parentOfTopCandidate.children.length == 1
1406 ) {
1407 topCandidate = parentOfTopCandidate;
1408 parentOfTopCandidate = topCandidate.parentNode;
1409 }
1410 if (!topCandidate.readability) {
1411 this._initializeNode(topCandidate);
1412 }
1413 }
1414
1415 // Now that we have the top candidate, look through its siblings for content
1416 // that might also be related. Things like preambles, content split by ads
1417 // that we removed, etc.
1418 var articleContent = doc.createElement("DIV");
1419 if (isPaging) {
1420 articleContent.id = "readability-content";
1421 }
1422
1423 var siblingScoreThreshold = Math.max(
1424 10,
1425 topCandidate.readability.contentScore * 0.2
1426 );
1427 // Keep potential top candidate's parent node to try to get text direction of it later.
1428 parentOfTopCandidate = topCandidate.parentNode;
1429 var siblings = parentOfTopCandidate.children;
1430
1431 for (var s = 0, sl = siblings.length; s < sl; s++) {
1432 var sibling = siblings[s];
1433 var append = false;
1434
1435 this.log(
1436 "Looking at sibling node:",
1437 sibling,
1438 sibling.readability
1439 ? "with score " + sibling.readability.contentScore
1440 : ""
1441 );
1442 this.log(
1443 "Sibling has score",
1444 sibling.readability ? sibling.readability.contentScore : "Unknown"
1445 );
1446
1447 if (sibling === topCandidate) {
1448 append = true;
1449 } else {
1450 var contentBonus = 0;
1451
1452 // Give a bonus if sibling nodes and top candidates have the example same classname
1453 if (
1454 sibling.className === topCandidate.className &&
1455 topCandidate.className !== ""
1456 ) {
1457 contentBonus += topCandidate.readability.contentScore * 0.2;
1458 }
1459
1460 if (
1461 sibling.readability &&
1462 sibling.readability.contentScore + contentBonus >=
1463 siblingScoreThreshold
1464 ) {
1465 append = true;
1466 } else if (sibling.nodeName === "P") {
1467 var linkDensity = this._getLinkDensity(sibling);
1468 var nodeContent = this._getInnerText(sibling);
1469 var nodeLength = nodeContent.length;
1470
1471 if (nodeLength > 80 && linkDensity < 0.25) {
1472 append = true;
1473 } else if (
1474 nodeLength < 80 &&
1475 nodeLength > 0 &&
1476 linkDensity === 0 &&
1477 nodeContent.search(/\.( |$)/) !== -1
1478 ) {
1479 append = true;
1480 }
1481 }
1482 }
1483
1484 if (append) {
1485 this.log("Appending node:", sibling);
1486
1487 if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) {
1488 // We have a node that isn't a common block level element, like a form or td tag.
1489 // Turn it into a div so it doesn't get filtered out later by accident.
1490 this.log("Altering sibling:", sibling, "to div.");
1491
1492 sibling = this._setNodeTag(sibling, "DIV");
1493 }
1494
1495 articleContent.appendChild(sibling);
1496 // Fetch children again to make it compatible
1497 // with DOM parsers without live collection support.
1498 siblings = parentOfTopCandidate.children;
1499 // siblings is a reference to the children array, and
1500 // sibling is removed from the array when we call appendChild().
1501 // As a result, we must revisit this index since the nodes
1502 // have been shifted.
1503 s -= 1;
1504 sl -= 1;
1505 }
1506 }
1507
1508 if (this._debug) {
1509 this.log("Article content pre-prep: " + articleContent.innerHTML);
1510 }
1511 // So we have all of the content that we need. Now we clean it up for presentation.
1512 this._prepArticle(articleContent);
1513 if (this._debug) {
1514 this.log("Article content post-prep: " + articleContent.innerHTML);
1515 }
1516
1517 if (neededToCreateTopCandidate) {
1518 // We already created a fake div thing, and there wouldn't have been any siblings left
1519 // for the previous loop, so there's no point trying to create a new div, and then
1520 // move all the children over. Just assign IDs and class names here. No need to append
1521 // because that already happened anyway.
1522 topCandidate.id = "readability-page-1";
1523 topCandidate.className = "page";
1524 } else {
1525 var div = doc.createElement("DIV");
1526 div.id = "readability-page-1";
1527 div.className = "page";
1528 while (articleContent.firstChild) {
1529 div.appendChild(articleContent.firstChild);
1530 }
1531 articleContent.appendChild(div);
1532 }
1533
1534 if (this._debug) {
1535 this.log("Article content after paging: " + articleContent.innerHTML);
1536 }
1537
1538 var parseSuccessful = true;
1539
1540 // Now that we've gone through the full algorithm, check to see if
1541 // we got any meaningful content. If we didn't, we may need to re-run
1542 // grabArticle with different flags set. This gives us a higher likelihood of
1543 // finding the content, and the sieve approach gives us a higher likelihood of
1544 // finding the -right- content.
1545 var textLength = this._getInnerText(articleContent, true).length;
1546 if (textLength < this._charThreshold) {
1547 parseSuccessful = false;
1548 // eslint-disable-next-line no-unsanitized/property
1549 page.innerHTML = pageCacheHtml;
1550
1551 this._attempts.push({
1552 articleContent,
1553 textLength,
1554 });
1555
1556 if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
1557 this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
1558 } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
1559 this._removeFlag(this.FLAG_WEIGHT_CLASSES);
1560 } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
1561 this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
1562 } else {
1563 // No luck after removing flags, just return the longest text we found during the different loops
1564 this._attempts.sort(function (a, b) {
1565 return b.textLength - a.textLength;
1566 });
1567
1568 // But first check if we actually have something
1569 if (!this._attempts[0].textLength) {
1570 return null;
1571 }
1572
1573 articleContent = this._attempts[0].articleContent;
1574 parseSuccessful = true;
1575 }
1576 }
1577
1578 if (parseSuccessful) {
1579 // Find out text direction from ancestors of final top candidate.
1580 var ancestors = [parentOfTopCandidate, topCandidate].concat(
1581 this._getNodeAncestors(parentOfTopCandidate)
1582 );
1583 this._someNode(ancestors, function (ancestor) {
1584 if (!ancestor.tagName) {
1585 return false;
1586 }
1587 var articleDir = ancestor.getAttribute("dir");
1588 if (articleDir) {
1589 this._articleDir = articleDir;
1590 return true;
1591 }
1592 return false;
1593 });
1594 return articleContent;
1595 }
1596 }
1597 },
1598
1599 /**
1600 * Converts some of the common HTML entities in string to their corresponding characters.
1601 *
1602 * @param str {string} - a string to unescape.
1603 * @return string without HTML entity.
1604 */
1605 _unescapeHtmlEntities(str) {
1606 if (!str) {
1607 return str;
1608 }
1609
1610 var htmlEscapeMap = this.HTML_ESCAPE_MAP;
1611 return str
1612 .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) {
1613 return htmlEscapeMap[tag];
1614 })
1615 .replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (_, hex, numStr) {
1616 var num = parseInt(hex || numStr, hex ? 16 : 10);
1617
1618 // these character references are replaced by a conforming HTML parser
1619 if (num == 0 || num > 0x10ffff || (num >= 0xd800 && num <= 0xdfff)) {
1620 num = 0xfffd;
1621 }
1622
1623 return String.fromCodePoint(num);
1624 });
1625 },
1626
1627 /**
1628 * Try to extract metadata from JSON-LD object.
1629 * For now, only Schema.org objects of type Article or its subtypes are supported.
1630 * @return Object with any metadata that could be extracted (possibly none)
1631 */
1632 _getJSONLD(doc) {
1633 var scripts = this._getAllNodesWithTag(doc, ["script"]);
1634
1635 var metadata;
1636
1637 this._forEachNode(scripts, function (jsonLdElement) {
1638 if (
1639 !metadata &&
1640 jsonLdElement.getAttribute("type") === "application/ld+json"
1641 ) {
1642 try {
1643 // Strip CDATA markers if present
1644 var content = jsonLdElement.textContent.replace(
1645 /^\s*<!\[CDATA\[|\]\]>\s*$/g,
1646 ""
1647 );
1648 var parsed = JSON.parse(content);
1649
1650 if (Array.isArray(parsed)) {
1651 parsed = parsed.find(it => {
1652 return (
1653 it["@type"] &&
1654 it["@type"].match(this.REGEXPS.jsonLdArticleTypes)
1655 );
1656 });
1657 if (!parsed) {
1658 return;
1659 }
1660 }
1661
1662 var schemaDotOrgRegex = /^https?\:\/\/schema\.org\/?$/;
1663 var matches =
1664 (typeof parsed["@context"] === "string" &&
1665 parsed["@context"].match(schemaDotOrgRegex)) ||
1666 (typeof parsed["@context"] === "object" &&
1667 typeof parsed["@context"]["@vocab"] == "string" &&
1668 parsed["@context"]["@vocab"].match(schemaDotOrgRegex));
1669
1670 if (!matches) {
1671 return;
1672 }
1673
1674 if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
1675 parsed = parsed["@graph"].find(it => {
1676 return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes);
1677 });
1678 }
1679
1680 if (
1681 !parsed ||
1682 !parsed["@type"] ||
1683 !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
1684 ) {
1685 return;
1686 }
1687
1688 metadata = {};
1689
1690 if (
1691 typeof parsed.name === "string" &&
1692 typeof parsed.headline === "string" &&
1693 parsed.name !== parsed.headline
1694 ) {
1695 // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1696 // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1697 // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1698
1699 var title = this._getArticleTitle();
1700 var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1701 var headlineMatches =
1702 this._textSimilarity(parsed.headline, title) > 0.75;
1703
1704 if (headlineMatches && !nameMatches) {
1705 metadata.title = parsed.headline;
1706 } else {
1707 metadata.title = parsed.name;
1708 }
1709 } else if (typeof parsed.name === "string") {
1710 metadata.title = parsed.name.trim();
1711 } else if (typeof parsed.headline === "string") {
1712 metadata.title = parsed.headline.trim();
1713 }
1714 if (parsed.author) {
1715 if (typeof parsed.author.name === "string") {
1716 metadata.byline = parsed.author.name.trim();
1717 } else if (
1718 Array.isArray(parsed.author) &&
1719 parsed.author[0] &&
1720 typeof parsed.author[0].name === "string"
1721 ) {
1722 metadata.byline = parsed.author
1723 .filter(function (author) {
1724 return author && typeof author.name === "string";
1725 })
1726 .map(function (author) {
1727 return author.name.trim();
1728 })
1729 .join(", ");
1730 }
1731 }
1732 if (typeof parsed.description === "string") {
1733 metadata.excerpt = parsed.description.trim();
1734 }
1735 if (parsed.publisher && typeof parsed.publisher.name === "string") {
1736 metadata.siteName = parsed.publisher.name.trim();
1737 }
1738 if (typeof parsed.datePublished === "string") {
1739 metadata.datePublished = parsed.datePublished.trim();
1740 }
1741 } catch (err) {
1742 this.log(err.message);
1743 }
1744 }
1745 });
1746 return metadata ? metadata : {};
1747 },
1748
1749 /**
1750 * Attempts to get excerpt and byline metadata for the article.
1751 *
1752 * @param {Object} jsonld — object containing any metadata that
1753 * could be extracted from JSON-LD object.
1754 *
1755 * @return Object with optional "excerpt" and "byline" properties
1756 */
1757 _getArticleMetadata(jsonld) {
1758 var metadata = {};
1759 var values = {};
1760 var metaElements = this._doc.getElementsByTagName("meta");
1761
1762 // property is a space-separated list of values
1763 var propertyPattern =
1764 /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
1765
1766 // name is a single value
1767 var namePattern =
1768 /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
1769
1770 // Find description tags.
1771 this._forEachNode(metaElements, function (element) {
1772 var elementName = element.getAttribute("name");
1773 var elementProperty = element.getAttribute("property");
1774 var content = element.getAttribute("content");
1775 if (!content) {
1776 return;
1777 }
1778 var matches = null;
1779 var name = null;
1780
1781 if (elementProperty) {
1782 matches = elementProperty.match(propertyPattern);
1783 if (matches) {
1784 // Convert to lowercase, and remove any whitespace
1785 // so we can match below.
1786 name = matches[0].toLowerCase().replace(/\s/g, "");
1787 // multiple authors
1788 values[name] = content.trim();
1789 }
1790 }
1791 if (!matches && elementName && namePattern.test(elementName)) {
1792 name = elementName;
1793 if (content) {
1794 // Convert to lowercase, remove any whitespace, and convert dots
1795 // to colons so we can match below.
1796 name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
1797 values[name] = content.trim();
1798 }
1799 }
1800 });
1801
1802 // get title
1803 metadata.title =
1804 jsonld.title ||
1805 values["dc:title"] ||
1806 values["dcterm:title"] ||
1807 values["og:title"] ||
1808 values["weibo:article:title"] ||
1809 values["weibo:webpage:title"] ||
1810 values.title ||
1811 values["twitter:title"] ||
1812 values["parsely-title"];
1813
1814 if (!metadata.title) {
1815 metadata.title = this._getArticleTitle();
1816 }
1817
1818 const articleAuthor =
1819 typeof values["article:author"] === "string" &&
1820 !this._isUrl(values["article:author"])
1821 ? values["article:author"]
1822 : undefined;
1823
1824 // get author
1825 metadata.byline =
1826 jsonld.byline ||
1827 values["dc:creator"] ||
1828 values["dcterm:creator"] ||
1829 values.author ||
1830 values["parsely-author"] ||
1831 articleAuthor;
1832
1833 // get description
1834 metadata.excerpt =
1835 jsonld.excerpt ||
1836 values["dc:description"] ||
1837 values["dcterm:description"] ||
1838 values["og:description"] ||
1839 values["weibo:article:description"] ||
1840 values["weibo:webpage:description"] ||
1841 values.description ||
1842 values["twitter:description"];
1843
1844 // get site name
1845 metadata.siteName = jsonld.siteName || values["og:site_name"];
1846
1847 // get article published time
1848 metadata.publishedTime =
1849 jsonld.datePublished ||
1850 values["article:published_time"] ||
1851 values["parsely-pub-date"] ||
1852 null;
1853
1854 // in many sites the meta value is escaped with HTML entities,
1855 // so here we need to unescape it
1856 metadata.title = this._unescapeHtmlEntities(metadata.title);
1857 metadata.byline = this._unescapeHtmlEntities(metadata.byline);
1858 metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
1859 metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
1860 metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
1861
1862 return metadata;
1863 },
1864
1865 /**
1866 * Check if node is image, or if node contains exactly only one image
1867 * whether as a direct child or as its descendants.
1868 *
1869 * @param Element
1870 **/
1871 _isSingleImage(node) {
1872 while (node) {
1873 if (node.tagName === "IMG") {
1874 return true;
1875 }
1876 if (node.children.length !== 1 || node.textContent.trim() !== "") {
1877 return false;
1878 }
1879 node = node.children[0];
1880 }
1881 return false;
1882 },
1883
1884 /**
1885 * Find all <noscript> that are located after <img> nodes, and which contain only one
1886 * <img> element. Replace the first image with the image from inside the <noscript> tag,
1887 * and remove the <noscript> tag. This improves the quality of the images we use on
1888 * some sites (e.g. Medium).
1889 *
1890 * @param Element
1891 **/
1892 _unwrapNoscriptImages(doc) {
1893 // Find img without source or attributes that might contains image, and remove it.
1894 // This is done to prevent a placeholder img is replaced by img from noscript in next step.
1895 var imgs = Array.from(doc.getElementsByTagName("img"));
1896 this._forEachNode(imgs, function (img) {
1897 for (var i = 0; i < img.attributes.length; i++) {
1898 var attr = img.attributes[i];
1899 switch (attr.name) {
1900 case "src":
1901 case "srcset":
1902 case "data-src":
1903 case "data-srcset":
1904 return;
1905 }
1906
1907 if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1908 return;
1909 }
1910 }
1911
1912 img.remove();
1913 });
1914
1915 // Next find noscript and try to extract its image
1916 var noscripts = Array.from(doc.getElementsByTagName("noscript"));
1917 this._forEachNode(noscripts, function (noscript) {
1918 // Parse content of noscript and make sure it only contains image
1919 if (!this._isSingleImage(noscript)) {
1920 return;
1921 }
1922 var tmp = doc.createElement("div");
1923 // We're running in the document context, and using unmodified
1924 // document contents, so doing this should be safe.
1925 // (Also we heavily discourage people from allowing script to
1926 // run at all in this document...)
1927 // eslint-disable-next-line no-unsanitized/property
1928 tmp.innerHTML = noscript.innerHTML;
1929
1930 // If noscript has previous sibling and it only contains image,
1931 // replace it with noscript content. However we also keep old
1932 // attributes that might contains image.
1933 var prevElement = noscript.previousElementSibling;
1934 if (prevElement && this._isSingleImage(prevElement)) {
1935 var prevImg = prevElement;
1936 if (prevImg.tagName !== "IMG") {
1937 prevImg = prevElement.getElementsByTagName("img")[0];
1938 }
1939
1940 var newImg = tmp.getElementsByTagName("img")[0];
1941 for (var i = 0; i < prevImg.attributes.length; i++) {
1942 var attr = prevImg.attributes[i];
1943 if (attr.value === "") {
1944 continue;
1945 }
1946
1947 if (
1948 attr.name === "src" ||
1949 attr.name === "srcset" ||
1950 /\.(jpg|jpeg|png|webp)/i.test(attr.value)
1951 ) {
1952 if (newImg.getAttribute(attr.name) === attr.value) {
1953 continue;
1954 }
1955
1956 var attrName = attr.name;
1957 if (newImg.hasAttribute(attrName)) {
1958 attrName = "data-old-" + attrName;
1959 }
1960
1961 newImg.setAttribute(attrName, attr.value);
1962 }
1963 }
1964
1965 noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
1966 }
1967 });
1968 },
1969
1970 /**
1971 * Removes script tags from the document.
1972 *
1973 * @param Element
1974 **/
1975 _removeScripts(doc) {
1976 this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"]));
1977 },
1978
1979 /**
1980 * Check if this node has only whitespace and a single element with given tag
1981 * Returns false if the DIV node contains non-empty text nodes
1982 * or if it contains no element with given tag or more than 1 element.
1983 *
1984 * @param Element
1985 * @param string tag of child element
1986 **/
1987 _hasSingleTagInsideElement(element, tag) {
1988 // There should be exactly 1 element child with given tag
1989 if (element.children.length != 1 || element.children[0].tagName !== tag) {
1990 return false;
1991 }
1992
1993 // And there should be no text nodes with real content
1994 return !this._someNode(element.childNodes, function (node) {
1995 return (
1996 node.nodeType === this.TEXT_NODE &&
1997 this.REGEXPS.hasContent.test(node.textContent)
1998 );
1999 });
2000 },
2001
2002 _isElementWithoutContent(node) {
2003 return (
2004 node.nodeType === this.ELEMENT_NODE &&
2005 !node.textContent.trim().length &&
2006 (!node.children.length ||
2007 node.children.length ==
2008 node.getElementsByTagName("br").length +
2009 node.getElementsByTagName("hr").length)
2010 );
2011 },
2012
2013 /**
2014 * Determine whether element has any children block level elements.
2015 *
2016 * @param Element
2017 */
2018 _hasChildBlockElement(element) {
2019 return this._someNode(element.childNodes, function (node) {
2020 return (
2021 this.DIV_TO_P_ELEMS.has(node.tagName) ||
2022 this._hasChildBlockElement(node)
2023 );
2024 });
2025 },
2026
2027 /***
2028 * Determine if a node qualifies as phrasing content.
2029 * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
2030 **/
2031 _isPhrasingContent(node) {
2032 return (
2033 node.nodeType === this.TEXT_NODE ||
2034 this.PHRASING_ELEMS.includes(node.tagName) ||
2035 ((node.tagName === "A" ||
2036 node.tagName === "DEL" ||
2037 node.tagName === "INS") &&
2038 this._everyNode(node.childNodes, this._isPhrasingContent))
2039 );
2040 },
2041
2042 _isWhitespace(node) {
2043 return (
2044 (node.nodeType === this.TEXT_NODE &&
2045 node.textContent.trim().length === 0) ||
2046 (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR")
2047 );
2048 },
2049
2050 /**
2051 * Get the inner text of a node - cross browser compatibly.
2052 * This also strips out any excess whitespace to be found.
2053 *
2054 * @param Element
2055 * @param Boolean normalizeSpaces (default: true)
2056 * @return string
2057 **/
2058 _getInnerText(e, normalizeSpaces) {
2059 normalizeSpaces =
2060 typeof normalizeSpaces === "undefined" ? true : normalizeSpaces;
2061 var textContent = e.textContent.trim();
2062
2063 if (normalizeSpaces) {
2064 return textContent.replace(this.REGEXPS.normalize, " ");
2065 }
2066 return textContent;
2067 },
2068
2069 /**
2070 * Get the number of times a string s appears in the node e.
2071 *
2072 * @param Element
2073 * @param string - what to split on. Default is ","
2074 * @return number (integer)
2075 **/
2076 _getCharCount(e, s) {
2077 s = s || ",";
2078 return this._getInnerText(e).split(s).length - 1;
2079 },
2080
2081 /**
2082 * Remove the style attribute on every e and under.
2083 * TODO: Test if getElementsByTagName(*) is faster.
2084 *
2085 * @param Element
2086 * @return void
2087 **/
2088 _cleanStyles(e) {
2089 if (!e || e.tagName.toLowerCase() === "svg") {
2090 return;
2091 }
2092
2093 // Remove `style` and deprecated presentational attributes
2094 for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
2095 e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
2096 }
2097
2098 if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.includes(e.tagName)) {
2099 e.removeAttribute("width");
2100 e.removeAttribute("height");
2101 }
2102
2103 var cur = e.firstElementChild;
2104 while (cur !== null) {
2105 this._cleanStyles(cur);
2106 cur = cur.nextElementSibling;
2107 }
2108 },
2109
2110 /**
2111 * Get the density of links as a percentage of the content
2112 * This is the amount of text that is inside a link divided by the total text in the node.
2113 *
2114 * @param Element
2115 * @return number (float)
2116 **/
2117 _getLinkDensity(element) {
2118 var textLength = this._getInnerText(element).length;
2119 if (textLength === 0) {
2120 return 0;
2121 }
2122
2123 var linkLength = 0;
2124
2125 // XXX implement _reduceNodeList?
2126 this._forEachNode(element.getElementsByTagName("a"), function (linkNode) {
2127 var href = linkNode.getAttribute("href");
2128 var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
2129 linkLength += this._getInnerText(linkNode).length * coefficient;
2130 });
2131
2132 return linkLength / textLength;
2133 },
2134
2135 /**
2136 * Get an elements class/id weight. Uses regular expressions to tell if this
2137 * element looks good or bad.
2138 *
2139 * @param Element
2140 * @return number (Integer)
2141 **/
2142 _getClassWeight(e) {
2143 if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
2144 return 0;
2145 }
2146
2147 var weight = 0;
2148
2149 // Look for a special classname
2150 if (typeof e.className === "string" && e.className !== "") {
2151 if (this.REGEXPS.negative.test(e.className)) {
2152 weight -= 25;
2153 }
2154
2155 if (this.REGEXPS.positive.test(e.className)) {
2156 weight += 25;
2157 }
2158 }
2159
2160 // Look for a special ID
2161 if (typeof e.id === "string" && e.id !== "") {
2162 if (this.REGEXPS.negative.test(e.id)) {
2163 weight -= 25;
2164 }
2165
2166 if (this.REGEXPS.positive.test(e.id)) {
2167 weight += 25;
2168 }
2169 }
2170
2171 return weight;
2172 },
2173
2174 /**
2175 * Clean a node of all elements of type "tag".
2176 * (Unless it's a youtube/vimeo video. People love movies.)
2177 *
2178 * @param Element
2179 * @param string tag to clean
2180 * @return void
2181 **/
2182 _clean(e, tag) {
2183 var isEmbed = ["object", "embed", "iframe"].includes(tag);
2184
2185 this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) {
2186 // Allow youtube and vimeo videos through as people usually want to see those.
2187 if (isEmbed) {
2188 // First, check the elements attributes to see if any of them contain youtube or vimeo
2189 for (var i = 0; i < element.attributes.length; i++) {
2190 if (this._allowedVideoRegex.test(element.attributes[i].value)) {
2191 return false;
2192 }
2193 }
2194
2195 // For embed with <object> tag, check inner HTML as well.
2196 if (
2197 element.tagName === "object" &&
2198 this._allowedVideoRegex.test(element.innerHTML)
2199 ) {
2200 return false;
2201 }
2202 }
2203
2204 return true;
2205 });
2206 },
2207
2208 /**
2209 * Check if a given node has one of its ancestor tag name matching the
2210 * provided one.
2211 * @param HTMLElement node
2212 * @param String tagName
2213 * @param Number maxDepth
2214 * @param Function filterFn a filter to invoke to determine whether this node 'counts'
2215 * @return Boolean
2216 */
2217 _hasAncestorTag(node, tagName, maxDepth, filterFn) {
2218 maxDepth = maxDepth || 3;
2219 tagName = tagName.toUpperCase();
2220 var depth = 0;
2221 while (node.parentNode) {
2222 if (maxDepth > 0 && depth > maxDepth) {
2223 return false;
2224 }
2225 if (
2226 node.parentNode.tagName === tagName &&
2227 (!filterFn || filterFn(node.parentNode))
2228 ) {
2229 return true;
2230 }
2231 node = node.parentNode;
2232 depth++;
2233 }
2234 return false;
2235 },
2236
2237 /**
2238 * Return an object indicating how many rows and columns this table has.
2239 */
2240 _getRowAndColumnCount(table) {
2241 var rows = 0;
2242 var columns = 0;
2243 var trs = table.getElementsByTagName("tr");
2244 for (var i = 0; i < trs.length; i++) {
2245 var rowspan = trs[i].getAttribute("rowspan") || 0;
2246 if (rowspan) {
2247 rowspan = parseInt(rowspan, 10);
2248 }
2249 rows += rowspan || 1;
2250
2251 // Now look for column-related info
2252 var columnsInThisRow = 0;
2253 var cells = trs[i].getElementsByTagName("td");
2254 for (var j = 0; j < cells.length; j++) {
2255 var colspan = cells[j].getAttribute("colspan") || 0;
2256 if (colspan) {
2257 colspan = parseInt(colspan, 10);
2258 }
2259 columnsInThisRow += colspan || 1;
2260 }
2261 columns = Math.max(columns, columnsInThisRow);
2262 }
2263 return { rows, columns };
2264 },
2265
2266 /**
2267 * Look for 'data' (as opposed to 'layout') tables, for which we use
2268 * similar checks as
2269 * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
2270 */
2271 _markDataTables(root) {
2272 var tables = root.getElementsByTagName("table");
2273 for (var i = 0; i < tables.length; i++) {
2274 var table = tables[i];
2275 var role = table.getAttribute("role");
2276 if (role == "presentation") {
2277 table._readabilityDataTable = false;
2278 continue;
2279 }
2280 var datatable = table.getAttribute("datatable");
2281 if (datatable == "0") {
2282 table._readabilityDataTable = false;
2283 continue;
2284 }
2285 var summary = table.getAttribute("summary");
2286 if (summary) {
2287 table._readabilityDataTable = true;
2288 continue;
2289 }
2290
2291 var caption = table.getElementsByTagName("caption")[0];
2292 if (caption && caption.childNodes.length) {
2293 table._readabilityDataTable = true;
2294 continue;
2295 }
2296
2297 // If the table has a descendant with any of these tags, consider a data table:
2298 var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
2299 var descendantExists = function (tag) {
2300 return !!table.getElementsByTagName(tag)[0];
2301 };
2302 if (dataTableDescendants.some(descendantExists)) {
2303 this.log("Data table because found data-y descendant");
2304 table._readabilityDataTable = true;
2305 continue;
2306 }
2307
2308 // Nested tables indicate a layout table:
2309 if (table.getElementsByTagName("table")[0]) {
2310 table._readabilityDataTable = false;
2311 continue;
2312 }
2313
2314 var sizeInfo = this._getRowAndColumnCount(table);
2315
2316 if (sizeInfo.columns == 1 || sizeInfo.rows == 1) {
2317 // single colum/row tables are commonly used for page layout purposes.
2318 table._readabilityDataTable = false;
2319 continue;
2320 }
2321
2322 if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
2323 table._readabilityDataTable = true;
2324 continue;
2325 }
2326 // Now just go by size entirely:
2327 table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
2328 }
2329 },
2330
2331 /* convert images and figures that have properties like data-src into images that can be loaded without JS */
2332 _fixLazyImages(root) {
2333 this._forEachNode(
2334 this._getAllNodesWithTag(root, ["img", "picture", "figure"]),
2335 function (elem) {
2336 // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
2337 // So, here we check if the data uri is too short, just might as well remove it.
2338 if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
2339 // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
2340 var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
2341 if (parts[1] === "image/svg+xml") {
2342 return;
2343 }
2344
2345 // Make sure this element has other attributes which contains image.
2346 // If it doesn't, then this src is important and shouldn't be removed.
2347 var srcCouldBeRemoved = false;
2348 for (var i = 0; i < elem.attributes.length; i++) {
2349 var attr = elem.attributes[i];
2350 if (attr.name === "src") {
2351 continue;
2352 }
2353
2354 if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
2355 srcCouldBeRemoved = true;
2356 break;
2357 }
2358 }
2359
2360 // Here we assume if image is less than 100 bytes (or 133 after encoded to base64)
2361 // it will be too small, therefore it might be placeholder image.
2362 if (srcCouldBeRemoved) {
2363 var b64starts = parts[0].length;
2364 var b64length = elem.src.length - b64starts;
2365 if (b64length < 133) {
2366 elem.removeAttribute("src");
2367 }
2368 }
2369 }
2370
2371 // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
2372 if (
2373 (elem.src || (elem.srcset && elem.srcset != "null")) &&
2374 !elem.className.toLowerCase().includes("lazy")
2375 ) {
2376 return;
2377 }
2378
2379 for (var j = 0; j < elem.attributes.length; j++) {
2380 attr = elem.attributes[j];
2381 if (
2382 attr.name === "src" ||
2383 attr.name === "srcset" ||
2384 attr.name === "alt"
2385 ) {
2386 continue;
2387 }
2388 var copyTo = null;
2389 if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
2390 copyTo = "srcset";
2391 } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
2392 copyTo = "src";
2393 }
2394 if (copyTo) {
2395 //if this is an img or picture, set the attribute directly
2396 if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
2397 elem.setAttribute(copyTo, attr.value);
2398 } else if (
2399 elem.tagName === "FIGURE" &&
2400 !this._getAllNodesWithTag(elem, ["img", "picture"]).length
2401 ) {
2402 //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
2403 //see the nytimes-3 testcase for an example
2404 var img = this._doc.createElement("img");
2405 img.setAttribute(copyTo, attr.value);
2406 elem.appendChild(img);
2407 }
2408 }
2409 }
2410 }
2411 );
2412 },
2413
2414 _getTextDensity(e, tags) {
2415 var textLength = this._getInnerText(e, true).length;
2416 if (textLength === 0) {
2417 return 0;
2418 }
2419 var childrenLength = 0;
2420 var children = this._getAllNodesWithTag(e, tags);
2421 this._forEachNode(
2422 children,
2423 child => (childrenLength += this._getInnerText(child, true).length)
2424 );
2425 return childrenLength / textLength;
2426 },
2427
2428 /**
2429 * Clean an element of all tags of type "tag" if they look fishy.
2430 * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
2431 *
2432 * @return void
2433 **/
2434 _cleanConditionally(e, tag) {
2435 if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
2436 return;
2437 }
2438
2439 // Gather counts for other typical elements embedded within.
2440 // Traverse backwards so we can remove nodes at the same time
2441 // without effecting the traversal.
2442 //
2443 // TODO: Consider taking into account original contentScore here.
2444 this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) {
2445 // First check if this node IS data table, in which case don't remove it.
2446 var isDataTable = function (t) {
2447 return t._readabilityDataTable;
2448 };
2449
2450 var isList = tag === "ul" || tag === "ol";
2451 if (!isList) {
2452 var listLength = 0;
2453 var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]);
2454 this._forEachNode(
2455 listNodes,
2456 list => (listLength += this._getInnerText(list).length)
2457 );
2458 isList = listLength / this._getInnerText(node).length > 0.9;
2459 }
2460
2461 if (tag === "table" && isDataTable(node)) {
2462 return false;
2463 }
2464
2465 // Next check if we're inside a data table, in which case don't remove it as well.
2466 if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
2467 return false;
2468 }
2469
2470 if (this._hasAncestorTag(node, "code")) {
2471 return false;
2472 }
2473
2474 // keep element if it has a data tables
2475 if (
2476 [...node.getElementsByTagName("table")].some(
2477 tbl => tbl._readabilityDataTable
2478 )
2479 ) {
2480 return false;
2481 }
2482
2483 var weight = this._getClassWeight(node);
2484
2485 this.log("Cleaning Conditionally", node);
2486
2487 var contentScore = 0;
2488
2489 if (weight + contentScore < 0) {
2490 return true;
2491 }
2492
2493 if (this._getCharCount(node, ",") < 10) {
2494 // If there are not very many commas, and the number of
2495 // non-paragraph elements is more than paragraphs or other
2496 // ominous signs, remove the element.
2497 var p = node.getElementsByTagName("p").length;
2498 var img = node.getElementsByTagName("img").length;
2499 var li = node.getElementsByTagName("li").length - 100;
2500 var input = node.getElementsByTagName("input").length;
2501 var headingDensity = this._getTextDensity(node, [
2502 "h1",
2503 "h2",
2504 "h3",
2505 "h4",
2506 "h5",
2507 "h6",
2508 ]);
2509
2510 var embedCount = 0;
2511 var embeds = this._getAllNodesWithTag(node, [
2512 "object",
2513 "embed",
2514 "iframe",
2515 ]);
2516
2517 for (var i = 0; i < embeds.length; i++) {
2518 // If this embed has attribute that matches video regex, don't delete it.
2519 for (var j = 0; j < embeds[i].attributes.length; j++) {
2520 if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
2521 return false;
2522 }
2523 }
2524
2525 // For embed with <object> tag, check inner HTML as well.
2526 if (
2527 embeds[i].tagName === "object" &&
2528 this._allowedVideoRegex.test(embeds[i].innerHTML)
2529 ) {
2530 return false;
2531 }
2532
2533 embedCount++;
2534 }
2535
2536 var innerText = this._getInnerText(node);
2537
2538 // toss any node whose inner text contains nothing but suspicious words
2539 if (
2540 this.REGEXPS.adWords.test(innerText) ||
2541 this.REGEXPS.loadingWords.test(innerText)
2542 ) {
2543 return true;
2544 }
2545
2546 var contentLength = innerText.length;
2547 var linkDensity = this._getLinkDensity(node);
2548 var textishTags = ["SPAN", "LI", "TD"].concat(
2549 Array.from(this.DIV_TO_P_ELEMS)
2550 );
2551 var textDensity = this._getTextDensity(node, textishTags);
2552 var isFigureChild = this._hasAncestorTag(node, "figure");
2553
2554 // apply shadiness checks, then check for exceptions
2555 const shouldRemoveNode = () => {
2556 const errs = [];
2557 if (!isFigureChild && img > 1 && p / img < 0.5) {
2558 errs.push(`Bad p to img ratio (img=${img}, p=${p})`);
2559 }
2560 if (!isList && li > p) {
2561 errs.push(`Too many li's outside of a list. (li=${li} > p=${p})`);
2562 }
2563 if (input > Math.floor(p / 3)) {
2564 errs.push(`Too many inputs per p. (input=${input}, p=${p})`);
2565 }
2566 if (
2567 !isList &&
2568 !isFigureChild &&
2569 headingDensity < 0.9 &&
2570 contentLength < 25 &&
2571 (img === 0 || img > 2) &&
2572 linkDensity > 0
2573 ) {
2574 errs.push(
2575 `Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})`
2576 );
2577 }
2578 if (
2579 !isList &&
2580 weight < 25 &&
2581 linkDensity > 0.2 + this._linkDensityModifier
2582 ) {
2583 errs.push(
2584 `Low weight and a little linky. (linkDensity=${linkDensity})`
2585 );
2586 }
2587 if (weight >= 25 && linkDensity > 0.5 + this._linkDensityModifier) {
2588 errs.push(
2589 `High weight and mostly links. (linkDensity=${linkDensity})`
2590 );
2591 }
2592 if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
2593 errs.push(
2594 `Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})`
2595 );
2596 }
2597 if (img === 0 && textDensity === 0) {
2598 errs.push(
2599 `No useful content. (img=${img}, textDensity=${textDensity})`
2600 );
2601 }
2602
2603 if (errs.length) {
2604 this.log("Checks failed", errs);
2605 return true;
2606 }
2607
2608 return false;
2609 };
2610
2611 var haveToRemove = shouldRemoveNode();
2612
2613 // Allow simple lists of images to remain in pages
2614 if (isList && haveToRemove) {
2615 for (var x = 0; x < node.children.length; x++) {
2616 let child = node.children[x];
2617 // Don't filter in lists with li's that contain more than one child
2618 if (child.children.length > 1) {
2619 return haveToRemove;
2620 }
2621 }
2622 let li_count = node.getElementsByTagName("li").length;
2623 // Only allow the list to remain if every li contains an image
2624 if (img == li_count) {
2625 return false;
2626 }
2627 }
2628 return haveToRemove;
2629 }
2630 return false;
2631 });
2632 },
2633
2634 /**
2635 * Clean out elements that match the specified conditions
2636 *
2637 * @param Element
2638 * @param Function determines whether a node should be removed
2639 * @return void
2640 **/
2641 _cleanMatchedNodes(e, filter) {
2642 var endOfSearchMarkerNode = this._getNextNode(e, true);
2643 var next = this._getNextNode(e);
2644 while (next && next != endOfSearchMarkerNode) {
2645 if (filter.call(this, next, next.className + " " + next.id)) {
2646 next = this._removeAndGetNext(next);
2647 } else {
2648 next = this._getNextNode(next);
2649 }
2650 }
2651 },
2652
2653 /**
2654 * Clean out spurious headers from an Element.
2655 *
2656 * @param Element
2657 * @return void
2658 **/
2659 _cleanHeaders(e) {
2660 let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]);
2661 this._removeNodes(headingNodes, function (node) {
2662 let shouldRemove = this._getClassWeight(node) < 0;
2663 if (shouldRemove) {
2664 this.log("Removing header with low class weight:", node);
2665 }
2666 return shouldRemove;
2667 });
2668 },
2669
2670 /**
2671 * Check if this node is an H1 or H2 element whose content is mostly
2672 * the same as the article title.
2673 *
2674 * @param Element the node to check.
2675 * @return boolean indicating whether this is a title-like header.
2676 */
2677 _headerDuplicatesTitle(node) {
2678 if (node.tagName != "H1" && node.tagName != "H2") {
2679 return false;
2680 }
2681 var heading = this._getInnerText(node, false);
2682 this.log("Evaluating similarity of header:", heading, this._articleTitle);
2683 return this._textSimilarity(this._articleTitle, heading) > 0.75;
2684 },
2685
2686 _flagIsActive(flag) {
2687 return (this._flags & flag) > 0;
2688 },
2689
2690 _removeFlag(flag) {
2691 this._flags = this._flags & ~flag;
2692 },
2693
2694 _isProbablyVisible(node) {
2695 // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes.
2696 return (
2697 (!node.style || node.style.display != "none") &&
2698 (!node.style || node.style.visibility != "hidden") &&
2699 !node.hasAttribute("hidden") &&
2700 //check for "fallback-image" so that wikimedia math images are displayed
2701 (!node.hasAttribute("aria-hidden") ||
2702 node.getAttribute("aria-hidden") != "true" ||
2703 (node.className &&
2704 node.className.includes &&
2705 node.className.includes("fallback-image")))
2706 );
2707 },
2708
2709 /**
2710 * Runs readability.
2711 *
2712 * Workflow:
2713 * 1. Prep the document by removing script tags, css, etc.
2714 * 2. Build readability's DOM tree.
2715 * 3. Grab the article content from the current dom tree.
2716 * 4. Replace the current DOM tree with the new one.
2717 * 5. Read peacefully.
2718 *
2719 * @return void
2720 **/
2721 parse() {
2722 // Avoid parsing too large documents, as per configuration option
2723 if (this._maxElemsToParse > 0) {
2724 var numTags = this._doc.getElementsByTagName("*").length;
2725 if (numTags > this._maxElemsToParse) {
2726 throw new Error(
2727 "Aborting parsing document; " + numTags + " elements found"
2728 );
2729 }
2730 }
2731
2732 // Unwrap image from noscript
2733 this._unwrapNoscriptImages(this._doc);
2734
2735 // Extract JSON-LD metadata before removing scripts
2736 var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
2737
2738 // Remove script tags from the document.
2739 this._removeScripts(this._doc);
2740
2741 this._prepDocument();
2742
2743 var metadata = this._getArticleMetadata(jsonLd);
2744 this._metadata = metadata;
2745 this._articleTitle = metadata.title;
2746
2747 var articleContent = this._grabArticle();
2748 if (!articleContent) {
2749 return null;
2750 }
2751
2752 this.log("Grabbed: " + articleContent.innerHTML);
2753
2754 this._postProcessContent(articleContent);
2755
2756 // If we haven't found an excerpt in the article's metadata, use the article's
2757 // first paragraph as the excerpt. This is used for displaying a preview of
2758 // the article's content.
2759 if (!metadata.excerpt) {
2760 var paragraphs = articleContent.getElementsByTagName("p");
2761 if (paragraphs.length) {
2762 metadata.excerpt = paragraphs[0].textContent.trim();
2763 }
2764 }
2765
2766 var textContent = articleContent.textContent;
2767 return {
2768 title: this._articleTitle,
2769 byline: metadata.byline || this._articleByline,
2770 dir: this._articleDir,
2771 lang: this._articleLang,
2772 content: this._serializer(articleContent),
2773 textContent,
2774 length: textContent.length,
2775 excerpt: metadata.excerpt,
2776 siteName: metadata.siteName || this._articleSiteName,
2777 publishedTime: metadata.publishedTime,
2778 };
2779 },
2780};
2781
2782if (typeof module === "object") {
2783 /* eslint-disable-next-line no-redeclare */
2784 /* global module */
2785 module.exports = Readability;
2786}