A browser extension that lets you summarize any webpage and ask questions using AI.
at main 659 lines 19 kB view raw
1// Content script: Mozilla Readability for articles, legacy DOM walk for feeds / non-articles 2 3(function () { 4 "use strict"; 5 6 // Prevent multiple injections 7 if (window.__webaiExtractorInstalled) { 8 return; 9 } 10 window.__webaiExtractorInstalled = true; 11 12 const MAX_LENGTH = 13 typeof CONFIG !== "undefined" && CONFIG.EXTRACTION?.MAX_LENGTH 14 ? CONFIG.EXTRACTION.MAX_LENGTH 15 : 50000; 16 17 /** 18 * Readability `textContent` length (article body only). Above this, treat as a real article 19 * extract. Do NOT use total formatted string length — title/metadata can exceed 500 chars alone. 20 * Kept moderately high so homepages don't classify sponsor blocks as "articles". 21 */ 22 const READABILITY_STRONG_BODY_CHARS = 560; 23 24 /** 25 * When body is shorter, still trust Readability if the body is a large share of our formatted 26 * output (not mostly "Title:/Description:" boilerplate from a feed shell). 27 */ 28 const READABILITY_MIN_BODY_CHARS = 120; 29 30 /** Minimum body / formatted-text ratio for the "substantial body" heuristic. */ 31 const READABILITY_MIN_CONTENT_RATIO = 0.34; 32 33 /** 34 * If Readability output is shorter than this, compare with legacy: full-page text often 35 * dwarfs a homepage "article" grab (e.g. link aggregators). 36 */ 37 const READABILITY_SHORT_EXTRACT_MAX = 2200; 38 39 /** 40 * When Readability extract is "short", prefer legacy if the DOM walk yields this many times 41 * more text (typical feed / homepage vs a thin Readability pick). 42 */ 43 const READABILITY_LEGACY_DOMINANCE_RATIO = 3.25; 44 45 /** Prefer legacy when the grab looks like promos / feed chrome, not prose. */ 46 const READABILITY_BIAS_LEGACY_REGEXES = [ 47 /\bSponsor(?:ed)?\s+Posts?\b/i, 48 /\bPromoted\s+(?:Stories?|Posts?|Content)\b/i, 49 /\bAdvertiser\s+Content\b/i, 50 /\bPaid\s+Partnership\b/i, 51 ]; 52 53 function readabilityTextBiasLegacy(formattedText) { 54 return READABILITY_BIAS_LEGACY_REGEXES.some((re) => re.test(formattedText)); 55 } 56 57 /** 58 * Short Readability extract + huge legacy output ⇒ likely listing/homepage, not an article. 59 * Skip when body is clearly long-form anyway (avoid flipping concise real articles on noisy DOMs). 60 */ 61 function tryLegacyWhenShortReadOverwhelmed(readability) { 62 const readLen = readability.text.length; 63 if (readLen > READABILITY_SHORT_EXTRACT_MAX) { 64 return null; 65 } 66 if (readability.bodyTextLen >= READABILITY_STRONG_BODY_CHARS + 400) { 67 return null; 68 } 69 const legacy = extractLegacy(); 70 if (legacy.text.length > readLen * READABILITY_LEGACY_DOMINANCE_RATIO) { 71 return legacy; 72 } 73 return null; 74 } 75 76 const EXCLUDE_TAGS = [ 77 "script", 78 "style", 79 "noscript", 80 "iframe", 81 "embed", 82 "object", 83 "frame", 84 "nav", 85 "aside", 86 "form", 87 "button", 88 "input", 89 ]; 90 91 /** 92 * Subtrees to omit from plain-text collection. `textContent` includes script/style/template 93 * bodies, which pulls in Astro/React/Vite inline bundles when walking `main div` etc. 94 * Also skip embed-like tags (iframe/object/embed) so news players do not dump URLs or attrs. 95 */ 96 const TEXT_SUBTREE_EXCLUDE_TAGS = new Set([ 97 ...EXCLUDE_TAGS, 98 "template", 99 ]); 100 101 function shouldExcludeTextSubtree(tagName) { 102 return TEXT_SUBTREE_EXCLUDE_TAGS.has(tagName.toLowerCase()); 103 } 104 105 /** 106 * Strip embed markup that appears as literal text in article bodies (CMS/oEmbed fallbacks). 107 * Complements subtree skipping — some sites still surface tags as visible copy. 108 */ 109 function sanitizeLiteralEmbedMarkup(text) { 110 if (!text || typeof text !== "string") { 111 return text; 112 } 113 let t = text; 114 t = t.replace(/<iframe\b[\s\S]{0,20000}?<\/iframe>/gi, "\n"); 115 t = t.replace(/<iframe\b[^>]{0,8000}\/?>/gi, "\n"); 116 t = t.replace(/<\/iframe>/gi, ""); 117 t = t.replace(/<embed\b[^>]{0,8000}\/?>/gi, "\n"); 118 t = t.replace(/<object\b[\s\S]{0,20000}?<\/object>/gi, "\n"); 119 return t; 120 } 121 122 function extractWithReadability() { 123 const documentClone = document.cloneNode(true); 124 const reader = new Readability(documentClone); 125 const article = reader.parse(); 126 127 if (!article) { 128 return { 129 text: "", 130 wasTruncated: false, 131 articleNull: true, 132 bodyTextLen: 0, 133 }; 134 } 135 136 let extractedText = ""; 137 138 if (article.title) { 139 extractedText += `Title: ${article.title}\n\n`; 140 } 141 142 if (article.byline) { 143 extractedText += `Author: ${article.byline}\n\n`; 144 } 145 146 if (article.excerpt && article.excerpt !== article.title) { 147 extractedText += `Description: ${article.excerpt}\n\n`; 148 } 149 150 if (article.publishedTime) { 151 extractedText += `Published: ${article.publishedTime}\n\n`; 152 } 153 154 if (article.siteName) { 155 extractedText += `Source: ${article.siteName}\n\n`; 156 } 157 158 if (extractedText) { 159 extractedText += "---\n\n"; 160 } 161 162 let content = article.textContent || ""; 163 content = sanitizeLiteralEmbedMarkup(content); 164 165 content = content 166 .replace(/[^\S\n]+/g, " ") 167 .replace(/\n{3,}/g, "\n\n") 168 .replace(/^\s+|\s+$/g, ""); 169 170 extractedText += content; 171 172 const bodyTextLen = content.trim().length; 173 174 let wasTruncated = false; 175 if (extractedText.length > MAX_LENGTH) { 176 wasTruncated = true; 177 extractedText = extractedText.substring(0, MAX_LENGTH); 178 } 179 180 return { 181 text: extractedText, 182 wasTruncated, 183 articleNull: false, 184 bodyTextLen, 185 }; 186 } 187 188 // --- Legacy extraction (pre-Readability): structured body walk + selector fallback --- 189 190 function extractLegacySelectorsFallback() { 191 const selectors = [ 192 "article p", 193 "article div", 194 ".content p", 195 ".content div", 196 ".post-content p", 197 ".entry-content p", 198 ".article-body p", 199 "main p", 200 "main div", 201 '[role="main"] p', 202 ".story p", 203 ".story-body p", 204 "#story p", 205 ]; 206 207 let text = ""; 208 let wasTruncated = false; 209 const seen = new Set(); 210 211 for (const selector of selectors) { 212 try { 213 const elements = document.querySelectorAll(selector); 214 for (const el of elements) { 215 const content = getTextContent(el).trim(); 216 if (content.length < 20 || seen.has(content.substring(0, 100))) continue; 217 218 const style = window.getComputedStyle(el); 219 if (style.display === "none" || style.visibility === "hidden") continue; 220 221 seen.add(content.substring(0, 100)); 222 text += content + "\n\n"; 223 224 if (text.length > MAX_LENGTH) { 225 wasTruncated = true; 226 break; 227 } 228 } 229 } catch (e) { 230 // Ignore invalid selectors 231 } 232 if (wasTruncated) break; 233 } 234 235 if (text.length < 500 && !wasTruncated) { 236 const allParagraphs = document.querySelectorAll("p"); 237 for (const p of allParagraphs) { 238 const content = getTextContent(p).trim(); 239 if (content.length > 30 && !seen.has(content.substring(0, 100))) { 240 const style = window.getComputedStyle(p); 241 if (style.display === "none" || style.visibility === "hidden") continue; 242 243 seen.add(content.substring(0, 100)); 244 text += content + "\n\n"; 245 246 if (text.length > MAX_LENGTH) { 247 wasTruncated = true; 248 break; 249 } 250 } 251 } 252 } 253 254 return { text: text.substring(0, MAX_LENGTH), wasTruncated }; 255 } 256 257 function shouldSkipElement(el) { 258 const tag = el.tagName.toLowerCase(); 259 if (EXCLUDE_TAGS.includes(tag)) { 260 return true; 261 } 262 263 try { 264 const style = window.getComputedStyle(el); 265 if ( 266 style.display === "none" || 267 style.visibility === "hidden" || 268 style.opacity === "0" 269 ) { 270 return true; 271 } 272 } catch (e) { 273 // ignore 274 } 275 276 if (isMainContent(el)) return false; 277 278 const role = el.getAttribute("role"); 279 if (role === "navigation" || role === "banner" || role === "complementary") { 280 return true; 281 } 282 283 let className = ""; 284 let id = ""; 285 286 if (el.className) { 287 if (typeof el.className === "string") { 288 className = el.className; 289 } else if (el.className.baseVal) { 290 className = el.className.baseVal; 291 } 292 } 293 294 if (el.id) { 295 if (typeof el.id === "string") { 296 id = el.id; 297 } else if (el.id.baseVal) { 298 id = el.id.baseVal; 299 } 300 } 301 302 const classAndId = (className + " " + id).toLowerCase(); 303 const strictNoisePatterns = [ 304 /^nav$/, 305 /-nav$/, 306 /^nav-/, 307 /^navigation$/, 308 /^footer$/, 309 /-footer$/, 310 /^footer-/, 311 /^header$/, 312 /^site-header$/, 313 /^page-header$/, 314 /^sidebar$/, 315 /^advertisement$/, 316 /^ad-container$/, 317 ]; 318 if (strictNoisePatterns.some((p) => p.test(classAndId.trim()))) { 319 return true; 320 } 321 322 return false; 323 } 324 325 function isMainContent(element) { 326 const role = element.getAttribute("role"); 327 const tagName = element.tagName.toLowerCase(); 328 329 let className = ""; 330 let id = ""; 331 332 if (element.className) { 333 if (typeof element.className === "string") { 334 className = element.className.toLowerCase(); 335 } else if (element.className.baseVal) { 336 className = element.className.baseVal.toLowerCase(); 337 } 338 } 339 340 if (element.id) { 341 if (typeof element.id === "string") { 342 id = element.id.toLowerCase(); 343 } else if (element.id.baseVal) { 344 id = element.id.baseVal.toLowerCase(); 345 } 346 } 347 348 const contentPatterns = [ 349 "content", 350 "main-content", 351 "article-content", 352 "post-content", 353 "entry-content", 354 "page-content", 355 "story-content", 356 "body-content", 357 "article", 358 "post", 359 "entry", 360 "story", 361 "main", 362 ]; 363 364 const isContentClass = contentPatterns.some( 365 (p) => className.includes(p) || id.includes(p), 366 ); 367 368 return ( 369 role === "main" || 370 role === "article" || 371 tagName === "main" || 372 tagName === "article" || 373 isContentClass 374 ); 375 } 376 377 function extractTextFromElement(element, depth = 0) { 378 let text = ""; 379 const indent = " ".repeat(depth); 380 381 const directText = getDirectTextContent(element).trim(); 382 if (directText.length > 20 && depth > 0) { 383 text += directText + "\n\n"; 384 } 385 386 for (const child of element.children) { 387 const childTag = child.tagName.toLowerCase(); 388 389 if (shouldSkipElement(child)) continue; 390 391 if (/^h[1-6]$/.test(childTag)) { 392 const headingText = getTextContent(child).trim(); 393 if (headingText) { 394 const prefix = "#".repeat(parseInt(childTag[1], 10)); 395 text += `\n${prefix} ${headingText}\n\n`; 396 } 397 } else if (childTag === "p") { 398 const pText = getTextContent(child).trim(); 399 if (pText.length > 5) { 400 text += `${pText}\n\n`; 401 } 402 } else if (childTag === "li") { 403 const liText = getTextContent(child).trim(); 404 if (liText) { 405 text += `${indent}- ${liText}\n`; 406 } 407 } else if (childTag === "pre" || childTag === "code") { 408 const codeText = getTextContent(child).trim(); 409 if (codeText) { 410 text += `\n\`\`\`\n${codeText}\n\`\`\`\n\n`; 411 } 412 } else { 413 const childText = extractTextFromElement(child, depth + 1); 414 if (childText.trim()) { 415 text += childText; 416 } 417 } 418 } 419 420 return text; 421 } 422 423 function getDirectTextContent(element) { 424 let text = ""; 425 for (const node of element.childNodes) { 426 if (node.nodeType === Node.TEXT_NODE) { 427 text += node.textContent; 428 } 429 } 430 return text.trim(); 431 } 432 433 function getTextContent(element) { 434 let text = ""; 435 436 for (const node of element.childNodes) { 437 if (node.nodeType === Node.TEXT_NODE) { 438 text += node.textContent; 439 } else if (node.nodeType === Node.ELEMENT_NODE) { 440 const tagName = node.tagName.toLowerCase(); 441 442 if (shouldExcludeTextSubtree(tagName)) { 443 continue; 444 } 445 446 if (["br", "p", "div", "li"].includes(tagName)) { 447 text += " " + getTextContent(node) + " "; 448 } else { 449 text += getTextContent(node); 450 } 451 } 452 } 453 454 return text; 455 } 456 457 function cleanExtractedText(text, shouldTruncate = true) { 458 let cleaned = sanitizeLiteralEmbedMarkup(text); 459 cleaned = cleaned 460 .replace(/[^\S\n]+/g, " ") 461 .replace(/\n{3,}/g, "\n\n") 462 .replace(/^\s+|\s+$/g, ""); 463 464 if (shouldTruncate && cleaned.length > MAX_LENGTH) { 465 cleaned = cleaned.substring(0, MAX_LENGTH); 466 } 467 468 return cleaned; 469 } 470 471 /** 472 * Full-document extraction used before Readability. Works well for feeds, homepages, 473 * and app-like pages where Readability returns nothing or very little. 474 */ 475 function extractLegacy() { 476 if (!document.body) { 477 return { text: document.title || "", wasTruncated: false }; 478 } 479 480 let extractedText = ""; 481 let wasTruncated = false; 482 483 const title = document.title || ""; 484 if (title) { 485 extractedText += `Title: ${title}\n\n`; 486 } 487 488 const metaDesc = document.querySelector('meta[name="description"]'); 489 if (metaDesc) { 490 const desc = metaDesc.getAttribute("content"); 491 if (desc) { 492 extractedText += `Description: ${desc}\n\n`; 493 } 494 } 495 496 extractedText += extractTextFromElement(document.body); 497 498 extractedText = cleanExtractedText(extractedText, false); 499 500 if (extractedText.length > MAX_LENGTH) { 501 wasTruncated = true; 502 extractedText = extractedText.substring(0, MAX_LENGTH); 503 } 504 505 if (extractedText.length < 1000) { 506 const fallbackResult = extractLegacySelectorsFallback(); 507 if (fallbackResult.text.length > extractedText.length) { 508 extractedText = `Title: ${title}\n\n${fallbackResult.text}`; 509 wasTruncated = fallbackResult.wasTruncated; 510 } 511 } 512 513 return { text: extractedText, wasTruncated }; 514 } 515 516 /** Legacy debug — uncomment body to log to the tab's DevTools (page context). */ 517 function logExtractionDebug(source, text) { 518 // console.log("[Lede DEBUG] extraction:", source); 519 // console.log( 520 // "[Lede DEBUG] raw extracted text (" + text.length + " chars):", 521 // text, 522 // ); 523 } 524 525 function tryRedditShredditExtract() { 526 if (typeof window.__webaiTryRedditNew !== "function") { 527 return null; 528 } 529 try { 530 return window.__webaiTryRedditNew(MAX_LENGTH); 531 } catch (err) { 532 console.error("[Lede] Reddit extract error:", err); 533 return null; 534 } 535 } 536 537 function extractContent() { 538 const redditResult = tryRedditShredditExtract(); 539 if (redditResult) { 540 logExtractionDebug(redditResult.extractionSource, redditResult.text); 541 return { 542 text: redditResult.text, 543 wasTruncated: Boolean(redditResult.wasTruncated), 544 extractionSource: redditResult.extractionSource, 545 unsupportedReason: redditResult.unsupportedReason, 546 }; 547 } 548 549 try { 550 const readability = extractWithReadability(); 551 const rbLen = readability.text.trim().length; 552 553 if (readability.articleNull || rbLen === 0) { 554 const legacy = extractLegacy(); 555 logExtractionDebug("legacy", legacy.text); 556 return { ...legacy, extractionSource: "legacy" }; 557 } 558 559 const bodyLen = readability.bodyTextLen; 560 const contentRatio = 561 bodyLen / Math.max(readability.text.length, 1); 562 563 const biasedToLegacy = readabilityTextBiasLegacy(readability.text); 564 565 const strongArticleBody = 566 bodyLen >= READABILITY_STRONG_BODY_CHARS && !biasedToLegacy; 567 const ratioLooksLikeArticle = 568 bodyLen >= READABILITY_MIN_BODY_CHARS && 569 contentRatio >= READABILITY_MIN_CONTENT_RATIO && 570 !biasedToLegacy; 571 572 // Long article body — Readability clearly won (unless sponsor/promo bias sent us to legacy). 573 if (strongArticleBody) { 574 const flip = tryLegacyWhenShortReadOverwhelmed(readability); 575 if (flip) { 576 logExtractionDebug("legacy", flip.text); 577 return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" }; 578 } 579 logExtractionDebug("readability", readability.text); 580 return { 581 text: readability.text, 582 wasTruncated: readability.wasTruncated, 583 extractionSource: "readability", 584 }; 585 } 586 587 // Short pages / stubs: body is still most of what we output (not metadata padding). 588 if (ratioLooksLikeArticle) { 589 const flip = tryLegacyWhenShortReadOverwhelmed(readability); 590 if (flip) { 591 logExtractionDebug("legacy", flip.text); 592 return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" }; 593 } 594 logExtractionDebug("readability", readability.text); 595 return { 596 text: readability.text, 597 wasTruncated: readability.wasTruncated, 598 extractionSource: "readability", 599 }; 600 } 601 602 // Marginal Readability (feed hero, cookie copy, tiny grab): compare with legacy. 603 const legacy = extractLegacy(); 604 const legLen = legacy.text.length; 605 const readLen = readability.text.length; 606 607 if (bodyLen < READABILITY_MIN_BODY_CHARS) { 608 const useLegacy = legLen > readLen; 609 const picked = useLegacy ? legacy : readability; 610 logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text); 611 return { 612 text: picked.text, 613 wasTruncated: picked.wasTruncated, 614 extractionSource: useLegacy ? "legacy" : "readability", 615 }; 616 } 617 618 // Body exists but is mostly not "article-like" vs full page (low ratio already failed). 619 if ( 620 contentRatio < READABILITY_MIN_CONTENT_RATIO && 621 legLen > readLen * 1.4 622 ) { 623 logExtractionDebug("legacy", legacy.text); 624 return { 625 text: legacy.text, 626 wasTruncated: legacy.wasTruncated, 627 extractionSource: "legacy", 628 }; 629 } 630 631 const useLegacy = legLen > readLen; 632 const picked = useLegacy ? legacy : readability; 633 logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text); 634 return { 635 text: picked.text, 636 wasTruncated: picked.wasTruncated, 637 extractionSource: useLegacy ? "legacy" : "readability", 638 }; 639 } catch (error) { 640 console.error("[Lede] Readability error:", error); 641 const legacy = extractLegacy(); 642 logExtractionDebug("legacy", legacy.text); 643 return { ...legacy, extractionSource: "legacy" }; 644 } 645 } 646 647 chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { 648 if (request.action === "extract") { 649 const result = extractContent(); 650 sendResponse({ 651 content: result.text, 652 wasTruncated: result.wasTruncated, 653 extractionSource: result.extractionSource, 654 unsupportedReason: result.unsupportedReason, 655 }); 656 } 657 return true; 658 }); 659})();