A browser extension that lets you summarize any webpage and ask questions using AI.
1// Content script: Mozilla Readability for articles, legacy DOM walk for feeds / non-articles
2
3(function () {
4 "use strict";
5
6 // Prevent multiple injections
7 if (window.__webaiExtractorInstalled) {
8 return;
9 }
10 window.__webaiExtractorInstalled = true;
11
12 const MAX_LENGTH =
13 typeof CONFIG !== "undefined" && CONFIG.EXTRACTION?.MAX_LENGTH
14 ? CONFIG.EXTRACTION.MAX_LENGTH
15 : 50000;
16
17 /**
18 * Readability `textContent` length (article body only). Above this, treat as a real article
19 * extract. Do NOT use total formatted string length — title/metadata can exceed 500 chars alone.
20 * Kept moderately high so homepages don't classify sponsor blocks as "articles".
21 */
22 const READABILITY_STRONG_BODY_CHARS = 560;
23
24 /**
25 * When body is shorter, still trust Readability if the body is a large share of our formatted
26 * output (not mostly "Title:/Description:" boilerplate from a feed shell).
27 */
28 const READABILITY_MIN_BODY_CHARS = 120;
29
30 /** Minimum body / formatted-text ratio for the "substantial body" heuristic. */
31 const READABILITY_MIN_CONTENT_RATIO = 0.34;
32
33 /**
34 * If Readability output is shorter than this, compare with legacy: full-page text often
35 * dwarfs a homepage "article" grab (e.g. link aggregators).
36 */
37 const READABILITY_SHORT_EXTRACT_MAX = 2200;
38
39 /**
40 * When Readability extract is "short", prefer legacy if the DOM walk yields this many times
41 * more text (typical feed / homepage vs a thin Readability pick).
42 */
43 const READABILITY_LEGACY_DOMINANCE_RATIO = 3.25;
44
45 /** Prefer legacy when the grab looks like promos / feed chrome, not prose. */
46 const READABILITY_BIAS_LEGACY_REGEXES = [
47 /\bSponsor(?:ed)?\s+Posts?\b/i,
48 /\bPromoted\s+(?:Stories?|Posts?|Content)\b/i,
49 /\bAdvertiser\s+Content\b/i,
50 /\bPaid\s+Partnership\b/i,
51 ];
52
53 function readabilityTextBiasLegacy(formattedText) {
54 return READABILITY_BIAS_LEGACY_REGEXES.some((re) => re.test(formattedText));
55 }
56
57 /**
58 * Short Readability extract + huge legacy output ⇒ likely listing/homepage, not an article.
59 * Skip when body is clearly long-form anyway (avoid flipping concise real articles on noisy DOMs).
60 */
61 function tryLegacyWhenShortReadOverwhelmed(readability) {
62 const readLen = readability.text.length;
63 if (readLen > READABILITY_SHORT_EXTRACT_MAX) {
64 return null;
65 }
66 if (readability.bodyTextLen >= READABILITY_STRONG_BODY_CHARS + 400) {
67 return null;
68 }
69 const legacy = extractLegacy();
70 if (legacy.text.length > readLen * READABILITY_LEGACY_DOMINANCE_RATIO) {
71 return legacy;
72 }
73 return null;
74 }
75
76 const EXCLUDE_TAGS = [
77 "script",
78 "style",
79 "noscript",
80 "iframe",
81 "embed",
82 "object",
83 "frame",
84 "nav",
85 "aside",
86 "form",
87 "button",
88 "input",
89 ];
90
91 /**
92 * Subtrees to omit from plain-text collection. `textContent` includes script/style/template
93 * bodies, which pulls in Astro/React/Vite inline bundles when walking `main div` etc.
94 * Also skip embed-like tags (iframe/object/embed) so news players do not dump URLs or attrs.
95 */
96 const TEXT_SUBTREE_EXCLUDE_TAGS = new Set([
97 ...EXCLUDE_TAGS,
98 "template",
99 ]);
100
101 function shouldExcludeTextSubtree(tagName) {
102 return TEXT_SUBTREE_EXCLUDE_TAGS.has(tagName.toLowerCase());
103 }
104
105 /**
106 * Strip embed markup that appears as literal text in article bodies (CMS/oEmbed fallbacks).
107 * Complements subtree skipping — some sites still surface tags as visible copy.
108 */
109 function sanitizeLiteralEmbedMarkup(text) {
110 if (!text || typeof text !== "string") {
111 return text;
112 }
113 let t = text;
114 t = t.replace(/<iframe\b[\s\S]{0,20000}?<\/iframe>/gi, "\n");
115 t = t.replace(/<iframe\b[^>]{0,8000}\/?>/gi, "\n");
116 t = t.replace(/<\/iframe>/gi, "");
117 t = t.replace(/<embed\b[^>]{0,8000}\/?>/gi, "\n");
118 t = t.replace(/<object\b[\s\S]{0,20000}?<\/object>/gi, "\n");
119 return t;
120 }
121
122 function extractWithReadability() {
123 const documentClone = document.cloneNode(true);
124 const reader = new Readability(documentClone);
125 const article = reader.parse();
126
127 if (!article) {
128 return {
129 text: "",
130 wasTruncated: false,
131 articleNull: true,
132 bodyTextLen: 0,
133 };
134 }
135
136 let extractedText = "";
137
138 if (article.title) {
139 extractedText += `Title: ${article.title}\n\n`;
140 }
141
142 if (article.byline) {
143 extractedText += `Author: ${article.byline}\n\n`;
144 }
145
146 if (article.excerpt && article.excerpt !== article.title) {
147 extractedText += `Description: ${article.excerpt}\n\n`;
148 }
149
150 if (article.publishedTime) {
151 extractedText += `Published: ${article.publishedTime}\n\n`;
152 }
153
154 if (article.siteName) {
155 extractedText += `Source: ${article.siteName}\n\n`;
156 }
157
158 if (extractedText) {
159 extractedText += "---\n\n";
160 }
161
162 let content = article.textContent || "";
163 content = sanitizeLiteralEmbedMarkup(content);
164
165 content = content
166 .replace(/[^\S\n]+/g, " ")
167 .replace(/\n{3,}/g, "\n\n")
168 .replace(/^\s+|\s+$/g, "");
169
170 extractedText += content;
171
172 const bodyTextLen = content.trim().length;
173
174 let wasTruncated = false;
175 if (extractedText.length > MAX_LENGTH) {
176 wasTruncated = true;
177 extractedText = extractedText.substring(0, MAX_LENGTH);
178 }
179
180 return {
181 text: extractedText,
182 wasTruncated,
183 articleNull: false,
184 bodyTextLen,
185 };
186 }
187
188 // --- Legacy extraction (pre-Readability): structured body walk + selector fallback ---
189
190 function extractLegacySelectorsFallback() {
191 const selectors = [
192 "article p",
193 "article div",
194 ".content p",
195 ".content div",
196 ".post-content p",
197 ".entry-content p",
198 ".article-body p",
199 "main p",
200 "main div",
201 '[role="main"] p',
202 ".story p",
203 ".story-body p",
204 "#story p",
205 ];
206
207 let text = "";
208 let wasTruncated = false;
209 const seen = new Set();
210
211 for (const selector of selectors) {
212 try {
213 const elements = document.querySelectorAll(selector);
214 for (const el of elements) {
215 const content = getTextContent(el).trim();
216 if (content.length < 20 || seen.has(content.substring(0, 100))) continue;
217
218 const style = window.getComputedStyle(el);
219 if (style.display === "none" || style.visibility === "hidden") continue;
220
221 seen.add(content.substring(0, 100));
222 text += content + "\n\n";
223
224 if (text.length > MAX_LENGTH) {
225 wasTruncated = true;
226 break;
227 }
228 }
229 } catch (e) {
230 // Ignore invalid selectors
231 }
232 if (wasTruncated) break;
233 }
234
235 if (text.length < 500 && !wasTruncated) {
236 const allParagraphs = document.querySelectorAll("p");
237 for (const p of allParagraphs) {
238 const content = getTextContent(p).trim();
239 if (content.length > 30 && !seen.has(content.substring(0, 100))) {
240 const style = window.getComputedStyle(p);
241 if (style.display === "none" || style.visibility === "hidden") continue;
242
243 seen.add(content.substring(0, 100));
244 text += content + "\n\n";
245
246 if (text.length > MAX_LENGTH) {
247 wasTruncated = true;
248 break;
249 }
250 }
251 }
252 }
253
254 return { text: text.substring(0, MAX_LENGTH), wasTruncated };
255 }
256
257 function shouldSkipElement(el) {
258 const tag = el.tagName.toLowerCase();
259 if (EXCLUDE_TAGS.includes(tag)) {
260 return true;
261 }
262
263 try {
264 const style = window.getComputedStyle(el);
265 if (
266 style.display === "none" ||
267 style.visibility === "hidden" ||
268 style.opacity === "0"
269 ) {
270 return true;
271 }
272 } catch (e) {
273 // ignore
274 }
275
276 if (isMainContent(el)) return false;
277
278 const role = el.getAttribute("role");
279 if (role === "navigation" || role === "banner" || role === "complementary") {
280 return true;
281 }
282
283 let className = "";
284 let id = "";
285
286 if (el.className) {
287 if (typeof el.className === "string") {
288 className = el.className;
289 } else if (el.className.baseVal) {
290 className = el.className.baseVal;
291 }
292 }
293
294 if (el.id) {
295 if (typeof el.id === "string") {
296 id = el.id;
297 } else if (el.id.baseVal) {
298 id = el.id.baseVal;
299 }
300 }
301
302 const classAndId = (className + " " + id).toLowerCase();
303 const strictNoisePatterns = [
304 /^nav$/,
305 /-nav$/,
306 /^nav-/,
307 /^navigation$/,
308 /^footer$/,
309 /-footer$/,
310 /^footer-/,
311 /^header$/,
312 /^site-header$/,
313 /^page-header$/,
314 /^sidebar$/,
315 /^advertisement$/,
316 /^ad-container$/,
317 ];
318 if (strictNoisePatterns.some((p) => p.test(classAndId.trim()))) {
319 return true;
320 }
321
322 return false;
323 }
324
325 function isMainContent(element) {
326 const role = element.getAttribute("role");
327 const tagName = element.tagName.toLowerCase();
328
329 let className = "";
330 let id = "";
331
332 if (element.className) {
333 if (typeof element.className === "string") {
334 className = element.className.toLowerCase();
335 } else if (element.className.baseVal) {
336 className = element.className.baseVal.toLowerCase();
337 }
338 }
339
340 if (element.id) {
341 if (typeof element.id === "string") {
342 id = element.id.toLowerCase();
343 } else if (element.id.baseVal) {
344 id = element.id.baseVal.toLowerCase();
345 }
346 }
347
348 const contentPatterns = [
349 "content",
350 "main-content",
351 "article-content",
352 "post-content",
353 "entry-content",
354 "page-content",
355 "story-content",
356 "body-content",
357 "article",
358 "post",
359 "entry",
360 "story",
361 "main",
362 ];
363
364 const isContentClass = contentPatterns.some(
365 (p) => className.includes(p) || id.includes(p),
366 );
367
368 return (
369 role === "main" ||
370 role === "article" ||
371 tagName === "main" ||
372 tagName === "article" ||
373 isContentClass
374 );
375 }
376
377 function extractTextFromElement(element, depth = 0) {
378 let text = "";
379 const indent = " ".repeat(depth);
380
381 const directText = getDirectTextContent(element).trim();
382 if (directText.length > 20 && depth > 0) {
383 text += directText + "\n\n";
384 }
385
386 for (const child of element.children) {
387 const childTag = child.tagName.toLowerCase();
388
389 if (shouldSkipElement(child)) continue;
390
391 if (/^h[1-6]$/.test(childTag)) {
392 const headingText = getTextContent(child).trim();
393 if (headingText) {
394 const prefix = "#".repeat(parseInt(childTag[1], 10));
395 text += `\n${prefix} ${headingText}\n\n`;
396 }
397 } else if (childTag === "p") {
398 const pText = getTextContent(child).trim();
399 if (pText.length > 5) {
400 text += `${pText}\n\n`;
401 }
402 } else if (childTag === "li") {
403 const liText = getTextContent(child).trim();
404 if (liText) {
405 text += `${indent}- ${liText}\n`;
406 }
407 } else if (childTag === "pre" || childTag === "code") {
408 const codeText = getTextContent(child).trim();
409 if (codeText) {
410 text += `\n\`\`\`\n${codeText}\n\`\`\`\n\n`;
411 }
412 } else {
413 const childText = extractTextFromElement(child, depth + 1);
414 if (childText.trim()) {
415 text += childText;
416 }
417 }
418 }
419
420 return text;
421 }
422
423 function getDirectTextContent(element) {
424 let text = "";
425 for (const node of element.childNodes) {
426 if (node.nodeType === Node.TEXT_NODE) {
427 text += node.textContent;
428 }
429 }
430 return text.trim();
431 }
432
433 function getTextContent(element) {
434 let text = "";
435
436 for (const node of element.childNodes) {
437 if (node.nodeType === Node.TEXT_NODE) {
438 text += node.textContent;
439 } else if (node.nodeType === Node.ELEMENT_NODE) {
440 const tagName = node.tagName.toLowerCase();
441
442 if (shouldExcludeTextSubtree(tagName)) {
443 continue;
444 }
445
446 if (["br", "p", "div", "li"].includes(tagName)) {
447 text += " " + getTextContent(node) + " ";
448 } else {
449 text += getTextContent(node);
450 }
451 }
452 }
453
454 return text;
455 }
456
457 function cleanExtractedText(text, shouldTruncate = true) {
458 let cleaned = sanitizeLiteralEmbedMarkup(text);
459 cleaned = cleaned
460 .replace(/[^\S\n]+/g, " ")
461 .replace(/\n{3,}/g, "\n\n")
462 .replace(/^\s+|\s+$/g, "");
463
464 if (shouldTruncate && cleaned.length > MAX_LENGTH) {
465 cleaned = cleaned.substring(0, MAX_LENGTH);
466 }
467
468 return cleaned;
469 }
470
471 /**
472 * Full-document extraction used before Readability. Works well for feeds, homepages,
473 * and app-like pages where Readability returns nothing or very little.
474 */
475 function extractLegacy() {
476 if (!document.body) {
477 return { text: document.title || "", wasTruncated: false };
478 }
479
480 let extractedText = "";
481 let wasTruncated = false;
482
483 const title = document.title || "";
484 if (title) {
485 extractedText += `Title: ${title}\n\n`;
486 }
487
488 const metaDesc = document.querySelector('meta[name="description"]');
489 if (metaDesc) {
490 const desc = metaDesc.getAttribute("content");
491 if (desc) {
492 extractedText += `Description: ${desc}\n\n`;
493 }
494 }
495
496 extractedText += extractTextFromElement(document.body);
497
498 extractedText = cleanExtractedText(extractedText, false);
499
500 if (extractedText.length > MAX_LENGTH) {
501 wasTruncated = true;
502 extractedText = extractedText.substring(0, MAX_LENGTH);
503 }
504
505 if (extractedText.length < 1000) {
506 const fallbackResult = extractLegacySelectorsFallback();
507 if (fallbackResult.text.length > extractedText.length) {
508 extractedText = `Title: ${title}\n\n${fallbackResult.text}`;
509 wasTruncated = fallbackResult.wasTruncated;
510 }
511 }
512
513 return { text: extractedText, wasTruncated };
514 }
515
516 /** Legacy debug — uncomment body to log to the tab's DevTools (page context). */
517 function logExtractionDebug(source, text) {
518 // console.log("[Lede DEBUG] extraction:", source);
519 // console.log(
520 // "[Lede DEBUG] raw extracted text (" + text.length + " chars):",
521 // text,
522 // );
523 }
524
525 function tryRedditShredditExtract() {
526 if (typeof window.__webaiTryRedditNew !== "function") {
527 return null;
528 }
529 try {
530 return window.__webaiTryRedditNew(MAX_LENGTH);
531 } catch (err) {
532 console.error("[Lede] Reddit extract error:", err);
533 return null;
534 }
535 }
536
537 function extractContent() {
538 const redditResult = tryRedditShredditExtract();
539 if (redditResult) {
540 logExtractionDebug(redditResult.extractionSource, redditResult.text);
541 return {
542 text: redditResult.text,
543 wasTruncated: Boolean(redditResult.wasTruncated),
544 extractionSource: redditResult.extractionSource,
545 unsupportedReason: redditResult.unsupportedReason,
546 };
547 }
548
549 try {
550 const readability = extractWithReadability();
551 const rbLen = readability.text.trim().length;
552
553 if (readability.articleNull || rbLen === 0) {
554 const legacy = extractLegacy();
555 logExtractionDebug("legacy", legacy.text);
556 return { ...legacy, extractionSource: "legacy" };
557 }
558
559 const bodyLen = readability.bodyTextLen;
560 const contentRatio =
561 bodyLen / Math.max(readability.text.length, 1);
562
563 const biasedToLegacy = readabilityTextBiasLegacy(readability.text);
564
565 const strongArticleBody =
566 bodyLen >= READABILITY_STRONG_BODY_CHARS && !biasedToLegacy;
567 const ratioLooksLikeArticle =
568 bodyLen >= READABILITY_MIN_BODY_CHARS &&
569 contentRatio >= READABILITY_MIN_CONTENT_RATIO &&
570 !biasedToLegacy;
571
572 // Long article body — Readability clearly won (unless sponsor/promo bias sent us to legacy).
573 if (strongArticleBody) {
574 const flip = tryLegacyWhenShortReadOverwhelmed(readability);
575 if (flip) {
576 logExtractionDebug("legacy", flip.text);
577 return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" };
578 }
579 logExtractionDebug("readability", readability.text);
580 return {
581 text: readability.text,
582 wasTruncated: readability.wasTruncated,
583 extractionSource: "readability",
584 };
585 }
586
587 // Short pages / stubs: body is still most of what we output (not metadata padding).
588 if (ratioLooksLikeArticle) {
589 const flip = tryLegacyWhenShortReadOverwhelmed(readability);
590 if (flip) {
591 logExtractionDebug("legacy", flip.text);
592 return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" };
593 }
594 logExtractionDebug("readability", readability.text);
595 return {
596 text: readability.text,
597 wasTruncated: readability.wasTruncated,
598 extractionSource: "readability",
599 };
600 }
601
602 // Marginal Readability (feed hero, cookie copy, tiny grab): compare with legacy.
603 const legacy = extractLegacy();
604 const legLen = legacy.text.length;
605 const readLen = readability.text.length;
606
607 if (bodyLen < READABILITY_MIN_BODY_CHARS) {
608 const useLegacy = legLen > readLen;
609 const picked = useLegacy ? legacy : readability;
610 logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text);
611 return {
612 text: picked.text,
613 wasTruncated: picked.wasTruncated,
614 extractionSource: useLegacy ? "legacy" : "readability",
615 };
616 }
617
618 // Body exists but is mostly not "article-like" vs full page (low ratio already failed).
619 if (
620 contentRatio < READABILITY_MIN_CONTENT_RATIO &&
621 legLen > readLen * 1.4
622 ) {
623 logExtractionDebug("legacy", legacy.text);
624 return {
625 text: legacy.text,
626 wasTruncated: legacy.wasTruncated,
627 extractionSource: "legacy",
628 };
629 }
630
631 const useLegacy = legLen > readLen;
632 const picked = useLegacy ? legacy : readability;
633 logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text);
634 return {
635 text: picked.text,
636 wasTruncated: picked.wasTruncated,
637 extractionSource: useLegacy ? "legacy" : "readability",
638 };
639 } catch (error) {
640 console.error("[Lede] Readability error:", error);
641 const legacy = extractLegacy();
642 logExtractionDebug("legacy", legacy.text);
643 return { ...legacy, extractionSource: "legacy" };
644 }
645 }
646
647 chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
648 if (request.action === "extract") {
649 const result = extractContent();
650 sendResponse({
651 content: result.text,
652 wasTruncated: result.wasTruncated,
653 extractionSource: result.extractionSource,
654 unsupportedReason: result.unsupportedReason,
655 });
656 }
657 return true;
658 });
659})();