scripts/content.js at main · ellioth.co/summarizer-extension

ellioth.co / summarizer-extension
fork atom
A browser extension that lets you summarize any webpage and ask questions using AI.
fork atom
summarizer-extension / scripts / content.js
at main 659 lines 19 kB view raw
wrap content
ellioth.co Rebrand extension to Lede (page summarizer & chat) 1hr ago
94a36526
  1// Content script: Mozilla Readability for articles, legacy DOM walk for feeds / non-articles
  2
  3(function () {
  4  "use strict";
  5
  6  // Prevent multiple injections
  7  if (window.__webaiExtractorInstalled) {
  8    return;
  9  }
 10  window.__webaiExtractorInstalled = true;
 11
 12  const MAX_LENGTH =
 13    typeof CONFIG !== "undefined" && CONFIG.EXTRACTION?.MAX_LENGTH
 14      ? CONFIG.EXTRACTION.MAX_LENGTH
 15      : 50000;
 16
 17  /**
 18   * Readability `textContent` length (article body only). Above this, treat as a real article
 19   * extract. Do NOT use total formatted string length — title/metadata can exceed 500 chars alone.
 20   * Kept moderately high so homepages don't classify sponsor blocks as "articles".
 21   */
 22  const READABILITY_STRONG_BODY_CHARS = 560;
 23
 24  /**
 25   * When body is shorter, still trust Readability if the body is a large share of our formatted
 26   * output (not mostly "Title:/Description:" boilerplate from a feed shell).
 27   */
 28  const READABILITY_MIN_BODY_CHARS = 120;
 29
 30  /** Minimum body / formatted-text ratio for the "substantial body" heuristic. */
 31  const READABILITY_MIN_CONTENT_RATIO = 0.34;
 32
 33  /**
 34   * If Readability output is shorter than this, compare with legacy: full-page text often
 35   * dwarfs a homepage "article" grab (e.g. link aggregators).
 36   */
 37  const READABILITY_SHORT_EXTRACT_MAX = 2200;
 38
 39  /**
 40   * When Readability extract is "short", prefer legacy if the DOM walk yields this many times
 41   * more text (typical feed / homepage vs a thin Readability pick).
 42   */
 43  const READABILITY_LEGACY_DOMINANCE_RATIO = 3.25;
 44
 45  /** Prefer legacy when the grab looks like promos / feed chrome, not prose. */
 46  const READABILITY_BIAS_LEGACY_REGEXES = [
 47    /\bSponsor(?:ed)?\s+Posts?\b/i,
 48    /\bPromoted\s+(?:Stories?|Posts?|Content)\b/i,
 49    /\bAdvertiser\s+Content\b/i,
 50    /\bPaid\s+Partnership\b/i,
 51  ];
 52
 53  function readabilityTextBiasLegacy(formattedText) {
 54    return READABILITY_BIAS_LEGACY_REGEXES.some((re) => re.test(formattedText));
 55  }
 56
 57  /**
 58   * Short Readability extract + huge legacy output ⇒ likely listing/homepage, not an article.
 59   * Skip when body is clearly long-form anyway (avoid flipping concise real articles on noisy DOMs).
 60   */
 61  function tryLegacyWhenShortReadOverwhelmed(readability) {
 62    const readLen = readability.text.length;
 63    if (readLen > READABILITY_SHORT_EXTRACT_MAX) {
 64      return null;
 65    }
 66    if (readability.bodyTextLen >= READABILITY_STRONG_BODY_CHARS + 400) {
 67      return null;
 68    }
 69    const legacy = extractLegacy();
 70    if (legacy.text.length > readLen * READABILITY_LEGACY_DOMINANCE_RATIO) {
 71      return legacy;
 72    }
 73    return null;
 74  }
 75
 76  const EXCLUDE_TAGS = [
 77    "script",
 78    "style",
 79    "noscript",
 80    "iframe",
 81    "embed",
 82    "object",
 83    "frame",
 84    "nav",
 85    "aside",
 86    "form",
 87    "button",
 88    "input",
 89  ];
 90
 91  /**
 92   * Subtrees to omit from plain-text collection. `textContent` includes script/style/template
 93   * bodies, which pulls in Astro/React/Vite inline bundles when walking `main div` etc.
 94   * Also skip embed-like tags (iframe/object/embed) so news players do not dump URLs or attrs.
 95   */
 96  const TEXT_SUBTREE_EXCLUDE_TAGS = new Set([
 97    ...EXCLUDE_TAGS,
 98    "template",
 99  ]);
100
101  function shouldExcludeTextSubtree(tagName) {
102    return TEXT_SUBTREE_EXCLUDE_TAGS.has(tagName.toLowerCase());
103  }
104
105  /**
106   * Strip embed markup that appears as literal text in article bodies (CMS/oEmbed fallbacks).
107   * Complements subtree skipping — some sites still surface tags as visible copy.
108   */
109  function sanitizeLiteralEmbedMarkup(text) {
110    if (!text || typeof text !== "string") {
111      return text;
112    }
113    let t = text;
114    t = t.replace(/<iframe\b[\s\S]{0,20000}?<\/iframe>/gi, "\n");
115    t = t.replace(/<iframe\b[^>]{0,8000}\/?>/gi, "\n");
116    t = t.replace(/<\/iframe>/gi, "");
117    t = t.replace(/<embed\b[^>]{0,8000}\/?>/gi, "\n");
118    t = t.replace(/<object\b[\s\S]{0,20000}?<\/object>/gi, "\n");
119    return t;
120  }
121
122  function extractWithReadability() {
123    const documentClone = document.cloneNode(true);
124    const reader = new Readability(documentClone);
125    const article = reader.parse();
126
127    if (!article) {
128      return {
129        text: "",
130        wasTruncated: false,
131        articleNull: true,
132        bodyTextLen: 0,
133      };
134    }
135
136    let extractedText = "";
137
138    if (article.title) {
139      extractedText += `Title: ${article.title}\n\n`;
140    }
141
142    if (article.byline) {
143      extractedText += `Author: ${article.byline}\n\n`;
144    }
145
146    if (article.excerpt && article.excerpt !== article.title) {
147      extractedText += `Description: ${article.excerpt}\n\n`;
148    }
149
150    if (article.publishedTime) {
151      extractedText += `Published: ${article.publishedTime}\n\n`;
152    }
153
154    if (article.siteName) {
155      extractedText += `Source: ${article.siteName}\n\n`;
156    }
157
158    if (extractedText) {
159      extractedText += "---\n\n";
160    }
161
162    let content = article.textContent || "";
163    content = sanitizeLiteralEmbedMarkup(content);
164
165    content = content
166      .replace(/[^\S\n]+/g, " ")
167      .replace(/\n{3,}/g, "\n\n")
168      .replace(/^\s+|\s+$/g, "");
169
170    extractedText += content;
171
172    const bodyTextLen = content.trim().length;
173
174    let wasTruncated = false;
175    if (extractedText.length > MAX_LENGTH) {
176      wasTruncated = true;
177      extractedText = extractedText.substring(0, MAX_LENGTH);
178    }
179
180    return {
181      text: extractedText,
182      wasTruncated,
183      articleNull: false,
184      bodyTextLen,
185    };
186  }
187
188  // --- Legacy extraction (pre-Readability): structured body walk + selector fallback ---
189
190  function extractLegacySelectorsFallback() {
191    const selectors = [
192      "article p",
193      "article div",
194      ".content p",
195      ".content div",
196      ".post-content p",
197      ".entry-content p",
198      ".article-body p",
199      "main p",
200      "main div",
201      '[role="main"] p',
202      ".story p",
203      ".story-body p",
204      "#story p",
205    ];
206
207    let text = "";
208    let wasTruncated = false;
209    const seen = new Set();
210
211    for (const selector of selectors) {
212      try {
213        const elements = document.querySelectorAll(selector);
214        for (const el of elements) {
215          const content = getTextContent(el).trim();
216          if (content.length < 20 || seen.has(content.substring(0, 100))) continue;
217
218          const style = window.getComputedStyle(el);
219          if (style.display === "none" || style.visibility === "hidden") continue;
220
221          seen.add(content.substring(0, 100));
222          text += content + "\n\n";
223
224          if (text.length > MAX_LENGTH) {
225            wasTruncated = true;
226            break;
227          }
228        }
229      } catch (e) {
230        // Ignore invalid selectors
231      }
232      if (wasTruncated) break;
233    }
234
235    if (text.length < 500 && !wasTruncated) {
236      const allParagraphs = document.querySelectorAll("p");
237      for (const p of allParagraphs) {
238        const content = getTextContent(p).trim();
239        if (content.length > 30 && !seen.has(content.substring(0, 100))) {
240          const style = window.getComputedStyle(p);
241          if (style.display === "none" || style.visibility === "hidden") continue;
242
243          seen.add(content.substring(0, 100));
244          text += content + "\n\n";
245
246          if (text.length > MAX_LENGTH) {
247            wasTruncated = true;
248            break;
249          }
250        }
251      }
252    }
253
254    return { text: text.substring(0, MAX_LENGTH), wasTruncated };
255  }
256
257  function shouldSkipElement(el) {
258    const tag = el.tagName.toLowerCase();
259    if (EXCLUDE_TAGS.includes(tag)) {
260      return true;
261    }
262
263    try {
264      const style = window.getComputedStyle(el);
265      if (
266        style.display === "none" ||
267        style.visibility === "hidden" ||
268        style.opacity === "0"
269      ) {
270        return true;
271      }
272    } catch (e) {
273      // ignore
274    }
275
276    if (isMainContent(el)) return false;
277
278    const role = el.getAttribute("role");
279    if (role === "navigation" || role === "banner" || role === "complementary") {
280      return true;
281    }
282
283    let className = "";
284    let id = "";
285
286    if (el.className) {
287      if (typeof el.className === "string") {
288        className = el.className;
289      } else if (el.className.baseVal) {
290        className = el.className.baseVal;
291      }
292    }
293
294    if (el.id) {
295      if (typeof el.id === "string") {
296        id = el.id;
297      } else if (el.id.baseVal) {
298        id = el.id.baseVal;
299      }
300    }
301
302    const classAndId = (className + " " + id).toLowerCase();
303    const strictNoisePatterns = [
304      /^nav$/,
305      /-nav$/,
306      /^nav-/,
307      /^navigation$/,
308      /^footer$/,
309      /-footer$/,
310      /^footer-/,
311      /^header$/,
312      /^site-header$/,
313      /^page-header$/,
314      /^sidebar$/,
315      /^advertisement$/,
316      /^ad-container$/,
317    ];
318    if (strictNoisePatterns.some((p) => p.test(classAndId.trim()))) {
319      return true;
320    }
321
322    return false;
323  }
324
325  function isMainContent(element) {
326    const role = element.getAttribute("role");
327    const tagName = element.tagName.toLowerCase();
328
329    let className = "";
330    let id = "";
331
332    if (element.className) {
333      if (typeof element.className === "string") {
334        className = element.className.toLowerCase();
335      } else if (element.className.baseVal) {
336        className = element.className.baseVal.toLowerCase();
337      }
338    }
339
340    if (element.id) {
341      if (typeof element.id === "string") {
342        id = element.id.toLowerCase();
343      } else if (element.id.baseVal) {
344        id = element.id.baseVal.toLowerCase();
345      }
346    }
347
348    const contentPatterns = [
349      "content",
350      "main-content",
351      "article-content",
352      "post-content",
353      "entry-content",
354      "page-content",
355      "story-content",
356      "body-content",
357      "article",
358      "post",
359      "entry",
360      "story",
361      "main",
362    ];
363
364    const isContentClass = contentPatterns.some(
365      (p) => className.includes(p) || id.includes(p),
366    );
367
368    return (
369      role === "main" ||
370      role === "article" ||
371      tagName === "main" ||
372      tagName === "article" ||
373      isContentClass
374    );
375  }
376
377  function extractTextFromElement(element, depth = 0) {
378    let text = "";
379    const indent = "  ".repeat(depth);
380
381    const directText = getDirectTextContent(element).trim();
382    if (directText.length > 20 && depth > 0) {
383      text += directText + "\n\n";
384    }
385
386    for (const child of element.children) {
387      const childTag = child.tagName.toLowerCase();
388
389      if (shouldSkipElement(child)) continue;
390
391      if (/^h[1-6]$/.test(childTag)) {
392        const headingText = getTextContent(child).trim();
393        if (headingText) {
394          const prefix = "#".repeat(parseInt(childTag[1], 10));
395          text += `\n${prefix} ${headingText}\n\n`;
396        }
397      } else if (childTag === "p") {
398        const pText = getTextContent(child).trim();
399        if (pText.length > 5) {
400          text += `${pText}\n\n`;
401        }
402      } else if (childTag === "li") {
403        const liText = getTextContent(child).trim();
404        if (liText) {
405          text += `${indent}- ${liText}\n`;
406        }
407      } else if (childTag === "pre" || childTag === "code") {
408        const codeText = getTextContent(child).trim();
409        if (codeText) {
410          text += `\n\`\`\`\n${codeText}\n\`\`\`\n\n`;
411        }
412      } else {
413        const childText = extractTextFromElement(child, depth + 1);
414        if (childText.trim()) {
415          text += childText;
416        }
417      }
418    }
419
420    return text;
421  }
422
423  function getDirectTextContent(element) {
424    let text = "";
425    for (const node of element.childNodes) {
426      if (node.nodeType === Node.TEXT_NODE) {
427        text += node.textContent;
428      }
429    }
430    return text.trim();
431  }
432
433  function getTextContent(element) {
434    let text = "";
435
436    for (const node of element.childNodes) {
437      if (node.nodeType === Node.TEXT_NODE) {
438        text += node.textContent;
439      } else if (node.nodeType === Node.ELEMENT_NODE) {
440        const tagName = node.tagName.toLowerCase();
441
442        if (shouldExcludeTextSubtree(tagName)) {
443          continue;
444        }
445
446        if (["br", "p", "div", "li"].includes(tagName)) {
447          text += " " + getTextContent(node) + " ";
448        } else {
449          text += getTextContent(node);
450        }
451      }
452    }
453
454    return text;
455  }
456
457  function cleanExtractedText(text, shouldTruncate = true) {
458    let cleaned = sanitizeLiteralEmbedMarkup(text);
459    cleaned = cleaned
460      .replace(/[^\S\n]+/g, " ")
461      .replace(/\n{3,}/g, "\n\n")
462      .replace(/^\s+|\s+$/g, "");
463
464    if (shouldTruncate && cleaned.length > MAX_LENGTH) {
465      cleaned = cleaned.substring(0, MAX_LENGTH);
466    }
467
468    return cleaned;
469  }
470
471  /**
472   * Full-document extraction used before Readability. Works well for feeds, homepages,
473   * and app-like pages where Readability returns nothing or very little.
474   */
475  function extractLegacy() {
476    if (!document.body) {
477      return { text: document.title || "", wasTruncated: false };
478    }
479
480    let extractedText = "";
481    let wasTruncated = false;
482
483    const title = document.title || "";
484    if (title) {
485      extractedText += `Title: ${title}\n\n`;
486    }
487
488    const metaDesc = document.querySelector('meta[name="description"]');
489    if (metaDesc) {
490      const desc = metaDesc.getAttribute("content");
491      if (desc) {
492        extractedText += `Description: ${desc}\n\n`;
493      }
494    }
495
496    extractedText += extractTextFromElement(document.body);
497
498    extractedText = cleanExtractedText(extractedText, false);
499
500    if (extractedText.length > MAX_LENGTH) {
501      wasTruncated = true;
502      extractedText = extractedText.substring(0, MAX_LENGTH);
503    }
504
505    if (extractedText.length < 1000) {
506      const fallbackResult = extractLegacySelectorsFallback();
507      if (fallbackResult.text.length > extractedText.length) {
508        extractedText = `Title: ${title}\n\n${fallbackResult.text}`;
509        wasTruncated = fallbackResult.wasTruncated;
510      }
511    }
512
513    return { text: extractedText, wasTruncated };
514  }
515
516  /** Legacy debug — uncomment body to log to the tab's DevTools (page context). */
517  function logExtractionDebug(source, text) {
518    // console.log("[Lede DEBUG] extraction:", source);
519    // console.log(
520    //   "[Lede DEBUG] raw extracted text (" + text.length + " chars):",
521    //   text,
522    // );
523  }
524
525  function tryRedditShredditExtract() {
526    if (typeof window.__webaiTryRedditNew !== "function") {
527      return null;
528    }
529    try {
530      return window.__webaiTryRedditNew(MAX_LENGTH);
531    } catch (err) {
532      console.error("[Lede] Reddit extract error:", err);
533      return null;
534    }
535  }
536
537  function extractContent() {
538    const redditResult = tryRedditShredditExtract();
539    if (redditResult) {
540      logExtractionDebug(redditResult.extractionSource, redditResult.text);
541      return {
542        text: redditResult.text,
543        wasTruncated: Boolean(redditResult.wasTruncated),
544        extractionSource: redditResult.extractionSource,
545        unsupportedReason: redditResult.unsupportedReason,
546      };
547    }
548
549    try {
550      const readability = extractWithReadability();
551      const rbLen = readability.text.trim().length;
552
553      if (readability.articleNull || rbLen === 0) {
554        const legacy = extractLegacy();
555        logExtractionDebug("legacy", legacy.text);
556        return { ...legacy, extractionSource: "legacy" };
557      }
558
559      const bodyLen = readability.bodyTextLen;
560      const contentRatio =
561        bodyLen / Math.max(readability.text.length, 1);
562
563      const biasedToLegacy = readabilityTextBiasLegacy(readability.text);
564
565      const strongArticleBody =
566        bodyLen >= READABILITY_STRONG_BODY_CHARS && !biasedToLegacy;
567      const ratioLooksLikeArticle =
568        bodyLen >= READABILITY_MIN_BODY_CHARS &&
569        contentRatio >= READABILITY_MIN_CONTENT_RATIO &&
570        !biasedToLegacy;
571
572      // Long article body — Readability clearly won (unless sponsor/promo bias sent us to legacy).
573      if (strongArticleBody) {
574        const flip = tryLegacyWhenShortReadOverwhelmed(readability);
575        if (flip) {
576          logExtractionDebug("legacy", flip.text);
577          return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" };
578        }
579        logExtractionDebug("readability", readability.text);
580        return {
581          text: readability.text,
582          wasTruncated: readability.wasTruncated,
583          extractionSource: "readability",
584        };
585      }
586
587      // Short pages / stubs: body is still most of what we output (not metadata padding).
588      if (ratioLooksLikeArticle) {
589        const flip = tryLegacyWhenShortReadOverwhelmed(readability);
590        if (flip) {
591          logExtractionDebug("legacy", flip.text);
592          return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" };
593        }
594        logExtractionDebug("readability", readability.text);
595        return {
596          text: readability.text,
597          wasTruncated: readability.wasTruncated,
598          extractionSource: "readability",
599        };
600      }
601
602      // Marginal Readability (feed hero, cookie copy, tiny grab): compare with legacy.
603      const legacy = extractLegacy();
604      const legLen = legacy.text.length;
605      const readLen = readability.text.length;
606
607      if (bodyLen < READABILITY_MIN_BODY_CHARS) {
608        const useLegacy = legLen > readLen;
609        const picked = useLegacy ? legacy : readability;
610        logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text);
611        return {
612          text: picked.text,
613          wasTruncated: picked.wasTruncated,
614          extractionSource: useLegacy ? "legacy" : "readability",
615        };
616      }
617
618      // Body exists but is mostly not "article-like" vs full page (low ratio already failed).
619      if (
620        contentRatio < READABILITY_MIN_CONTENT_RATIO &&
621        legLen > readLen * 1.4
622      ) {
623        logExtractionDebug("legacy", legacy.text);
624        return {
625          text: legacy.text,
626          wasTruncated: legacy.wasTruncated,
627          extractionSource: "legacy",
628        };
629      }
630
631      const useLegacy = legLen > readLen;
632      const picked = useLegacy ? legacy : readability;
633      logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text);
634      return {
635        text: picked.text,
636        wasTruncated: picked.wasTruncated,
637        extractionSource: useLegacy ? "legacy" : "readability",
638      };
639    } catch (error) {
640      console.error("[Lede] Readability error:", error);
641      const legacy = extractLegacy();
642      logExtractionDebug("legacy", legacy.text);
643      return { ...legacy, extractionSource: "legacy" };
644    }
645  }
646
647  chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
648    if (request.action === "extract") {
649      const result = extractContent();
650      sendResponse({
651        content: result.text,
652        wasTruncated: result.wasTruncated,
653        extractionSource: result.extractionSource,
654        unsupportedReason: result.unsupportedReason,
655      });
656    }
657    return true;
658  });
659})();