// Content script: Mozilla Readability for articles, legacy DOM walk for feeds / non-articles (function () { "use strict"; // Prevent multiple injections if (window.__webaiExtractorInstalled) { return; } window.__webaiExtractorInstalled = true; const MAX_LENGTH = typeof CONFIG !== "undefined" && CONFIG.EXTRACTION?.MAX_LENGTH ? CONFIG.EXTRACTION.MAX_LENGTH : 50000; /** * Readability `textContent` length (article body only). Above this, treat as a real article * extract. Do NOT use total formatted string length — title/metadata can exceed 500 chars alone. * Kept moderately high so homepages don't classify sponsor blocks as "articles". */ const READABILITY_STRONG_BODY_CHARS = 560; /** * When body is shorter, still trust Readability if the body is a large share of our formatted * output (not mostly "Title:/Description:" boilerplate from a feed shell). */ const READABILITY_MIN_BODY_CHARS = 120; /** Minimum body / formatted-text ratio for the "substantial body" heuristic. */ const READABILITY_MIN_CONTENT_RATIO = 0.34; /** * If Readability output is shorter than this, compare with legacy: full-page text often * dwarfs a homepage "article" grab (e.g. link aggregators). */ const READABILITY_SHORT_EXTRACT_MAX = 2200; /** * When Readability extract is "short", prefer legacy if the DOM walk yields this many times * more text (typical feed / homepage vs a thin Readability pick). */ const READABILITY_LEGACY_DOMINANCE_RATIO = 3.25; /** Prefer legacy when the grab looks like promos / feed chrome, not prose. */ const READABILITY_BIAS_LEGACY_REGEXES = [ /\bSponsor(?:ed)?\s+Posts?\b/i, /\bPromoted\s+(?:Stories?|Posts?|Content)\b/i, /\bAdvertiser\s+Content\b/i, /\bPaid\s+Partnership\b/i, ]; function readabilityTextBiasLegacy(formattedText) { return READABILITY_BIAS_LEGACY_REGEXES.some((re) => re.test(formattedText)); } /** * Short Readability extract + huge legacy output ⇒ likely listing/homepage, not an article. * Skip when body is clearly long-form anyway (avoid flipping concise real articles on noisy DOMs). */ function tryLegacyWhenShortReadOverwhelmed(readability) { const readLen = readability.text.length; if (readLen > READABILITY_SHORT_EXTRACT_MAX) { return null; } if (readability.bodyTextLen >= READABILITY_STRONG_BODY_CHARS + 400) { return null; } const legacy = extractLegacy(); if (legacy.text.length > readLen * READABILITY_LEGACY_DOMINANCE_RATIO) { return legacy; } return null; } const EXCLUDE_TAGS = [ "script", "style", "noscript", "iframe", "embed", "object", "frame", "nav", "aside", "form", "button", "input", ]; /** * Subtrees to omit from plain-text collection. `textContent` includes script/style/template * bodies, which pulls in Astro/React/Vite inline bundles when walking `main div` etc. * Also skip embed-like tags (iframe/object/embed) so news players do not dump URLs or attrs. */ const TEXT_SUBTREE_EXCLUDE_TAGS = new Set([ ...EXCLUDE_TAGS, "template", ]); function shouldExcludeTextSubtree(tagName) { return TEXT_SUBTREE_EXCLUDE_TAGS.has(tagName.toLowerCase()); } /** * Strip embed markup that appears as literal text in article bodies (CMS/oEmbed fallbacks). * Complements subtree skipping — some sites still surface tags as visible copy. */ function sanitizeLiteralEmbedMarkup(text) { if (!text || typeof text !== "string") { return text; } let t = text; t = t.replace(//gi, "\n"); t = t.replace(/]{0,8000}\/?>/gi, "\n"); t = t.replace(/<\/iframe>/gi, ""); t = t.replace(/]{0,8000}\/?>/gi, "\n"); t = t.replace(//gi, "\n"); return t; } function extractWithReadability() { const documentClone = document.cloneNode(true); const reader = new Readability(documentClone); const article = reader.parse(); if (!article) { return { text: "", wasTruncated: false, articleNull: true, bodyTextLen: 0, }; } let extractedText = ""; if (article.title) { extractedText += `Title: ${article.title}\n\n`; } if (article.byline) { extractedText += `Author: ${article.byline}\n\n`; } if (article.excerpt && article.excerpt !== article.title) { extractedText += `Description: ${article.excerpt}\n\n`; } if (article.publishedTime) { extractedText += `Published: ${article.publishedTime}\n\n`; } if (article.siteName) { extractedText += `Source: ${article.siteName}\n\n`; } if (extractedText) { extractedText += "---\n\n"; } let content = article.textContent || ""; content = sanitizeLiteralEmbedMarkup(content); content = content .replace(/[^\S\n]+/g, " ") .replace(/\n{3,}/g, "\n\n") .replace(/^\s+|\s+$/g, ""); extractedText += content; const bodyTextLen = content.trim().length; let wasTruncated = false; if (extractedText.length > MAX_LENGTH) { wasTruncated = true; extractedText = extractedText.substring(0, MAX_LENGTH); } return { text: extractedText, wasTruncated, articleNull: false, bodyTextLen, }; } // --- Legacy extraction (pre-Readability): structured body walk + selector fallback --- function extractLegacySelectorsFallback() { const selectors = [ "article p", "article div", ".content p", ".content div", ".post-content p", ".entry-content p", ".article-body p", "main p", "main div", '[role="main"] p', ".story p", ".story-body p", "#story p", ]; let text = ""; let wasTruncated = false; const seen = new Set(); for (const selector of selectors) { try { const elements = document.querySelectorAll(selector); for (const el of elements) { const content = getTextContent(el).trim(); if (content.length < 20 || seen.has(content.substring(0, 100))) continue; const style = window.getComputedStyle(el); if (style.display === "none" || style.visibility === "hidden") continue; seen.add(content.substring(0, 100)); text += content + "\n\n"; if (text.length > MAX_LENGTH) { wasTruncated = true; break; } } } catch (e) { // Ignore invalid selectors } if (wasTruncated) break; } if (text.length < 500 && !wasTruncated) { const allParagraphs = document.querySelectorAll("p"); for (const p of allParagraphs) { const content = getTextContent(p).trim(); if (content.length > 30 && !seen.has(content.substring(0, 100))) { const style = window.getComputedStyle(p); if (style.display === "none" || style.visibility === "hidden") continue; seen.add(content.substring(0, 100)); text += content + "\n\n"; if (text.length > MAX_LENGTH) { wasTruncated = true; break; } } } } return { text: text.substring(0, MAX_LENGTH), wasTruncated }; } function shouldSkipElement(el) { const tag = el.tagName.toLowerCase(); if (EXCLUDE_TAGS.includes(tag)) { return true; } try { const style = window.getComputedStyle(el); if ( style.display === "none" || style.visibility === "hidden" || style.opacity === "0" ) { return true; } } catch (e) { // ignore } if (isMainContent(el)) return false; const role = el.getAttribute("role"); if (role === "navigation" || role === "banner" || role === "complementary") { return true; } let className = ""; let id = ""; if (el.className) { if (typeof el.className === "string") { className = el.className; } else if (el.className.baseVal) { className = el.className.baseVal; } } if (el.id) { if (typeof el.id === "string") { id = el.id; } else if (el.id.baseVal) { id = el.id.baseVal; } } const classAndId = (className + " " + id).toLowerCase(); const strictNoisePatterns = [ /^nav$/, /-nav$/, /^nav-/, /^navigation$/, /^footer$/, /-footer$/, /^footer-/, /^header$/, /^site-header$/, /^page-header$/, /^sidebar$/, /^advertisement$/, /^ad-container$/, ]; if (strictNoisePatterns.some((p) => p.test(classAndId.trim()))) { return true; } return false; } function isMainContent(element) { const role = element.getAttribute("role"); const tagName = element.tagName.toLowerCase(); let className = ""; let id = ""; if (element.className) { if (typeof element.className === "string") { className = element.className.toLowerCase(); } else if (element.className.baseVal) { className = element.className.baseVal.toLowerCase(); } } if (element.id) { if (typeof element.id === "string") { id = element.id.toLowerCase(); } else if (element.id.baseVal) { id = element.id.baseVal.toLowerCase(); } } const contentPatterns = [ "content", "main-content", "article-content", "post-content", "entry-content", "page-content", "story-content", "body-content", "article", "post", "entry", "story", "main", ]; const isContentClass = contentPatterns.some( (p) => className.includes(p) || id.includes(p), ); return ( role === "main" || role === "article" || tagName === "main" || tagName === "article" || isContentClass ); } function extractTextFromElement(element, depth = 0) { let text = ""; const indent = " ".repeat(depth); const directText = getDirectTextContent(element).trim(); if (directText.length > 20 && depth > 0) { text += directText + "\n\n"; } for (const child of element.children) { const childTag = child.tagName.toLowerCase(); if (shouldSkipElement(child)) continue; if (/^h[1-6]$/.test(childTag)) { const headingText = getTextContent(child).trim(); if (headingText) { const prefix = "#".repeat(parseInt(childTag[1], 10)); text += `\n${prefix} ${headingText}\n\n`; } } else if (childTag === "p") { const pText = getTextContent(child).trim(); if (pText.length > 5) { text += `${pText}\n\n`; } } else if (childTag === "li") { const liText = getTextContent(child).trim(); if (liText) { text += `${indent}- ${liText}\n`; } } else if (childTag === "pre" || childTag === "code") { const codeText = getTextContent(child).trim(); if (codeText) { text += `\n\`\`\`\n${codeText}\n\`\`\`\n\n`; } } else { const childText = extractTextFromElement(child, depth + 1); if (childText.trim()) { text += childText; } } } return text; } function getDirectTextContent(element) { let text = ""; for (const node of element.childNodes) { if (node.nodeType === Node.TEXT_NODE) { text += node.textContent; } } return text.trim(); } function getTextContent(element) { let text = ""; for (const node of element.childNodes) { if (node.nodeType === Node.TEXT_NODE) { text += node.textContent; } else if (node.nodeType === Node.ELEMENT_NODE) { const tagName = node.tagName.toLowerCase(); if (shouldExcludeTextSubtree(tagName)) { continue; } if (["br", "p", "div", "li"].includes(tagName)) { text += " " + getTextContent(node) + " "; } else { text += getTextContent(node); } } } return text; } function cleanExtractedText(text, shouldTruncate = true) { let cleaned = sanitizeLiteralEmbedMarkup(text); cleaned = cleaned .replace(/[^\S\n]+/g, " ") .replace(/\n{3,}/g, "\n\n") .replace(/^\s+|\s+$/g, ""); if (shouldTruncate && cleaned.length > MAX_LENGTH) { cleaned = cleaned.substring(0, MAX_LENGTH); } return cleaned; } /** * Full-document extraction used before Readability. Works well for feeds, homepages, * and app-like pages where Readability returns nothing or very little. */ function extractLegacy() { if (!document.body) { return { text: document.title || "", wasTruncated: false }; } let extractedText = ""; let wasTruncated = false; const title = document.title || ""; if (title) { extractedText += `Title: ${title}\n\n`; } const metaDesc = document.querySelector('meta[name="description"]'); if (metaDesc) { const desc = metaDesc.getAttribute("content"); if (desc) { extractedText += `Description: ${desc}\n\n`; } } extractedText += extractTextFromElement(document.body); extractedText = cleanExtractedText(extractedText, false); if (extractedText.length > MAX_LENGTH) { wasTruncated = true; extractedText = extractedText.substring(0, MAX_LENGTH); } if (extractedText.length < 1000) { const fallbackResult = extractLegacySelectorsFallback(); if (fallbackResult.text.length > extractedText.length) { extractedText = `Title: ${title}\n\n${fallbackResult.text}`; wasTruncated = fallbackResult.wasTruncated; } } return { text: extractedText, wasTruncated }; } /** Legacy debug — uncomment body to log to the tab's DevTools (page context). */ function logExtractionDebug(source, text) { // console.log("[Lede DEBUG] extraction:", source); // console.log( // "[Lede DEBUG] raw extracted text (" + text.length + " chars):", // text, // ); } function tryRedditShredditExtract() { if (typeof window.__webaiTryRedditNew !== "function") { return null; } try { return window.__webaiTryRedditNew(MAX_LENGTH); } catch (err) { console.error("[Lede] Reddit extract error:", err); return null; } } function extractContent() { const redditResult = tryRedditShredditExtract(); if (redditResult) { logExtractionDebug(redditResult.extractionSource, redditResult.text); return { text: redditResult.text, wasTruncated: Boolean(redditResult.wasTruncated), extractionSource: redditResult.extractionSource, unsupportedReason: redditResult.unsupportedReason, }; } try { const readability = extractWithReadability(); const rbLen = readability.text.trim().length; if (readability.articleNull || rbLen === 0) { const legacy = extractLegacy(); logExtractionDebug("legacy", legacy.text); return { ...legacy, extractionSource: "legacy" }; } const bodyLen = readability.bodyTextLen; const contentRatio = bodyLen / Math.max(readability.text.length, 1); const biasedToLegacy = readabilityTextBiasLegacy(readability.text); const strongArticleBody = bodyLen >= READABILITY_STRONG_BODY_CHARS && !biasedToLegacy; const ratioLooksLikeArticle = bodyLen >= READABILITY_MIN_BODY_CHARS && contentRatio >= READABILITY_MIN_CONTENT_RATIO && !biasedToLegacy; // Long article body — Readability clearly won (unless sponsor/promo bias sent us to legacy). if (strongArticleBody) { const flip = tryLegacyWhenShortReadOverwhelmed(readability); if (flip) { logExtractionDebug("legacy", flip.text); return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" }; } logExtractionDebug("readability", readability.text); return { text: readability.text, wasTruncated: readability.wasTruncated, extractionSource: "readability", }; } // Short pages / stubs: body is still most of what we output (not metadata padding). if (ratioLooksLikeArticle) { const flip = tryLegacyWhenShortReadOverwhelmed(readability); if (flip) { logExtractionDebug("legacy", flip.text); return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" }; } logExtractionDebug("readability", readability.text); return { text: readability.text, wasTruncated: readability.wasTruncated, extractionSource: "readability", }; } // Marginal Readability (feed hero, cookie copy, tiny grab): compare with legacy. const legacy = extractLegacy(); const legLen = legacy.text.length; const readLen = readability.text.length; if (bodyLen < READABILITY_MIN_BODY_CHARS) { const useLegacy = legLen > readLen; const picked = useLegacy ? legacy : readability; logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text); return { text: picked.text, wasTruncated: picked.wasTruncated, extractionSource: useLegacy ? "legacy" : "readability", }; } // Body exists but is mostly not "article-like" vs full page (low ratio already failed). if ( contentRatio < READABILITY_MIN_CONTENT_RATIO && legLen > readLen * 1.4 ) { logExtractionDebug("legacy", legacy.text); return { text: legacy.text, wasTruncated: legacy.wasTruncated, extractionSource: "legacy", }; } const useLegacy = legLen > readLen; const picked = useLegacy ? legacy : readability; logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text); return { text: picked.text, wasTruncated: picked.wasTruncated, extractionSource: useLegacy ? "legacy" : "readability", }; } catch (error) { console.error("[Lede] Readability error:", error); const legacy = extractLegacy(); logExtractionDebug("legacy", legacy.text); return { ...legacy, extractionSource: "legacy" }; } } chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { if (request.action === "extract") { const result = extractContent(); sendResponse({ content: result.text, wasTruncated: result.wasTruncated, extractionSource: result.extractionSource, unsupportedReason: result.unsupportedReason, }); } return true; }); })();