// Content script: Mozilla Readability for articles, legacy DOM walk for feeds / non-articles

(function () {
  "use strict";

  // Prevent multiple injections
  if (window.__webaiExtractorInstalled) {
    return;
  }
  window.__webaiExtractorInstalled = true;

  const MAX_LENGTH =
    typeof CONFIG !== "undefined" && CONFIG.EXTRACTION?.MAX_LENGTH
      ? CONFIG.EXTRACTION.MAX_LENGTH
      : 50000;

  /**
   * Readability `textContent` length (article body only). Above this, treat as a real article
   * extract. Do NOT use total formatted string length — title/metadata can exceed 500 chars alone.
   * Kept moderately high so homepages don't classify sponsor blocks as "articles".
   */
  const READABILITY_STRONG_BODY_CHARS = 560;

  /**
   * When body is shorter, still trust Readability if the body is a large share of our formatted
   * output (not mostly "Title:/Description:" boilerplate from a feed shell).
   */
  const READABILITY_MIN_BODY_CHARS = 120;

  /** Minimum body / formatted-text ratio for the "substantial body" heuristic. */
  const READABILITY_MIN_CONTENT_RATIO = 0.34;

  /**
   * If Readability output is shorter than this, compare with legacy: full-page text often
   * dwarfs a homepage "article" grab (e.g. link aggregators).
   */
  const READABILITY_SHORT_EXTRACT_MAX = 2200;

  /**
   * When Readability extract is "short", prefer legacy if the DOM walk yields this many times
   * more text (typical feed / homepage vs a thin Readability pick).
   */
  const READABILITY_LEGACY_DOMINANCE_RATIO = 3.25;

  /** Prefer legacy when the grab looks like promos / feed chrome, not prose. */
  const READABILITY_BIAS_LEGACY_REGEXES = [
    /\bSponsor(?:ed)?\s+Posts?\b/i,
    /\bPromoted\s+(?:Stories?|Posts?|Content)\b/i,
    /\bAdvertiser\s+Content\b/i,
    /\bPaid\s+Partnership\b/i,
  ];

  function readabilityTextBiasLegacy(formattedText) {
    return READABILITY_BIAS_LEGACY_REGEXES.some((re) => re.test(formattedText));
  }

  /**
   * Short Readability extract + huge legacy output ⇒ likely listing/homepage, not an article.
   * Skip when body is clearly long-form anyway (avoid flipping concise real articles on noisy DOMs).
   */
  function tryLegacyWhenShortReadOverwhelmed(readability) {
    const readLen = readability.text.length;
    if (readLen > READABILITY_SHORT_EXTRACT_MAX) {
      return null;
    }
    if (readability.bodyTextLen >= READABILITY_STRONG_BODY_CHARS + 400) {
      return null;
    }
    const legacy = extractLegacy();
    if (legacy.text.length > readLen * READABILITY_LEGACY_DOMINANCE_RATIO) {
      return legacy;
    }
    return null;
  }

  const EXCLUDE_TAGS = [
    "script",
    "style",
    "noscript",
    "iframe",
    "embed",
    "object",
    "frame",
    "nav",
    "aside",
    "form",
    "button",
    "input",
  ];

  /**
   * Subtrees to omit from plain-text collection. `textContent` includes script/style/template
   * bodies, which pulls in Astro/React/Vite inline bundles when walking `main div` etc.
   * Also skip embed-like tags (iframe/object/embed) so news players do not dump URLs or attrs.
   */
  const TEXT_SUBTREE_EXCLUDE_TAGS = new Set([
    ...EXCLUDE_TAGS,
    "template",
  ]);

  function shouldExcludeTextSubtree(tagName) {
    return TEXT_SUBTREE_EXCLUDE_TAGS.has(tagName.toLowerCase());
  }

  /**
   * Strip embed markup that appears as literal text in article bodies (CMS/oEmbed fallbacks).
   * Complements subtree skipping — some sites still surface tags as visible copy.
   */
  function sanitizeLiteralEmbedMarkup(text) {
    if (!text || typeof text !== "string") {
      return text;
    }
    let t = text;
    t = t.replace(/<iframe\b[\s\S]{0,20000}?<\/iframe>/gi, "\n");
    t = t.replace(/<iframe\b[^>]{0,8000}\/?>/gi, "\n");
    t = t.replace(/<\/iframe>/gi, "");
    t = t.replace(/<embed\b[^>]{0,8000}\/?>/gi, "\n");
    t = t.replace(/<object\b[\s\S]{0,20000}?<\/object>/gi, "\n");
    return t;
  }

  function extractWithReadability() {
    const documentClone = document.cloneNode(true);
    const reader = new Readability(documentClone);
    const article = reader.parse();

    if (!article) {
      return {
        text: "",
        wasTruncated: false,
        articleNull: true,
        bodyTextLen: 0,
      };
    }

    let extractedText = "";

    if (article.title) {
      extractedText += `Title: ${article.title}\n\n`;
    }

    if (article.byline) {
      extractedText += `Author: ${article.byline}\n\n`;
    }

    if (article.excerpt && article.excerpt !== article.title) {
      extractedText += `Description: ${article.excerpt}\n\n`;
    }

    if (article.publishedTime) {
      extractedText += `Published: ${article.publishedTime}\n\n`;
    }

    if (article.siteName) {
      extractedText += `Source: ${article.siteName}\n\n`;
    }

    if (extractedText) {
      extractedText += "---\n\n";
    }

    let content = article.textContent || "";
    content = sanitizeLiteralEmbedMarkup(content);

    content = content
      .replace(/[^\S\n]+/g, " ")
      .replace(/\n{3,}/g, "\n\n")
      .replace(/^\s+|\s+$/g, "");

    extractedText += content;

    const bodyTextLen = content.trim().length;

    let wasTruncated = false;
    if (extractedText.length > MAX_LENGTH) {
      wasTruncated = true;
      extractedText = extractedText.substring(0, MAX_LENGTH);
    }

    return {
      text: extractedText,
      wasTruncated,
      articleNull: false,
      bodyTextLen,
    };
  }

  // --- Legacy extraction (pre-Readability): structured body walk + selector fallback ---

  function extractLegacySelectorsFallback() {
    const selectors = [
      "article p",
      "article div",
      ".content p",
      ".content div",
      ".post-content p",
      ".entry-content p",
      ".article-body p",
      "main p",
      "main div",
      '[role="main"] p',
      ".story p",
      ".story-body p",
      "#story p",
    ];

    let text = "";
    let wasTruncated = false;
    const seen = new Set();

    for (const selector of selectors) {
      try {
        const elements = document.querySelectorAll(selector);
        for (const el of elements) {
          const content = getTextContent(el).trim();
          if (content.length < 20 || seen.has(content.substring(0, 100))) continue;

          const style = window.getComputedStyle(el);
          if (style.display === "none" || style.visibility === "hidden") continue;

          seen.add(content.substring(0, 100));
          text += content + "\n\n";

          if (text.length > MAX_LENGTH) {
            wasTruncated = true;
            break;
          }
        }
      } catch (e) {
        // Ignore invalid selectors
      }
      if (wasTruncated) break;
    }

    if (text.length < 500 && !wasTruncated) {
      const allParagraphs = document.querySelectorAll("p");
      for (const p of allParagraphs) {
        const content = getTextContent(p).trim();
        if (content.length > 30 && !seen.has(content.substring(0, 100))) {
          const style = window.getComputedStyle(p);
          if (style.display === "none" || style.visibility === "hidden") continue;

          seen.add(content.substring(0, 100));
          text += content + "\n\n";

          if (text.length > MAX_LENGTH) {
            wasTruncated = true;
            break;
          }
        }
      }
    }

    return { text: text.substring(0, MAX_LENGTH), wasTruncated };
  }

  function shouldSkipElement(el) {
    const tag = el.tagName.toLowerCase();
    if (EXCLUDE_TAGS.includes(tag)) {
      return true;
    }

    try {
      const style = window.getComputedStyle(el);
      if (
        style.display === "none" ||
        style.visibility === "hidden" ||
        style.opacity === "0"
      ) {
        return true;
      }
    } catch (e) {
      // ignore
    }

    if (isMainContent(el)) return false;

    const role = el.getAttribute("role");
    if (role === "navigation" || role === "banner" || role === "complementary") {
      return true;
    }

    let className = "";
    let id = "";

    if (el.className) {
      if (typeof el.className === "string") {
        className = el.className;
      } else if (el.className.baseVal) {
        className = el.className.baseVal;
      }
    }

    if (el.id) {
      if (typeof el.id === "string") {
        id = el.id;
      } else if (el.id.baseVal) {
        id = el.id.baseVal;
      }
    }

    const classAndId = (className + " " + id).toLowerCase();
    const strictNoisePatterns = [
      /^nav$/,
      /-nav$/,
      /^nav-/,
      /^navigation$/,
      /^footer$/,
      /-footer$/,
      /^footer-/,
      /^header$/,
      /^site-header$/,
      /^page-header$/,
      /^sidebar$/,
      /^advertisement$/,
      /^ad-container$/,
    ];
    if (strictNoisePatterns.some((p) => p.test(classAndId.trim()))) {
      return true;
    }

    return false;
  }

  function isMainContent(element) {
    const role = element.getAttribute("role");
    const tagName = element.tagName.toLowerCase();

    let className = "";
    let id = "";

    if (element.className) {
      if (typeof element.className === "string") {
        className = element.className.toLowerCase();
      } else if (element.className.baseVal) {
        className = element.className.baseVal.toLowerCase();
      }
    }

    if (element.id) {
      if (typeof element.id === "string") {
        id = element.id.toLowerCase();
      } else if (element.id.baseVal) {
        id = element.id.baseVal.toLowerCase();
      }
    }

    const contentPatterns = [
      "content",
      "main-content",
      "article-content",
      "post-content",
      "entry-content",
      "page-content",
      "story-content",
      "body-content",
      "article",
      "post",
      "entry",
      "story",
      "main",
    ];

    const isContentClass = contentPatterns.some(
      (p) => className.includes(p) || id.includes(p),
    );

    return (
      role === "main" ||
      role === "article" ||
      tagName === "main" ||
      tagName === "article" ||
      isContentClass
    );
  }

  function extractTextFromElement(element, depth = 0) {
    let text = "";
    const indent = "  ".repeat(depth);

    const directText = getDirectTextContent(element).trim();
    if (directText.length > 20 && depth > 0) {
      text += directText + "\n\n";
    }

    for (const child of element.children) {
      const childTag = child.tagName.toLowerCase();

      if (shouldSkipElement(child)) continue;

      if (/^h[1-6]$/.test(childTag)) {
        const headingText = getTextContent(child).trim();
        if (headingText) {
          const prefix = "#".repeat(parseInt(childTag[1], 10));
          text += `\n${prefix} ${headingText}\n\n`;
        }
      } else if (childTag === "p") {
        const pText = getTextContent(child).trim();
        if (pText.length > 5) {
          text += `${pText}\n\n`;
        }
      } else if (childTag === "li") {
        const liText = getTextContent(child).trim();
        if (liText) {
          text += `${indent}- ${liText}\n`;
        }
      } else if (childTag === "pre" || childTag === "code") {
        const codeText = getTextContent(child).trim();
        if (codeText) {
          text += `\n\`\`\`\n${codeText}\n\`\`\`\n\n`;
        }
      } else {
        const childText = extractTextFromElement(child, depth + 1);
        if (childText.trim()) {
          text += childText;
        }
      }
    }

    return text;
  }

  function getDirectTextContent(element) {
    let text = "";
    for (const node of element.childNodes) {
      if (node.nodeType === Node.TEXT_NODE) {
        text += node.textContent;
      }
    }
    return text.trim();
  }

  function getTextContent(element) {
    let text = "";

    for (const node of element.childNodes) {
      if (node.nodeType === Node.TEXT_NODE) {
        text += node.textContent;
      } else if (node.nodeType === Node.ELEMENT_NODE) {
        const tagName = node.tagName.toLowerCase();

        if (shouldExcludeTextSubtree(tagName)) {
          continue;
        }

        if (["br", "p", "div", "li"].includes(tagName)) {
          text += " " + getTextContent(node) + " ";
        } else {
          text += getTextContent(node);
        }
      }
    }

    return text;
  }

  function cleanExtractedText(text, shouldTruncate = true) {
    let cleaned = sanitizeLiteralEmbedMarkup(text);
    cleaned = cleaned
      .replace(/[^\S\n]+/g, " ")
      .replace(/\n{3,}/g, "\n\n")
      .replace(/^\s+|\s+$/g, "");

    if (shouldTruncate && cleaned.length > MAX_LENGTH) {
      cleaned = cleaned.substring(0, MAX_LENGTH);
    }

    return cleaned;
  }

  /**
   * Full-document extraction used before Readability. Works well for feeds, homepages,
   * and app-like pages where Readability returns nothing or very little.
   */
  function extractLegacy() {
    if (!document.body) {
      return { text: document.title || "", wasTruncated: false };
    }

    let extractedText = "";
    let wasTruncated = false;

    const title = document.title || "";
    if (title) {
      extractedText += `Title: ${title}\n\n`;
    }

    const metaDesc = document.querySelector('meta[name="description"]');
    if (metaDesc) {
      const desc = metaDesc.getAttribute("content");
      if (desc) {
        extractedText += `Description: ${desc}\n\n`;
      }
    }

    extractedText += extractTextFromElement(document.body);

    extractedText = cleanExtractedText(extractedText, false);

    if (extractedText.length > MAX_LENGTH) {
      wasTruncated = true;
      extractedText = extractedText.substring(0, MAX_LENGTH);
    }

    if (extractedText.length < 1000) {
      const fallbackResult = extractLegacySelectorsFallback();
      if (fallbackResult.text.length > extractedText.length) {
        extractedText = `Title: ${title}\n\n${fallbackResult.text}`;
        wasTruncated = fallbackResult.wasTruncated;
      }
    }

    return { text: extractedText, wasTruncated };
  }

  /** Legacy debug — uncomment body to log to the tab's DevTools (page context). */
  function logExtractionDebug(source, text) {
    // console.log("[Lede DEBUG] extraction:", source);
    // console.log(
    //   "[Lede DEBUG] raw extracted text (" + text.length + " chars):",
    //   text,
    // );
  }

  function tryRedditShredditExtract() {
    if (typeof window.__webaiTryRedditNew !== "function") {
      return null;
    }
    try {
      return window.__webaiTryRedditNew(MAX_LENGTH);
    } catch (err) {
      console.error("[Lede] Reddit extract error:", err);
      return null;
    }
  }

  function extractContent() {
    const redditResult = tryRedditShredditExtract();
    if (redditResult) {
      logExtractionDebug(redditResult.extractionSource, redditResult.text);
      return {
        text: redditResult.text,
        wasTruncated: Boolean(redditResult.wasTruncated),
        extractionSource: redditResult.extractionSource,
        unsupportedReason: redditResult.unsupportedReason,
      };
    }

    try {
      const readability = extractWithReadability();
      const rbLen = readability.text.trim().length;

      if (readability.articleNull || rbLen === 0) {
        const legacy = extractLegacy();
        logExtractionDebug("legacy", legacy.text);
        return { ...legacy, extractionSource: "legacy" };
      }

      const bodyLen = readability.bodyTextLen;
      const contentRatio =
        bodyLen / Math.max(readability.text.length, 1);

      const biasedToLegacy = readabilityTextBiasLegacy(readability.text);

      const strongArticleBody =
        bodyLen >= READABILITY_STRONG_BODY_CHARS && !biasedToLegacy;
      const ratioLooksLikeArticle =
        bodyLen >= READABILITY_MIN_BODY_CHARS &&
        contentRatio >= READABILITY_MIN_CONTENT_RATIO &&
        !biasedToLegacy;

      // Long article body — Readability clearly won (unless sponsor/promo bias sent us to legacy).
      if (strongArticleBody) {
        const flip = tryLegacyWhenShortReadOverwhelmed(readability);
        if (flip) {
          logExtractionDebug("legacy", flip.text);
          return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" };
        }
        logExtractionDebug("readability", readability.text);
        return {
          text: readability.text,
          wasTruncated: readability.wasTruncated,
          extractionSource: "readability",
        };
      }

      // Short pages / stubs: body is still most of what we output (not metadata padding).
      if (ratioLooksLikeArticle) {
        const flip = tryLegacyWhenShortReadOverwhelmed(readability);
        if (flip) {
          logExtractionDebug("legacy", flip.text);
          return { text: flip.text, wasTruncated: flip.wasTruncated, extractionSource: "legacy" };
        }
        logExtractionDebug("readability", readability.text);
        return {
          text: readability.text,
          wasTruncated: readability.wasTruncated,
          extractionSource: "readability",
        };
      }

      // Marginal Readability (feed hero, cookie copy, tiny grab): compare with legacy.
      const legacy = extractLegacy();
      const legLen = legacy.text.length;
      const readLen = readability.text.length;

      if (bodyLen < READABILITY_MIN_BODY_CHARS) {
        const useLegacy = legLen > readLen;
        const picked = useLegacy ? legacy : readability;
        logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text);
        return {
          text: picked.text,
          wasTruncated: picked.wasTruncated,
          extractionSource: useLegacy ? "legacy" : "readability",
        };
      }

      // Body exists but is mostly not "article-like" vs full page (low ratio already failed).
      if (
        contentRatio < READABILITY_MIN_CONTENT_RATIO &&
        legLen > readLen * 1.4
      ) {
        logExtractionDebug("legacy", legacy.text);
        return {
          text: legacy.text,
          wasTruncated: legacy.wasTruncated,
          extractionSource: "legacy",
        };
      }

      const useLegacy = legLen > readLen;
      const picked = useLegacy ? legacy : readability;
      logExtractionDebug(useLegacy ? "legacy" : "readability", picked.text);
      return {
        text: picked.text,
        wasTruncated: picked.wasTruncated,
        extractionSource: useLegacy ? "legacy" : "readability",
      };
    } catch (error) {
      console.error("[Lede] Readability error:", error);
      const legacy = extractLegacy();
      logExtractionDebug("legacy", legacy.text);
      return { ...legacy, extractionSource: "legacy" };
    }
  }

  chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
    if (request.action === "extract") {
      const result = extractContent();
      sendResponse({
        content: result.text,
        wasTruncated: result.wasTruncated,
        extractionSource: result.extractionSource,
        unsupportedReason: result.unsupportedReason,
      });
    }
    return true;
  });
})();