// Courtesy of Claude :) import * as cssTokenizer from "/code/css_tokenizer.js"; const keywords = new Set([ "abstract", "arguments", "await", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "eval", "export", "extends", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "let", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with", "yield", "async", "of" ]); const builtins = new Set([ "console", "window", "document", "Array", "Object", "String", "Number", "Boolean", "Date", "RegExp", "Math", "JSON", "parseInt", "parseFloat", "isNaN", "isFinite", "undefined", "NaN", "Infinity", "Promise", "Set", "Map", "WeakSet", "WeakMap", "Symbol", "Proxy", "Reflect" ]); const operators = new Set([ "+", "-", "*", "/", "%", "**", "++", "--", "=", "+=", "-=", "*=", "/=", "%=", "**=", "==", "===", "!=", "!==", "<", ">", "<=", ">=", "&&", "||", "!", "&", "|", "^", "~", "<<", ">>", ">>>", "?", ":", "=>", "...", "??", "??=", "&&=", "||=", "&=", "|=", "^=", "<<=", ">>=", ">>>=" ]); export function tokenize(code) { const tokens = []; let i = 0; while (i < code.length) { const char = code[i]; // Whitespace if (/\s/.test(char)) { const start = i; while (i < code.length && /\s/.test(code[i])) i++; tokens.push({ type: "whitespace", value: code.slice(start, i) }); continue; } // Single-line comment if (char === '/' && code[i + 1] === '/') { const start = i; while (i < code.length && code[i] !== '\n') i++; tokens.push({ type: "comment", value: code.slice(start, i) }); continue; } // Multi-line comment if (char === '/' && code[i + 1] === '*') { const start = i; i += 2; while (i < code.length - 1 && !(code[i] === '*' && code[i + 1] === '/')) i++; if (i < code.length - 1) i += 2; tokens.push({ type: "comment", value: code.slice(start, i) }); continue; } // Template literals if (char === '`') { const templateTokens = tokenizeTemplateLiteral(code, i); tokens.push(...templateTokens.tokens); i = templateTokens.newIndex; continue; } // Strings if (char === '"' || char === '\'') { const stringToken = tokenizeString(code, i, char); tokens.push(stringToken.token); i = stringToken.newIndex; continue; } // Regular expressions if (char === '/' && isRegexContext(tokens)) { const regexToken = tokenizeRegex(code, i); if (regexToken) { tokens.push(regexToken.token); i = regexToken.newIndex; continue; } } // Numbers if (/\d/.test(char) || (char === '.' && /\d/.test(code[i + 1]))) { const numberToken = tokenizeNumber(code, i); tokens.push(numberToken.token); i = numberToken.newIndex; continue; } // Identifiers and keywords if (/[a-zA-Z_$]/.test(char)) { const identifierToken = tokenizeIdentifier(code, i); tokens.push(identifierToken.token); i = identifierToken.newIndex; continue; } // Multi-character operators const twoChar = code.slice(i, i + 2); const threeChar = code.slice(i, i + 3); if (operators.has(threeChar)) { tokens.push({ type: "operator", value: threeChar }); i += 3; continue; } if (operators.has(twoChar)) { tokens.push({ type: "operator", value: twoChar }); i += 2; continue; } // Single-character operators and punctuation if (operators.has(char)) { tokens.push({ type: "operator", value: char }); i++; continue; } if ("{}[]().,;".includes(char)) { tokens.push({ type: "punctuation", value: char }); i++; continue; } // Unknown character tokens.push({ type: "unknown", value: char }); i++; } return tokens; } function tokenizeTemplateLiteral(code, start) { const tokens = []; let i = start; let current = ""; // Opening backtick current += code[i++]; // Check for CSS string const remainingCode = code.slice(i); if (remainingCode.startsWith("/* css */")) { // Find the closing quote let j = i; while (j < code.length && code[j] !== '`') { if (code[j] === '\\') j += 2; else j++; } if (j < code.length) { const cssContent = code.slice(i + 9, j); // Skip "/* css */" const cssTokens = cssTokenizer.tokenize(cssContent); // Create a compound token for the CSS string current += code.slice(i, j + 1); return { tokens: [{ type: "css-string", value: current, cssTokens }], newIndex: j + 1 }; } } while (i < code.length) { const char = code[i]; if (char === '`') { current += char; tokens.push({ type: "template-literal", value: current }); i++; break; } if (char === '$' && code[i + 1] === '{') { if (current) { tokens.push({ type: "template-literal", value: current }); } // Find matching closing brace let braceCount = 1; let j = i + 2; while (j < code.length && braceCount > 0) { if (code[j] === '{') braceCount++; else if (code[j] === '}') braceCount--; j++; } const expression = code.slice(i, j); tokens.push({ type: "template-expression", value: expression }); i = j; current = ""; continue; } if (char === '\\') { current += char; i++; if (i < code.length) { current += code[i]; i++; } continue; } current += char; i++; } return { tokens, newIndex: i }; } function tokenizeString(code, start, quote) { let i = start; let value = ""; value += code[i++]; // Opening quote // Check for CSS string const remainingCode = code.slice(i); if (remainingCode.startsWith("/* css */")) { // Find the closing quote let j = i; while (j < code.length && code[j] !== quote) { if (code[j] === '\\') j += 2; else j++; } if (j < code.length) { const cssContent = code.slice(i + 9, j); // Skip "/* css */" const cssTokens = cssTokenizer.tokenize(cssContent); // Create a compound token for the CSS string value += code.slice(i, j + 1); return { token: { type: "css-string", value, cssTokens }, newIndex: j + 1 }; } } while (i < code.length) { const char = code[i]; if (char === quote) { value += char; i++; break; } if (char === "\\") { value += char; i++; if (i < code.length) { value += code[i]; i++; } continue; } value += char; i++; } return { token: { type: "string", value }, newIndex: i }; } function tokenizeRegex(code, start) { let i = start + 1; // Skip opening / let value = "/"; while (i < code.length) { const char = code[i]; if (char === '/') { value += char; i++; // Parse flags while (i < code.length && /[gimsuvy]/.test(code[i])) { value += code[i]; i++; } return { token: { type: "regex", value }, newIndex: i }; } if (char === '\\') { value += char; i++; if (i < code.length) { value += code[i]; i++; } continue; } if (char === '\n') { return null; // Invalid regex } value += char; i++; } return null; // Unterminated regex } function tokenizeNumber(code, start) { let i = start; let value = ""; // Handle hex numbers if (code[i] === '0' && (code[i + 1] === 'x' || code[i + 1] === 'X')) { value += code[i++]; value += code[i++]; while (i < code.length && /[0-9a-fA-F]/.test(code[i])) { value += code[i++]; } return { token: { type: "number", value }, newIndex: i }; } // Handle decimal numbers let hasDecimal = false; while (i < code.length && (/\d/.test(code[i]) || (code[i] === '.' && !hasDecimal))) { if (code[i] === '.') hasDecimal = true; value += code[i++]; } // Handle scientific notation if (i < code.length && (code[i] === 'e' || code[i] === 'E')) { value += code[i++]; if (i < code.length && (code[i] === '+' || code[i] === '-')) { value += code[i++]; } while (i < code.length && /\d/.test(code[i])) { value += code[i++]; } } return { token: { type: "number", value }, newIndex: i }; } function tokenizeIdentifier(code, start) { let i = start; let value = ""; while (i < code.length && /[a-zA-Z0-9_$]/.test(code[i])) { value += code[i++]; } let type = "identifier"; if (keywords.has(value)) type = "keyword"; else if (builtins.has(value)) type = "builtin"; return { token: { type, value }, newIndex: i }; } function isRegexContext(tokens) { // Look at the last non-whitespace token to determine if / starts a regex for (let i = tokens.length - 1; i >= 0; i--) { const token = tokens[i]; if (token.type === "whitespace") continue; // Regex is likely after these tokens if (["operator", "keyword", "punctuation"].includes(token.type)) { if (token.value === ')' || token.value === ']') return false; return true; } // Not a regex after identifiers or numbers if (["identifier", "number"].includes(token.type)) return false; break; } return true; // Default to regex at start of input }