馃悕馃悕馃悕
at dev 11 kB view raw
1 2// Courtesy of Claude :) 3 4import * as cssTokenizer from "/code/css_tokenizer.js"; 5 6const keywords = new Set([ 7 "abstract", "arguments", "await", "boolean", "break", "byte", "case", "catch", 8 "char", "class", "const", "continue", "debugger", "default", "delete", "do", 9 "double", "else", "enum", "eval", "export", "extends", "false", "final", 10 "finally", "float", "for", "function", "goto", "if", "implements", "import", 11 "in", "instanceof", "int", "interface", "let", "long", "native", "new", 12 "null", "package", "private", "protected", "public", "return", "short", 13 "static", "super", "switch", "synchronized", "this", "throw", "throws", 14 "transient", "true", "try", "typeof", "var", "void", "volatile", "while", 15 "with", "yield", "async", "of" 16]); 17 18const builtins = new Set([ 19 "console", "window", "document", "Array", "Object", "String", "Number", 20 "Boolean", "Date", "RegExp", "Math", "JSON", "parseInt", "parseFloat", 21 "isNaN", "isFinite", "undefined", "NaN", "Infinity", "Promise", "Set", 22 "Map", "WeakSet", "WeakMap", "Symbol", "Proxy", "Reflect" 23]); 24 25const operators = new Set([ 26 "+", "-", "*", "/", "%", "**", "++", "--", "=", "+=", "-=", "*=", "/=", "%=", 27 "**=", "==", "===", "!=", "!==", "<", ">", "<=", ">=", "&&", "||", "!", 28 "&", "|", "^", "~", "<<", ">>", ">>>", "?", ":", "=>", "...", "??", "??=", 29 "&&=", "||=", "&=", "|=", "^=", "<<=", ">>=", ">>>=" 30]); 31 32export function tokenize(code) { 33 const tokens = []; 34 let i = 0; 35 36 while (i < code.length) { 37 const char = code[i]; 38 39 // Whitespace 40 if (/\s/.test(char)) { 41 const start = i; 42 while (i < code.length && /\s/.test(code[i])) i++; 43 tokens.push({ type: "whitespace", value: code.slice(start, i) }); 44 continue; 45 } 46 47 // Single-line comment 48 if (char === '/' && code[i + 1] === '/') { 49 const start = i; 50 while (i < code.length && code[i] !== '\n') i++; 51 tokens.push({ type: "comment", value: code.slice(start, i) }); 52 continue; 53 } 54 55 // Multi-line comment 56 if (char === '/' && code[i + 1] === '*') { 57 const start = i; 58 i += 2; 59 while (i < code.length - 1 && !(code[i] === '*' && code[i + 1] === '/')) i++; 60 if (i < code.length - 1) i += 2; 61 tokens.push({ type: "comment", value: code.slice(start, i) }); 62 continue; 63 } 64 65 // Template literals 66 if (char === '`') { 67 const templateTokens = tokenizeTemplateLiteral(code, i); 68 tokens.push(...templateTokens.tokens); 69 i = templateTokens.newIndex; 70 continue; 71 } 72 73 // Strings 74 if (char === '"' || char === '\'') { 75 const stringToken = tokenizeString(code, i, char); 76 tokens.push(stringToken.token); 77 i = stringToken.newIndex; 78 continue; 79 } 80 81 // Regular expressions 82 if (char === '/' && isRegexContext(tokens)) { 83 const regexToken = tokenizeRegex(code, i); 84 if (regexToken) { 85 tokens.push(regexToken.token); 86 i = regexToken.newIndex; 87 continue; 88 } 89 } 90 91 // Numbers 92 if (/\d/.test(char) || (char === '.' && /\d/.test(code[i + 1]))) { 93 const numberToken = tokenizeNumber(code, i); 94 tokens.push(numberToken.token); 95 i = numberToken.newIndex; 96 continue; 97 } 98 99 // Identifiers and keywords 100 if (/[a-zA-Z_$]/.test(char)) { 101 const identifierToken = tokenizeIdentifier(code, i); 102 tokens.push(identifierToken.token); 103 i = identifierToken.newIndex; 104 continue; 105 } 106 107 // Multi-character operators 108 const twoChar = code.slice(i, i + 2); 109 const threeChar = code.slice(i, i + 3); 110 111 if (operators.has(threeChar)) { 112 tokens.push({ type: "operator", value: threeChar }); 113 i += 3; 114 continue; 115 } 116 117 if (operators.has(twoChar)) { 118 tokens.push({ type: "operator", value: twoChar }); 119 i += 2; 120 continue; 121 } 122 123 // Single-character operators and punctuation 124 if (operators.has(char)) { 125 tokens.push({ type: "operator", value: char }); 126 i++; 127 continue; 128 } 129 130 if ("{}[]().,;".includes(char)) { 131 tokens.push({ type: "punctuation", value: char }); 132 i++; 133 continue; 134 } 135 136 // Unknown character 137 tokens.push({ type: "unknown", value: char }); 138 i++; 139 } 140 141 return tokens; 142} 143 144function tokenizeTemplateLiteral(code, start) { 145 const tokens = []; 146 let i = start; 147 let current = ""; 148 149 // Opening backtick 150 current += code[i++]; 151 152 // Check for CSS string 153 const remainingCode = code.slice(i); 154 if (remainingCode.startsWith("/* css */")) { 155 // Find the closing quote 156 let j = i; 157 while (j < code.length && code[j] !== '`') { 158 if (code[j] === '\\') j += 2; 159 else j++; 160 } 161 162 if (j < code.length) { 163 const cssContent = code.slice(i + 9, j); // Skip "/* css */" 164 const cssTokens = cssTokenizer.tokenize(cssContent); 165 166 // Create a compound token for the CSS string 167 current += code.slice(i, j + 1); 168 return { 169 tokens: [{ 170 type: "css-string", 171 value: current, 172 cssTokens 173 }], 174 newIndex: j + 1 175 }; 176 } 177 } 178 179 180 while (i < code.length) { 181 const char = code[i]; 182 183 if (char === '`') { 184 current += char; 185 tokens.push({ type: "template-literal", value: current }); 186 i++; 187 break; 188 } 189 190 if (char === '$' && code[i + 1] === '{') { 191 if (current) { 192 tokens.push({ type: "template-literal", value: current }); 193 } 194 195 // Find matching closing brace 196 let braceCount = 1; 197 let j = i + 2; 198 while (j < code.length && braceCount > 0) { 199 if (code[j] === '{') braceCount++; 200 else if (code[j] === '}') braceCount--; 201 j++; 202 } 203 204 const expression = code.slice(i, j); 205 tokens.push({ type: "template-expression", value: expression }); 206 i = j; 207 current = ""; 208 continue; 209 } 210 211 if (char === '\\') { 212 current += char; 213 i++; 214 if (i < code.length) { 215 current += code[i]; 216 i++; 217 } 218 continue; 219 } 220 221 current += char; 222 i++; 223 } 224 225 return { tokens, newIndex: i }; 226} 227 228function tokenizeString(code, start, quote) { 229 let i = start; 230 let value = ""; 231 232 value += code[i++]; // Opening quote 233 234 // Check for CSS string 235 const remainingCode = code.slice(i); 236 if (remainingCode.startsWith("/* css */")) { 237 // Find the closing quote 238 let j = i; 239 while (j < code.length && code[j] !== quote) { 240 if (code[j] === '\\') j += 2; 241 else j++; 242 } 243 244 if (j < code.length) { 245 const cssContent = code.slice(i + 9, j); // Skip "/* css */" 246 const cssTokens = cssTokenizer.tokenize(cssContent); 247 248 // Create a compound token for the CSS string 249 value += code.slice(i, j + 1); 250 return { 251 token: { 252 type: "css-string", 253 value, 254 cssTokens 255 }, 256 newIndex: j + 1 257 }; 258 } 259 } 260 261 262 while (i < code.length) { 263 const char = code[i]; 264 265 if (char === quote) { 266 value += char; 267 i++; 268 break; 269 } 270 271 if (char === "\\") { 272 value += char; 273 i++; 274 if (i < code.length) { 275 value += code[i]; 276 i++; 277 } 278 continue; 279 } 280 281 value += char; 282 i++; 283 } 284 285 return { token: { type: "string", value }, newIndex: i }; 286} 287 288function tokenizeRegex(code, start) { 289 let i = start + 1; // Skip opening / 290 let value = "/"; 291 292 while (i < code.length) { 293 const char = code[i]; 294 295 if (char === '/') { 296 value += char; 297 i++; 298 299 // Parse flags 300 while (i < code.length && /[gimsuvy]/.test(code[i])) { 301 value += code[i]; 302 i++; 303 } 304 305 return { token: { type: "regex", value }, newIndex: i }; 306 } 307 308 if (char === '\\') { 309 value += char; 310 i++; 311 if (i < code.length) { 312 value += code[i]; 313 i++; 314 } 315 continue; 316 } 317 318 if (char === '\n') { 319 return null; // Invalid regex 320 } 321 322 value += char; 323 i++; 324 } 325 326 return null; // Unterminated regex 327} 328 329function tokenizeNumber(code, start) { 330 let i = start; 331 let value = ""; 332 333 // Handle hex numbers 334 if (code[i] === '0' && (code[i + 1] === 'x' || code[i + 1] === 'X')) { 335 value += code[i++]; 336 value += code[i++]; 337 while (i < code.length && /[0-9a-fA-F]/.test(code[i])) { 338 value += code[i++]; 339 } 340 return { token: { type: "number", value }, newIndex: i }; 341 } 342 343 // Handle decimal numbers 344 let hasDecimal = false; 345 while (i < code.length && (/\d/.test(code[i]) || (code[i] === '.' && !hasDecimal))) { 346 if (code[i] === '.') hasDecimal = true; 347 value += code[i++]; 348 } 349 350 // Handle scientific notation 351 if (i < code.length && (code[i] === 'e' || code[i] === 'E')) { 352 value += code[i++]; 353 if (i < code.length && (code[i] === '+' || code[i] === '-')) { 354 value += code[i++]; 355 } 356 while (i < code.length && /\d/.test(code[i])) { 357 value += code[i++]; 358 } 359 } 360 361 return { token: { type: "number", value }, newIndex: i }; 362} 363 364function tokenizeIdentifier(code, start) { 365 let i = start; 366 let value = ""; 367 368 while (i < code.length && /[a-zA-Z0-9_$]/.test(code[i])) { 369 value += code[i++]; 370 } 371 372 let type = "identifier"; 373 if (keywords.has(value)) type = "keyword"; 374 else if (builtins.has(value)) type = "builtin"; 375 376 return { token: { type, value }, newIndex: i }; 377} 378 379function isRegexContext(tokens) { 380 // Look at the last non-whitespace token to determine if / starts a regex 381 for (let i = tokens.length - 1; i >= 0; i--) { 382 const token = tokens[i]; 383 if (token.type === "whitespace") continue; 384 385 // Regex is likely after these tokens 386 if (["operator", "keyword", "punctuation"].includes(token.type)) { 387 if (token.value === ')' || token.value === ']') return false; 388 return true; 389 } 390 391 // Not a regex after identifiers or numbers 392 if (["identifier", "number"].includes(token.type)) return false; 393 394 break; 395 } 396 397 return true; // Default to regex at start of input 398} 399