馃悕馃悕馃悕
1
2// Courtesy of Claude :)
3
4import * as cssTokenizer from "/code/css_tokenizer.js";
5
6const keywords = new Set([
7 "abstract", "arguments", "await", "boolean", "break", "byte", "case", "catch",
8 "char", "class", "const", "continue", "debugger", "default", "delete", "do",
9 "double", "else", "enum", "eval", "export", "extends", "false", "final",
10 "finally", "float", "for", "function", "goto", "if", "implements", "import",
11 "in", "instanceof", "int", "interface", "let", "long", "native", "new",
12 "null", "package", "private", "protected", "public", "return", "short",
13 "static", "super", "switch", "synchronized", "this", "throw", "throws",
14 "transient", "true", "try", "typeof", "var", "void", "volatile", "while",
15 "with", "yield", "async", "of"
16]);
17
18const builtins = new Set([
19 "console", "window", "document", "Array", "Object", "String", "Number",
20 "Boolean", "Date", "RegExp", "Math", "JSON", "parseInt", "parseFloat",
21 "isNaN", "isFinite", "undefined", "NaN", "Infinity", "Promise", "Set",
22 "Map", "WeakSet", "WeakMap", "Symbol", "Proxy", "Reflect"
23]);
24
25const operators = new Set([
26 "+", "-", "*", "/", "%", "**", "++", "--", "=", "+=", "-=", "*=", "/=", "%=",
27 "**=", "==", "===", "!=", "!==", "<", ">", "<=", ">=", "&&", "||", "!",
28 "&", "|", "^", "~", "<<", ">>", ">>>", "?", ":", "=>", "...", "??", "??=",
29 "&&=", "||=", "&=", "|=", "^=", "<<=", ">>=", ">>>="
30]);
31
32export function tokenize(code) {
33 const tokens = [];
34 let i = 0;
35
36 while (i < code.length) {
37 const char = code[i];
38
39 // Whitespace
40 if (/\s/.test(char)) {
41 const start = i;
42 while (i < code.length && /\s/.test(code[i])) i++;
43 tokens.push({ type: "whitespace", value: code.slice(start, i) });
44 continue;
45 }
46
47 // Single-line comment
48 if (char === '/' && code[i + 1] === '/') {
49 const start = i;
50 while (i < code.length && code[i] !== '\n') i++;
51 tokens.push({ type: "comment", value: code.slice(start, i) });
52 continue;
53 }
54
55 // Multi-line comment
56 if (char === '/' && code[i + 1] === '*') {
57 const start = i;
58 i += 2;
59 while (i < code.length - 1 && !(code[i] === '*' && code[i + 1] === '/')) i++;
60 if (i < code.length - 1) i += 2;
61 tokens.push({ type: "comment", value: code.slice(start, i) });
62 continue;
63 }
64
65 // Template literals
66 if (char === '`') {
67 const templateTokens = tokenizeTemplateLiteral(code, i);
68 tokens.push(...templateTokens.tokens);
69 i = templateTokens.newIndex;
70 continue;
71 }
72
73 // Strings
74 if (char === '"' || char === '\'') {
75 const stringToken = tokenizeString(code, i, char);
76 tokens.push(stringToken.token);
77 i = stringToken.newIndex;
78 continue;
79 }
80
81 // Regular expressions
82 if (char === '/' && isRegexContext(tokens)) {
83 const regexToken = tokenizeRegex(code, i);
84 if (regexToken) {
85 tokens.push(regexToken.token);
86 i = regexToken.newIndex;
87 continue;
88 }
89 }
90
91 // Numbers
92 if (/\d/.test(char) || (char === '.' && /\d/.test(code[i + 1]))) {
93 const numberToken = tokenizeNumber(code, i);
94 tokens.push(numberToken.token);
95 i = numberToken.newIndex;
96 continue;
97 }
98
99 // Identifiers and keywords
100 if (/[a-zA-Z_$]/.test(char)) {
101 const identifierToken = tokenizeIdentifier(code, i);
102 tokens.push(identifierToken.token);
103 i = identifierToken.newIndex;
104 continue;
105 }
106
107 // Multi-character operators
108 const twoChar = code.slice(i, i + 2);
109 const threeChar = code.slice(i, i + 3);
110
111 if (operators.has(threeChar)) {
112 tokens.push({ type: "operator", value: threeChar });
113 i += 3;
114 continue;
115 }
116
117 if (operators.has(twoChar)) {
118 tokens.push({ type: "operator", value: twoChar });
119 i += 2;
120 continue;
121 }
122
123 // Single-character operators and punctuation
124 if (operators.has(char)) {
125 tokens.push({ type: "operator", value: char });
126 i++;
127 continue;
128 }
129
130 if ("{}[]().,;".includes(char)) {
131 tokens.push({ type: "punctuation", value: char });
132 i++;
133 continue;
134 }
135
136 // Unknown character
137 tokens.push({ type: "unknown", value: char });
138 i++;
139 }
140
141 return tokens;
142}
143
144function tokenizeTemplateLiteral(code, start) {
145 const tokens = [];
146 let i = start;
147 let current = "";
148
149 // Opening backtick
150 current += code[i++];
151
152 // Check for CSS string
153 const remainingCode = code.slice(i);
154 if (remainingCode.startsWith("/* css */")) {
155 // Find the closing quote
156 let j = i;
157 while (j < code.length && code[j] !== '`') {
158 if (code[j] === '\\') j += 2;
159 else j++;
160 }
161
162 if (j < code.length) {
163 const cssContent = code.slice(i + 9, j); // Skip "/* css */"
164 const cssTokens = cssTokenizer.tokenize(cssContent);
165
166 // Create a compound token for the CSS string
167 current += code.slice(i, j + 1);
168 return {
169 tokens: [{
170 type: "css-string",
171 value: current,
172 cssTokens
173 }],
174 newIndex: j + 1
175 };
176 }
177 }
178
179
180 while (i < code.length) {
181 const char = code[i];
182
183 if (char === '`') {
184 current += char;
185 tokens.push({ type: "template-literal", value: current });
186 i++;
187 break;
188 }
189
190 if (char === '$' && code[i + 1] === '{') {
191 if (current) {
192 tokens.push({ type: "template-literal", value: current });
193 }
194
195 // Find matching closing brace
196 let braceCount = 1;
197 let j = i + 2;
198 while (j < code.length && braceCount > 0) {
199 if (code[j] === '{') braceCount++;
200 else if (code[j] === '}') braceCount--;
201 j++;
202 }
203
204 const expression = code.slice(i, j);
205 tokens.push({ type: "template-expression", value: expression });
206 i = j;
207 current = "";
208 continue;
209 }
210
211 if (char === '\\') {
212 current += char;
213 i++;
214 if (i < code.length) {
215 current += code[i];
216 i++;
217 }
218 continue;
219 }
220
221 current += char;
222 i++;
223 }
224
225 return { tokens, newIndex: i };
226}
227
228function tokenizeString(code, start, quote) {
229 let i = start;
230 let value = "";
231
232 value += code[i++]; // Opening quote
233
234 // Check for CSS string
235 const remainingCode = code.slice(i);
236 if (remainingCode.startsWith("/* css */")) {
237 // Find the closing quote
238 let j = i;
239 while (j < code.length && code[j] !== quote) {
240 if (code[j] === '\\') j += 2;
241 else j++;
242 }
243
244 if (j < code.length) {
245 const cssContent = code.slice(i + 9, j); // Skip "/* css */"
246 const cssTokens = cssTokenizer.tokenize(cssContent);
247
248 // Create a compound token for the CSS string
249 value += code.slice(i, j + 1);
250 return {
251 token: {
252 type: "css-string",
253 value,
254 cssTokens
255 },
256 newIndex: j + 1
257 };
258 }
259 }
260
261
262 while (i < code.length) {
263 const char = code[i];
264
265 if (char === quote) {
266 value += char;
267 i++;
268 break;
269 }
270
271 if (char === "\\") {
272 value += char;
273 i++;
274 if (i < code.length) {
275 value += code[i];
276 i++;
277 }
278 continue;
279 }
280
281 value += char;
282 i++;
283 }
284
285 return { token: { type: "string", value }, newIndex: i };
286}
287
288function tokenizeRegex(code, start) {
289 let i = start + 1; // Skip opening /
290 let value = "/";
291
292 while (i < code.length) {
293 const char = code[i];
294
295 if (char === '/') {
296 value += char;
297 i++;
298
299 // Parse flags
300 while (i < code.length && /[gimsuvy]/.test(code[i])) {
301 value += code[i];
302 i++;
303 }
304
305 return { token: { type: "regex", value }, newIndex: i };
306 }
307
308 if (char === '\\') {
309 value += char;
310 i++;
311 if (i < code.length) {
312 value += code[i];
313 i++;
314 }
315 continue;
316 }
317
318 if (char === '\n') {
319 return null; // Invalid regex
320 }
321
322 value += char;
323 i++;
324 }
325
326 return null; // Unterminated regex
327}
328
329function tokenizeNumber(code, start) {
330 let i = start;
331 let value = "";
332
333 // Handle hex numbers
334 if (code[i] === '0' && (code[i + 1] === 'x' || code[i + 1] === 'X')) {
335 value += code[i++];
336 value += code[i++];
337 while (i < code.length && /[0-9a-fA-F]/.test(code[i])) {
338 value += code[i++];
339 }
340 return { token: { type: "number", value }, newIndex: i };
341 }
342
343 // Handle decimal numbers
344 let hasDecimal = false;
345 while (i < code.length && (/\d/.test(code[i]) || (code[i] === '.' && !hasDecimal))) {
346 if (code[i] === '.') hasDecimal = true;
347 value += code[i++];
348 }
349
350 // Handle scientific notation
351 if (i < code.length && (code[i] === 'e' || code[i] === 'E')) {
352 value += code[i++];
353 if (i < code.length && (code[i] === '+' || code[i] === '-')) {
354 value += code[i++];
355 }
356 while (i < code.length && /\d/.test(code[i])) {
357 value += code[i++];
358 }
359 }
360
361 return { token: { type: "number", value }, newIndex: i };
362}
363
364function tokenizeIdentifier(code, start) {
365 let i = start;
366 let value = "";
367
368 while (i < code.length && /[a-zA-Z0-9_$]/.test(code[i])) {
369 value += code[i++];
370 }
371
372 let type = "identifier";
373 if (keywords.has(value)) type = "keyword";
374 else if (builtins.has(value)) type = "builtin";
375
376 return { token: { type, value }, newIndex: i };
377}
378
379function isRegexContext(tokens) {
380 // Look at the last non-whitespace token to determine if / starts a regex
381 for (let i = tokens.length - 1; i >= 0; i--) {
382 const token = tokens[i];
383 if (token.type === "whitespace") continue;
384
385 // Regex is likely after these tokens
386 if (["operator", "keyword", "punctuation"].includes(token.type)) {
387 if (token.value === ')' || token.value === ']') return false;
388 return true;
389 }
390
391 // Not a regex after identifiers or numbers
392 if (["identifier", "number"].includes(token.type)) return false;
393
394 break;
395 }
396
397 return true; // Default to regex at start of input
398}
399