webui/js/code/js_tokenizer.js at dev · ponder.ooo/snakepyt

ponder.ooo / snakepyt
🐍🐍🐍
snakepyt / webui / js / code / js_tokenizer.js
at dev 11 kB view raw
  1
  2// Courtesy of Claude :)
  3
  4import * as cssTokenizer from "/code/css_tokenizer.js";
  5
  6const keywords = new Set([
  7    "abstract", "arguments", "await", "boolean", "break", "byte", "case", "catch",
  8    "char", "class", "const", "continue", "debugger", "default", "delete", "do",
  9    "double", "else", "enum", "eval", "export", "extends", "false", "final",
 10    "finally", "float", "for", "function", "goto", "if", "implements", "import",
 11    "in", "instanceof", "int", "interface", "let", "long", "native", "new",
 12    "null", "package", "private", "protected", "public", "return", "short",
 13    "static", "super", "switch", "synchronized", "this", "throw", "throws",
 14    "transient", "true", "try", "typeof", "var", "void", "volatile", "while",
 15    "with", "yield", "async", "of"
 16]);
 17
 18const builtins = new Set([
 19    "console", "window", "document", "Array", "Object", "String", "Number",
 20    "Boolean", "Date", "RegExp", "Math", "JSON", "parseInt", "parseFloat",
 21    "isNaN", "isFinite", "undefined", "NaN", "Infinity", "Promise", "Set",
 22    "Map", "WeakSet", "WeakMap", "Symbol", "Proxy", "Reflect"
 23]);
 24
 25const operators = new Set([
 26    "+", "-", "*", "/", "%", "**", "++", "--", "=", "+=", "-=", "*=", "/=", "%=",
 27    "**=", "==", "===", "!=", "!==", "<", ">", "<=", ">=", "&&", "||", "!",
 28    "&", "|", "^", "~", "<<", ">>", ">>>", "?", ":", "=>", "...", "??", "??=",
 29    "&&=", "||=", "&=", "|=", "^=", "<<=", ">>=", ">>>="
 30]);
 31
 32export function tokenize(code) {
 33    const tokens = [];
 34    let i = 0;
 35
 36    while (i < code.length) {
 37        const char = code[i];
 38
 39        // Whitespace
 40        if (/\s/.test(char)) {
 41            const start = i;
 42            while (i < code.length && /\s/.test(code[i])) i++;
 43            tokens.push({ type: "whitespace", value: code.slice(start, i) });
 44            continue;
 45        }
 46
 47        // Single-line comment
 48        if (char === '/' && code[i + 1] === '/') {
 49            const start = i;
 50            while (i < code.length && code[i] !== '\n') i++;
 51            tokens.push({ type: "comment", value: code.slice(start, i) });
 52            continue;
 53        }
 54
 55        // Multi-line comment
 56        if (char === '/' && code[i + 1] === '*') {
 57            const start = i;
 58            i += 2;
 59            while (i < code.length - 1 && !(code[i] === '*' && code[i + 1] === '/')) i++;
 60            if (i < code.length - 1) i += 2;
 61            tokens.push({ type: "comment", value: code.slice(start, i) });
 62            continue;
 63        }
 64
 65        // Template literals
 66        if (char === '`') {
 67            const templateTokens = tokenizeTemplateLiteral(code, i);
 68            tokens.push(...templateTokens.tokens);
 69            i = templateTokens.newIndex;
 70            continue;
 71        }
 72
 73        // Strings
 74        if (char === '"' || char === '\'') {
 75            const stringToken = tokenizeString(code, i, char);
 76            tokens.push(stringToken.token);
 77            i = stringToken.newIndex;
 78            continue;
 79        }
 80
 81        // Regular expressions
 82        if (char === '/' && isRegexContext(tokens)) {
 83            const regexToken = tokenizeRegex(code, i);
 84            if (regexToken) {
 85                tokens.push(regexToken.token);
 86                i = regexToken.newIndex;
 87                continue;
 88            }
 89        }
 90
 91        // Numbers
 92        if (/\d/.test(char) || (char === '.' && /\d/.test(code[i + 1]))) {
 93            const numberToken = tokenizeNumber(code, i);
 94            tokens.push(numberToken.token);
 95            i = numberToken.newIndex;
 96            continue;
 97        }
 98
 99        // Identifiers and keywords
100        if (/[a-zA-Z_$]/.test(char)) {
101            const identifierToken = tokenizeIdentifier(code, i);
102            tokens.push(identifierToken.token);
103            i = identifierToken.newIndex;
104            continue;
105        }
106
107        // Multi-character operators
108        const twoChar = code.slice(i, i + 2);
109        const threeChar = code.slice(i, i + 3);
110
111        if (operators.has(threeChar)) {
112            tokens.push({ type: "operator", value: threeChar });
113            i += 3;
114            continue;
115        }
116
117        if (operators.has(twoChar)) {
118            tokens.push({ type: "operator", value: twoChar });
119            i += 2;
120            continue;
121        }
122
123        // Single-character operators and punctuation
124        if (operators.has(char)) {
125            tokens.push({ type: "operator", value: char });
126            i++;
127            continue;
128        }
129
130        if ("{}[]().,;".includes(char)) {
131            tokens.push({ type: "punctuation", value: char });
132            i++;
133            continue;
134        }
135
136        // Unknown character
137        tokens.push({ type: "unknown", value: char });
138        i++;
139    }
140
141    return tokens;
142}
143
144function tokenizeTemplateLiteral(code, start) {
145    const tokens = [];
146    let i = start;
147    let current = "";
148
149    // Opening backtick
150    current += code[i++];
151
152    // Check for CSS string
153    const remainingCode = code.slice(i);
154    if (remainingCode.startsWith("/* css */")) {
155        // Find the closing quote
156        let j = i;
157        while (j < code.length && code[j] !== '`') {
158            if (code[j] === '\\') j += 2;
159            else j++;
160        }
161
162        if (j < code.length) {
163            const cssContent = code.slice(i + 9, j); // Skip "/* css */"
164            const cssTokens = cssTokenizer.tokenize(cssContent);
165
166            // Create a compound token for the CSS string
167            current += code.slice(i, j + 1);
168            return {
169                tokens: [{
170                    type: "css-string",
171                    value: current,
172                    cssTokens
173                }],
174                newIndex: j + 1
175            };
176        }
177    }
178
179
180    while (i < code.length) {
181        const char = code[i];
182
183        if (char === '`') {
184            current += char;
185            tokens.push({ type: "template-literal", value: current });
186            i++;
187            break;
188        }
189
190        if (char === '$' && code[i + 1] === '{') {
191            if (current) {
192                tokens.push({ type: "template-literal", value: current });
193            }
194
195            // Find matching closing brace
196            let braceCount = 1;
197            let j = i + 2;
198            while (j < code.length && braceCount > 0) {
199                if (code[j] === '{') braceCount++;
200                else if (code[j] === '}') braceCount--;
201                j++;
202            }
203
204            const expression = code.slice(i, j);
205            tokens.push({ type: "template-expression", value: expression });
206            i = j;
207            current = "";
208            continue;
209        }
210
211        if (char === '\\') {
212            current += char;
213            i++;
214            if (i < code.length) {
215                current += code[i];
216                i++;
217            }
218            continue;
219        }
220
221        current += char;
222        i++;
223    }
224
225    return { tokens, newIndex: i };
226}
227
228function tokenizeString(code, start, quote) {
229    let i = start;
230    let value = "";
231
232    value += code[i++]; // Opening quote
233
234    // Check for CSS string
235    const remainingCode = code.slice(i);
236    if (remainingCode.startsWith("/* css */")) {
237        // Find the closing quote
238        let j = i;
239        while (j < code.length && code[j] !== quote) {
240            if (code[j] === '\\') j += 2;
241            else j++;
242        }
243
244        if (j < code.length) {
245            const cssContent = code.slice(i + 9, j); // Skip "/* css */"
246            const cssTokens = cssTokenizer.tokenize(cssContent);
247
248            // Create a compound token for the CSS string
249            value += code.slice(i, j + 1);
250            return {
251                token: {
252                    type: "css-string",
253                    value,
254                    cssTokens
255                },
256                newIndex: j + 1
257            };
258        }
259    }
260
261
262    while (i < code.length) {
263        const char = code[i];
264
265        if (char === quote) {
266            value += char;
267            i++;
268            break;
269        }
270
271        if (char === "\\") {
272            value += char;
273            i++;
274            if (i < code.length) {
275                value += code[i];
276                i++;
277            }
278            continue;
279        }
280
281        value += char;
282        i++;
283    }
284
285    return { token: { type: "string", value }, newIndex: i };
286}
287
288function tokenizeRegex(code, start) {
289    let i = start + 1; // Skip opening /
290    let value = "/";
291
292    while (i < code.length) {
293        const char = code[i];
294
295        if (char === '/') {
296            value += char;
297            i++;
298
299            // Parse flags
300            while (i < code.length && /[gimsuvy]/.test(code[i])) {
301                value += code[i];
302                i++;
303            }
304
305            return { token: { type: "regex", value }, newIndex: i };
306        }
307
308        if (char === '\\') {
309            value += char;
310            i++;
311            if (i < code.length) {
312                value += code[i];
313                i++;
314            }
315            continue;
316        }
317
318        if (char === '\n') {
319            return null; // Invalid regex
320        }
321
322        value += char;
323        i++;
324    }
325
326    return null; // Unterminated regex
327}
328
329function tokenizeNumber(code, start) {
330    let i = start;
331    let value = "";
332
333    // Handle hex numbers
334    if (code[i] === '0' && (code[i + 1] === 'x' || code[i + 1] === 'X')) {
335        value += code[i++];
336        value += code[i++];
337        while (i < code.length && /[0-9a-fA-F]/.test(code[i])) {
338            value += code[i++];
339        }
340        return { token: { type: "number", value }, newIndex: i };
341    }
342
343    // Handle decimal numbers
344    let hasDecimal = false;
345    while (i < code.length && (/\d/.test(code[i]) || (code[i] === '.' && !hasDecimal))) {
346        if (code[i] === '.') hasDecimal = true;
347        value += code[i++];
348    }
349
350    // Handle scientific notation
351    if (i < code.length && (code[i] === 'e' || code[i] === 'E')) {
352        value += code[i++];
353        if (i < code.length && (code[i] === '+' || code[i] === '-')) {
354            value += code[i++];
355        }
356        while (i < code.length && /\d/.test(code[i])) {
357            value += code[i++];
358        }
359    }
360
361    return { token: { type: "number", value }, newIndex: i };
362}
363
364function tokenizeIdentifier(code, start) {
365    let i = start;
366    let value = "";
367
368    while (i < code.length && /[a-zA-Z0-9_$]/.test(code[i])) {
369        value += code[i++];
370    }
371
372    let type = "identifier";
373    if (keywords.has(value)) type = "keyword";
374    else if (builtins.has(value)) type = "builtin";
375
376    return { token: { type, value }, newIndex: i };
377}
378
379function isRegexContext(tokens) {
380    // Look at the last non-whitespace token to determine if / starts a regex
381    for (let i = tokens.length - 1; i >= 0; i--) {
382        const token = tokens[i];
383        if (token.type === "whitespace") continue;
384
385        // Regex is likely after these tokens
386        if (["operator", "keyword", "punctuation"].includes(token.type)) {
387            if (token.value === ')' || token.value === ']') return false;
388            return true;
389        }
390
391        // Not a regex after identifiers or numbers
392        if (["identifier", "number"].includes(token.type)) return false;
393
394        break;
395    }
396
397    return true; // Default to regex at start of input
398}
399