this repo has no description
1import { ParseResult } from './ParseResult.ts'
2import type {
3 Entity,
4 EntitySchema,
5 ParserConfig,
6 PrimitiveType,
7} from './types.ts'
8
9/**
10 * TaggedStringParser extracts tagged entities from strings
11 *
12 * Supports two parsing modes:
13 * - Delimited mode: Extract entities surrounded by delimiters (e.g., [key:value])
14 * - Delimiter-free mode: Extract key=value patterns from natural language strings
15 *
16 * Features:
17 * - Configurable delimiters and type separators
18 * - Schema-based type parsing with custom formatters
19 * - Automatic type inference (string, number, boolean)
20 * - Quoted strings with escape sequences (\", \\)
21 * - Lenient error handling (skips malformed entities)
22 */
23export class TaggedStringParser {
24 private readonly openDelimiter: string
25 private readonly closeDelimiter: string
26 private readonly typeSeparator: string
27 private readonly schema?: EntitySchema
28 private readonly isDelimiterFree: boolean
29
30 /**
31 * Create a new TaggedStringParser with optional configuration
32 * @param config - Parser configuration options
33 * @param config.delimiters - Unified delimiter configuration:
34 * - `false` or `[]` enables delimiter-free mode (parse key=value patterns)
35 * - `[open, close]` uses specified delimiters (e.g., ['{{', '}}'])
36 * - If omitted, uses openDelimiter and closeDelimiter options
37 * @param config.openDelimiter - Opening tag delimiter (default: '[', legacy option)
38 * @param config.closeDelimiter - Closing tag delimiter (default: ']', legacy option)
39 * @param config.typeSeparator - Separator between type and value (default: ':')
40 * @param config.schema - Entity type definitions with optional formatters
41 * @throws Error if configuration is invalid
42 */
43 constructor(config?: ParserConfig) {
44 // Resolve delimiter configuration
45 if (config?.delimiters !== undefined) {
46 // delimiters option takes precedence
47 if (
48 config.delimiters === false ||
49 (Array.isArray(config.delimiters) && config.delimiters.length === 0)
50 ) {
51 // Delimiter-free mode
52 this.isDelimiterFree = true
53 this.openDelimiter = ''
54 this.closeDelimiter = ''
55 } else if (
56 Array.isArray(config.delimiters) &&
57 config.delimiters.length === 2
58 ) {
59 // Delimited mode with specified delimiters
60 this.isDelimiterFree = false
61 this.openDelimiter = config.delimiters[0]
62 this.closeDelimiter = config.delimiters[1]
63 } else {
64 // Invalid configuration
65 throw new Error('Invalid delimiters configuration')
66 }
67 } else {
68 // Backward compatible: use individual delimiter options
69 this.isDelimiterFree = false
70 this.openDelimiter = config?.openDelimiter ?? '['
71 this.closeDelimiter = config?.closeDelimiter ?? ']'
72 }
73
74 this.typeSeparator = config?.typeSeparator ?? ':'
75 this.schema = config?.schema
76
77 // Validate configuration
78 this.validateConfig()
79 }
80
81 /**
82 * Validate parser configuration
83 * @throws Error if configuration is invalid
84 */
85 private validateConfig(): void {
86 // Skip delimiter validation in delimiter-free mode
87 if (this.isDelimiterFree) {
88 return
89 }
90
91 if (this.openDelimiter === '') {
92 throw new Error('Open delimiter cannot be empty')
93 }
94 if (this.closeDelimiter === '') {
95 throw new Error('Close delimiter cannot be empty')
96 }
97 if (this.openDelimiter === this.closeDelimiter) {
98 throw new Error('Open and close delimiters cannot be the same')
99 }
100 }
101
102 /**
103 * Parse a string and extract all tagged entities
104 * @param message - The string to parse
105 * @returns ParseResult containing original message and extracted entities
106 */
107 parse(message: string): ParseResult {
108 if (message === '') {
109 return new ParseResult(message, [])
110 }
111
112 // Route to appropriate parsing method based on mode
113 if (this.isDelimiterFree) {
114 return this.parseDelimiterFree(message)
115 }
116
117 return this.parseDelimited(message)
118 }
119
120 /**
121 * Parse a string in delimited mode, extracting tags with proper quoted string handling
122 * @param message - The string to parse
123 * @returns ParseResult containing original message and extracted entities
124 */
125 private parseDelimited(message: string): ParseResult {
126 const entities: Entity[] = []
127 let pos = 0
128
129 while (pos < message.length) {
130 // Find the opening delimiter
131 const openIndex = message.indexOf(this.openDelimiter, pos)
132 if (openIndex === -1) {
133 // No more tags
134 break
135 }
136
137 // Start scanning for tag content after the opening delimiter
138 const contentStart = openIndex + this.openDelimiter.length
139 let contentEnd = contentStart
140 let inQuote = false
141
142 // Scan character by character to find the closing delimiter
143 // while respecting quoted strings
144 while (contentEnd < message.length) {
145 const char = message[contentEnd]
146
147 if (char === '"') {
148 // Toggle quote state (simplified - doesn't handle escapes during scan)
149 // We'll handle escapes properly in processTag
150 if (contentEnd > contentStart && message[contentEnd - 1] === '\\') {
151 // This is an escaped quote, don't toggle
152 // But we need to check if the backslash itself is escaped
153 let backslashCount = 0
154 let checkPos = contentEnd - 1
155 while (checkPos >= contentStart && message[checkPos] === '\\') {
156 backslashCount++
157 checkPos--
158 }
159 // If odd number of backslashes, the quote is escaped
160 if (backslashCount % 2 === 1) {
161 contentEnd++
162 continue
163 }
164 }
165 inQuote = !inQuote
166 contentEnd++
167 } else if (
168 !inQuote &&
169 message.substring(
170 contentEnd,
171 contentEnd + this.closeDelimiter.length,
172 ) === this.closeDelimiter
173 ) {
174 // Found closing delimiter outside of quotes
175 const tagContent = message.substring(contentStart, contentEnd).trim()
176
177 if (tagContent !== '') {
178 const entity = this.processTag(
179 tagContent,
180 openIndex,
181 contentEnd + this.closeDelimiter.length,
182 )
183 if (entity) {
184 entities.push(entity)
185 }
186 }
187
188 // Move past this tag
189 pos = contentEnd + this.closeDelimiter.length
190 break
191 } else {
192 contentEnd++
193 }
194 }
195
196 // If we reached the end without finding a closing delimiter, skip this opening
197 if (contentEnd >= message.length) {
198 pos = openIndex + this.openDelimiter.length
199 }
200 }
201
202 return new ParseResult(message, entities, this.closeDelimiter)
203 }
204
205 /**
206 * Parse a string in delimiter-free mode, extracting key-value patterns
207 * @param message - The string to parse
208 * @returns ParseResult containing original message and extracted entities
209 */
210 private parseDelimiterFree(message: string): ParseResult {
211 const entities: Entity[] = []
212 let pos = 0
213
214 while (pos < message.length) {
215 // Skip whitespace
216 while (pos < message.length && /\s/.test(message[pos])) {
217 pos++
218 }
219
220 if (pos >= message.length) {
221 break
222 }
223
224 // Try to extract key (quoted or unquoted)
225 const keyStart = pos
226 let key: string
227 let keyEnd: number
228
229 if (message[pos] === '"') {
230 // Quoted key
231 const quotedKey = this.extractQuotedString(message, pos)
232 if (!quotedKey) {
233 // Malformed quoted key - skip this character and continue
234 pos++
235 continue
236 }
237 key = quotedKey.content
238 keyEnd = quotedKey.endPosition
239 } else {
240 // Unquoted key - extract until separator or whitespace
241 const unquotedKey = this.extractUnquotedToken(message, pos, [
242 this.typeSeparator,
243 ])
244 if (unquotedKey.content === '') {
245 // No key found - advance and continue
246 pos++
247 continue
248 }
249 key = unquotedKey.content
250 keyEnd = unquotedKey.endPosition
251 }
252
253 // Check for type separator
254 if (keyEnd >= message.length || message[keyEnd] !== this.typeSeparator) {
255 // No separator - this isn't an entity, continue from after the key
256 pos = keyEnd + 1
257 continue
258 }
259
260 // Advance past separator
261 pos = keyEnd + 1
262
263 // Try to extract value (quoted or unquoted)
264 let value: string
265 let valueEnd: number
266
267 if (pos < message.length && message[pos] === '"') {
268 // Quoted value
269 const quotedValue = this.extractQuotedString(message, pos)
270 if (!quotedValue) {
271 // Malformed quoted value - skip this entity
272 continue
273 }
274 value = quotedValue.content
275 valueEnd = quotedValue.endPosition
276 } else {
277 // Unquoted value - extract until whitespace
278 const unquotedValue = this.extractUnquotedToken(message, pos, [])
279 if (unquotedValue.content === '') {
280 // No value found - skip this entity
281 continue
282 }
283 value = unquotedValue.content
284 valueEnd = unquotedValue.endPosition
285 }
286
287 // Create entity
288 const { parsedValue, inferredType } = this.parseValue(key, value)
289 const formattedValue = this.applyFormatter(key, parsedValue)
290
291 entities.push({
292 type: key,
293 value,
294 parsedValue,
295 formattedValue,
296 inferredType,
297 position: keyStart,
298 endPosition: valueEnd,
299 })
300
301 // Update position to continue scanning
302 pos = valueEnd
303 }
304
305 return new ParseResult(message, entities, this.closeDelimiter)
306 }
307
308 /**
309 * Process a tag's content and create an Entity
310 * @param tagContent - The content between delimiters
311 * @param position - The position of the tag in the original message
312 * @param endPosition - The position after the closing delimiter
313 * @returns Entity or null if tag is malformed
314 */
315 private processTag(
316 tagContent: string,
317 position: number,
318 endPosition: number,
319 ): Entity | null {
320 let type: string
321 let value: string
322 let pos = 0
323
324 // Extract type (key) - can be quoted or unquoted
325 if (tagContent[pos] === '"') {
326 // Quoted key
327 const quotedKey = this.extractQuotedString(tagContent, pos)
328 if (!quotedKey) {
329 // Malformed quoted key - skip this tag
330 return null
331 }
332 type = quotedKey.content
333 pos = quotedKey.endPosition
334 } else {
335 // Unquoted key - find separator
336 const separatorIndex = tagContent.indexOf(this.typeSeparator)
337 if (separatorIndex === -1) {
338 // No separator - treat entire content as value with empty type
339 type = ''
340 value = tagContent
341
342 // Parse the value and get typed result
343 const { parsedValue, inferredType } = this.parseValue(type, value)
344
345 // Apply formatter to get formatted value
346 const formattedValue = this.applyFormatter(type, parsedValue)
347
348 return {
349 type,
350 value,
351 parsedValue,
352 formattedValue,
353 inferredType,
354 position,
355 endPosition,
356 }
357 }
358 type = tagContent.substring(0, separatorIndex)
359 pos = separatorIndex
360 }
361
362 // Check for separator
363 if (pos >= tagContent.length || tagContent[pos] !== this.typeSeparator) {
364 // No separator found after key - malformed
365 return null
366 }
367
368 // Skip separator
369 pos++
370
371 // Extract value - can be quoted or unquoted
372 if (pos < tagContent.length && tagContent[pos] === '"') {
373 // Quoted value
374 const quotedValue = this.extractQuotedString(tagContent, pos)
375 if (!quotedValue) {
376 // Malformed quoted value - skip this tag
377 return null
378 }
379 value = quotedValue.content
380 } else {
381 // Unquoted value - rest of the content
382 value = tagContent.substring(pos)
383 }
384
385 // Parse the value and get typed result
386 const { parsedValue, inferredType } = this.parseValue(type, value)
387
388 // Apply formatter to get formatted value
389 const formattedValue = this.applyFormatter(type, parsedValue)
390
391 return {
392 type,
393 value,
394 parsedValue,
395 formattedValue,
396 inferredType,
397 position,
398 endPosition,
399 }
400 }
401
402 /**
403 * Infer the primitive type from a raw string value
404 * @param value - The raw string value
405 * @returns The inferred primitive type
406 */
407 private inferType(value: string): PrimitiveType {
408 // Check for number (including decimals and negatives)
409 if (/^-?\d+(\.\d+)?$/.test(value)) {
410 return 'number'
411 }
412
413 // Check for boolean (case-insensitive)
414 const lowerValue = value.toLowerCase()
415 if (lowerValue === 'true' || lowerValue === 'false') {
416 return 'boolean'
417 }
418
419 // Default to string
420 return 'string'
421 }
422
423 /**
424 * Parse a value using schema (if available) or type inference
425 * @param type - The entity type
426 * @param rawValue - The raw string value
427 * @returns Object with parsedValue and inferredType
428 */
429 private parseValue(
430 type: string,
431 rawValue: string,
432 ): {
433 parsedValue: string | number | boolean
434 inferredType: PrimitiveType
435 } {
436 let targetType: PrimitiveType
437
438 // Check if type is in schema
439 if (this.schema && type in this.schema) {
440 const schemaEntry = this.schema[type]
441 // Handle both shorthand (just type) and full definition
442 targetType =
443 typeof schemaEntry === 'string' ? schemaEntry : schemaEntry.type
444 } else {
445 // Use inference for unknown types
446 targetType = this.inferType(rawValue)
447 }
448
449 // Parse based on target type
450 let parsedValue: string | number | boolean
451
452 switch (targetType) {
453 case 'number':
454 parsedValue = parseFloat(rawValue)
455 break
456 case 'boolean':
457 parsedValue = rawValue.toLowerCase() === 'true'
458 break
459 case 'string':
460 parsedValue = rawValue
461 break
462 default:
463 parsedValue = rawValue
464 break
465 }
466
467 return {
468 parsedValue,
469 inferredType: targetType,
470 }
471 }
472
473 /**
474 * Apply formatter function to a parsed value
475 * @param type - The entity type
476 * @param parsedValue - The parsed value
477 * @returns Formatted string
478 */
479 private applyFormatter(
480 type: string,
481 parsedValue: string | number | boolean,
482 ): string {
483 // Check if schema has a formatter for this type
484 if (this.schema && type in this.schema) {
485 const schemaEntry = this.schema[type]
486
487 // Only full EntityDefinition can have a formatter
488 if (typeof schemaEntry !== 'string' && schemaEntry.format) {
489 return schemaEntry.format(parsedValue)
490 }
491 }
492
493 // No formatter - convert to string
494 return String(parsedValue)
495 }
496
497 /**
498 * Extract a quoted string starting at the given position
499 * Processes escape sequences: \" becomes " and \\ becomes \
500 * @param message - The string to extract from
501 * @param startPos - The position of the opening quote
502 * @returns Object with content and endPosition, or null if unclosed
503 */
504 private extractQuotedString(
505 message: string,
506 startPos: number,
507 ): { content: string; endPosition: number } | null {
508 // Verify we're starting at a quote
509 if (message[startPos] !== '"') {
510 return null
511 }
512
513 let result = ''
514 let pos = startPos + 1
515
516 while (pos < message.length) {
517 const char = message[pos]
518
519 if (char === '\\') {
520 // Check if there's a next character
521 if (pos + 1 < message.length) {
522 const nextChar = message[pos + 1]
523 // Process escape sequences for quote and backslash
524 if (nextChar === '"' || nextChar === '\\') {
525 result += nextChar
526 pos += 2
527 continue
528 }
529 }
530 // Backslash at end or before non-escapable char - treat as literal
531 result += char
532 pos += 1
533 } else if (char === '"') {
534 // Found closing quote
535 return { content: result, endPosition: pos + 1 }
536 } else {
537 // Regular character
538 result += char
539 pos += 1
540 }
541 }
542
543 // Reached end of string without finding closing quote
544 return null
545 }
546
547 /**
548 * Extract an unquoted token starting at the given position
549 * Stops at whitespace or any of the specified stop characters
550 * @param message - The string to extract from
551 * @param startPos - The position to start extraction
552 * @param stopChars - Array of characters that should stop extraction
553 * @returns Object with content and endPosition
554 */
555 private extractUnquotedToken(
556 message: string,
557 startPos: number,
558 stopChars: string[],
559 ): { content: string; endPosition: number } {
560 let result = ''
561 let pos = startPos
562
563 while (pos < message.length) {
564 const char = message[pos]
565
566 // Check if we hit whitespace
567 if (/\s/.test(char)) {
568 break
569 }
570
571 // Check if we hit a stop character
572 if (stopChars.includes(char)) {
573 break
574 }
575
576 // Add character to result
577 result += char
578 pos += 1
579 }
580
581 return { content: result, endPosition: pos }
582 }
583}