this repo has no description
at main 583 lines 17 kB view raw
1import { ParseResult } from './ParseResult.ts' 2import type { 3 Entity, 4 EntitySchema, 5 ParserConfig, 6 PrimitiveType, 7} from './types.ts' 8 9/** 10 * TaggedStringParser extracts tagged entities from strings 11 * 12 * Supports two parsing modes: 13 * - Delimited mode: Extract entities surrounded by delimiters (e.g., [key:value]) 14 * - Delimiter-free mode: Extract key=value patterns from natural language strings 15 * 16 * Features: 17 * - Configurable delimiters and type separators 18 * - Schema-based type parsing with custom formatters 19 * - Automatic type inference (string, number, boolean) 20 * - Quoted strings with escape sequences (\", \\) 21 * - Lenient error handling (skips malformed entities) 22 */ 23export class TaggedStringParser { 24 private readonly openDelimiter: string 25 private readonly closeDelimiter: string 26 private readonly typeSeparator: string 27 private readonly schema?: EntitySchema 28 private readonly isDelimiterFree: boolean 29 30 /** 31 * Create a new TaggedStringParser with optional configuration 32 * @param config - Parser configuration options 33 * @param config.delimiters - Unified delimiter configuration: 34 * - `false` or `[]` enables delimiter-free mode (parse key=value patterns) 35 * - `[open, close]` uses specified delimiters (e.g., ['{{', '}}']) 36 * - If omitted, uses openDelimiter and closeDelimiter options 37 * @param config.openDelimiter - Opening tag delimiter (default: '[', legacy option) 38 * @param config.closeDelimiter - Closing tag delimiter (default: ']', legacy option) 39 * @param config.typeSeparator - Separator between type and value (default: ':') 40 * @param config.schema - Entity type definitions with optional formatters 41 * @throws Error if configuration is invalid 42 */ 43 constructor(config?: ParserConfig) { 44 // Resolve delimiter configuration 45 if (config?.delimiters !== undefined) { 46 // delimiters option takes precedence 47 if ( 48 config.delimiters === false || 49 (Array.isArray(config.delimiters) && config.delimiters.length === 0) 50 ) { 51 // Delimiter-free mode 52 this.isDelimiterFree = true 53 this.openDelimiter = '' 54 this.closeDelimiter = '' 55 } else if ( 56 Array.isArray(config.delimiters) && 57 config.delimiters.length === 2 58 ) { 59 // Delimited mode with specified delimiters 60 this.isDelimiterFree = false 61 this.openDelimiter = config.delimiters[0] 62 this.closeDelimiter = config.delimiters[1] 63 } else { 64 // Invalid configuration 65 throw new Error('Invalid delimiters configuration') 66 } 67 } else { 68 // Backward compatible: use individual delimiter options 69 this.isDelimiterFree = false 70 this.openDelimiter = config?.openDelimiter ?? '[' 71 this.closeDelimiter = config?.closeDelimiter ?? ']' 72 } 73 74 this.typeSeparator = config?.typeSeparator ?? ':' 75 this.schema = config?.schema 76 77 // Validate configuration 78 this.validateConfig() 79 } 80 81 /** 82 * Validate parser configuration 83 * @throws Error if configuration is invalid 84 */ 85 private validateConfig(): void { 86 // Skip delimiter validation in delimiter-free mode 87 if (this.isDelimiterFree) { 88 return 89 } 90 91 if (this.openDelimiter === '') { 92 throw new Error('Open delimiter cannot be empty') 93 } 94 if (this.closeDelimiter === '') { 95 throw new Error('Close delimiter cannot be empty') 96 } 97 if (this.openDelimiter === this.closeDelimiter) { 98 throw new Error('Open and close delimiters cannot be the same') 99 } 100 } 101 102 /** 103 * Parse a string and extract all tagged entities 104 * @param message - The string to parse 105 * @returns ParseResult containing original message and extracted entities 106 */ 107 parse(message: string): ParseResult { 108 if (message === '') { 109 return new ParseResult(message, []) 110 } 111 112 // Route to appropriate parsing method based on mode 113 if (this.isDelimiterFree) { 114 return this.parseDelimiterFree(message) 115 } 116 117 return this.parseDelimited(message) 118 } 119 120 /** 121 * Parse a string in delimited mode, extracting tags with proper quoted string handling 122 * @param message - The string to parse 123 * @returns ParseResult containing original message and extracted entities 124 */ 125 private parseDelimited(message: string): ParseResult { 126 const entities: Entity[] = [] 127 let pos = 0 128 129 while (pos < message.length) { 130 // Find the opening delimiter 131 const openIndex = message.indexOf(this.openDelimiter, pos) 132 if (openIndex === -1) { 133 // No more tags 134 break 135 } 136 137 // Start scanning for tag content after the opening delimiter 138 const contentStart = openIndex + this.openDelimiter.length 139 let contentEnd = contentStart 140 let inQuote = false 141 142 // Scan character by character to find the closing delimiter 143 // while respecting quoted strings 144 while (contentEnd < message.length) { 145 const char = message[contentEnd] 146 147 if (char === '"') { 148 // Toggle quote state (simplified - doesn't handle escapes during scan) 149 // We'll handle escapes properly in processTag 150 if (contentEnd > contentStart && message[contentEnd - 1] === '\\') { 151 // This is an escaped quote, don't toggle 152 // But we need to check if the backslash itself is escaped 153 let backslashCount = 0 154 let checkPos = contentEnd - 1 155 while (checkPos >= contentStart && message[checkPos] === '\\') { 156 backslashCount++ 157 checkPos-- 158 } 159 // If odd number of backslashes, the quote is escaped 160 if (backslashCount % 2 === 1) { 161 contentEnd++ 162 continue 163 } 164 } 165 inQuote = !inQuote 166 contentEnd++ 167 } else if ( 168 !inQuote && 169 message.substring( 170 contentEnd, 171 contentEnd + this.closeDelimiter.length, 172 ) === this.closeDelimiter 173 ) { 174 // Found closing delimiter outside of quotes 175 const tagContent = message.substring(contentStart, contentEnd).trim() 176 177 if (tagContent !== '') { 178 const entity = this.processTag( 179 tagContent, 180 openIndex, 181 contentEnd + this.closeDelimiter.length, 182 ) 183 if (entity) { 184 entities.push(entity) 185 } 186 } 187 188 // Move past this tag 189 pos = contentEnd + this.closeDelimiter.length 190 break 191 } else { 192 contentEnd++ 193 } 194 } 195 196 // If we reached the end without finding a closing delimiter, skip this opening 197 if (contentEnd >= message.length) { 198 pos = openIndex + this.openDelimiter.length 199 } 200 } 201 202 return new ParseResult(message, entities, this.closeDelimiter) 203 } 204 205 /** 206 * Parse a string in delimiter-free mode, extracting key-value patterns 207 * @param message - The string to parse 208 * @returns ParseResult containing original message and extracted entities 209 */ 210 private parseDelimiterFree(message: string): ParseResult { 211 const entities: Entity[] = [] 212 let pos = 0 213 214 while (pos < message.length) { 215 // Skip whitespace 216 while (pos < message.length && /\s/.test(message[pos])) { 217 pos++ 218 } 219 220 if (pos >= message.length) { 221 break 222 } 223 224 // Try to extract key (quoted or unquoted) 225 const keyStart = pos 226 let key: string 227 let keyEnd: number 228 229 if (message[pos] === '"') { 230 // Quoted key 231 const quotedKey = this.extractQuotedString(message, pos) 232 if (!quotedKey) { 233 // Malformed quoted key - skip this character and continue 234 pos++ 235 continue 236 } 237 key = quotedKey.content 238 keyEnd = quotedKey.endPosition 239 } else { 240 // Unquoted key - extract until separator or whitespace 241 const unquotedKey = this.extractUnquotedToken(message, pos, [ 242 this.typeSeparator, 243 ]) 244 if (unquotedKey.content === '') { 245 // No key found - advance and continue 246 pos++ 247 continue 248 } 249 key = unquotedKey.content 250 keyEnd = unquotedKey.endPosition 251 } 252 253 // Check for type separator 254 if (keyEnd >= message.length || message[keyEnd] !== this.typeSeparator) { 255 // No separator - this isn't an entity, continue from after the key 256 pos = keyEnd + 1 257 continue 258 } 259 260 // Advance past separator 261 pos = keyEnd + 1 262 263 // Try to extract value (quoted or unquoted) 264 let value: string 265 let valueEnd: number 266 267 if (pos < message.length && message[pos] === '"') { 268 // Quoted value 269 const quotedValue = this.extractQuotedString(message, pos) 270 if (!quotedValue) { 271 // Malformed quoted value - skip this entity 272 continue 273 } 274 value = quotedValue.content 275 valueEnd = quotedValue.endPosition 276 } else { 277 // Unquoted value - extract until whitespace 278 const unquotedValue = this.extractUnquotedToken(message, pos, []) 279 if (unquotedValue.content === '') { 280 // No value found - skip this entity 281 continue 282 } 283 value = unquotedValue.content 284 valueEnd = unquotedValue.endPosition 285 } 286 287 // Create entity 288 const { parsedValue, inferredType } = this.parseValue(key, value) 289 const formattedValue = this.applyFormatter(key, parsedValue) 290 291 entities.push({ 292 type: key, 293 value, 294 parsedValue, 295 formattedValue, 296 inferredType, 297 position: keyStart, 298 endPosition: valueEnd, 299 }) 300 301 // Update position to continue scanning 302 pos = valueEnd 303 } 304 305 return new ParseResult(message, entities, this.closeDelimiter) 306 } 307 308 /** 309 * Process a tag's content and create an Entity 310 * @param tagContent - The content between delimiters 311 * @param position - The position of the tag in the original message 312 * @param endPosition - The position after the closing delimiter 313 * @returns Entity or null if tag is malformed 314 */ 315 private processTag( 316 tagContent: string, 317 position: number, 318 endPosition: number, 319 ): Entity | null { 320 let type: string 321 let value: string 322 let pos = 0 323 324 // Extract type (key) - can be quoted or unquoted 325 if (tagContent[pos] === '"') { 326 // Quoted key 327 const quotedKey = this.extractQuotedString(tagContent, pos) 328 if (!quotedKey) { 329 // Malformed quoted key - skip this tag 330 return null 331 } 332 type = quotedKey.content 333 pos = quotedKey.endPosition 334 } else { 335 // Unquoted key - find separator 336 const separatorIndex = tagContent.indexOf(this.typeSeparator) 337 if (separatorIndex === -1) { 338 // No separator - treat entire content as value with empty type 339 type = '' 340 value = tagContent 341 342 // Parse the value and get typed result 343 const { parsedValue, inferredType } = this.parseValue(type, value) 344 345 // Apply formatter to get formatted value 346 const formattedValue = this.applyFormatter(type, parsedValue) 347 348 return { 349 type, 350 value, 351 parsedValue, 352 formattedValue, 353 inferredType, 354 position, 355 endPosition, 356 } 357 } 358 type = tagContent.substring(0, separatorIndex) 359 pos = separatorIndex 360 } 361 362 // Check for separator 363 if (pos >= tagContent.length || tagContent[pos] !== this.typeSeparator) { 364 // No separator found after key - malformed 365 return null 366 } 367 368 // Skip separator 369 pos++ 370 371 // Extract value - can be quoted or unquoted 372 if (pos < tagContent.length && tagContent[pos] === '"') { 373 // Quoted value 374 const quotedValue = this.extractQuotedString(tagContent, pos) 375 if (!quotedValue) { 376 // Malformed quoted value - skip this tag 377 return null 378 } 379 value = quotedValue.content 380 } else { 381 // Unquoted value - rest of the content 382 value = tagContent.substring(pos) 383 } 384 385 // Parse the value and get typed result 386 const { parsedValue, inferredType } = this.parseValue(type, value) 387 388 // Apply formatter to get formatted value 389 const formattedValue = this.applyFormatter(type, parsedValue) 390 391 return { 392 type, 393 value, 394 parsedValue, 395 formattedValue, 396 inferredType, 397 position, 398 endPosition, 399 } 400 } 401 402 /** 403 * Infer the primitive type from a raw string value 404 * @param value - The raw string value 405 * @returns The inferred primitive type 406 */ 407 private inferType(value: string): PrimitiveType { 408 // Check for number (including decimals and negatives) 409 if (/^-?\d+(\.\d+)?$/.test(value)) { 410 return 'number' 411 } 412 413 // Check for boolean (case-insensitive) 414 const lowerValue = value.toLowerCase() 415 if (lowerValue === 'true' || lowerValue === 'false') { 416 return 'boolean' 417 } 418 419 // Default to string 420 return 'string' 421 } 422 423 /** 424 * Parse a value using schema (if available) or type inference 425 * @param type - The entity type 426 * @param rawValue - The raw string value 427 * @returns Object with parsedValue and inferredType 428 */ 429 private parseValue( 430 type: string, 431 rawValue: string, 432 ): { 433 parsedValue: string | number | boolean 434 inferredType: PrimitiveType 435 } { 436 let targetType: PrimitiveType 437 438 // Check if type is in schema 439 if (this.schema && type in this.schema) { 440 const schemaEntry = this.schema[type] 441 // Handle both shorthand (just type) and full definition 442 targetType = 443 typeof schemaEntry === 'string' ? schemaEntry : schemaEntry.type 444 } else { 445 // Use inference for unknown types 446 targetType = this.inferType(rawValue) 447 } 448 449 // Parse based on target type 450 let parsedValue: string | number | boolean 451 452 switch (targetType) { 453 case 'number': 454 parsedValue = parseFloat(rawValue) 455 break 456 case 'boolean': 457 parsedValue = rawValue.toLowerCase() === 'true' 458 break 459 case 'string': 460 parsedValue = rawValue 461 break 462 default: 463 parsedValue = rawValue 464 break 465 } 466 467 return { 468 parsedValue, 469 inferredType: targetType, 470 } 471 } 472 473 /** 474 * Apply formatter function to a parsed value 475 * @param type - The entity type 476 * @param parsedValue - The parsed value 477 * @returns Formatted string 478 */ 479 private applyFormatter( 480 type: string, 481 parsedValue: string | number | boolean, 482 ): string { 483 // Check if schema has a formatter for this type 484 if (this.schema && type in this.schema) { 485 const schemaEntry = this.schema[type] 486 487 // Only full EntityDefinition can have a formatter 488 if (typeof schemaEntry !== 'string' && schemaEntry.format) { 489 return schemaEntry.format(parsedValue) 490 } 491 } 492 493 // No formatter - convert to string 494 return String(parsedValue) 495 } 496 497 /** 498 * Extract a quoted string starting at the given position 499 * Processes escape sequences: \" becomes " and \\ becomes \ 500 * @param message - The string to extract from 501 * @param startPos - The position of the opening quote 502 * @returns Object with content and endPosition, or null if unclosed 503 */ 504 private extractQuotedString( 505 message: string, 506 startPos: number, 507 ): { content: string; endPosition: number } | null { 508 // Verify we're starting at a quote 509 if (message[startPos] !== '"') { 510 return null 511 } 512 513 let result = '' 514 let pos = startPos + 1 515 516 while (pos < message.length) { 517 const char = message[pos] 518 519 if (char === '\\') { 520 // Check if there's a next character 521 if (pos + 1 < message.length) { 522 const nextChar = message[pos + 1] 523 // Process escape sequences for quote and backslash 524 if (nextChar === '"' || nextChar === '\\') { 525 result += nextChar 526 pos += 2 527 continue 528 } 529 } 530 // Backslash at end or before non-escapable char - treat as literal 531 result += char 532 pos += 1 533 } else if (char === '"') { 534 // Found closing quote 535 return { content: result, endPosition: pos + 1 } 536 } else { 537 // Regular character 538 result += char 539 pos += 1 540 } 541 } 542 543 // Reached end of string without finding closing quote 544 return null 545 } 546 547 /** 548 * Extract an unquoted token starting at the given position 549 * Stops at whitespace or any of the specified stop characters 550 * @param message - The string to extract from 551 * @param startPos - The position to start extraction 552 * @param stopChars - Array of characters that should stop extraction 553 * @returns Object with content and endPosition 554 */ 555 private extractUnquotedToken( 556 message: string, 557 startPos: number, 558 stopChars: string[], 559 ): { content: string; endPosition: number } { 560 let result = '' 561 let pos = startPos 562 563 while (pos < message.length) { 564 const char = message[pos] 565 566 // Check if we hit whitespace 567 if (/\s/.test(char)) { 568 break 569 } 570 571 // Check if we hit a stop character 572 if (stopChars.includes(char)) { 573 break 574 } 575 576 // Add character to result 577 result += char 578 pos += 1 579 } 580 581 return { content: result, endPosition: pos } 582 } 583}