src/TaggedStringParser.ts at main · tbeseda.com/tagged-string

tbeseda.com / tagged-string
fork atom
this repo has no description
fork atom
tagged-string / src / TaggedStringParser.ts
at main 583 lines 17 kB view raw
wrap content
tbeseda.com isDelimiterFree: docs and steering 3mo ago
ea607bdf
  1import { ParseResult } from './ParseResult.ts'
  2import type {
  3  Entity,
  4  EntitySchema,
  5  ParserConfig,
  6  PrimitiveType,
  7} from './types.ts'
  8
  9/**
 10 * TaggedStringParser extracts tagged entities from strings
 11 *
 12 * Supports two parsing modes:
 13 * - Delimited mode: Extract entities surrounded by delimiters (e.g., [key:value])
 14 * - Delimiter-free mode: Extract key=value patterns from natural language strings
 15 *
 16 * Features:
 17 * - Configurable delimiters and type separators
 18 * - Schema-based type parsing with custom formatters
 19 * - Automatic type inference (string, number, boolean)
 20 * - Quoted strings with escape sequences (\", \\)
 21 * - Lenient error handling (skips malformed entities)
 22 */
 23export class TaggedStringParser {
 24  private readonly openDelimiter: string
 25  private readonly closeDelimiter: string
 26  private readonly typeSeparator: string
 27  private readonly schema?: EntitySchema
 28  private readonly isDelimiterFree: boolean
 29
 30  /**
 31   * Create a new TaggedStringParser with optional configuration
 32   * @param config - Parser configuration options
 33   * @param config.delimiters - Unified delimiter configuration:
 34   *   - `false` or `[]` enables delimiter-free mode (parse key=value patterns)
 35   *   - `[open, close]` uses specified delimiters (e.g., ['{{', '}}'])
 36   *   - If omitted, uses openDelimiter and closeDelimiter options
 37   * @param config.openDelimiter - Opening tag delimiter (default: '[', legacy option)
 38   * @param config.closeDelimiter - Closing tag delimiter (default: ']', legacy option)
 39   * @param config.typeSeparator - Separator between type and value (default: ':')
 40   * @param config.schema - Entity type definitions with optional formatters
 41   * @throws Error if configuration is invalid
 42   */
 43  constructor(config?: ParserConfig) {
 44    // Resolve delimiter configuration
 45    if (config?.delimiters !== undefined) {
 46      // delimiters option takes precedence
 47      if (
 48        config.delimiters === false ||
 49        (Array.isArray(config.delimiters) && config.delimiters.length === 0)
 50      ) {
 51        // Delimiter-free mode
 52        this.isDelimiterFree = true
 53        this.openDelimiter = ''
 54        this.closeDelimiter = ''
 55      } else if (
 56        Array.isArray(config.delimiters) &&
 57        config.delimiters.length === 2
 58      ) {
 59        // Delimited mode with specified delimiters
 60        this.isDelimiterFree = false
 61        this.openDelimiter = config.delimiters[0]
 62        this.closeDelimiter = config.delimiters[1]
 63      } else {
 64        // Invalid configuration
 65        throw new Error('Invalid delimiters configuration')
 66      }
 67    } else {
 68      // Backward compatible: use individual delimiter options
 69      this.isDelimiterFree = false
 70      this.openDelimiter = config?.openDelimiter ?? '['
 71      this.closeDelimiter = config?.closeDelimiter ?? ']'
 72    }
 73
 74    this.typeSeparator = config?.typeSeparator ?? ':'
 75    this.schema = config?.schema
 76
 77    // Validate configuration
 78    this.validateConfig()
 79  }
 80
 81  /**
 82   * Validate parser configuration
 83   * @throws Error if configuration is invalid
 84   */
 85  private validateConfig(): void {
 86    // Skip delimiter validation in delimiter-free mode
 87    if (this.isDelimiterFree) {
 88      return
 89    }
 90
 91    if (this.openDelimiter === '') {
 92      throw new Error('Open delimiter cannot be empty')
 93    }
 94    if (this.closeDelimiter === '') {
 95      throw new Error('Close delimiter cannot be empty')
 96    }
 97    if (this.openDelimiter === this.closeDelimiter) {
 98      throw new Error('Open and close delimiters cannot be the same')
 99    }
100  }
101
102  /**
103   * Parse a string and extract all tagged entities
104   * @param message - The string to parse
105   * @returns ParseResult containing original message and extracted entities
106   */
107  parse(message: string): ParseResult {
108    if (message === '') {
109      return new ParseResult(message, [])
110    }
111
112    // Route to appropriate parsing method based on mode
113    if (this.isDelimiterFree) {
114      return this.parseDelimiterFree(message)
115    }
116
117    return this.parseDelimited(message)
118  }
119
120  /**
121   * Parse a string in delimited mode, extracting tags with proper quoted string handling
122   * @param message - The string to parse
123   * @returns ParseResult containing original message and extracted entities
124   */
125  private parseDelimited(message: string): ParseResult {
126    const entities: Entity[] = []
127    let pos = 0
128
129    while (pos < message.length) {
130      // Find the opening delimiter
131      const openIndex = message.indexOf(this.openDelimiter, pos)
132      if (openIndex === -1) {
133        // No more tags
134        break
135      }
136
137      // Start scanning for tag content after the opening delimiter
138      const contentStart = openIndex + this.openDelimiter.length
139      let contentEnd = contentStart
140      let inQuote = false
141
142      // Scan character by character to find the closing delimiter
143      // while respecting quoted strings
144      while (contentEnd < message.length) {
145        const char = message[contentEnd]
146
147        if (char === '"') {
148          // Toggle quote state (simplified - doesn't handle escapes during scan)
149          // We'll handle escapes properly in processTag
150          if (contentEnd > contentStart && message[contentEnd - 1] === '\\') {
151            // This is an escaped quote, don't toggle
152            // But we need to check if the backslash itself is escaped
153            let backslashCount = 0
154            let checkPos = contentEnd - 1
155            while (checkPos >= contentStart && message[checkPos] === '\\') {
156              backslashCount++
157              checkPos--
158            }
159            // If odd number of backslashes, the quote is escaped
160            if (backslashCount % 2 === 1) {
161              contentEnd++
162              continue
163            }
164          }
165          inQuote = !inQuote
166          contentEnd++
167        } else if (
168          !inQuote &&
169          message.substring(
170            contentEnd,
171            contentEnd + this.closeDelimiter.length,
172          ) === this.closeDelimiter
173        ) {
174          // Found closing delimiter outside of quotes
175          const tagContent = message.substring(contentStart, contentEnd).trim()
176
177          if (tagContent !== '') {
178            const entity = this.processTag(
179              tagContent,
180              openIndex,
181              contentEnd + this.closeDelimiter.length,
182            )
183            if (entity) {
184              entities.push(entity)
185            }
186          }
187
188          // Move past this tag
189          pos = contentEnd + this.closeDelimiter.length
190          break
191        } else {
192          contentEnd++
193        }
194      }
195
196      // If we reached the end without finding a closing delimiter, skip this opening
197      if (contentEnd >= message.length) {
198        pos = openIndex + this.openDelimiter.length
199      }
200    }
201
202    return new ParseResult(message, entities, this.closeDelimiter)
203  }
204
205  /**
206   * Parse a string in delimiter-free mode, extracting key-value patterns
207   * @param message - The string to parse
208   * @returns ParseResult containing original message and extracted entities
209   */
210  private parseDelimiterFree(message: string): ParseResult {
211    const entities: Entity[] = []
212    let pos = 0
213
214    while (pos < message.length) {
215      // Skip whitespace
216      while (pos < message.length && /\s/.test(message[pos])) {
217        pos++
218      }
219
220      if (pos >= message.length) {
221        break
222      }
223
224      // Try to extract key (quoted or unquoted)
225      const keyStart = pos
226      let key: string
227      let keyEnd: number
228
229      if (message[pos] === '"') {
230        // Quoted key
231        const quotedKey = this.extractQuotedString(message, pos)
232        if (!quotedKey) {
233          // Malformed quoted key - skip this character and continue
234          pos++
235          continue
236        }
237        key = quotedKey.content
238        keyEnd = quotedKey.endPosition
239      } else {
240        // Unquoted key - extract until separator or whitespace
241        const unquotedKey = this.extractUnquotedToken(message, pos, [
242          this.typeSeparator,
243        ])
244        if (unquotedKey.content === '') {
245          // No key found - advance and continue
246          pos++
247          continue
248        }
249        key = unquotedKey.content
250        keyEnd = unquotedKey.endPosition
251      }
252
253      // Check for type separator
254      if (keyEnd >= message.length || message[keyEnd] !== this.typeSeparator) {
255        // No separator - this isn't an entity, continue from after the key
256        pos = keyEnd + 1
257        continue
258      }
259
260      // Advance past separator
261      pos = keyEnd + 1
262
263      // Try to extract value (quoted or unquoted)
264      let value: string
265      let valueEnd: number
266
267      if (pos < message.length && message[pos] === '"') {
268        // Quoted value
269        const quotedValue = this.extractQuotedString(message, pos)
270        if (!quotedValue) {
271          // Malformed quoted value - skip this entity
272          continue
273        }
274        value = quotedValue.content
275        valueEnd = quotedValue.endPosition
276      } else {
277        // Unquoted value - extract until whitespace
278        const unquotedValue = this.extractUnquotedToken(message, pos, [])
279        if (unquotedValue.content === '') {
280          // No value found - skip this entity
281          continue
282        }
283        value = unquotedValue.content
284        valueEnd = unquotedValue.endPosition
285      }
286
287      // Create entity
288      const { parsedValue, inferredType } = this.parseValue(key, value)
289      const formattedValue = this.applyFormatter(key, parsedValue)
290
291      entities.push({
292        type: key,
293        value,
294        parsedValue,
295        formattedValue,
296        inferredType,
297        position: keyStart,
298        endPosition: valueEnd,
299      })
300
301      // Update position to continue scanning
302      pos = valueEnd
303    }
304
305    return new ParseResult(message, entities, this.closeDelimiter)
306  }
307
308  /**
309   * Process a tag's content and create an Entity
310   * @param tagContent - The content between delimiters
311   * @param position - The position of the tag in the original message
312   * @param endPosition - The position after the closing delimiter
313   * @returns Entity or null if tag is malformed
314   */
315  private processTag(
316    tagContent: string,
317    position: number,
318    endPosition: number,
319  ): Entity | null {
320    let type: string
321    let value: string
322    let pos = 0
323
324    // Extract type (key) - can be quoted or unquoted
325    if (tagContent[pos] === '"') {
326      // Quoted key
327      const quotedKey = this.extractQuotedString(tagContent, pos)
328      if (!quotedKey) {
329        // Malformed quoted key - skip this tag
330        return null
331      }
332      type = quotedKey.content
333      pos = quotedKey.endPosition
334    } else {
335      // Unquoted key - find separator
336      const separatorIndex = tagContent.indexOf(this.typeSeparator)
337      if (separatorIndex === -1) {
338        // No separator - treat entire content as value with empty type
339        type = ''
340        value = tagContent
341
342        // Parse the value and get typed result
343        const { parsedValue, inferredType } = this.parseValue(type, value)
344
345        // Apply formatter to get formatted value
346        const formattedValue = this.applyFormatter(type, parsedValue)
347
348        return {
349          type,
350          value,
351          parsedValue,
352          formattedValue,
353          inferredType,
354          position,
355          endPosition,
356        }
357      }
358      type = tagContent.substring(0, separatorIndex)
359      pos = separatorIndex
360    }
361
362    // Check for separator
363    if (pos >= tagContent.length || tagContent[pos] !== this.typeSeparator) {
364      // No separator found after key - malformed
365      return null
366    }
367
368    // Skip separator
369    pos++
370
371    // Extract value - can be quoted or unquoted
372    if (pos < tagContent.length && tagContent[pos] === '"') {
373      // Quoted value
374      const quotedValue = this.extractQuotedString(tagContent, pos)
375      if (!quotedValue) {
376        // Malformed quoted value - skip this tag
377        return null
378      }
379      value = quotedValue.content
380    } else {
381      // Unquoted value - rest of the content
382      value = tagContent.substring(pos)
383    }
384
385    // Parse the value and get typed result
386    const { parsedValue, inferredType } = this.parseValue(type, value)
387
388    // Apply formatter to get formatted value
389    const formattedValue = this.applyFormatter(type, parsedValue)
390
391    return {
392      type,
393      value,
394      parsedValue,
395      formattedValue,
396      inferredType,
397      position,
398      endPosition,
399    }
400  }
401
402  /**
403   * Infer the primitive type from a raw string value
404   * @param value - The raw string value
405   * @returns The inferred primitive type
406   */
407  private inferType(value: string): PrimitiveType {
408    // Check for number (including decimals and negatives)
409    if (/^-?\d+(\.\d+)?$/.test(value)) {
410      return 'number'
411    }
412
413    // Check for boolean (case-insensitive)
414    const lowerValue = value.toLowerCase()
415    if (lowerValue === 'true' || lowerValue === 'false') {
416      return 'boolean'
417    }
418
419    // Default to string
420    return 'string'
421  }
422
423  /**
424   * Parse a value using schema (if available) or type inference
425   * @param type - The entity type
426   * @param rawValue - The raw string value
427   * @returns Object with parsedValue and inferredType
428   */
429  private parseValue(
430    type: string,
431    rawValue: string,
432  ): {
433    parsedValue: string | number | boolean
434    inferredType: PrimitiveType
435  } {
436    let targetType: PrimitiveType
437
438    // Check if type is in schema
439    if (this.schema && type in this.schema) {
440      const schemaEntry = this.schema[type]
441      // Handle both shorthand (just type) and full definition
442      targetType =
443        typeof schemaEntry === 'string' ? schemaEntry : schemaEntry.type
444    } else {
445      // Use inference for unknown types
446      targetType = this.inferType(rawValue)
447    }
448
449    // Parse based on target type
450    let parsedValue: string | number | boolean
451
452    switch (targetType) {
453      case 'number':
454        parsedValue = parseFloat(rawValue)
455        break
456      case 'boolean':
457        parsedValue = rawValue.toLowerCase() === 'true'
458        break
459      case 'string':
460        parsedValue = rawValue
461        break
462      default:
463        parsedValue = rawValue
464        break
465    }
466
467    return {
468      parsedValue,
469      inferredType: targetType,
470    }
471  }
472
473  /**
474   * Apply formatter function to a parsed value
475   * @param type - The entity type
476   * @param parsedValue - The parsed value
477   * @returns Formatted string
478   */
479  private applyFormatter(
480    type: string,
481    parsedValue: string | number | boolean,
482  ): string {
483    // Check if schema has a formatter for this type
484    if (this.schema && type in this.schema) {
485      const schemaEntry = this.schema[type]
486
487      // Only full EntityDefinition can have a formatter
488      if (typeof schemaEntry !== 'string' && schemaEntry.format) {
489        return schemaEntry.format(parsedValue)
490      }
491    }
492
493    // No formatter - convert to string
494    return String(parsedValue)
495  }
496
497  /**
498   * Extract a quoted string starting at the given position
499   * Processes escape sequences: \" becomes " and \\ becomes \
500   * @param message - The string to extract from
501   * @param startPos - The position of the opening quote
502   * @returns Object with content and endPosition, or null if unclosed
503   */
504  private extractQuotedString(
505    message: string,
506    startPos: number,
507  ): { content: string; endPosition: number } | null {
508    // Verify we're starting at a quote
509    if (message[startPos] !== '"') {
510      return null
511    }
512
513    let result = ''
514    let pos = startPos + 1
515
516    while (pos < message.length) {
517      const char = message[pos]
518
519      if (char === '\\') {
520        // Check if there's a next character
521        if (pos + 1 < message.length) {
522          const nextChar = message[pos + 1]
523          // Process escape sequences for quote and backslash
524          if (nextChar === '"' || nextChar === '\\') {
525            result += nextChar
526            pos += 2
527            continue
528          }
529        }
530        // Backslash at end or before non-escapable char - treat as literal
531        result += char
532        pos += 1
533      } else if (char === '"') {
534        // Found closing quote
535        return { content: result, endPosition: pos + 1 }
536      } else {
537        // Regular character
538        result += char
539        pos += 1
540      }
541    }
542
543    // Reached end of string without finding closing quote
544    return null
545  }
546
547  /**
548   * Extract an unquoted token starting at the given position
549   * Stops at whitespace or any of the specified stop characters
550   * @param message - The string to extract from
551   * @param startPos - The position to start extraction
552   * @param stopChars - Array of characters that should stop extraction
553   * @returns Object with content and endPosition
554   */
555  private extractUnquotedToken(
556    message: string,
557    startPos: number,
558    stopChars: string[],
559  ): { content: string; endPosition: number } {
560    let result = ''
561    let pos = startPos
562
563    while (pos < message.length) {
564      const char = message[pos]
565
566      // Check if we hit whitespace
567      if (/\s/.test(char)) {
568        break
569      }
570
571      // Check if we hit a stop character
572      if (stopChars.includes(char)) {
573        break
574      }
575
576      // Add character to result
577      result += char
578      pos += 1
579    }
580
581    return { content: result, endPosition: pos }
582  }
583}