lexer.go at main · angrydutchman.peedee.es/gott

A go template renderer based on Perl's Template Toolkit
gott / lexer.go
at main 9.1 kB view raw
  1package gott
  2
  3import (
  4    "strings"
  5    "unicode"
  6    "unicode/utf8"
  7)
  8
  9const eof = -1
 10
 11// Lexer tokenizes a template string into a sequence of tokens
 12type Lexer struct {
 13    input   string     // the string being scanned
 14    start   int        // start position of current token
 15    pos     int        // current position in input
 16    width   int        // width of last rune read
 17    line    int        // current line number (1-based)
 18    linePos int        // position of start of current line
 19    tokens  chan Token // channel of scanned tokens
 20}
 21
 22// stateFn represents a lexer state function; returns the next state
 23type stateFn func(*Lexer) stateFn
 24
 25// NewLexer creates a new lexer for the given input and starts scanning
 26func NewLexer(input string) *Lexer {
 27    l := &Lexer{
 28        input:   input,
 29        line:    1,
 30        linePos: 0,
 31        tokens:  make(chan Token, 2),
 32    }
 33    go l.run()
 34    return l
 35}
 36
 37// run executes the state machine
 38func (l *Lexer) run() {
 39    for state := lexText; state != nil; {
 40        state = state(l)
 41    }
 42    close(l.tokens)
 43}
 44
 45// NextToken returns the next token from the lexer
 46func (l *Lexer) NextToken() Token {
 47    return <-l.tokens
 48}
 49
 50// Tokens returns all tokens as a slice (consumes the channel)
 51func (l *Lexer) Tokens() []Token {
 52    var tokens []Token
 53    for tok := range l.tokens {
 54        tokens = append(tokens, tok)
 55    }
 56    return tokens
 57}
 58
 59// emit sends a token to the tokens channel
 60func (l *Lexer) emit(t TokenType) {
 61    l.tokens <- Token{
 62        Type:  t,
 63        Value: l.input[l.start:l.pos],
 64        Pos:   l.currentPos(),
 65    }
 66    l.start = l.pos
 67}
 68
 69// emitValue sends a token with a specific value
 70func (l *Lexer) emitValue(t TokenType, value string) {
 71    l.tokens <- Token{
 72        Type:  t,
 73        Value: value,
 74        Pos:   l.currentPos(),
 75    }
 76    l.start = l.pos
 77}
 78
 79// currentPos returns the current position for error reporting
 80func (l *Lexer) currentPos() Position {
 81    return Position{
 82        Line:   l.line,
 83        Column: l.start - l.linePos + 1,
 84        Offset: l.start,
 85    }
 86}
 87
 88// next returns the next rune in the input and advances the position
 89func (l *Lexer) next() rune {
 90    if l.pos >= len(l.input) {
 91        l.width = 0
 92        return eof
 93    }
 94    r, w := utf8.DecodeRuneInString(l.input[l.pos:])
 95    l.width = w
 96    l.pos += l.width
 97    if r == '\n' {
 98        l.line++
 99        l.linePos = l.pos
100    }
101    return r
102}
103
104// backup steps back one rune (can only be called once per next call)
105func (l *Lexer) backup() {
106    l.pos -= l.width
107    // If we backed up over a newline, decrement line count
108    if l.pos < len(l.input) && l.input[l.pos] == '\n' {
109        l.line--
110        // Find previous line start
111        l.linePos = strings.LastIndex(l.input[:l.pos], "\n") + 1
112    }
113}
114
115// peek returns the next rune without advancing
116func (l *Lexer) peek() rune {
117    r := l.next()
118    l.backup()
119    return r
120}
121
122// ignore skips over the pending input
123func (l *Lexer) ignore() {
124    l.start = l.pos
125}
126
127// errorf emits an error token and terminates scanning
128func (l *Lexer) errorf(format string, args ...any) stateFn {
129    l.tokens <- Token{
130        Type:  TokenError,
131        Value: sprintf(format, args...),
132        Pos:   l.currentPos(),
133    }
134    return nil
135}
136
137// sprintf is a simple formatter (avoiding fmt import in hot path)
138func sprintf(format string, args ...any) string {
139    // Simple implementation - for errors only
140    result := format
141    for _, arg := range args {
142        if s, ok := arg.(string); ok {
143            result = strings.Replace(result, "%s", s, 1)
144            result = strings.Replace(result, "%q", "'"+s+"'", 1)
145        }
146    }
147    return result
148}
149
150// skipWhitespace advances past any whitespace characters
151func (l *Lexer) skipWhitespace() {
152    for {
153        r := l.next()
154        if r == eof || !unicode.IsSpace(r) {
155            l.backup()
156            break
157        }
158    }
159    l.ignore()
160}
161
162// hasPrefix checks if the remaining input starts with the given prefix
163func (l *Lexer) hasPrefix(prefix string) bool {
164    return strings.HasPrefix(l.input[l.pos:], prefix)
165}
166
167// ---- State Functions ----
168
169// lexText scans text outside of tags until [% or EOF
170func lexText(l *Lexer) stateFn {
171    for {
172        if l.hasPrefix("[%") {
173            if l.pos > l.start {
174                l.emit(TokenText)
175            }
176            return lexTagOpen
177        }
178        if l.next() == eof {
179            break
180        }
181    }
182    // Emit any remaining text
183    if l.pos > l.start {
184        l.emit(TokenText)
185    }
186    l.emit(TokenEOF)
187    return nil
188}
189
190// lexTagOpen scans the [% opening delimiter
191func lexTagOpen(l *Lexer) stateFn {
192    l.pos += 2 // skip [%
193    l.emit(TokenTagOpen)
194    return lexInsideTag
195}
196
197// lexInsideTag scans inside a [% ... %] tag
198func lexInsideTag(l *Lexer) stateFn {
199    l.skipWhitespace()
200
201    // Check for closing tag
202    if l.hasPrefix("%]") {
203        l.pos += 2
204        l.emit(TokenTagClose)
205        return lexText
206    }
207
208    // Check for two-character operators first
209    twoCharOps := []struct {
210        str string
211        tok TokenType
212    }{
213        {"==", TokenEq},
214        {"!=", TokenNe},
215        {">=", TokenGe},
216        {"<=", TokenLe},
217        {"&&", TokenAnd},
218        {"||", TokenOr},
219    }
220    for _, op := range twoCharOps {
221        if l.hasPrefix(op.str) {
222            l.pos += 2
223            l.emit(op.tok)
224            return lexInsideTag
225        }
226    }
227
228    // Check for single-character operators/delimiters
229    r := l.next()
230    switch r {
231    case eof:
232        return l.errorf("unclosed tag")
233    case '>':
234        l.emit(TokenGt)
235        return lexInsideTag
236    case '<':
237        l.emit(TokenLt)
238        return lexInsideTag
239    case '+':
240        l.emit(TokenPlus)
241        return lexInsideTag
242    case '-':
243        // Could be minus or negative number
244        if unicode.IsDigit(l.peek()) {
245            l.backup()
246            return lexNumber
247        }
248        l.emit(TokenMinus)
249        return lexInsideTag
250    case '*':
251        l.emit(TokenMul)
252        return lexInsideTag
253    case '/':
254        l.emit(TokenDiv)
255        return lexInsideTag
256    case '%':
257        // Check if this is %] (tag close) or % (modulo)
258        if l.peek() == ']' {
259            l.backup()
260            l.pos += 2
261            l.emit(TokenTagClose)
262            return lexText
263        }
264        l.emit(TokenMod)
265        return lexInsideTag
266    case '.':
267        l.emit(TokenDot)
268        return lexInsideTag
269    case '|':
270        l.emit(TokenPipe)
271        return lexInsideTag
272    case '(':
273        l.emit(TokenLParen)
274        return lexInsideTag
275    case ')':
276        l.emit(TokenRParen)
277        return lexInsideTag
278    case ',':
279        l.emit(TokenComma)
280        return lexInsideTag
281    case '=':
282        l.emit(TokenAssign)
283        return lexInsideTag
284    case '$':
285        l.emit(TokenDollar)
286        return lexInsideTag
287    case '"', '\'':
288        l.backup()
289        return lexString
290    }
291
292    // Check for number
293    if unicode.IsDigit(r) {
294        l.backup()
295        return lexNumber
296    }
297
298    // Must be identifier or keyword
299    if isAlpha(r) || r == '_' {
300        l.backup()
301        return lexIdentifier
302    }
303
304    return l.errorf("unexpected character: %s", string(r))
305}
306
307// lexString scans a quoted string literal (single or double quotes)
308func lexString(l *Lexer) stateFn {
309    quote := l.next() // consume opening quote
310    l.ignore()        // don't include quote in value
311
312    for {
313        r := l.next()
314        if r == eof {
315            return l.errorf("unterminated string")
316        }
317        if r == '\\' {
318            // Skip escaped character
319            l.next()
320            continue
321        }
322        if r == quote {
323            // Don't include closing quote in value
324            l.backup()
325            l.emit(TokenString)
326            l.next() // consume closing quote
327            l.ignore()
328            return lexInsideTag
329        }
330    }
331}
332
333// lexNumber scans a number (integer or float, possibly negative)
334func lexNumber(l *Lexer) stateFn {
335    // Optional leading minus
336    if l.peek() == '-' {
337        l.next()
338    }
339
340    // Integer part
341    digits := false
342    for unicode.IsDigit(l.peek()) {
343        l.next()
344        digits = true
345    }
346
347    if !digits {
348        return l.errorf("expected digits in number")
349    }
350
351    // Optional decimal part
352    if l.peek() == '.' {
353        l.next()
354        for unicode.IsDigit(l.peek()) {
355            l.next()
356        }
357    }
358
359    l.emit(TokenNumber)
360    return lexInsideTag
361}
362
363// lexIdentifier scans an identifier or keyword
364func lexIdentifier(l *Lexer) stateFn {
365    for {
366        r := l.next()
367        if !isAlphaNumeric(r) && r != '_' {
368            l.backup()
369            break
370        }
371    }
372
373    word := l.input[l.start:l.pos]
374    tokType := LookupKeyword(word)
375    l.emit(tokType)
376    return lexInsideTag
377}
378
379// isAlpha returns true if r is an alphabetic character
380func isAlpha(r rune) bool {
381    return unicode.IsLetter(r)
382}
383
384// isAlphaNumeric returns true if r is alphanumeric
385func isAlphaNumeric(r rune) bool {
386    return unicode.IsLetter(r) || unicode.IsDigit(r)
387}