An Erlang lexer and syntax highlighter in Gleam
1import gleam/int
2import gleam/list
3import gleam/option.{type Option, None, Some}
4import gleam/string
5import gleam_community/ansi
6import houdini
7import splitter.{type Splitter}
8
9pub opaque type Lexer {
10 Lexer(
11 source: String,
12 ignore_comments: Bool,
13 ignore_whitespace: Bool,
14 errors: List(Error),
15 splitters: Splitters,
16 )
17}
18
19type Splitters {
20 Splitters(
21 until_end_of_line: Splitter,
22 string: Splitter,
23 quoted_atom: Splitter,
24 brace_escape_sequence: Splitter,
25 sigil: Splitter,
26 sigil_verbatim: Splitter,
27 triple_quoted_string: Splitter,
28 )
29}
30
31pub type Error {
32 UnknownCharacter(character: String)
33 UnterminatedStringLiteral
34 UnterminatedQuotedAtom
35 InvalidRadix(radix: String)
36 NumericSeparatorNotAllowed
37 ExpectedExponent
38 NumberCannotEndAfterRadix
39 UnterminatedCharacter
40 UnterminatedEscapeSequence
41 ExpectedSigilDelimiter
42 ExpectedWhitespaceAfterTripleQuote
43 InvalidTripleQuotedStringIndentation(
44 expected_indentation: String,
45 line: String,
46 )
47}
48
49pub fn stringify_error(error: Error) -> String {
50 case error {
51 UnterminatedQuotedAtom -> "Unterminated quoted atom"
52 UnterminatedStringLiteral -> "Unterminated string literal"
53 ExpectedExponent -> "Expected an exponent"
54 ExpectedSigilDelimiter -> "Expected a valid sigil delimiter after `~`"
55 ExpectedWhitespaceAfterTripleQuote ->
56 "Expected whitespace after a triple quote"
57 InvalidRadix(radix:) -> "Invalid numeric radix: " <> radix
58 InvalidTripleQuotedStringIndentation(expected_indentation:, line:) ->
59 "Invalid triple-quoted string: Expected the indentation `"
60 <> expected_indentation
61 <> "` preceding the line `"
62 <> line
63 <> "`"
64 NumberCannotEndAfterRadix ->
65 "Number cannot end directly after radix specification"
66 NumericSeparatorNotAllowed -> "Numeric separator is not allowed here"
67 UnknownCharacter(character:) ->
68 "Unexpected character: `" <> character <> "`"
69 UnterminatedCharacter -> "Unterminated character literal"
70 UnterminatedEscapeSequence -> "Unterminated escape sequence"
71 }
72}
73
74pub type Token {
75 // Whitespace and comments
76 Whitespace(String)
77 Comment(String)
78 DocComment(String)
79 ModuleComment(String)
80 EndOfFile
81
82 Character(String)
83 Integer(String)
84 Float(String)
85 Atom(name: String, quoted: Bool)
86 String(String)
87 TripleQuotedString(
88 sigil: option.Option(String),
89 number_of_quotes: Int,
90 beginning_whitespace: String,
91 lines: List(String),
92 end_indentation: String,
93 )
94 Sigil(sigil: String, delimiter: SigilDelimiter, contents: String)
95 Variable(String)
96
97 // Keywords
98 After
99 Begin
100 Case
101 Catch
102 Cond
103 Else
104 End
105 Fun
106 If
107 Let
108 Maybe
109 Of
110 Receive
111 Try
112 When
113
114 // Grouping
115 LeftParen
116 RightParen
117 LeftBrace
118 RightBrace
119 LeftSquare
120 RightSquare
121
122 // Punctuation
123 Comma
124 Semicolon
125 Colon
126 Dot
127 MinusGreater
128 DoubleLess
129 DoubleGreater
130 Hash
131 DoubleColon
132 DoubleDot
133 TripleDot
134 DoublePipe
135 EqualGreater
136 ColonEqual
137 LessMinus
138 LessEqual
139
140 // Operators
141 Pipe
142 DoubleEqual
143 SlashEqual
144 EqualLess
145 Less
146 GreaterEqual
147 Greater
148 EqualColonEqual
149 EqualSlashEqual
150 Plus
151 Minus
152 Star
153 Slash
154 Bnot
155 Div
156 Rem
157 Band
158 Bor
159 Bxor
160 Bsl
161 Bsr
162 Not
163 And
164 Or
165 Xor
166 Andalso
167 Orelse
168 DoublePlus
169 DoubleMinus
170 QuestionEqual
171 Question
172 Bang
173 Equal
174
175 // Invalid tokens
176 Unknown(String)
177 UnterminatedString(String)
178 UnterminatedSigil(sigil: String, delimiter: SigilDelimiter, contents: String)
179 UnterminatedAtom(String)
180 InvalidTripleQuotedString(contents: String)
181}
182
183/// Convert a token back to its source code representation
184pub fn token_to_source(token: Token) -> String {
185 case token {
186 // Whitespace and comments
187 Whitespace(space) -> space
188 Comment(contents) -> "%" <> contents
189 DocComment(contents) -> "%%" <> contents
190 ModuleComment(contents) -> "%%%" <> contents
191 EndOfFile -> ""
192
193 Character(char) -> "$" <> char
194 Integer(int) -> int
195 Float(float) -> float
196 Atom(name:, quoted: True) -> "'" <> name <> "'"
197 Atom(name:, quoted: False) -> name
198 String(contents) -> "\"" <> contents <> "\""
199 TripleQuotedString(
200 sigil:,
201 number_of_quotes:,
202 beginning_whitespace:,
203 lines:,
204 end_indentation:,
205 ) ->
206 case sigil {
207 option.None -> ""
208 option.Some(sigil) -> "~" <> sigil
209 }
210 <> string.repeat("\"", number_of_quotes)
211 <> beginning_whitespace
212 <> string.join(
213 list.map(lines, fn(line) { end_indentation <> line }),
214 "\n",
215 )
216 <> "\n"
217 <> end_indentation
218 <> string.repeat("\"", number_of_quotes)
219 Sigil(sigil:, delimiter:, contents:) -> {
220 let #(opening, closing) = sigil_delimiters(delimiter)
221 "~" <> sigil <> opening <> contents <> closing
222 }
223 Variable(name) -> name
224
225 // Keywords
226 After -> "after"
227 Begin -> "begin"
228 Case -> "case"
229 Catch -> "catch"
230 Cond -> "cond"
231 Else -> "else"
232 End -> "end"
233 Fun -> "fun"
234 If -> "if"
235 Let -> "let"
236 Maybe -> "maybe"
237 Of -> "of"
238 Receive -> "receive"
239 Try -> "try"
240 When -> "when"
241
242 // Grouping
243 LeftParen -> "("
244 RightParen -> ")"
245 LeftBrace -> "{"
246 RightBrace -> "}"
247 LeftSquare -> "["
248 RightSquare -> "]"
249
250 // Punctuation
251 Comma -> ","
252 Semicolon -> ";"
253 Colon -> ":"
254 Dot -> "."
255 MinusGreater -> "->"
256 DoubleLess -> "<<"
257 DoubleGreater -> ">>"
258 Hash -> "#"
259 DoubleColon -> "::"
260 DoubleDot -> ".."
261 TripleDot -> "..."
262 DoublePipe -> "||"
263 EqualGreater -> "=>"
264 ColonEqual -> ":="
265 LessMinus -> "<-"
266 LessEqual -> "<="
267
268 // Operators
269 Pipe -> "|"
270 DoubleEqual -> "=="
271 SlashEqual -> "/="
272 EqualLess -> "=<"
273 Less -> "<"
274 GreaterEqual -> ">="
275 Greater -> ">"
276 EqualColonEqual -> "=:="
277 EqualSlashEqual -> "=/="
278 Plus -> "+"
279 Minus -> "-"
280 Star -> "*"
281 Slash -> "/"
282 Bnot -> "bnot"
283 Div -> "div"
284 Rem -> "rem"
285 Band -> "band"
286 Bor -> "bor"
287 Bxor -> "bxor"
288 Bsl -> "bsl"
289 Bsr -> "bsr"
290 Not -> "not"
291 And -> "and"
292 Or -> "or"
293 Xor -> "xor"
294 Andalso -> "andalso"
295 Orelse -> "orelse"
296 DoublePlus -> "++"
297 DoubleMinus -> "--"
298 QuestionEqual -> "?="
299 Question -> "?"
300 Bang -> "!"
301 Equal -> "="
302
303 // Invalid tokens
304 Unknown(char) -> char
305 UnterminatedString(contents) -> "\"" <> contents
306 UnterminatedSigil(sigil:, contents:, delimiter:) -> {
307 let #(opening, _closing) = sigil_delimiters(delimiter)
308 "~" <> sigil <> opening <> contents
309 }
310 UnterminatedAtom(contents) -> "'" <> contents
311 InvalidTripleQuotedString(contents) -> "\"\"\"" <> contents <> "\"\"\""
312 }
313}
314
315/// Convert a list of tokens back to their original source code
316pub fn to_source(tokens: List(Token)) -> String {
317 list.fold(tokens, "", fn(code, token) { code <> token_to_source(token) })
318}
319
320pub type SigilDelimiter {
321 SigilNone
322 SigilParen
323 SigilSquare
324 SigilBrace
325 SigilAngle
326 SigilSlash
327 SigilPipe
328 SigilSingleQuote
329 SigilDoubleQuote
330 SigilBacktick
331 SigilHash
332}
333
334/// Get the beginning and ending characters for a sigil
335pub fn sigil_delimiters(delimiter: SigilDelimiter) -> #(String, String) {
336 case delimiter {
337 SigilNone -> #("", "")
338 SigilAngle -> #("<", ">")
339 SigilBacktick -> #("`", "`")
340 SigilBrace -> #("{", "}")
341 SigilDoubleQuote -> #("\"", "\"")
342 SigilHash -> #("#", "#")
343 SigilParen -> #("(", ")")
344 SigilPipe -> #("|", "|")
345 SigilSingleQuote -> #("'", "'")
346 SigilSlash -> #("/", "/")
347 SigilSquare -> #("[", "]")
348 }
349}
350
351pub fn new(source: String) -> Lexer {
352 Lexer(
353 source:,
354 ignore_comments: False,
355 ignore_whitespace: False,
356 errors: [],
357 splitters: make_splitters(),
358 )
359}
360
361fn make_splitters() -> Splitters {
362 Splitters(
363 until_end_of_line: splitter.new(["\n", "\r\n"]),
364 string: splitter.new(["\"", "\\"]),
365 quoted_atom: splitter.new(["'", "\\"]),
366 brace_escape_sequence: splitter.new(["}", "\n", "\r\n"]),
367 sigil: splitter.new([
368 ")", "]", "}", ">", "/", "|", "'", "\"", "`", "#", "\\",
369 ]),
370 sigil_verbatim: splitter.new([
371 ")", "]", "}", ">", "/", "|", "'", "\"", "`", "#",
372 ]),
373 triple_quoted_string: splitter.new(["\n", "\r\n", "\"\"\""]),
374 )
375}
376
377pub fn ignore_comments(lexer: Lexer) -> Lexer {
378 Lexer(..lexer, ignore_comments: True)
379}
380
381pub fn ignore_whitespace(lexer: Lexer) -> Lexer {
382 Lexer(..lexer, ignore_whitespace: True)
383}
384
385pub fn tokenise(lexer: Lexer) -> #(List(Token), List(Error)) {
386 do_tokenise(lexer, [])
387}
388
389fn do_tokenise(lexer: Lexer, tokens: List(Token)) -> #(List(Token), List(Error)) {
390 case next(lexer) {
391 #(lexer, EndOfFile) -> #(
392 list.reverse([EndOfFile, ..tokens]),
393 list.reverse(lexer.errors),
394 )
395 #(lexer, token) -> do_tokenise(lexer, [token, ..tokens])
396 }
397}
398
399fn next(lexer: Lexer) -> #(Lexer, Token) {
400 case lexer.source {
401 "" -> #(lexer, EndOfFile)
402
403 " " as space <> source
404 | "\n" as space <> source
405 | "\r" as space <> source
406 | "\t" as space <> source
407 | "\f" as space <> source -> lex_whitespace(advance(lexer, source), space)
408
409 "%%%" <> source -> {
410 let #(lexer, contents) = lex_until_end_of_line(advance(lexer, source))
411 maybe_token(lexer, ModuleComment(contents), !lexer.ignore_comments)
412 }
413 "%%" <> source -> {
414 let #(lexer, contents) = lex_until_end_of_line(advance(lexer, source))
415 maybe_token(lexer, DocComment(contents), !lexer.ignore_comments)
416 }
417 "%" <> source -> {
418 let #(lexer, contents) = lex_until_end_of_line(advance(lexer, source))
419 maybe_token(lexer, Comment(contents), !lexer.ignore_comments)
420 }
421
422 "::" <> source -> #(advance(lexer, source), DoubleColon)
423 ":=" <> source -> #(advance(lexer, source), ColonEqual)
424 ":" <> source -> #(advance(lexer, source), Colon)
425 "..." <> source -> #(advance(lexer, source), TripleDot)
426 ".." <> source -> #(advance(lexer, source), DoubleDot)
427
428 "(" <> source -> #(advance(lexer, source), LeftParen)
429 ")" <> source -> #(advance(lexer, source), RightParen)
430 "{" <> source -> #(advance(lexer, source), LeftBrace)
431 "}" <> source -> #(advance(lexer, source), RightBrace)
432 "[" <> source -> #(advance(lexer, source), LeftSquare)
433 "]" <> source -> #(advance(lexer, source), RightSquare)
434
435 "," <> source -> #(advance(lexer, source), Comma)
436 ";" <> source -> #(advance(lexer, source), Semicolon)
437 "." <> source -> #(advance(lexer, source), Dot)
438 "->" <> source -> #(advance(lexer, source), MinusGreater)
439 "<<" <> source -> #(advance(lexer, source), DoubleLess)
440 ">>" <> source -> #(advance(lexer, source), DoubleGreater)
441 "#" <> source -> #(advance(lexer, source), Hash)
442 "||" <> source -> #(advance(lexer, source), DoublePipe)
443 "=>" <> source -> #(advance(lexer, source), EqualGreater)
444 "<-" <> source -> #(advance(lexer, source), LessMinus)
445 "<=" <> source -> #(advance(lexer, source), LessEqual)
446 "|" <> source -> #(advance(lexer, source), Pipe)
447
448 "++" <> source -> #(advance(lexer, source), DoublePlus)
449 "--" <> source -> #(advance(lexer, source), DoubleMinus)
450 "==" <> source -> #(advance(lexer, source), DoubleEqual)
451 "/=" <> source -> #(advance(lexer, source), SlashEqual)
452 "=<" <> source -> #(advance(lexer, source), EqualLess)
453 "<" <> source -> #(advance(lexer, source), Less)
454 ">=" <> source -> #(advance(lexer, source), GreaterEqual)
455 ">" <> source -> #(advance(lexer, source), Greater)
456 "=:=" <> source -> #(advance(lexer, source), EqualColonEqual)
457 "=/=" <> source -> #(advance(lexer, source), EqualSlashEqual)
458 "+" <> source -> #(advance(lexer, source), Plus)
459 "-" <> source -> #(advance(lexer, source), Minus)
460 "*" <> source -> #(advance(lexer, source), Star)
461 "/" <> source -> #(advance(lexer, source), Slash)
462 "?=" <> source -> #(advance(lexer, source), QuestionEqual)
463 "?" <> source -> #(advance(lexer, source), Question)
464 "!" <> source -> #(advance(lexer, source), Bang)
465 "=" <> source -> #(advance(lexer, source), Equal)
466
467 "a" as char <> source
468 | "b" as char <> source
469 | "c" as char <> source
470 | "d" as char <> source
471 | "e" as char <> source
472 | "f" as char <> source
473 | "g" as char <> source
474 | "h" as char <> source
475 | "i" as char <> source
476 | "j" as char <> source
477 | "k" as char <> source
478 | "l" as char <> source
479 | "m" as char <> source
480 | "n" as char <> source
481 | "o" as char <> source
482 | "p" as char <> source
483 | "q" as char <> source
484 | "r" as char <> source
485 | "s" as char <> source
486 | "t" as char <> source
487 | "u" as char <> source
488 | "v" as char <> source
489 | "w" as char <> source
490 | "x" as char <> source
491 | "y" as char <> source
492 | "z" as char <> source -> lex_atom(advance(lexer, source), char)
493
494 "A" as char <> source
495 | "B" as char <> source
496 | "C" as char <> source
497 | "D" as char <> source
498 | "E" as char <> source
499 | "F" as char <> source
500 | "G" as char <> source
501 | "H" as char <> source
502 | "I" as char <> source
503 | "J" as char <> source
504 | "K" as char <> source
505 | "L" as char <> source
506 | "M" as char <> source
507 | "N" as char <> source
508 | "O" as char <> source
509 | "P" as char <> source
510 | "Q" as char <> source
511 | "R" as char <> source
512 | "S" as char <> source
513 | "T" as char <> source
514 | "U" as char <> source
515 | "V" as char <> source
516 | "W" as char <> source
517 | "X" as char <> source
518 | "Y" as char <> source
519 | "Z" as char <> source
520 | "_" as char <> source -> lex_variable(advance(lexer, source), char)
521
522 "0" as char <> source
523 | "1" as char <> source
524 | "2" as char <> source
525 | "3" as char <> source
526 | "4" as char <> source
527 | "5" as char <> source
528 | "6" as char <> source
529 | "7" as char <> source
530 | "8" as char <> source
531 | "9" as char <> source ->
532 lex_number(advance(lexer, source), char, Initial, AfterNumber)
533
534 "\"\"\"" <> source -> lex_triple_quoted_string(advance(lexer, source), None)
535
536 "\"" <> source -> lex_string(advance(lexer, source), "")
537 "'" <> source -> lex_quoted_atom(advance(lexer, source), "")
538
539 "$" <> source -> lex_character(advance(lexer, source))
540
541 "~" <> source -> lex_sigil(advance(lexer, source))
542
543 _ ->
544 case string.pop_grapheme(lexer.source) {
545 Error(_) -> #(lexer, EndOfFile)
546 Ok(#(char, source)) -> #(
547 advance(error(lexer, UnknownCharacter(char)), source),
548 Unknown(char),
549 )
550 }
551 }
552}
553
554fn lex_character(lexer: Lexer) -> #(Lexer, Token) {
555 case lexer.source {
556 "\\" <> source -> {
557 let #(lexer, escape_sequence) =
558 lex_escape_sequence(advance(lexer, source))
559 #(lexer, Character("\\" <> escape_sequence))
560 }
561 _ ->
562 case string.pop_grapheme(lexer.source) {
563 Ok(#(char, source)) -> #(advance(lexer, source), Character(char))
564 Error(_) -> #(error(lexer, UnterminatedCharacter), Character(""))
565 }
566 }
567}
568
569fn lex_escape_sequence(lexer: Lexer) -> #(Lexer, String) {
570 case lexer.source {
571 "^a" as sequence <> source
572 | "^b" as sequence <> source
573 | "^c" as sequence <> source
574 | "^d" as sequence <> source
575 | "^e" as sequence <> source
576 | "^f" as sequence <> source
577 | "^g" as sequence <> source
578 | "^h" as sequence <> source
579 | "^i" as sequence <> source
580 | "^j" as sequence <> source
581 | "^k" as sequence <> source
582 | "^l" as sequence <> source
583 | "^m" as sequence <> source
584 | "^n" as sequence <> source
585 | "^o" as sequence <> source
586 | "^p" as sequence <> source
587 | "^q" as sequence <> source
588 | "^r" as sequence <> source
589 | "^s" as sequence <> source
590 | "^t" as sequence <> source
591 | "^u" as sequence <> source
592 | "^v" as sequence <> source
593 | "^w" as sequence <> source
594 | "^x" as sequence <> source
595 | "^y" as sequence <> source
596 | "^z" as sequence <> source
597 | "^A" as sequence <> source
598 | "^B" as sequence <> source
599 | "^C" as sequence <> source
600 | "^D" as sequence <> source
601 | "^E" as sequence <> source
602 | "^F" as sequence <> source
603 | "^G" as sequence <> source
604 | "^H" as sequence <> source
605 | "^I" as sequence <> source
606 | "^J" as sequence <> source
607 | "^K" as sequence <> source
608 | "^L" as sequence <> source
609 | "^M" as sequence <> source
610 | "^N" as sequence <> source
611 | "^O" as sequence <> source
612 | "^P" as sequence <> source
613 | "^Q" as sequence <> source
614 | "^R" as sequence <> source
615 | "^S" as sequence <> source
616 | "^T" as sequence <> source
617 | "^U" as sequence <> source
618 | "^V" as sequence <> source
619 | "^W" as sequence <> source
620 | "^X" as sequence <> source
621 | "^Y" as sequence <> source
622 | "^Z" as sequence <> source
623 | "^@" as sequence <> source
624 | "^[" as sequence <> source
625 | "^\\" as sequence <> source
626 | "^]" as sequence <> source
627 | "^^" as sequence <> source
628 | "^_" as sequence <> source
629 | "^?" as sequence <> source -> #(advance(lexer, source), sequence)
630
631 "x{" <> _source -> lex_brace_escape_sequence(lexer)
632 "x" <> source -> lex_hex_escape_sequence(advance(lexer, source))
633
634 "0" as char <> source
635 | "1" as char <> source
636 | "2" as char <> source
637 | "3" as char <> source
638 | "4" as char <> source
639 | "5" as char <> source
640 | "6" as char <> source
641 | "7" as char <> source ->
642 lex_octal_escape_sequence(advance(lexer, source), char)
643
644 _ ->
645 case string.pop_grapheme(lexer.source) {
646 Error(_) -> #(error(lexer, UnterminatedEscapeSequence), "")
647 Ok(#(char, source)) -> #(advance(lexer, source), char)
648 }
649 }
650}
651
652fn lex_octal_escape_sequence(lexer: Lexer, first: String) -> #(Lexer, String) {
653 case extract_octal_digit(lexer) {
654 Error(_) -> #(lexer, first)
655 Ok(#(lexer, second)) ->
656 case extract_octal_digit(lexer) {
657 Error(_) -> #(lexer, first <> second)
658 Ok(#(lexer, third)) -> #(lexer, first <> second <> third)
659 }
660 }
661}
662
663fn extract_octal_digit(lexer: Lexer) -> Result(#(Lexer, String), Nil) {
664 case lexer.source {
665 "0" as char <> source
666 | "1" as char <> source
667 | "2" as char <> source
668 | "3" as char <> source
669 | "4" as char <> source
670 | "5" as char <> source
671 | "6" as char <> source
672 | "7" as char <> source -> Ok(#(advance(lexer, source), char))
673 _ -> Error(Nil)
674 }
675}
676
677fn lex_hex_escape_sequence(lexer: Lexer) -> #(Lexer, String) {
678 case extract_hex_digit(lexer) {
679 Error(_) -> #(error(lexer, UnterminatedEscapeSequence), "x")
680 Ok(#(lexer, first)) ->
681 case extract_hex_digit(lexer) {
682 Error(_) -> #(error(lexer, UnterminatedEscapeSequence), "x" <> first)
683 Ok(#(lexer, second)) -> #(lexer, "x" <> first <> second)
684 }
685 }
686}
687
688fn extract_hex_digit(lexer: Lexer) -> Result(#(Lexer, String), Nil) {
689 case lexer.source {
690 "0" as char <> source
691 | "1" as char <> source
692 | "2" as char <> source
693 | "3" as char <> source
694 | "4" as char <> source
695 | "5" as char <> source
696 | "6" as char <> source
697 | "7" as char <> source
698 | "8" as char <> source
699 | "9" as char <> source
700 | "a" as char <> source
701 | "b" as char <> source
702 | "c" as char <> source
703 | "d" as char <> source
704 | "e" as char <> source
705 | "f" as char <> source
706 | "A" as char <> source
707 | "B" as char <> source
708 | "C" as char <> source
709 | "D" as char <> source
710 | "E" as char <> source
711 | "F" as char <> source -> Ok(#(advance(lexer, source), char))
712 _ -> Error(Nil)
713 }
714}
715
716fn lex_brace_escape_sequence(lexer: Lexer) -> #(Lexer, String) {
717 case
718 splitter.split_after(lexer.splitters.brace_escape_sequence, lexer.source)
719 {
720 #(before, "") -> #(error(lexer, UnterminatedEscapeSequence), before)
721 #(before, after) -> #(advance(lexer, after), before)
722 }
723}
724
725type LexNumberMode {
726 Initial
727 Radix(Int)
728 Decimal
729 Exponent
730}
731
732type DelimitedPosition {
733 AfterDecimal
734 AfterNumber
735 AfterSeparator
736 AfterExponent
737 AfterRadix
738}
739
740fn lex_number(
741 lexer: Lexer,
742 lexed: String,
743 mode: LexNumberMode,
744 position: DelimitedPosition,
745) -> #(Lexer, Token) {
746 let radix = case mode {
747 Radix(r) -> r
748 Initial | Decimal | Exponent -> 10
749 }
750
751 case lexer.source {
752 "0" as char <> source | "1" as char <> source ->
753 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
754 "2" as char <> source if radix >= 3 ->
755 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
756 "3" as char <> source if radix >= 4 ->
757 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
758 "4" as char <> source if radix >= 5 ->
759 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
760 "5" as char <> source if radix >= 6 ->
761 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
762 "6" as char <> source if radix >= 7 ->
763 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
764 "7" as char <> source if radix >= 8 ->
765 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
766 "8" as char <> source if radix >= 9 ->
767 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
768 "9" as char <> source if radix >= 10 ->
769 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
770 "a" as char <> source | "A" as char <> source if radix >= 11 ->
771 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
772 "b" as char <> source | "B" as char <> source if radix >= 12 ->
773 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
774 "c" as char <> source | "C" as char <> source if radix >= 13 ->
775 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
776 "d" as char <> source | "D" as char <> source if radix >= 14 ->
777 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
778 "e" as char <> source | "E" as char <> source if radix >= 15 ->
779 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
780 "f" as char <> source | "F" as char <> source if radix >= 16 ->
781 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
782 "g" as char <> source | "G" as char <> source if radix >= 17 ->
783 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
784 "h" as char <> source | "H" as char <> source if radix >= 18 ->
785 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
786 "i" as char <> source | "I" as char <> source if radix >= 19 ->
787 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
788 "j" as char <> source | "J" as char <> source if radix >= 20 ->
789 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
790 "k" as char <> source | "K" as char <> source if radix >= 21 ->
791 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
792 "l" as char <> source | "L" as char <> source if radix >= 22 ->
793 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
794 "m" as char <> source | "M" as char <> source if radix >= 23 ->
795 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
796 "n" as char <> source | "N" as char <> source if radix >= 24 ->
797 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
798 "o" as char <> source | "O" as char <> source if radix >= 25 ->
799 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
800 "p" as char <> source | "P" as char <> source if radix >= 26 ->
801 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
802 "q" as char <> source | "Q" as char <> source if radix >= 27 ->
803 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
804 "r" as char <> source | "R" as char <> source if radix >= 28 ->
805 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
806 "s" as char <> source | "S" as char <> source if radix >= 29 ->
807 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
808 "t" as char <> source | "T" as char <> source if radix >= 30 ->
809 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
810 "u" as char <> source | "U" as char <> source if radix >= 31 ->
811 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
812 "v" as char <> source | "V" as char <> source if radix >= 32 ->
813 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
814 "w" as char <> source | "W" as char <> source if radix >= 33 ->
815 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
816 "x" as char <> source | "X" as char <> source if radix >= 34 ->
817 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
818 "y" as char <> source | "Y" as char <> source if radix >= 35 ->
819 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
820 "z" as char <> source | "Z" as char <> source if radix >= 36 ->
821 lex_number(advance(lexer, source), lexed <> char, mode, AfterNumber)
822
823 "#" <> source if mode == Initial && position == AfterNumber ->
824 case int.parse(string.replace(in: lexed, each: "_", with: "")) {
825 Error(_) -> #(
826 error(advance(lexer, source), InvalidRadix(lexed)),
827 Integer(lexed),
828 )
829 Ok(radix) if radix < 2 || radix > 36 -> #(
830 error(advance(lexer, source), InvalidRadix(lexed)),
831 Integer(lexed),
832 )
833 Ok(radix) ->
834 lex_number(
835 advance(lexer, source),
836 lexed <> "#",
837 Radix(radix),
838 AfterRadix,
839 )
840 }
841
842 "_" <> source if position == AfterNumber ->
843 lex_number(advance(lexer, source), lexed <> "_", mode, AfterSeparator)
844
845 "_" <> _ -> #(error(lexer, NumericSeparatorNotAllowed), Integer(lexed))
846
847 "." <> source if mode == Initial && position == AfterNumber ->
848 lex_number(advance(lexer, source), lexed <> ".", Decimal, AfterDecimal)
849
850 "e-" as prefix <> source
851 | "e" as prefix <> source
852 | "E-" as prefix <> source
853 | "E" as prefix <> source
854 if mode == Decimal && position == AfterNumber
855 ->
856 lex_number(
857 advance(lexer, source),
858 lexed <> prefix,
859 Exponent,
860 AfterExponent,
861 )
862
863 _ -> {
864 let token = case mode {
865 Decimal | Exponent -> Float(lexed)
866 Initial | Radix(_) -> Integer(lexed)
867 }
868 case position {
869 // If we have some code that looks like `15.`, that is valid syntax,
870 // but it's an integer followed by a dot, not a float.
871 AfterDecimal -> #(
872 advance(lexer, "." <> lexer.source),
873 Integer(string.drop_end(lexed, 1)),
874 )
875 AfterExponent -> #(error(lexer, ExpectedExponent), token)
876 AfterRadix -> #(error(lexer, NumberCannotEndAfterRadix), token)
877 AfterNumber -> #(lexer, token)
878 AfterSeparator -> #(error(lexer, NumericSeparatorNotAllowed), token)
879 }
880 }
881 }
882}
883
884fn lex_sigil(lexer: Lexer) -> #(Lexer, Token) {
885 let #(lexer, sigil, verbatim) = case lexer.source {
886 "b" as sigil <> source | "s" as sigil <> source -> #(
887 advance(lexer, source),
888 sigil,
889 False,
890 )
891
892 "B" as sigil <> source | "S" as sigil <> source -> #(
893 advance(lexer, source),
894 sigil,
895 True,
896 )
897 _ -> #(lexer, "", False)
898 }
899
900 case lexer.source {
901 "\"\"\"" <> source ->
902 lex_triple_quoted_string(advance(lexer, source), Some(sigil))
903 _ -> {
904 let #(lexer, delimiter, closing_char) = case lexer.source {
905 "(" <> source -> #(advance(lexer, source), SigilParen, ")")
906 "[" <> source -> #(advance(lexer, source), SigilSquare, "]")
907 "{" <> source -> #(advance(lexer, source), SigilBrace, "}")
908 "<" <> source -> #(advance(lexer, source), SigilAngle, ">")
909
910 "/" <> source -> #(advance(lexer, source), SigilSlash, "/")
911 "|" <> source -> #(advance(lexer, source), SigilPipe, "|")
912 "'" <> source -> #(advance(lexer, source), SigilSingleQuote, "'")
913 "\"" <> source -> #(advance(lexer, source), SigilDoubleQuote, "\"")
914 "`" <> source -> #(advance(lexer, source), SigilBacktick, "`")
915 "#" <> source -> #(advance(lexer, source), SigilHash, "#")
916
917 _ -> #(error(lexer, ExpectedSigilDelimiter), SigilNone, "")
918 }
919
920 case delimiter {
921 SigilNone -> #(
922 lexer,
923 UnterminatedSigil(sigil:, delimiter:, contents: ""),
924 )
925 _ -> {
926 let splitter = case verbatim {
927 False -> lexer.splitters.sigil
928 True -> lexer.splitters.sigil_verbatim
929 }
930
931 do_lex_sigil(lexer, sigil, delimiter, closing_char, splitter, "")
932 }
933 }
934 }
935 }
936}
937
938fn do_lex_sigil(
939 lexer: Lexer,
940 sigil: String,
941 delimiter: SigilDelimiter,
942 closing_char: String,
943 splitter: Splitter,
944 contents: String,
945) -> #(Lexer, Token) {
946 let #(before, split, after) = splitter.split(splitter, lexer.source)
947 case split {
948 "" -> #(
949 error(advance(lexer, after), UnterminatedStringLiteral),
950 UnterminatedSigil(sigil:, delimiter:, contents: contents <> before),
951 )
952
953 "\\" ->
954 case string.pop_grapheme(after) {
955 Error(_) -> #(
956 error(advance(lexer, after), UnterminatedStringLiteral),
957 UnterminatedSigil(
958 sigil:,
959 delimiter:,
960 contents: contents <> before <> "\\",
961 ),
962 )
963 Ok(#(character, source)) ->
964 do_lex_sigil(
965 advance(lexer, source),
966 sigil,
967 delimiter,
968 closing_char,
969 splitter,
970 contents <> before <> "\\" <> character,
971 )
972 }
973
974 _ if split == closing_char -> #(
975 advance(lexer, after),
976 Sigil(sigil:, delimiter:, contents: contents <> before),
977 )
978
979 // Here, we've split on a delimiter which doesn't match the current sigil.
980 // In this case, we must continue lexing until we find a delimiter of the
981 // correct kind.
982 _ ->
983 do_lex_sigil(
984 advance(lexer, after),
985 sigil,
986 delimiter,
987 closing_char,
988 splitter,
989 contents <> before <> split,
990 )
991 }
992}
993
994fn lex_string(lexer: Lexer, contents: String) -> #(Lexer, Token) {
995 let #(before, split, after) =
996 splitter.split(lexer.splitters.string, lexer.source)
997 case split {
998 "" -> #(
999 error(advance(lexer, after), UnterminatedStringLiteral),
1000 UnterminatedString(contents <> before),
1001 )
1002
1003 "\\" -> {
1004 let #(lexer, escape) = lex_escape_sequence(advance(lexer, after))
1005 lex_string(lexer, contents <> before <> "\\" <> escape)
1006 }
1007
1008 _ -> #(advance(lexer, after), String(contents <> before))
1009 }
1010}
1011
1012fn lex_triple_quoted_string(
1013 lexer: Lexer,
1014 sigil: Option(String),
1015) -> #(Lexer, Token) {
1016 let #(lexer, extra_quotes) = count_extra_quotes(lexer, 0)
1017
1018 let #(lexer, beginning_whitespace) = case
1019 splitter.split(lexer.splitters.until_end_of_line, lexer.source)
1020 {
1021 #(_, "", _) -> #(error(lexer, ExpectedWhitespaceAfterTripleQuote), "")
1022 #(before, newline, after) ->
1023 case is_whitespace(before) {
1024 True -> #(advance(lexer, after), before <> newline)
1025 False -> #(error(lexer, ExpectedWhitespaceAfterTripleQuote), "")
1026 }
1027 }
1028
1029 let #(lexer, lines, end_indentation) =
1030 lex_triple_quoted_string_contents(lexer, [], "", extra_quotes)
1031
1032 case strip_line_prefixes(lines, end_indentation, []) {
1033 Error(line) -> {
1034 let contents =
1035 beginning_whitespace
1036 <> string.join(list.reverse(lines), "\n")
1037 <> "\n"
1038 <> end_indentation
1039 #(
1040 error(
1041 lexer,
1042 InvalidTripleQuotedStringIndentation(
1043 expected_indentation: end_indentation,
1044 line:,
1045 ),
1046 ),
1047 InvalidTripleQuotedString(contents),
1048 )
1049 }
1050 Ok(lines) -> #(
1051 lexer,
1052 TripleQuotedString(
1053 sigil:,
1054 number_of_quotes: extra_quotes + 3,
1055 beginning_whitespace:,
1056 lines:,
1057 end_indentation:,
1058 ),
1059 )
1060 }
1061}
1062
1063fn count_extra_quotes(lexer: Lexer, extra: Int) -> #(Lexer, Int) {
1064 case lexer.source {
1065 "\"" <> source -> count_extra_quotes(advance(lexer, source), extra + 1)
1066 _ -> #(lexer, extra)
1067 }
1068}
1069
1070fn is_whitespace(string: String) -> Bool {
1071 case string {
1072 "" -> True
1073 " " <> string
1074 | "\n" <> string
1075 | "\r" <> string
1076 | "\t" <> string
1077 | "\f" <> string -> is_whitespace(string)
1078 _ -> False
1079 }
1080}
1081
1082fn strip_line_prefixes(
1083 lines: List(String),
1084 end_indentation: String,
1085 acc: List(String),
1086) -> Result(List(String), String) {
1087 case lines {
1088 [] -> Ok(acc)
1089 [line, ..lines] ->
1090 case strip_prefix(line, end_indentation) {
1091 Ok(line) -> strip_line_prefixes(lines, end_indentation, [line, ..acc])
1092 Error(_) -> Error(line)
1093 }
1094 }
1095}
1096
1097@external(erlang, "pearl_ffi", "strip_prefix")
1098@external(javascript, "./pearl_ffi.mjs", "strip_prefix")
1099fn strip_prefix(string: String, prefix: String) -> Result(String, Nil)
1100
1101fn lex_triple_quoted_string_contents(
1102 lexer: Lexer,
1103 lines: List(String),
1104 current_line: String,
1105 extra_quotes: Int,
1106) -> #(Lexer, List(String), String) {
1107 let #(before, split, after) =
1108 splitter.split(lexer.splitters.triple_quoted_string, lexer.source)
1109
1110 let before = current_line <> before
1111
1112 case split {
1113 "\"\"\"" -> {
1114 let lexer = advance(lexer, after)
1115 case is_whitespace(before) {
1116 False ->
1117 lex_triple_quoted_string_contents(
1118 lexer,
1119 lines,
1120 before <> "\"\"\"",
1121 extra_quotes,
1122 )
1123 True if extra_quotes == 0 -> #(lexer, lines, before)
1124 True ->
1125 case consume_extra_quotes(lexer, extra_quotes) {
1126 Ok(lexer) -> #(lexer, lines, before)
1127 Error(Nil) ->
1128 lex_triple_quoted_string_contents(
1129 lexer,
1130 lines,
1131 before <> "\"\"\"",
1132 extra_quotes,
1133 )
1134 }
1135 }
1136 }
1137
1138 "\n" | "\r\n" ->
1139 lex_triple_quoted_string_contents(
1140 advance(lexer, after),
1141 [before, ..lines],
1142 "",
1143 extra_quotes,
1144 )
1145
1146 _ -> #(error(lexer, UnterminatedStringLiteral), [before, ..lines], "")
1147 }
1148}
1149
1150fn consume_extra_quotes(lexer: Lexer, extra_quotes: Int) -> Result(Lexer, Nil) {
1151 case extra_quotes, lexer.source {
1152 0, _ -> Ok(lexer)
1153 _, "\"" <> source ->
1154 consume_extra_quotes(advance(lexer, source), extra_quotes - 1)
1155 _, _ -> Error(Nil)
1156 }
1157}
1158
1159fn lex_quoted_atom(lexer: Lexer, contents: String) -> #(Lexer, Token) {
1160 let #(before, split, after) =
1161 splitter.split(lexer.splitters.quoted_atom, lexer.source)
1162 case split {
1163 "" -> #(
1164 error(advance(lexer, after), UnterminatedQuotedAtom),
1165 UnterminatedAtom(contents <> before),
1166 )
1167
1168 "\\" ->
1169 case string.pop_grapheme(after) {
1170 Error(_) -> #(
1171 error(advance(lexer, after), UnterminatedStringLiteral),
1172 UnterminatedString(contents),
1173 )
1174 Ok(#(character, source)) ->
1175 lex_string(
1176 advance(lexer, source),
1177 contents <> before <> "\\" <> character,
1178 )
1179 }
1180
1181 _ -> #(advance(lexer, after), Atom(contents <> before, True))
1182 }
1183}
1184
1185fn lex_variable_or_atom(lexer: Lexer, lexed: String) -> #(Lexer, String) {
1186 case lexer.source {
1187 "a" as char <> source
1188 | "b" as char <> source
1189 | "c" as char <> source
1190 | "d" as char <> source
1191 | "e" as char <> source
1192 | "f" as char <> source
1193 | "g" as char <> source
1194 | "h" as char <> source
1195 | "i" as char <> source
1196 | "j" as char <> source
1197 | "k" as char <> source
1198 | "l" as char <> source
1199 | "m" as char <> source
1200 | "n" as char <> source
1201 | "o" as char <> source
1202 | "p" as char <> source
1203 | "q" as char <> source
1204 | "r" as char <> source
1205 | "s" as char <> source
1206 | "t" as char <> source
1207 | "u" as char <> source
1208 | "v" as char <> source
1209 | "w" as char <> source
1210 | "x" as char <> source
1211 | "y" as char <> source
1212 | "z" as char <> source
1213 | "A" as char <> source
1214 | "B" as char <> source
1215 | "C" as char <> source
1216 | "D" as char <> source
1217 | "E" as char <> source
1218 | "F" as char <> source
1219 | "G" as char <> source
1220 | "H" as char <> source
1221 | "I" as char <> source
1222 | "J" as char <> source
1223 | "K" as char <> source
1224 | "L" as char <> source
1225 | "M" as char <> source
1226 | "N" as char <> source
1227 | "O" as char <> source
1228 | "P" as char <> source
1229 | "Q" as char <> source
1230 | "R" as char <> source
1231 | "S" as char <> source
1232 | "T" as char <> source
1233 | "U" as char <> source
1234 | "V" as char <> source
1235 | "W" as char <> source
1236 | "X" as char <> source
1237 | "Y" as char <> source
1238 | "Z" as char <> source
1239 | "0" as char <> source
1240 | "1" as char <> source
1241 | "2" as char <> source
1242 | "3" as char <> source
1243 | "4" as char <> source
1244 | "5" as char <> source
1245 | "6" as char <> source
1246 | "7" as char <> source
1247 | "8" as char <> source
1248 | "9" as char <> source
1249 | "_" as char <> source
1250 | "@" as char <> source ->
1251 lex_variable_or_atom(advance(lexer, source), lexed <> char)
1252
1253 _ -> #(lexer, lexed)
1254 }
1255}
1256
1257fn lex_variable(lexer: Lexer, char: String) -> #(Lexer, Token) {
1258 let #(lexer, name) = lex_variable_or_atom(lexer, char)
1259 #(lexer, Variable(name))
1260}
1261
1262fn lex_atom(lexer: Lexer, char: String) -> #(Lexer, Token) {
1263 let #(lexer, name) = lex_variable_or_atom(lexer, char)
1264
1265 let token = case name {
1266 "after" -> After
1267 "begin" -> Begin
1268 "case" -> Case
1269 "catch" -> Catch
1270 "cond" -> Cond
1271 "else" -> Else
1272 "end" -> End
1273 "fun" -> Fun
1274 "if" -> If
1275 "let" -> Let
1276 "maybe" -> Maybe
1277 "of" -> Of
1278 "receive" -> Receive
1279 "try" -> Try
1280 "when" -> When
1281 "bnot" -> Bnot
1282 "div" -> Div
1283 "rem" -> Rem
1284 "band" -> Band
1285 "bor" -> Bor
1286 "bxor" -> Bxor
1287 "bsl" -> Bsl
1288 "bsr" -> Bsr
1289 "not" -> Not
1290 "and" -> And
1291 "or" -> Or
1292 "xor" -> Xor
1293 "andalso" -> Andalso
1294 "orelse" -> Orelse
1295
1296 _ -> Atom(name, False)
1297 }
1298 #(lexer, token)
1299}
1300
1301fn lex_until_end_of_line(lexer: Lexer) -> #(Lexer, String) {
1302 let #(before, after) =
1303 splitter.split_after(lexer.splitters.until_end_of_line, lexer.source)
1304 #(advance(lexer, after), before)
1305}
1306
1307fn lex_whitespace(lexer: Lexer, lexed: String) -> #(Lexer, Token) {
1308 case lexer.source {
1309 " " as space <> source
1310 | "\n" as space <> source
1311 | "\r" as space <> source
1312 | "\t" as space <> source
1313 | "\f" as space <> source ->
1314 lex_whitespace(advance(lexer, source), lexed <> space)
1315 _ -> maybe_token(lexer, Whitespace(lexed), !lexer.ignore_whitespace)
1316 }
1317}
1318
1319fn maybe_token(lexer: Lexer, token: Token, condition: Bool) -> #(Lexer, Token) {
1320 case condition {
1321 True -> #(lexer, token)
1322 False -> next(lexer)
1323 }
1324}
1325
1326fn advance(lexer: Lexer, source: String) -> Lexer {
1327 Lexer(..lexer, source:)
1328}
1329
1330fn error(lexer: Lexer, error: Error) -> Lexer {
1331 Lexer(..lexer, errors: [error, ..lexer.errors])
1332}
1333
1334/// A highlighting token, containing information about the kind of syntax
1335/// being used. Many similar tokens (e.g. all keywords) are grouped together
1336/// to simplify them.
1337///
1338/// For syntax tokens, see [`Token`](#Token).
1339///
1340pub type HighlightToken {
1341 HighlightWhitespace(String)
1342 HighlightKeyword(String)
1343 HighlightVariable(String)
1344 HighlightString(String)
1345 HighlightAtom(String)
1346 HighlightNumber(String)
1347 HighlightModule(String)
1348 HighlightFunction(String)
1349 HighlightOperator(String)
1350 HighlightComment(String)
1351 HighlightPunctuation(String)
1352 HighlightOther(String)
1353}
1354
1355/// Convert a string of Erlang source code into ansi highlighting.
1356///
1357/// Colours taken from [`contour`](https://hexdocs.pm/contour):
1358/// | Token | Colour |
1359/// | ---------------------- | ----------- |
1360/// | Keyword | Yellow |
1361/// | Module | Cyan |
1362/// | Function | Blue |
1363/// | Operator | Magenta |
1364/// | Comment | Italic grey |
1365/// | String, Number, Atom | Green |
1366/// | Whitespace, Variable | No colour |
1367///
1368/// If you wish to use other colours or another format, use `to_tokens`.
1369///
1370pub fn highlight_ansi(code: String) -> String {
1371 highlight_tokens(code)
1372 |> list.fold("", fn(code, token) {
1373 code
1374 <> case token {
1375 HighlightWhitespace(s) -> ansi.reset(s)
1376 HighlightKeyword(s) -> ansi.yellow(s)
1377 HighlightVariable(s) -> ansi.reset(s)
1378 HighlightString(s) -> ansi.green(s)
1379 HighlightAtom(s) -> ansi.green(s)
1380 HighlightNumber(s) -> ansi.green(s)
1381 HighlightModule(s) -> ansi.cyan(s)
1382 HighlightFunction(s) -> ansi.blue(s)
1383 HighlightOperator(s) -> ansi.magenta(s)
1384 HighlightComment(s) -> ansi.italic(ansi.gray(s))
1385 HighlightPunctuation(s) -> ansi.reset(s)
1386 HighlightOther(s) -> ansi.reset(s)
1387 }
1388 })
1389}
1390
1391/// Convert a string of Erlang source code into an HTML string.
1392/// Each token is wrapped in a `<span>` with a class indicating the type of
1393///
1394/// Class names taken from [`contour`](https://hexdocs.pm/contour):
1395/// | Token | CSS class |
1396/// | ----------- | -------------- |
1397/// | Keyword | hl-keyword |
1398/// | Variable | hl-variable |
1399/// | Module | hl-module |
1400/// | Function | hl-function |
1401/// | Operator | hl-operator |
1402/// | Punctuation | hl-punctuation |
1403/// | Comment | hl-comment |
1404/// | String | hl-string |
1405/// | Atom | hl-atom |
1406/// | Number | hl-number |
1407/// | Whitespace | no class |
1408///
1409/// Place the output within a `<pre><code>...</code></pre>` and add styling for
1410/// these CSS classes to get highlighting on your website. Here's some CSS you
1411/// could use:
1412///
1413/// ```css
1414/// pre code .hl-comment { color: #d4d4d4; font-style: italic }
1415/// pre code .hl-function { color: #9ce7ff }
1416/// pre code .hl-keyword { color: #ffd596 }
1417/// pre code .hl-operator { color: #ffaff3 }
1418/// pre code .hl-string { color: #c8ffa7 }
1419/// pre code .hl-number { color: #c8ffa7 }
1420/// pre code .hl-atom { color: #c8ffa7 }
1421/// pre code .hl-module { color: #ffddfa }
1422/// ```
1423///
1424/// If you wish to use another format see `to_ansi` or `to_tokens`.
1425///
1426pub fn highlight_html(code: String) -> String {
1427 highlight_tokens(code)
1428 |> list.fold("", fn(acc, token) {
1429 case token {
1430 HighlightWhitespace(s) -> acc <> s
1431 HighlightKeyword(s) ->
1432 acc <> "<span class=hl-keyword>" <> houdini.escape(s) <> "</span>"
1433 HighlightVariable(s) ->
1434 acc <> "<span class=hl-variable>" <> houdini.escape(s) <> "</span>"
1435 HighlightString(s) ->
1436 acc <> "<span class=hl-string>" <> houdini.escape(s) <> "</span>"
1437 HighlightAtom(s) ->
1438 acc <> "<span class=hl-atom>" <> houdini.escape(s) <> "</span>"
1439 HighlightNumber(s) ->
1440 acc <> "<span class=hl-number>" <> houdini.escape(s) <> "</span>"
1441 HighlightModule(s) ->
1442 acc <> "<span class=hl-module>" <> houdini.escape(s) <> "</span>"
1443 HighlightFunction(s) ->
1444 acc <> "<span class=hl-function>" <> houdini.escape(s) <> "</span>"
1445 HighlightOperator(s) ->
1446 acc <> "<span class=hl-operator>" <> houdini.escape(s) <> "</span>"
1447 HighlightComment(s) ->
1448 acc <> "<span class=hl-comment>" <> houdini.escape(s) <> "</span>"
1449 HighlightPunctuation(s) ->
1450 acc <> "<span class=hl-punctuation>" <> houdini.escape(s) <> "</span>"
1451 HighlightOther(s) -> acc <> s
1452 }
1453 })
1454}
1455
1456/// Convert a string of Erlang source code into highlighting tokens.
1457/// Highlighting tokens only contain information about the kind of syntax
1458/// being used, grouping similar tokens (e.g. all keywords) into one category.
1459///
1460/// To convert code into syntax tokens, see `pearl.tokenise`.
1461///
1462pub fn highlight_tokens(code: String) -> List(HighlightToken) {
1463 let #(tokens, _errors) = tokenise(new(code))
1464 do_highlight_tokens(tokens, [])
1465}
1466
1467fn do_highlight_tokens(
1468 in: List(Token),
1469 out: List(HighlightToken),
1470) -> List(HighlightToken) {
1471 case in {
1472 [] -> list.reverse(out)
1473
1474 // Specific constructs
1475 [Atom(value, quoted: False), LeftParen, ..in] ->
1476 do_highlight_tokens(in, [
1477 HighlightPunctuation("("),
1478 HighlightFunction(value),
1479 ..out
1480 ])
1481 [Atom(function, quoted: False), Slash, Integer(arity), ..in] ->
1482 do_highlight_tokens(in, [
1483 HighlightNumber(arity),
1484 HighlightPunctuation("/"),
1485 HighlightFunction(function),
1486 ..out
1487 ])
1488 [
1489 Atom(module, quoted: False),
1490 Colon,
1491 Atom(function, quoted: False),
1492 Slash,
1493 Integer(arity),
1494 ..in
1495 ] ->
1496 do_highlight_tokens(in, [
1497 HighlightNumber(arity),
1498 HighlightPunctuation("/"),
1499 HighlightFunction(function),
1500 HighlightPunctuation(":"),
1501 HighlightModule(module),
1502 ..out
1503 ])
1504 [Atom(module, quoted: False), Colon, Atom(function, quoted: False), ..in] ->
1505 do_highlight_tokens(in, [
1506 HighlightFunction(function),
1507 HighlightPunctuation(":"),
1508 HighlightModule(module),
1509 ..out
1510 ])
1511 [Question, Variable(macro_name), ..in] ->
1512 do_highlight_tokens(in, [
1513 HighlightFunction(macro_name),
1514 HighlightPunctuation("?"),
1515 ..out
1516 ])
1517
1518 // Whitespace and comments
1519 [Whitespace(space), ..in] ->
1520 do_highlight_tokens(in, [HighlightWhitespace(space), ..out])
1521 [Comment(contents), ..in] ->
1522 do_highlight_tokens(in, [HighlightComment("%" <> contents), ..out])
1523 [DocComment(contents), ..in] ->
1524 do_highlight_tokens(in, [HighlightComment("%%" <> contents), ..out])
1525 [ModuleComment(contents), ..in] ->
1526 do_highlight_tokens(in, [HighlightComment("%%%" <> contents), ..out])
1527 [EndOfFile, ..in] -> do_highlight_tokens(in, out)
1528
1529 // Literals
1530 [Character(char), ..in] ->
1531 do_highlight_tokens(in, [HighlightString("$" <> char), ..out])
1532 [Integer(int), ..in] ->
1533 do_highlight_tokens(in, [HighlightNumber(int), ..out])
1534 [Float(float), ..in] ->
1535 do_highlight_tokens(in, [HighlightNumber(float), ..out])
1536 [Atom(name:, quoted: True), ..in] ->
1537 do_highlight_tokens(in, [HighlightAtom("'" <> name <> "'"), ..out])
1538 [Atom(name:, quoted: False), ..in] ->
1539 do_highlight_tokens(in, [HighlightAtom(name), ..out])
1540 [String(contents), ..in] ->
1541 do_highlight_tokens(in, [HighlightString("\"" <> contents <> "\""), ..out])
1542 [
1543 TripleQuotedString(
1544 sigil:,
1545 number_of_quotes:,
1546 beginning_whitespace:,
1547 lines:,
1548 end_indentation:,
1549 ),
1550 ..in
1551 ] ->
1552 do_highlight_tokens(in, [
1553 HighlightString(
1554 case sigil {
1555 option.None -> ""
1556 option.Some(sigil) -> "~" <> sigil
1557 }
1558 <> string.repeat("\"", number_of_quotes)
1559 <> beginning_whitespace
1560 <> string.join(
1561 list.map(lines, fn(line) { end_indentation <> line }),
1562 "\n",
1563 )
1564 <> "\n"
1565 <> end_indentation
1566 <> string.repeat("\"", number_of_quotes),
1567 ),
1568 ..out
1569 ])
1570 [Sigil(sigil:, delimiter:, contents:), ..in] ->
1571 do_highlight_tokens(in, [
1572 HighlightString({
1573 let #(opening, closing) = sigil_delimiters(delimiter)
1574 "~" <> sigil <> opening <> contents <> closing
1575 }),
1576 ..out
1577 ])
1578 [Variable(name), ..in] ->
1579 do_highlight_tokens(in, [HighlightVariable(name), ..out])
1580
1581 // Keywords
1582 [After, ..in] -> do_highlight_tokens(in, [HighlightKeyword("after"), ..out])
1583 [Begin, ..in] -> do_highlight_tokens(in, [HighlightKeyword("begin"), ..out])
1584 [Case, ..in] -> do_highlight_tokens(in, [HighlightKeyword("case"), ..out])
1585 [Catch, ..in] -> do_highlight_tokens(in, [HighlightKeyword("catch"), ..out])
1586 [Cond, ..in] -> do_highlight_tokens(in, [HighlightKeyword("cond"), ..out])
1587 [Else, ..in] -> do_highlight_tokens(in, [HighlightKeyword("else"), ..out])
1588 [End, ..in] -> do_highlight_tokens(in, [HighlightKeyword("end"), ..out])
1589 [Fun, ..in] -> do_highlight_tokens(in, [HighlightKeyword("fun"), ..out])
1590 [If, ..in] -> do_highlight_tokens(in, [HighlightKeyword("if"), ..out])
1591 [Let, ..in] -> do_highlight_tokens(in, [HighlightKeyword("let"), ..out])
1592 [Maybe, ..in] -> do_highlight_tokens(in, [HighlightKeyword("maybe"), ..out])
1593 [Of, ..in] -> do_highlight_tokens(in, [HighlightKeyword("of"), ..out])
1594 [Receive, ..in] ->
1595 do_highlight_tokens(in, [HighlightKeyword("receive"), ..out])
1596 [Try, ..in] -> do_highlight_tokens(in, [HighlightKeyword("try"), ..out])
1597 [When, ..in] -> do_highlight_tokens(in, [HighlightKeyword("when"), ..out])
1598
1599 // Punctuation
1600 [LeftParen, ..in] ->
1601 do_highlight_tokens(in, [HighlightPunctuation("("), ..out])
1602 [RightParen, ..in] ->
1603 do_highlight_tokens(in, [HighlightPunctuation(")"), ..out])
1604 [LeftBrace, ..in] ->
1605 do_highlight_tokens(in, [HighlightPunctuation("{"), ..out])
1606 [RightBrace, ..in] ->
1607 do_highlight_tokens(in, [HighlightPunctuation("}"), ..out])
1608 [LeftSquare, ..in] ->
1609 do_highlight_tokens(in, [HighlightPunctuation("["), ..out])
1610 [RightSquare, ..in] ->
1611 do_highlight_tokens(in, [HighlightPunctuation("]"), ..out])
1612 [Comma, ..in] -> do_highlight_tokens(in, [HighlightPunctuation(","), ..out])
1613 [Semicolon, ..in] ->
1614 do_highlight_tokens(in, [HighlightPunctuation(";"), ..out])
1615 [Colon, ..in] -> do_highlight_tokens(in, [HighlightPunctuation(":"), ..out])
1616 [Dot, ..in] -> do_highlight_tokens(in, [HighlightPunctuation("."), ..out])
1617 [MinusGreater, ..in] ->
1618 do_highlight_tokens(in, [HighlightPunctuation("->"), ..out])
1619 [DoubleLess, ..in] ->
1620 do_highlight_tokens(in, [HighlightPunctuation("<<"), ..out])
1621 [DoubleGreater, ..in] ->
1622 do_highlight_tokens(in, [HighlightPunctuation(">>"), ..out])
1623 [Hash, ..in] -> do_highlight_tokens(in, [HighlightPunctuation("#"), ..out])
1624 [DoubleColon, ..in] ->
1625 do_highlight_tokens(in, [HighlightPunctuation("::"), ..out])
1626 [DoubleDot, ..in] ->
1627 do_highlight_tokens(in, [HighlightPunctuation(".."), ..out])
1628 [TripleDot, ..in] ->
1629 do_highlight_tokens(in, [HighlightPunctuation("..."), ..out])
1630 [Question, ..in] ->
1631 do_highlight_tokens(in, [HighlightPunctuation("?"), ..out])
1632
1633 // Operators
1634 [DoublePipe, ..in] ->
1635 do_highlight_tokens(in, [HighlightOperator("||"), ..out])
1636 [EqualGreater, ..in] ->
1637 do_highlight_tokens(in, [HighlightOperator("=>"), ..out])
1638 [ColonEqual, ..in] ->
1639 do_highlight_tokens(in, [HighlightOperator(":="), ..out])
1640 [LessMinus, ..in] ->
1641 do_highlight_tokens(in, [HighlightOperator("<-"), ..out])
1642 [LessEqual, ..in] ->
1643 do_highlight_tokens(in, [HighlightOperator("<="), ..out])
1644 [Pipe, ..in] -> do_highlight_tokens(in, [HighlightOperator("|"), ..out])
1645 [DoubleEqual, ..in] ->
1646 do_highlight_tokens(in, [HighlightOperator("=="), ..out])
1647 [SlashEqual, ..in] ->
1648 do_highlight_tokens(in, [HighlightOperator("/="), ..out])
1649 [EqualLess, ..in] ->
1650 do_highlight_tokens(in, [HighlightOperator("=<"), ..out])
1651 [Less, ..in] -> do_highlight_tokens(in, [HighlightOperator("<"), ..out])
1652 [GreaterEqual, ..in] ->
1653 do_highlight_tokens(in, [HighlightOperator(">="), ..out])
1654 [Greater, ..in] -> do_highlight_tokens(in, [HighlightOperator(">"), ..out])
1655 [EqualColonEqual, ..in] ->
1656 do_highlight_tokens(in, [HighlightOperator("=:="), ..out])
1657 [EqualSlashEqual, ..in] ->
1658 do_highlight_tokens(in, [HighlightOperator("=/="), ..out])
1659 [Plus, ..in] -> do_highlight_tokens(in, [HighlightOperator("+"), ..out])
1660 [Minus, ..in] -> do_highlight_tokens(in, [HighlightOperator("-"), ..out])
1661 [Star, ..in] -> do_highlight_tokens(in, [HighlightOperator("*"), ..out])
1662 [Slash, ..in] -> do_highlight_tokens(in, [HighlightOperator("/"), ..out])
1663 [Bnot, ..in] -> do_highlight_tokens(in, [HighlightOperator("bnot"), ..out])
1664 [Div, ..in] -> do_highlight_tokens(in, [HighlightOperator("div"), ..out])
1665 [Rem, ..in] -> do_highlight_tokens(in, [HighlightOperator("rem"), ..out])
1666 [Band, ..in] -> do_highlight_tokens(in, [HighlightOperator("band"), ..out])
1667 [Bor, ..in] -> do_highlight_tokens(in, [HighlightOperator("bor"), ..out])
1668 [Bxor, ..in] -> do_highlight_tokens(in, [HighlightOperator("bxor"), ..out])
1669 [Bsl, ..in] -> do_highlight_tokens(in, [HighlightOperator("bsl"), ..out])
1670 [Bsr, ..in] -> do_highlight_tokens(in, [HighlightOperator("bsr"), ..out])
1671 [Not, ..in] -> do_highlight_tokens(in, [HighlightOperator("not"), ..out])
1672 [And, ..in] -> do_highlight_tokens(in, [HighlightOperator("and"), ..out])
1673 [Or, ..in] -> do_highlight_tokens(in, [HighlightOperator("or"), ..out])
1674 [Xor, ..in] -> do_highlight_tokens(in, [HighlightOperator("xor"), ..out])
1675 [Andalso, ..in] ->
1676 do_highlight_tokens(in, [HighlightOperator("andalso"), ..out])
1677 [Orelse, ..in] ->
1678 do_highlight_tokens(in, [HighlightOperator("orelse"), ..out])
1679 [DoublePlus, ..in] ->
1680 do_highlight_tokens(in, [HighlightOperator("++"), ..out])
1681 [DoubleMinus, ..in] ->
1682 do_highlight_tokens(in, [HighlightOperator("--"), ..out])
1683 [QuestionEqual, ..in] ->
1684 do_highlight_tokens(in, [HighlightOperator("?="), ..out])
1685 [Bang, ..in] -> do_highlight_tokens(in, [HighlightOperator("!"), ..out])
1686 [Equal, ..in] -> do_highlight_tokens(in, [HighlightOperator("="), ..out])
1687
1688 // Invalid tokens
1689 [Unknown(char), ..in] ->
1690 do_highlight_tokens(in, [HighlightOther(char), ..out])
1691 [UnterminatedString(contents), ..in] ->
1692 do_highlight_tokens(in, [HighlightString("\"" <> contents), ..out])
1693 [UnterminatedSigil(sigil:, contents:, delimiter:), ..in] ->
1694 do_highlight_tokens(in, [
1695 HighlightString({
1696 let #(opening, _closing) = sigil_delimiters(delimiter)
1697 "~" <> sigil <> opening <> contents
1698 }),
1699 ..out
1700 ])
1701 [UnterminatedAtom(contents), ..in] ->
1702 do_highlight_tokens(in, [HighlightAtom("'" <> contents), ..out])
1703 [InvalidTripleQuotedString(contents), ..in] ->
1704 do_highlight_tokens(in, [
1705 HighlightString("\"\"\"" <> contents <> "\"\"\""),
1706 ..out
1707 ])
1708 }
1709}