the hito embeddable programming language
1#include "lexer.h"
2#include "source.h"
3#include "error.h"
4#include "util.h"
5#include <stdlib.h>
6#include <stdbool.h>
7#include <string.h>
8#include <stdio.h>
9#include <ctype.h>
10struct lexer {
11 source_t* source;
12 pos_t cur_pos;
13};
14
15lexer_t* lexer_alloc(source_t *source) {
16 lexer_t *it = malloc(sizeof (struct lexer));
17 if (it == NULL) {
18 die("out of memory: lexer allocation failed");
19 }
20 it->source = source;
21 it->cur_pos.line = 0;
22 it->cur_pos.col = 0;
23 it->cur_pos.idx = 0;
24 return it;
25}
26pos_t lexer_cur_pos(lexer_t* lexer) {
27 return lexer->cur_pos;
28}
29source_t* lexer_source(lexer_t *lexer) {
30 return lexer->source;
31}
32token_t lexer_lex(lexer_t* lexer) {
33 const char* current;
34 start: current = source_contents_from(lexer->source, lexer->cur_pos);
35 if (*current == '\0') {
36 token_t tok;
37 tok.len = 0;
38 tok.start = current;
39 tok.type = TOKEN_EOF;
40 return tok;
41 }
42 if (starts_with_any(current, " ","\t","\n","\r")) {
43 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
44 goto start;
45 }
46 if (starts_with_any(current,"//")) {
47 while (*current != '\n') {
48 current++;
49 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
50 }
51 goto start;
52 }
53
54 token_t tok;
55 tok.len = 1;
56 tok.start = current;
57 tok.pos = lexer->cur_pos;
58 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
59 switch (*current) {
60 case '(': tok.type = TOKEN_LPAREN; return tok;
61 case ')': tok.type = TOKEN_RPAREN; return tok;
62 case '{': tok.type = TOKEN_LBRACE; return tok;
63 case '}': tok.type = TOKEN_RBRACE; return tok;
64 case ':': if (*(current+1) == '=') {
65 tok.type = TOKEN_ASSIGN;
66 tok.len = 2;
67 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
68 return tok;
69 } else {
70 tok.type = TOKEN_COLON;
71 return tok;
72 }
73 case '>': if (*(current+1) == '=') {
74 tok.type = TOKEN_GREATEREQ;
75 tok.len = 2;
76 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
77 return tok;
78 } else {
79 tok.type = TOKEN_GREATER;
80 return tok;
81 }
82 case '<': if (*(current+1) == '=') {
83 tok.type = TOKEN_LESSEQ;
84 tok.len = 2;
85 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
86 return tok;
87 } else {
88 tok.type = TOKEN_LESS;
89 return tok;
90 }
91 case '/': if (*(current+1) == '=') {
92 tok.type = TOKEN_NOTEQ;
93 tok.len = 2;
94 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
95 return tok;
96 } else {
97 tok.type = TOKEN_DIV;
98 return tok;
99 }
100 if (*(current+1) == '=') {
101 tok.type = TOKEN_NOTEQ;
102 tok.len = 2;
103 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
104 return tok;
105 } else {
106 tok.type = TOKEN_DIV;
107 return tok;
108 }
109 case '|': tok.type = TOKEN_PIPE; return tok;
110 case ',': tok.type = TOKEN_COMMA; return tok;
111 case ';': tok.type = TOKEN_SEMI; return tok;
112 case '-': if (isdigit(*(current+1))) {
113 current++;
114 tok.len++;
115 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
116 break;
117 } else {
118 tok.type = TOKEN_MINUS; return tok;
119 }
120 case '+': tok.type = TOKEN_PLUS; return tok;
121 case '=': tok.type = TOKEN_EQ; return tok;
122 case '^': tok.type = TOKEN_EXP; return tok;
123 case '*': tok.type = TOKEN_MULT; return tok;
124 case '.': if (isalnum(*(current+1)) || *(current+1) == '_') {
125 current++;
126 while (isalnum(*current) || *current == '_') {
127 tok.len++;
128 current++;
129 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
130 }
131 tok.type = TOKEN_CONSTRUCTOR;
132 return tok;
133 } else break;
134 case '\"': {
135 current++;
136 bool escaped = false;
137 while (*current != '\"' || escaped) {
138 if (*current == '\0') {
139 tok.type = TOKEN_UNRECOGNISED;
140 error_report(
141 error(ERROR_UNTERMINATED_STRING_LITERAL,lexer->source,tok.pos,1)
142 );
143 return tok;
144 }
145 escaped = false;
146 if (*current == '\\') escaped = true;
147 current++; tok.len++;
148 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
149 }
150 current++; tok.len++;
151 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
152 tok.type = TOKEN_STRING_LIT;
153 return tok;
154 }
155 }
156 if (isdigit(*current)) {
157 current++;
158 bool dot_seen = false;
159 bool e_seen = false;
160 tok.type = TOKEN_INT_LIT;
161 while (isdigit(*current)
162 || !dot_seen && *current == '.'
163 || !e_seen && *current == 'E'
164 || !e_seen && *current == 'e') {
165 if (*current == '.') {
166 dot_seen = true;
167 tok.type = TOKEN_FLOAT_LIT;
168 }
169 if (*current == 'e' || *current == 'E') {
170 e_seen = true;
171 tok.type = TOKEN_FLOAT_LIT;
172 }
173 current++; tok.len++;
174 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
175 }
176
177 return tok;
178 } else if (isalpha(*current) || *current == '_') {
179 current++;
180 while (isalnum(*current) || *current == '_') {
181 current++; tok.len++;
182 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos);
183 }
184 if (tok.len == 2 && starts_with_any(tok.start,"if"))
185 tok.type = TOKEN_IF;
186 else if (tok.len == 3 && starts_with_any(tok.start,"and"))
187 tok.type = TOKEN_AND;
188 else if (tok.len == 3 && starts_with_any(tok.start,"mod"))
189 tok.type = TOKEN_MOD;
190 else if (tok.len == 3 && starts_with_any(tok.start,"div"))
191 tok.type = TOKEN_IDIV;
192 else if (tok.len == 2 && starts_with_any(tok.start,"or"))
193 tok.type = TOKEN_OR;
194 else
195 tok.type = TOKEN_IDENT;
196 return tok;
197 }
198
199 tok.type = TOKEN_UNRECOGNISED;
200 error_report(
201 error(ERROR_UNRECOGNISED_TOKEN,lexer->source,tok.pos,1)
202 );
203 return tok;
204}
205
206
207token_t lexer_peek(lexer_t* lexer) {
208 pos_t pos = lexer->cur_pos;
209 token_t tok = lexer_lex(lexer);
210 printf("%s\n",lexer_token_type_to_string(tok));
211 lexer->cur_pos = pos;
212 return tok;
213}
214void lexer_dealloc(lexer_t* lexer) {
215 free(lexer);
216}
217
218#define enum_case_str(n) case n: return #n
219const char* lexer_token_type_to_string(token_t to_print) {
220 switch (to_print.type) {
221 enum_case_str(TOKEN_LPAREN);
222 enum_case_str(TOKEN_RPAREN);
223 enum_case_str(TOKEN_LBRACE);
224 enum_case_str(TOKEN_RBRACE);
225 enum_case_str(TOKEN_STRING_LIT);
226 enum_case_str(TOKEN_INT_LIT);
227 enum_case_str(TOKEN_FLOAT_LIT);
228 enum_case_str(TOKEN_IDENT);
229 enum_case_str(TOKEN_ASSIGN);
230 enum_case_str(TOKEN_SEMI);
231 enum_case_str(TOKEN_PIPE);
232 enum_case_str(TOKEN_IF);
233 enum_case_str(TOKEN_AND);
234 enum_case_str(TOKEN_OR);
235 enum_case_str(TOKEN_MULT);
236 enum_case_str(TOKEN_PLUS);
237 enum_case_str(TOKEN_EXP);
238 enum_case_str(TOKEN_MINUS);
239 enum_case_str(TOKEN_DIV);
240 enum_case_str(TOKEN_IDIV);
241 enum_case_str(TOKEN_EQ);
242 enum_case_str(TOKEN_NOTEQ);
243 enum_case_str(TOKEN_MOD);
244 enum_case_str(TOKEN_LESSEQ);
245 enum_case_str(TOKEN_GREATEREQ);
246 enum_case_str(TOKEN_LESS);
247 enum_case_str(TOKEN_GREATER);
248 enum_case_str(TOKEN_COMMA);
249 enum_case_str(TOKEN_COLON);
250 enum_case_str(TOKEN_CONSTRUCTOR);
251 enum_case_str(TOKEN_UNRECOGNISED);
252 enum_case_str(TOKEN_EOF);
253 }
254}