the hito embeddable programming language
at main 254 lines 7.3 kB view raw
1#include "lexer.h" 2#include "source.h" 3#include "error.h" 4#include "util.h" 5#include <stdlib.h> 6#include <stdbool.h> 7#include <string.h> 8#include <stdio.h> 9#include <ctype.h> 10struct lexer { 11 source_t* source; 12 pos_t cur_pos; 13}; 14 15lexer_t* lexer_alloc(source_t *source) { 16 lexer_t *it = malloc(sizeof (struct lexer)); 17 if (it == NULL) { 18 die("out of memory: lexer allocation failed"); 19 } 20 it->source = source; 21 it->cur_pos.line = 0; 22 it->cur_pos.col = 0; 23 it->cur_pos.idx = 0; 24 return it; 25} 26pos_t lexer_cur_pos(lexer_t* lexer) { 27 return lexer->cur_pos; 28} 29source_t* lexer_source(lexer_t *lexer) { 30 return lexer->source; 31} 32token_t lexer_lex(lexer_t* lexer) { 33 const char* current; 34 start: current = source_contents_from(lexer->source, lexer->cur_pos); 35 if (*current == '\0') { 36 token_t tok; 37 tok.len = 0; 38 tok.start = current; 39 tok.type = TOKEN_EOF; 40 return tok; 41 } 42 if (starts_with_any(current, " ","\t","\n","\r")) { 43 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 44 goto start; 45 } 46 if (starts_with_any(current,"//")) { 47 while (*current != '\n') { 48 current++; 49 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 50 } 51 goto start; 52 } 53 54 token_t tok; 55 tok.len = 1; 56 tok.start = current; 57 tok.pos = lexer->cur_pos; 58 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 59 switch (*current) { 60 case '(': tok.type = TOKEN_LPAREN; return tok; 61 case ')': tok.type = TOKEN_RPAREN; return tok; 62 case '{': tok.type = TOKEN_LBRACE; return tok; 63 case '}': tok.type = TOKEN_RBRACE; return tok; 64 case ':': if (*(current+1) == '=') { 65 tok.type = TOKEN_ASSIGN; 66 tok.len = 2; 67 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 68 return tok; 69 } else { 70 tok.type = TOKEN_COLON; 71 return tok; 72 } 73 case '>': if (*(current+1) == '=') { 74 tok.type = TOKEN_GREATEREQ; 75 tok.len = 2; 76 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 77 return tok; 78 } else { 79 tok.type = TOKEN_GREATER; 80 return tok; 81 } 82 case '<': if (*(current+1) == '=') { 83 tok.type = TOKEN_LESSEQ; 84 tok.len = 2; 85 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 86 return tok; 87 } else { 88 tok.type = TOKEN_LESS; 89 return tok; 90 } 91 case '/': if (*(current+1) == '=') { 92 tok.type = TOKEN_NOTEQ; 93 tok.len = 2; 94 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 95 return tok; 96 } else { 97 tok.type = TOKEN_DIV; 98 return tok; 99 } 100 if (*(current+1) == '=') { 101 tok.type = TOKEN_NOTEQ; 102 tok.len = 2; 103 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 104 return tok; 105 } else { 106 tok.type = TOKEN_DIV; 107 return tok; 108 } 109 case '|': tok.type = TOKEN_PIPE; return tok; 110 case ',': tok.type = TOKEN_COMMA; return tok; 111 case ';': tok.type = TOKEN_SEMI; return tok; 112 case '-': if (isdigit(*(current+1))) { 113 current++; 114 tok.len++; 115 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 116 break; 117 } else { 118 tok.type = TOKEN_MINUS; return tok; 119 } 120 case '+': tok.type = TOKEN_PLUS; return tok; 121 case '=': tok.type = TOKEN_EQ; return tok; 122 case '^': tok.type = TOKEN_EXP; return tok; 123 case '*': tok.type = TOKEN_MULT; return tok; 124 case '.': if (isalnum(*(current+1)) || *(current+1) == '_') { 125 current++; 126 while (isalnum(*current) || *current == '_') { 127 tok.len++; 128 current++; 129 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 130 } 131 tok.type = TOKEN_CONSTRUCTOR; 132 return tok; 133 } else break; 134 case '\"': { 135 current++; 136 bool escaped = false; 137 while (*current != '\"' || escaped) { 138 if (*current == '\0') { 139 tok.type = TOKEN_UNRECOGNISED; 140 error_report( 141 error(ERROR_UNTERMINATED_STRING_LITERAL,lexer->source,tok.pos,1) 142 ); 143 return tok; 144 } 145 escaped = false; 146 if (*current == '\\') escaped = true; 147 current++; tok.len++; 148 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 149 } 150 current++; tok.len++; 151 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 152 tok.type = TOKEN_STRING_LIT; 153 return tok; 154 } 155 } 156 if (isdigit(*current)) { 157 current++; 158 bool dot_seen = false; 159 bool e_seen = false; 160 tok.type = TOKEN_INT_LIT; 161 while (isdigit(*current) 162 || !dot_seen && *current == '.' 163 || !e_seen && *current == 'E' 164 || !e_seen && *current == 'e') { 165 if (*current == '.') { 166 dot_seen = true; 167 tok.type = TOKEN_FLOAT_LIT; 168 } 169 if (*current == 'e' || *current == 'E') { 170 e_seen = true; 171 tok.type = TOKEN_FLOAT_LIT; 172 } 173 current++; tok.len++; 174 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 175 } 176 177 return tok; 178 } else if (isalpha(*current) || *current == '_') { 179 current++; 180 while (isalnum(*current) || *current == '_') { 181 current++; tok.len++; 182 lexer->cur_pos = source_next_pos(lexer->source, lexer->cur_pos); 183 } 184 if (tok.len == 2 && starts_with_any(tok.start,"if")) 185 tok.type = TOKEN_IF; 186 else if (tok.len == 3 && starts_with_any(tok.start,"and")) 187 tok.type = TOKEN_AND; 188 else if (tok.len == 3 && starts_with_any(tok.start,"mod")) 189 tok.type = TOKEN_MOD; 190 else if (tok.len == 3 && starts_with_any(tok.start,"div")) 191 tok.type = TOKEN_IDIV; 192 else if (tok.len == 2 && starts_with_any(tok.start,"or")) 193 tok.type = TOKEN_OR; 194 else 195 tok.type = TOKEN_IDENT; 196 return tok; 197 } 198 199 tok.type = TOKEN_UNRECOGNISED; 200 error_report( 201 error(ERROR_UNRECOGNISED_TOKEN,lexer->source,tok.pos,1) 202 ); 203 return tok; 204} 205 206 207token_t lexer_peek(lexer_t* lexer) { 208 pos_t pos = lexer->cur_pos; 209 token_t tok = lexer_lex(lexer); 210 printf("%s\n",lexer_token_type_to_string(tok)); 211 lexer->cur_pos = pos; 212 return tok; 213} 214void lexer_dealloc(lexer_t* lexer) { 215 free(lexer); 216} 217 218#define enum_case_str(n) case n: return #n 219const char* lexer_token_type_to_string(token_t to_print) { 220 switch (to_print.type) { 221 enum_case_str(TOKEN_LPAREN); 222 enum_case_str(TOKEN_RPAREN); 223 enum_case_str(TOKEN_LBRACE); 224 enum_case_str(TOKEN_RBRACE); 225 enum_case_str(TOKEN_STRING_LIT); 226 enum_case_str(TOKEN_INT_LIT); 227 enum_case_str(TOKEN_FLOAT_LIT); 228 enum_case_str(TOKEN_IDENT); 229 enum_case_str(TOKEN_ASSIGN); 230 enum_case_str(TOKEN_SEMI); 231 enum_case_str(TOKEN_PIPE); 232 enum_case_str(TOKEN_IF); 233 enum_case_str(TOKEN_AND); 234 enum_case_str(TOKEN_OR); 235 enum_case_str(TOKEN_MULT); 236 enum_case_str(TOKEN_PLUS); 237 enum_case_str(TOKEN_EXP); 238 enum_case_str(TOKEN_MINUS); 239 enum_case_str(TOKEN_DIV); 240 enum_case_str(TOKEN_IDIV); 241 enum_case_str(TOKEN_EQ); 242 enum_case_str(TOKEN_NOTEQ); 243 enum_case_str(TOKEN_MOD); 244 enum_case_str(TOKEN_LESSEQ); 245 enum_case_str(TOKEN_GREATEREQ); 246 enum_case_str(TOKEN_LESS); 247 enum_case_str(TOKEN_GREATER); 248 enum_case_str(TOKEN_COMMA); 249 enum_case_str(TOKEN_COLON); 250 enum_case_str(TOKEN_CONSTRUCTOR); 251 enum_case_str(TOKEN_UNRECOGNISED); 252 enum_case_str(TOKEN_EOF); 253 } 254}