jcs's openbsd hax
openbsd
at jcs 662 lines 15 kB view raw
1/* $OpenBSD: lex.c,v 1.35 2025/02/05 20:32:56 millert Exp $ */ 2/**************************************************************** 3Copyright (C) Lucent Technologies 1997 4All Rights Reserved 5 6Permission to use, copy, modify, and distribute this software and 7its documentation for any purpose and without fee is hereby 8granted, provided that the above copyright notice appear in all 9copies and that both that the copyright notice and this 10permission notice and warranty disclaimer appear in supporting 11documentation, and that the name Lucent Technologies or any of 12its entities not be used in advertising or publicity pertaining 13to distribution of the software without specific, written prior 14permission. 15 16LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23THIS SOFTWARE. 24****************************************************************/ 25 26#include <stdio.h> 27#include <stdlib.h> 28#include <string.h> 29#include <ctype.h> 30#include "awk.h" 31#include "awkgram.tab.h" 32 33extern YYSTYPE yylval; 34extern bool infunc; 35 36int lineno = 1; 37int bracecnt = 0; 38int brackcnt = 0; 39int parencnt = 0; 40 41typedef struct Keyword { 42 const char *word; 43 int sub; 44 int type; 45} Keyword; 46 47const Keyword keywords[] = { /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "and", FAND, BLTIN }, 52 { "atan2", FATAN, BLTIN }, 53 { "break", BREAK, BREAK }, 54 { "close", CLOSE, CLOSE }, 55 { "compl", FCOMPL, BLTIN }, 56 { "continue", CONTINUE, CONTINUE }, 57 { "cos", FCOS, BLTIN }, 58 { "delete", DELETE, DELETE }, 59 { "do", DO, DO }, 60 { "else", ELSE, ELSE }, 61 { "exit", EXIT, EXIT }, 62 { "exp", FEXP, BLTIN }, 63 { "fflush", FFLUSH, BLTIN }, 64 { "for", FOR, FOR }, 65 { "func", FUNC, FUNC }, 66 { "function", FUNC, FUNC }, 67 { "gensub", GENSUB, GENSUB }, 68 { "getline", GETLINE, GETLINE }, 69 { "gsub", GSUB, GSUB }, 70 { "if", IF, IF }, 71 { "in", IN, IN }, 72 { "index", INDEX, INDEX }, 73 { "int", FINT, BLTIN }, 74 { "length", FLENGTH, BLTIN }, 75 { "log", FLOG, BLTIN }, 76 { "lshift", FLSHIFT, BLTIN }, 77 { "match", MATCHFCN, MATCHFCN }, 78 { "mktime", FMKTIME, BLTIN }, 79 { "next", NEXT, NEXT }, 80 { "nextfile", NEXTFILE, NEXTFILE }, 81 { "or", FFOR, BLTIN }, 82 { "print", PRINT, PRINT }, 83 { "printf", PRINTF, PRINTF }, 84 { "rand", FRAND, BLTIN }, 85 { "return", RETURN, RETURN }, 86 { "rshift", FRSHIFT, BLTIN }, 87 { "sin", FSIN, BLTIN }, 88 { "split", SPLIT, SPLIT }, 89 { "sprintf", SPRINTF, SPRINTF }, 90 { "sqrt", FSQRT, BLTIN }, 91 { "srand", FSRAND, BLTIN }, 92 { "strftime", FSTRFTIME, BLTIN }, 93 { "sub", SUB, SUB }, 94 { "substr", SUBSTR, SUBSTR }, 95 { "system", FSYSTEM, BLTIN }, 96 { "systime", FSYSTIME, BLTIN }, 97 { "tolower", FTOLOWER, BLTIN }, 98 { "toupper", FTOUPPER, BLTIN }, 99 { "while", WHILE, WHILE }, 100 { "xor", FXOR, BLTIN }, 101}; 102 103#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 104 105static int peek(void) 106{ 107 int c = input(); 108 unput(c); 109 return c; 110} 111 112static int gettok(char **pbuf, int *psz) /* get next input token */ 113{ 114 int c, retc; 115 char *buf = *pbuf; 116 int sz = *psz; 117 char *bp = buf; 118 119 c = input(); 120 if (c == 0) 121 return 0; 122 buf[0] = c; 123 buf[1] = 0; 124 if (!isalnum(c) && c != '.' && c != '_') 125 return c; 126 127 *bp++ = c; 128 if (isalpha(c) || c == '_') { /* it's a varname */ 129 for ( ; (c = input()) != 0; ) { 130 if (bp-buf >= sz) 131 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 132 FATAL( "out of space for name %.10s...", buf ); 133 if (isalnum(c) || c == '_') 134 *bp++ = c; 135 else { 136 *bp = 0; 137 unput(c); 138 break; 139 } 140 } 141 *bp = 0; 142 retc = 'a'; /* alphanumeric */ 143 } else { /* maybe it's a number, but could be . */ 144 char *rem; 145 /* read input until can't be a number */ 146 for ( ; (c = input()) != 0; ) { 147 if (bp-buf >= sz) 148 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 149 FATAL( "out of space for number %.10s...", buf ); 150 if (isdigit(c) || c == 'e' || c == 'E' 151 || c == '.' || c == '+' || c == '-') 152 *bp++ = c; 153 else { 154 unput(c); 155 break; 156 } 157 } 158 *bp = 0; 159 strtod(buf, &rem); /* parse the number */ 160 if (rem == buf) { /* it wasn't a valid number at all */ 161 buf[1] = 0; /* return one character as token */ 162 retc = (uschar)buf[0]; /* character is its own type */ 163 unputstr(rem+1); /* put rest back for later */ 164 } else { /* some prefix was a number */ 165 unputstr(rem); /* put rest back for later */ 166 rem[0] = 0; /* truncate buf after number part */ 167 retc = '0'; /* type is number */ 168 } 169 } 170 *pbuf = buf; 171 *psz = sz; 172 return retc; 173} 174 175int word(char *); 176int string(void); 177int regexpr(void); 178bool sc = false; /* true => return a } right now */ 179bool reg = false; /* true => return a REGEXPR now */ 180 181int yylex(void) 182{ 183 int c; 184 static char *buf = NULL; 185 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 186 187 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 188 FATAL( "out of space in yylex" ); 189 if (sc) { 190 sc = false; 191 RET('}'); 192 } 193 if (reg) { 194 reg = false; 195 return regexpr(); 196 } 197 for (;;) { 198 c = gettok(&buf, &bufsize); 199 if (c == 0) 200 return 0; 201 if (isalpha(c) || c == '_') 202 return word(buf); 203 if (isdigit(c)) { 204 char *cp = tostring(buf); 205 double result; 206 207 if (is_number(cp, & result)) 208 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 209 else 210 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 211 free(cp); 212 /* should this also have STR set? */ 213 RET(NUMBER); 214 } 215 216 yylval.i = c; 217 switch (c) { 218 case '\n': /* {EOL} */ 219 lineno++; 220 RET(NL); 221 case '\r': /* assume \n is coming */ 222 case ' ': /* {WS}+ */ 223 case '\t': 224 break; 225 case '#': /* #.* strip comments */ 226 while ((c = input()) != '\n' && c != 0) 227 ; 228 unput(c); 229 break; 230 case ';': 231 RET(';'); 232 case '\\': 233 if (peek() == '\n') { 234 input(); 235 lineno++; 236 } else if (peek() == '\r') { 237 input(); input(); /* \n */ 238 lineno++; 239 } else { 240 RET(c); 241 } 242 break; 243 case '&': 244 if (peek() == '&') { 245 input(); RET(AND); 246 } else 247 RET('&'); 248 case '|': 249 if (peek() == '|') { 250 input(); RET(BOR); 251 } else 252 RET('|'); 253 case '!': 254 if (peek() == '=') { 255 input(); yylval.i = NE; RET(NE); 256 } else if (peek() == '~') { 257 input(); yylval.i = NOTMATCH; RET(MATCHOP); 258 } else 259 RET(NOT); 260 case '~': 261 yylval.i = MATCH; 262 RET(MATCHOP); 263 case '<': 264 if (peek() == '=') { 265 input(); yylval.i = LE; RET(LE); 266 } else { 267 yylval.i = LT; RET(LT); 268 } 269 case '=': 270 if (peek() == '=') { 271 input(); yylval.i = EQ; RET(EQ); 272 } else { 273 yylval.i = ASSIGN; RET(ASGNOP); 274 } 275 case '>': 276 if (peek() == '=') { 277 input(); yylval.i = GE; RET(GE); 278 } else if (peek() == '>') { 279 input(); yylval.i = APPEND; RET(APPEND); 280 } else { 281 yylval.i = GT; RET(GT); 282 } 283 case '+': 284 if (peek() == '+') { 285 input(); yylval.i = INCR; RET(INCR); 286 } else if (peek() == '=') { 287 input(); yylval.i = ADDEQ; RET(ASGNOP); 288 } else 289 RET('+'); 290 case '-': 291 if (peek() == '-') { 292 input(); yylval.i = DECR; RET(DECR); 293 } else if (peek() == '=') { 294 input(); yylval.i = SUBEQ; RET(ASGNOP); 295 } else 296 RET('-'); 297 case '*': 298 if (peek() == '=') { /* *= */ 299 input(); yylval.i = MULTEQ; RET(ASGNOP); 300 } else if (peek() == '*') { /* ** or **= */ 301 input(); /* eat 2nd * */ 302 if (peek() == '=') { 303 input(); yylval.i = POWEQ; RET(ASGNOP); 304 } else { 305 RET(POWER); 306 } 307 } else 308 RET('*'); 309 case '/': 310 RET('/'); 311 case '%': 312 if (peek() == '=') { 313 input(); yylval.i = MODEQ; RET(ASGNOP); 314 } else 315 RET('%'); 316 case '^': 317 if (peek() == '=') { 318 input(); yylval.i = POWEQ; RET(ASGNOP); 319 } else 320 RET(POWER); 321 322 case '$': 323 /* BUG: awkward, if not wrong */ 324 c = gettok(&buf, &bufsize); 325 if (isalpha(c)) { 326 if (strcmp(buf, "NF") == 0) { /* very special */ 327 unputstr("(NF)"); 328 RET(INDIRECT); 329 } 330 c = peek(); 331 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 332 unputstr(buf); 333 RET(INDIRECT); 334 } 335 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 336 RET(IVAR); 337 } else if (c == 0) { /* */ 338 SYNTAX( "unexpected end of input after $" ); 339 RET(';'); 340 } else { 341 unputstr(buf); 342 RET(INDIRECT); 343 } 344 345 case '}': 346 if (--bracecnt < 0) 347 SYNTAX( "extra }" ); 348 sc = true; 349 RET(';'); 350 case ']': 351 if (--brackcnt < 0) 352 SYNTAX( "extra ]" ); 353 RET(']'); 354 case ')': 355 if (--parencnt < 0) 356 SYNTAX( "extra )" ); 357 RET(')'); 358 case '{': 359 bracecnt++; 360 RET('{'); 361 case '[': 362 brackcnt++; 363 RET('['); 364 case '(': 365 parencnt++; 366 RET('('); 367 368 case '"': 369 return string(); /* BUG: should be like tran.c ? */ 370 371 default: 372 RET(c); 373 } 374 } 375} 376 377int string(void) 378{ 379 int c, n; 380 char *s, *bp; 381 static char *buf = NULL; 382 static int bufsz = 500; 383 384 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 385 FATAL("out of space for strings"); 386 for (bp = buf; (c = input()) != '"'; ) { 387 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 388 FATAL("out of space for string %.10s...", buf); 389 switch (c) { 390 case '\n': 391 case '\r': 392 case 0: 393 *bp = '\0'; 394 SYNTAX( "non-terminated string %.10s...", buf ); 395 if (c == 0) /* hopeless */ 396 FATAL( "giving up" ); 397 lineno++; 398 break; 399 case '\\': 400 c = input(); 401 switch (c) { 402 case '\n': break; 403 case '"': *bp++ = '"'; break; 404 case 'n': *bp++ = '\n'; break; 405 case 't': *bp++ = '\t'; break; 406 case 'f': *bp++ = '\f'; break; 407 case 'r': *bp++ = '\r'; break; 408 case 'b': *bp++ = '\b'; break; 409 case 'v': *bp++ = '\v'; break; 410 case 'a': *bp++ = '\a'; break; 411 case '\\': *bp++ = '\\'; break; 412 413 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 414 case '3': case '4': case '5': case '6': case '7': 415 n = c - '0'; 416 if ((c = peek()) >= '0' && c < '8') { 417 n = 8 * n + input() - '0'; 418 if ((c = peek()) >= '0' && c < '8') 419 n = 8 * n + input() - '0'; 420 } 421 *bp++ = n; 422 break; 423 424 case 'x': /* hex \x0-9a-fA-F (exactly two) */ 425 { 426 int i; 427 428 if (!isxdigit(peek())) { 429 unput(c); 430 break; 431 } 432 n = 0; 433 for (i = 0; i < 2; i++) { 434 c = input(); 435 if (c == 0) 436 break; 437 if (isxdigit(c)) { 438 c = tolower(c); 439 n *= 16; 440 if (isdigit(c)) 441 n += (c - '0'); 442 else 443 n += 10 + (c - 'a'); 444 } else { 445 unput(c); 446 break; 447 } 448 } 449 if (i) 450 *bp++ = n; 451 break; 452 } 453 454 case 'u': /* utf \u0-9a-fA-F (1..8) */ 455 { 456 int i; 457 458 n = 0; 459 for (i = 0; i < 8; i++) { 460 c = input(); 461 if (!isxdigit(c) || c == 0) 462 break; 463 c = tolower(c); 464 n *= 16; 465 if (isdigit(c)) 466 n += (c - '0'); 467 else 468 n += 10 + (c - 'a'); 469 } 470 unput(c); 471 bp += runetochar(bp, n); 472 break; 473 } 474 475 default: 476 *bp++ = c; 477 break; 478 } 479 break; 480 default: 481 *bp++ = c; 482 break; 483 } 484 } 485 *bp = 0; 486 s = tostring(buf); 487 *bp++ = ' '; *bp++ = '\0'; 488 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 489 free(s); 490 RET(STRING); 491} 492 493 494static int binsearch(char *w, const Keyword *kp, int n) 495{ 496 int cond, low, mid, high; 497 498 low = 0; 499 high = n - 1; 500 while (low <= high) { 501 mid = (low + high) / 2; 502 if ((cond = strcmp(w, kp[mid].word)) < 0) 503 high = mid - 1; 504 else if (cond > 0) 505 low = mid + 1; 506 else 507 return mid; 508 } 509 return -1; 510} 511 512int word(char *w) 513{ 514 const Keyword *kp; 515 int c, n; 516 517 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 518 if (n != -1) { /* found in table */ 519 kp = keywords + n; 520 yylval.i = kp->sub; 521 switch (kp->type) { /* special handling */ 522 case BLTIN: 523 if (kp->sub == FSYSTEM && safe) 524 SYNTAX( "system is unsafe" ); 525 RET(kp->type); 526 case FUNC: 527 if (infunc) 528 SYNTAX( "illegal nested function" ); 529 RET(kp->type); 530 case RETURN: 531 if (!infunc) 532 SYNTAX( "return not in function" ); 533 RET(kp->type); 534 case VARNF: 535 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 536 RET(VARNF); 537 default: 538 RET(kp->type); 539 } 540 } 541 c = peek(); /* look for '(' */ 542 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 543 yylval.i = n; 544 RET(ARG); 545 } else { 546 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 547 if (c == '(') { 548 RET(CALL); 549 } else { 550 RET(VAR); 551 } 552 } 553} 554 555void startreg(void) /* next call to yylex will return a regular expression */ 556{ 557 reg = true; 558} 559 560int regexpr(void) 561{ 562 int c, openclass = 0; 563 static char *buf = NULL; 564 static int bufsz = 500; 565 char *bp, *cstart; 566 567 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 568 FATAL("out of space for reg expr"); 569 bp = buf; 570 for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) { 571 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 572 FATAL("out of space for reg expr %.10s...", buf); 573 if (c == '\n') { 574 *bp = '\0'; 575 SYNTAX( "newline in regular expression %.10s...", buf ); 576 unput('\n'); 577 break; 578 } else if (c == '\\') { 579 *bp++ = '\\'; 580 *bp++ = input(); 581 } else { 582 /* 583 * POSIX requires a slash in a regexp to be escaped, 584 * other awks don't require it to be escaped inside 585 * a character class. 586 */ 587 if (!do_posix) { 588 if (c == '[') { 589 int nextc = peek(); 590 if (openclass == 0 || nextc == ':' || 591 nextc == '.' || nextc == '=') { 592 if (++openclass == 1) 593 cstart = bp; 594 } 595 } else if (c == ']' && openclass > 0) { 596 /* 597 * A ']' as the first char in a 598 * class is treated literally. 599 */ 600 if (cstart != bp - 1 && 601 (cstart != bp - 2 || bp[-1] != '^')) 602 openclass--; 603 } 604 } 605 *bp++ = c; 606 } 607 } 608 *bp = 0; 609 if (c == 0) 610 SYNTAX("non-terminated regular expression %.10s...", buf); 611 yylval.s = tostring(buf); 612 unput('/'); 613 RET(REGEXPR); 614} 615 616/* low-level lexical stuff, sort of inherited from lex */ 617 618char ebuf[300]; 619char *ep = ebuf; 620char yysbuf[100]; /* pushback buffer */ 621char *yysptr = yysbuf; 622FILE *yyin = NULL; 623 624int input(void) /* get next lexical input character */ 625{ 626 int c; 627 extern char *lexprog; 628 629 if (yysptr > yysbuf) 630 c = (uschar)*--yysptr; 631 else if (lexprog != NULL) { /* awk '...' */ 632 if ((c = (uschar)*lexprog) != 0) 633 lexprog++; 634 } else /* awk -f ... */ 635 c = pgetc(); 636 if (c == EOF) 637 c = 0; 638 if (ep >= ebuf + sizeof ebuf) 639 ep = ebuf; 640 *ep = c; 641 if (c != 0) { 642 ep++; 643 } 644 return (c); 645} 646 647void unput(int c) /* put lexical character back on input */ 648{ 649 if (yysptr >= yysbuf + sizeof(yysbuf)) 650 FATAL("pushed back too much: %.20s...", yysbuf); 651 *yysptr++ = c; 652 if (--ep < ebuf) 653 ep = ebuf + sizeof(ebuf) - 1; 654} 655 656void unputstr(const char *s) /* put a string back on input */ 657{ 658 int i; 659 660 for (i = strlen(s)-1; i >= 0; i--) 661 unput(s[i]); 662}