usr.bin/awk/lex.c at jcs · jcs.org/openbsd-src

jcs.org / openbsd-src
fork atom
jcs's openbsd hax
openbsd
fork atom
openbsd-src / usr.bin / awk / lex.c
at jcs 662 lines 15 kB view raw
wrap content
millert Update awk to the Jan 14, 2025 version. * Fix incorrect error line number issues. * Fix hex detection in is_valid_number. * Fix indirect field specification with non-numeric string in indirect. * Fixed openfile to not try to read from a directory. 1y ago
4a7e1005
  1/*	$OpenBSD: lex.c,v 1.35 2025/02/05 20:32:56 millert Exp $	*/
  2/****************************************************************
  3Copyright (C) Lucent Technologies 1997
  4All Rights Reserved
  5
  6Permission to use, copy, modify, and distribute this software and
  7its documentation for any purpose and without fee is hereby
  8granted, provided that the above copyright notice appear in all
  9copies and that both that the copyright notice and this
 10permission notice and warranty disclaimer appear in supporting
 11documentation, and that the name Lucent Technologies or any of
 12its entities not be used in advertising or publicity pertaining
 13to distribution of the software without specific, written prior
 14permission.
 15
 16LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 17INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
 18IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
 19SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 20WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
 21IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 22ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 23THIS SOFTWARE.
 24****************************************************************/
 25
 26#include <stdio.h>
 27#include <stdlib.h>
 28#include <string.h>
 29#include <ctype.h>
 30#include "awk.h"
 31#include "awkgram.tab.h"
 32
 33extern YYSTYPE	yylval;
 34extern bool	infunc;
 35
 36int	lineno	= 1;
 37int	bracecnt = 0;
 38int	brackcnt  = 0;
 39int	parencnt = 0;
 40
 41typedef struct Keyword {
 42	const char *word;
 43	int	sub;
 44	int	type;
 45} Keyword;
 46
 47const Keyword keywords[] = {	/* keep sorted: binary searched */
 48	{ "BEGIN",	XBEGIN,		XBEGIN },
 49	{ "END",	XEND,		XEND },
 50	{ "NF",		VARNF,		VARNF },
 51	{ "and",	FAND,		BLTIN },
 52	{ "atan2",	FATAN,		BLTIN },
 53	{ "break",	BREAK,		BREAK },
 54	{ "close",	CLOSE,		CLOSE },
 55	{ "compl",	FCOMPL,		BLTIN },
 56	{ "continue",	CONTINUE,	CONTINUE },
 57	{ "cos",	FCOS,		BLTIN },
 58	{ "delete",	DELETE,		DELETE },
 59	{ "do",		DO,		DO },
 60	{ "else",	ELSE,		ELSE },
 61	{ "exit",	EXIT,		EXIT },
 62	{ "exp",	FEXP,		BLTIN },
 63	{ "fflush",	FFLUSH,		BLTIN },
 64	{ "for",	FOR,		FOR },
 65	{ "func",	FUNC,		FUNC },
 66	{ "function",	FUNC,		FUNC },
 67	{ "gensub",	GENSUB,		GENSUB },
 68	{ "getline",	GETLINE,	GETLINE },
 69	{ "gsub",	GSUB,		GSUB },
 70	{ "if",		IF,		IF },
 71	{ "in",		IN,		IN },
 72	{ "index",	INDEX,		INDEX },
 73	{ "int",	FINT,		BLTIN },
 74	{ "length",	FLENGTH,	BLTIN },
 75	{ "log",	FLOG,		BLTIN },
 76	{ "lshift",	FLSHIFT,	BLTIN },
 77	{ "match",	MATCHFCN,	MATCHFCN },
 78	{ "mktime",	FMKTIME,	BLTIN },
 79	{ "next",	NEXT,		NEXT },
 80	{ "nextfile",	NEXTFILE,	NEXTFILE },
 81	{ "or",		FFOR,		BLTIN },
 82	{ "print",	PRINT,		PRINT },
 83	{ "printf",	PRINTF,		PRINTF },
 84	{ "rand",	FRAND,		BLTIN },
 85	{ "return",	RETURN,		RETURN },
 86	{ "rshift",	FRSHIFT,	BLTIN },
 87	{ "sin",	FSIN,		BLTIN },
 88	{ "split",	SPLIT,		SPLIT },
 89	{ "sprintf",	SPRINTF,	SPRINTF },
 90	{ "sqrt",	FSQRT,		BLTIN },
 91	{ "srand",	FSRAND,		BLTIN },
 92	{ "strftime",	FSTRFTIME,	BLTIN },
 93	{ "sub",	SUB,		SUB },
 94	{ "substr",	SUBSTR,		SUBSTR },
 95	{ "system",	FSYSTEM,	BLTIN },
 96	{ "systime",	FSYSTIME,	BLTIN },
 97	{ "tolower",	FTOLOWER,	BLTIN },
 98	{ "toupper",	FTOUPPER,	BLTIN },
 99	{ "while",	WHILE,		WHILE },
100	{ "xor",	FXOR,		BLTIN },
101};
102
103#define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
104
105static int peek(void)
106{
107	int c = input();
108	unput(c);
109	return c;
110}
111
112static int gettok(char **pbuf, int *psz)	/* get next input token */
113{
114	int c, retc;
115	char *buf = *pbuf;
116	int sz = *psz;
117	char *bp = buf;
118
119	c = input();
120	if (c == 0)
121		return 0;
122	buf[0] = c;
123	buf[1] = 0;
124	if (!isalnum(c) && c != '.' && c != '_')
125		return c;
126
127	*bp++ = c;
128	if (isalpha(c) || c == '_') {	/* it's a varname */
129		for ( ; (c = input()) != 0; ) {
130			if (bp-buf >= sz)
131				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
132					FATAL( "out of space for name %.10s...", buf );
133			if (isalnum(c) || c == '_')
134				*bp++ = c;
135			else {
136				*bp = 0;
137				unput(c);
138				break;
139			}
140		}
141		*bp = 0;
142		retc = 'a';	/* alphanumeric */
143	} else {	/* maybe it's a number, but could be . */
144		char *rem;
145		/* read input until can't be a number */
146		for ( ; (c = input()) != 0; ) {
147			if (bp-buf >= sz)
148				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
149					FATAL( "out of space for number %.10s...", buf );
150			if (isdigit(c) || c == 'e' || c == 'E'
151			  || c == '.' || c == '+' || c == '-')
152				*bp++ = c;
153			else {
154				unput(c);
155				break;
156			}
157		}
158		*bp = 0;
159		strtod(buf, &rem);	/* parse the number */
160		if (rem == buf) {	/* it wasn't a valid number at all */
161			buf[1] = 0;	/* return one character as token */
162			retc = (uschar)buf[0];	/* character is its own type */
163			unputstr(rem+1); /* put rest back for later */
164		} else {	/* some prefix was a number */
165			unputstr(rem);	/* put rest back for later */
166			rem[0] = 0;	/* truncate buf after number part */
167			retc = '0';	/* type is number */
168		}
169	}
170	*pbuf = buf;
171	*psz = sz;
172	return retc;
173}
174
175int	word(char *);
176int	string(void);
177int	regexpr(void);
178bool	sc	= false;	/* true => return a } right now */
179bool	reg	= false;	/* true => return a REGEXPR now */
180
181int yylex(void)
182{
183	int c;
184	static char *buf = NULL;
185	static int bufsize = 5; /* BUG: setting this small causes core dump! */
186
187	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
188		FATAL( "out of space in yylex" );
189	if (sc) {
190		sc = false;
191		RET('}');
192	}
193	if (reg) {
194		reg = false;
195		return regexpr();
196	}
197	for (;;) {
198		c = gettok(&buf, &bufsize);
199		if (c == 0)
200			return 0;
201		if (isalpha(c) || c == '_')
202			return word(buf);
203		if (isdigit(c)) {
204			char *cp = tostring(buf);
205			double result;
206
207			if (is_number(cp, & result))
208				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
209			else
210				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
211			free(cp);
212			/* should this also have STR set? */
213			RET(NUMBER);
214		}
215
216		yylval.i = c;
217		switch (c) {
218		case '\n':	/* {EOL} */
219			lineno++;
220			RET(NL);
221		case '\r':	/* assume \n is coming */
222		case ' ':	/* {WS}+ */
223		case '\t':
224			break;
225		case '#':	/* #.* strip comments */
226			while ((c = input()) != '\n' && c != 0)
227				;
228			unput(c);
229			break;
230		case ';':
231			RET(';');
232		case '\\':
233			if (peek() == '\n') {
234				input();
235				lineno++;
236			} else if (peek() == '\r') {
237				input(); input();	/* \n */
238				lineno++;
239			} else {
240				RET(c);
241			}
242			break;
243		case '&':
244			if (peek() == '&') {
245				input(); RET(AND);
246			} else
247				RET('&');
248		case '|':
249			if (peek() == '|') {
250				input(); RET(BOR);
251			} else
252				RET('|');
253		case '!':
254			if (peek() == '=') {
255				input(); yylval.i = NE; RET(NE);
256			} else if (peek() == '~') {
257				input(); yylval.i = NOTMATCH; RET(MATCHOP);
258			} else
259				RET(NOT);
260		case '~':
261			yylval.i = MATCH;
262			RET(MATCHOP);
263		case '<':
264			if (peek() == '=') {
265				input(); yylval.i = LE; RET(LE);
266			} else {
267				yylval.i = LT; RET(LT);
268			}
269		case '=':
270			if (peek() == '=') {
271				input(); yylval.i = EQ; RET(EQ);
272			} else {
273				yylval.i = ASSIGN; RET(ASGNOP);
274			}
275		case '>':
276			if (peek() == '=') {
277				input(); yylval.i = GE; RET(GE);
278			} else if (peek() == '>') {
279				input(); yylval.i = APPEND; RET(APPEND);
280			} else {
281				yylval.i = GT; RET(GT);
282			}
283		case '+':
284			if (peek() == '+') {
285				input(); yylval.i = INCR; RET(INCR);
286			} else if (peek() == '=') {
287				input(); yylval.i = ADDEQ; RET(ASGNOP);
288			} else
289				RET('+');
290		case '-':
291			if (peek() == '-') {
292				input(); yylval.i = DECR; RET(DECR);
293			} else if (peek() == '=') {
294				input(); yylval.i = SUBEQ; RET(ASGNOP);
295			} else
296				RET('-');
297		case '*':
298			if (peek() == '=') {	/* *= */
299				input(); yylval.i = MULTEQ; RET(ASGNOP);
300			} else if (peek() == '*') {	/* ** or **= */
301				input();	/* eat 2nd * */
302				if (peek() == '=') {
303					input(); yylval.i = POWEQ; RET(ASGNOP);
304				} else {
305					RET(POWER);
306				}
307			} else
308				RET('*');
309		case '/':
310			RET('/');
311		case '%':
312			if (peek() == '=') {
313				input(); yylval.i = MODEQ; RET(ASGNOP);
314			} else
315				RET('%');
316		case '^':
317			if (peek() == '=') {
318				input(); yylval.i = POWEQ; RET(ASGNOP);
319			} else
320				RET(POWER);
321
322		case '$':
323			/* BUG: awkward, if not wrong */
324			c = gettok(&buf, &bufsize);
325			if (isalpha(c)) {
326				if (strcmp(buf, "NF") == 0) {	/* very special */
327					unputstr("(NF)");
328					RET(INDIRECT);
329				}
330				c = peek();
331				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
332					unputstr(buf);
333					RET(INDIRECT);
334				}
335				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
336				RET(IVAR);
337			} else if (c == 0) {	/*  */
338				SYNTAX( "unexpected end of input after $" );
339				RET(';');
340			} else {
341				unputstr(buf);
342				RET(INDIRECT);
343			}
344
345		case '}':
346			if (--bracecnt < 0)
347				SYNTAX( "extra }" );
348			sc = true;
349			RET(';');
350		case ']':
351			if (--brackcnt < 0)
352				SYNTAX( "extra ]" );
353			RET(']');
354		case ')':
355			if (--parencnt < 0)
356				SYNTAX( "extra )" );
357			RET(')');
358		case '{':
359			bracecnt++;
360			RET('{');
361		case '[':
362			brackcnt++;
363			RET('[');
364		case '(':
365			parencnt++;
366			RET('(');
367
368		case '"':
369			return string();	/* BUG: should be like tran.c ? */
370
371		default:
372			RET(c);
373		}
374	}
375}
376
377int string(void)
378{
379	int c, n;
380	char *s, *bp;
381	static char *buf = NULL;
382	static int bufsz = 500;
383
384	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
385		FATAL("out of space for strings");
386	for (bp = buf; (c = input()) != '"'; ) {
387		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
388			FATAL("out of space for string %.10s...", buf);
389		switch (c) {
390		case '\n':
391		case '\r':
392		case 0:
393			*bp = '\0';
394			SYNTAX( "non-terminated string %.10s...", buf );
395			if (c == 0)	/* hopeless */
396				FATAL( "giving up" );
397			lineno++;
398			break;
399		case '\\':
400			c = input();
401			switch (c) {
402			case '\n': break;
403			case '"': *bp++ = '"'; break;
404			case 'n': *bp++ = '\n'; break;
405			case 't': *bp++ = '\t'; break;
406			case 'f': *bp++ = '\f'; break;
407			case 'r': *bp++ = '\r'; break;
408			case 'b': *bp++ = '\b'; break;
409			case 'v': *bp++ = '\v'; break;
410			case 'a': *bp++ = '\a'; break;
411			case '\\': *bp++ = '\\'; break;
412
413			case '0': case '1': case '2': /* octal: \d \dd \ddd */
414			case '3': case '4': case '5': case '6': case '7':
415				n = c - '0';
416				if ((c = peek()) >= '0' && c < '8') {
417					n = 8 * n + input() - '0';
418					if ((c = peek()) >= '0' && c < '8')
419						n = 8 * n + input() - '0';
420				}
421				*bp++ = n;
422				break;
423
424			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
425			    {
426				int i;
427
428				if (!isxdigit(peek())) {
429					unput(c);
430					break;
431				}
432				n = 0;
433				for (i = 0; i < 2; i++) {
434					c = input();
435					if (c == 0)
436						break;
437					if (isxdigit(c)) {
438						c = tolower(c);
439						n *= 16;
440						if (isdigit(c))
441							n += (c - '0');
442						else
443							n += 10 + (c - 'a');
444					} else {
445						unput(c);
446						break;
447					}
448				}
449				if (i)
450					*bp++ = n;
451				break;
452			    }
453
454			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
455			    {
456				int i;
457
458				n = 0;
459				for (i = 0; i < 8; i++) {
460					c = input();
461					if (!isxdigit(c) || c == 0)
462						break;
463					c = tolower(c);
464					n *= 16;
465					if (isdigit(c))
466						n += (c - '0');
467					else
468						n += 10 + (c - 'a');
469				}
470				unput(c);
471				bp += runetochar(bp, n);
472				break;
473			    }
474
475			default:
476				*bp++ = c;
477				break;
478			}
479			break;
480		default:
481			*bp++ = c;
482			break;
483		}
484	}
485	*bp = 0;
486	s = tostring(buf);
487	*bp++ = ' '; *bp++ = '\0';
488	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
489	free(s);
490	RET(STRING);
491}
492
493
494static int binsearch(char *w, const Keyword *kp, int n)
495{
496	int cond, low, mid, high;
497
498	low = 0;
499	high = n - 1;
500	while (low <= high) {
501		mid = (low + high) / 2;
502		if ((cond = strcmp(w, kp[mid].word)) < 0)
503			high = mid - 1;
504		else if (cond > 0)
505			low = mid + 1;
506		else
507			return mid;
508	}
509	return -1;
510}
511
512int word(char *w)
513{
514	const Keyword *kp;
515	int c, n;
516
517	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
518	if (n != -1) {	/* found in table */
519		kp = keywords + n;
520		yylval.i = kp->sub;
521		switch (kp->type) {	/* special handling */
522		case BLTIN:
523			if (kp->sub == FSYSTEM && safe)
524				SYNTAX( "system is unsafe" );
525			RET(kp->type);
526		case FUNC:
527			if (infunc)
528				SYNTAX( "illegal nested function" );
529			RET(kp->type);
530		case RETURN:
531			if (!infunc)
532				SYNTAX( "return not in function" );
533			RET(kp->type);
534		case VARNF:
535			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
536			RET(VARNF);
537		default:
538			RET(kp->type);
539		}
540	}
541	c = peek();	/* look for '(' */
542	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
543		yylval.i = n;
544		RET(ARG);
545	} else {
546		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
547		if (c == '(') {
548			RET(CALL);
549		} else {
550			RET(VAR);
551		}
552	}
553}
554
555void startreg(void)	/* next call to yylex will return a regular expression */
556{
557	reg = true;
558}
559
560int regexpr(void)
561{
562	int c, openclass = 0;
563	static char *buf = NULL;
564	static int bufsz = 500;
565	char *bp, *cstart;
566
567	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
568		FATAL("out of space for reg expr");
569	bp = buf;
570	for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
571		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
572			FATAL("out of space for reg expr %.10s...", buf);
573		if (c == '\n') {
574			*bp = '\0';
575			SYNTAX( "newline in regular expression %.10s...", buf );
576			unput('\n');
577			break;
578		} else if (c == '\\') {
579			*bp++ = '\\';
580			*bp++ = input();
581		} else {
582			/*
583			 * POSIX requires a slash in a regexp to be escaped,
584			 * other awks don't require it to be escaped inside
585			 * a character class.
586			 */
587			if (!do_posix) {
588				if (c == '[') {
589					int nextc = peek();
590					if (openclass == 0 || nextc == ':' ||
591					    nextc == '.' || nextc == '=') {
592						if (++openclass == 1)
593							cstart = bp;
594					}
595				} else if (c == ']' && openclass > 0) {
596					/*
597					 * A ']' as the first char in a
598					 * class is treated literally.
599					 */
600					if (cstart != bp - 1 &&
601					    (cstart != bp - 2 || bp[-1] != '^'))
602						openclass--;
603				}
604			}
605			*bp++ = c;
606		}
607	}
608	*bp = 0;
609	if (c == 0)
610		SYNTAX("non-terminated regular expression %.10s...", buf);
611	yylval.s = tostring(buf);
612	unput('/');
613	RET(REGEXPR);
614}
615
616/* low-level lexical stuff, sort of inherited from lex */
617
618char	ebuf[300];
619char	*ep = ebuf;
620char	yysbuf[100];	/* pushback buffer */
621char	*yysptr = yysbuf;
622FILE	*yyin = NULL;
623
624int input(void)	/* get next lexical input character */
625{
626	int c;
627	extern char *lexprog;
628
629	if (yysptr > yysbuf)
630		c = (uschar)*--yysptr;
631	else if (lexprog != NULL) {	/* awk '...' */
632		if ((c = (uschar)*lexprog) != 0)
633			lexprog++;
634	} else				/* awk -f ... */
635		c = pgetc();
636	if (c == EOF)
637		c = 0;
638	if (ep >= ebuf + sizeof ebuf)
639		ep = ebuf;
640	*ep = c;
641	if (c != 0) {
642		ep++;
643	}
644	return (c);
645}
646
647void unput(int c)	/* put lexical character back on input */
648{
649	if (yysptr >= yysbuf + sizeof(yysbuf))
650		FATAL("pushed back too much: %.20s...", yysbuf);
651	*yysptr++ = c;
652	if (--ep < ebuf)
653		ep = ebuf + sizeof(ebuf) - 1;
654}
655
656void unputstr(const char *s)	/* put a string back on input */
657{
658	int i;
659
660	for (i = strlen(s)-1; i >= 0; i--)
661		unput(s[i]);
662}