]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/one-true-awk/lex.c
Merge llvm-project main llvmorg-14-init-18294-gdb01b123d012
[FreeBSD/FreeBSD.git] / contrib / one-true-awk / lex.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31
32 extern YYSTYPE  yylval;
33 extern bool     infunc;
34
35 int     lineno  = 1;
36 int     bracecnt = 0;
37 int     brackcnt  = 0;
38 int     parencnt = 0;
39
40 typedef struct Keyword {
41         const char *word;
42         int     sub;
43         int     type;
44 } Keyword;
45
46 const Keyword keywords[] = {    /* keep sorted: binary searched */
47         { "BEGIN",      XBEGIN,         XBEGIN },
48         { "END",        XEND,           XEND },
49         { "NF",         VARNF,          VARNF },
50         { "and",        FAND,           BLTIN },
51         { "atan2",      FATAN,          BLTIN },
52         { "break",      BREAK,          BREAK },
53         { "close",      CLOSE,          CLOSE },
54         { "compl",      FCOMPL,         BLTIN },
55         { "continue",   CONTINUE,       CONTINUE },
56         { "cos",        FCOS,           BLTIN },
57         { "delete",     DELETE,         DELETE },
58         { "do",         DO,             DO },
59         { "else",       ELSE,           ELSE },
60         { "exit",       EXIT,           EXIT },
61         { "exp",        FEXP,           BLTIN },
62         { "fflush",     FFLUSH,         BLTIN },
63         { "for",        FOR,            FOR },
64         { "func",       FUNC,           FUNC },
65         { "function",   FUNC,           FUNC },
66         { "gensub",     GENSUB,         GENSUB },
67         { "getline",    GETLINE,        GETLINE },
68         { "gsub",       GSUB,           GSUB },
69         { "if",         IF,             IF },
70         { "in",         IN,             IN },
71         { "index",      INDEX,          INDEX },
72         { "int",        FINT,           BLTIN },
73         { "length",     FLENGTH,        BLTIN },
74         { "log",        FLOG,           BLTIN },
75         { "lshift",     FLSHIFT,        BLTIN },
76         { "match",      MATCHFCN,       MATCHFCN },
77         { "next",       NEXT,           NEXT },
78         { "nextfile",   NEXTFILE,       NEXTFILE },
79         { "or",         FFOR,           BLTIN },
80         { "print",      PRINT,          PRINT },
81         { "printf",     PRINTF,         PRINTF },
82         { "rand",       FRAND,          BLTIN },
83         { "return",     RETURN,         RETURN },
84         { "rshift",     FRSHIFT,        BLTIN },
85         { "sin",        FSIN,           BLTIN },
86         { "split",      SPLIT,          SPLIT },
87         { "sprintf",    SPRINTF,        SPRINTF },
88         { "sqrt",       FSQRT,          BLTIN },
89         { "srand",      FSRAND,         BLTIN },
90         { "strftime",   FSTRFTIME,      BLTIN },
91         { "sub",        SUB,            SUB },
92         { "substr",     SUBSTR,         SUBSTR },
93         { "system",     FSYSTEM,        BLTIN },
94         { "systime",    FSYSTIME,       BLTIN },
95         { "tolower",    FTOLOWER,       BLTIN },
96         { "toupper",    FTOUPPER,       BLTIN },
97         { "while",      WHILE,          WHILE },
98         { "xor",        FXOR,           BLTIN },
99 };
100
101 #define RET(x)  { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
102
103 static int peek(void)
104 {
105         int c = input();
106         unput(c);
107         return c;
108 }
109
110 static int gettok(char **pbuf, int *psz)        /* get next input token */
111 {
112         int c, retc;
113         char *buf = *pbuf;
114         int sz = *psz;
115         char *bp = buf;
116
117         c = input();
118         if (c == 0)
119                 return 0;
120         buf[0] = c;
121         buf[1] = 0;
122         if (!isalnum(c) && c != '.' && c != '_')
123                 return c;
124
125         *bp++ = c;
126         if (isalpha(c) || c == '_') {   /* it's a varname */
127                 for ( ; (c = input()) != 0; ) {
128                         if (bp-buf >= sz)
129                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
130                                         FATAL( "out of space for name %.10s...", buf );
131                         if (isalnum(c) || c == '_')
132                                 *bp++ = c;
133                         else {
134                                 *bp = 0;
135                                 unput(c);
136                                 break;
137                         }
138                 }
139                 *bp = 0;
140                 retc = 'a';     /* alphanumeric */
141         } else {        /* maybe it's a number, but could be . */
142                 char *rem;
143                 /* read input until can't be a number */
144                 for ( ; (c = input()) != 0; ) {
145                         if (bp-buf >= sz)
146                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
147                                         FATAL( "out of space for number %.10s...", buf );
148                         if (isdigit(c) || c == 'e' || c == 'E'
149                           || c == '.' || c == '+' || c == '-')
150                                 *bp++ = c;
151                         else {
152                                 unput(c);
153                                 break;
154                         }
155                 }
156                 *bp = 0;
157                 strtod(buf, &rem);      /* parse the number */
158                 if (rem == buf) {       /* it wasn't a valid number at all */
159                         buf[1] = 0;     /* return one character as token */
160                         retc = (uschar)buf[0];  /* character is its own type */
161                         unputstr(rem+1); /* put rest back for later */
162                 } else {        /* some prefix was a number */
163                         unputstr(rem);  /* put rest back for later */
164                         rem[0] = 0;     /* truncate buf after number part */
165                         retc = '0';     /* type is number */
166                 }
167         }
168         *pbuf = buf;
169         *psz = sz;
170         return retc;
171 }
172
173 int     word(char *);
174 int     string(void);
175 int     regexpr(void);
176 bool    sc      = false;        /* true => return a } right now */
177 bool    reg     = false;        /* true => return a REGEXPR now */
178
179 int yylex(void)
180 {
181         int c;
182         static char *buf = NULL;
183         static int bufsize = 5; /* BUG: setting this small causes core dump! */
184
185         if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
186                 FATAL( "out of space in yylex" );
187         if (sc) {
188                 sc = false;
189                 RET('}');
190         }
191         if (reg) {
192                 reg = false;
193                 return regexpr();
194         }
195         for (;;) {
196                 c = gettok(&buf, &bufsize);
197                 if (c == 0)
198                         return 0;
199                 if (isalpha(c) || c == '_')
200                         return word(buf);
201                 if (isdigit(c)) {
202                         char *cp = tostring(buf);
203                         double result;
204
205                         if (is_number(cp, & result))
206                                 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
207                         else
208                                 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
209                         free(cp);
210                         /* should this also have STR set? */
211                         RET(NUMBER);
212                 }
213
214                 yylval.i = c;
215                 switch (c) {
216                 case '\n':      /* {EOL} */
217                         lineno++;
218                         RET(NL);
219                 case '\r':      /* assume \n is coming */
220                 case ' ':       /* {WS}+ */
221                 case '\t':
222                         break;
223                 case '#':       /* #.* strip comments */
224                         while ((c = input()) != '\n' && c != 0)
225                                 ;
226                         unput(c);
227                         /*
228                          * Next line is a hack, itcompensates for
229                          * unput's treatment of \n.
230                          */
231                         lineno++;
232                         break;
233                 case ';':
234                         RET(';');
235                 case '\\':
236                         if (peek() == '\n') {
237                                 input();
238                                 lineno++;
239                         } else if (peek() == '\r') {
240                                 input(); input();       /* \n */
241                                 lineno++;
242                         } else {
243                                 RET(c);
244                         }
245                         break;
246                 case '&':
247                         if (peek() == '&') {
248                                 input(); RET(AND);
249                         } else
250                                 RET('&');
251                 case '|':
252                         if (peek() == '|') {
253                                 input(); RET(BOR);
254                         } else
255                                 RET('|');
256                 case '!':
257                         if (peek() == '=') {
258                                 input(); yylval.i = NE; RET(NE);
259                         } else if (peek() == '~') {
260                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
261                         } else
262                                 RET(NOT);
263                 case '~':
264                         yylval.i = MATCH;
265                         RET(MATCHOP);
266                 case '<':
267                         if (peek() == '=') {
268                                 input(); yylval.i = LE; RET(LE);
269                         } else {
270                                 yylval.i = LT; RET(LT);
271                         }
272                 case '=':
273                         if (peek() == '=') {
274                                 input(); yylval.i = EQ; RET(EQ);
275                         } else {
276                                 yylval.i = ASSIGN; RET(ASGNOP);
277                         }
278                 case '>':
279                         if (peek() == '=') {
280                                 input(); yylval.i = GE; RET(GE);
281                         } else if (peek() == '>') {
282                                 input(); yylval.i = APPEND; RET(APPEND);
283                         } else {
284                                 yylval.i = GT; RET(GT);
285                         }
286                 case '+':
287                         if (peek() == '+') {
288                                 input(); yylval.i = INCR; RET(INCR);
289                         } else if (peek() == '=') {
290                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
291                         } else
292                                 RET('+');
293                 case '-':
294                         if (peek() == '-') {
295                                 input(); yylval.i = DECR; RET(DECR);
296                         } else if (peek() == '=') {
297                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
298                         } else
299                                 RET('-');
300                 case '*':
301                         if (peek() == '=') {    /* *= */
302                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
303                         } else if (peek() == '*') {     /* ** or **= */
304                                 input();        /* eat 2nd * */
305                                 if (peek() == '=') {
306                                         input(); yylval.i = POWEQ; RET(ASGNOP);
307                                 } else {
308                                         RET(POWER);
309                                 }
310                         } else
311                                 RET('*');
312                 case '/':
313                         RET('/');
314                 case '%':
315                         if (peek() == '=') {
316                                 input(); yylval.i = MODEQ; RET(ASGNOP);
317                         } else
318                                 RET('%');
319                 case '^':
320                         if (peek() == '=') {
321                                 input(); yylval.i = POWEQ; RET(ASGNOP);
322                         } else
323                                 RET(POWER);
324
325                 case '$':
326                         /* BUG: awkward, if not wrong */
327                         c = gettok(&buf, &bufsize);
328                         if (isalpha(c)) {
329                                 if (strcmp(buf, "NF") == 0) {   /* very special */
330                                         unputstr("(NF)");
331                                         RET(INDIRECT);
332                                 }
333                                 c = peek();
334                                 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
335                                         unputstr(buf);
336                                         RET(INDIRECT);
337                                 }
338                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
339                                 RET(IVAR);
340                         } else if (c == 0) {    /*  */
341                                 SYNTAX( "unexpected end of input after $" );
342                                 RET(';');
343                         } else {
344                                 unputstr(buf);
345                                 RET(INDIRECT);
346                         }
347
348                 case '}':
349                         if (--bracecnt < 0)
350                                 SYNTAX( "extra }" );
351                         sc = true;
352                         RET(';');
353                 case ']':
354                         if (--brackcnt < 0)
355                                 SYNTAX( "extra ]" );
356                         RET(']');
357                 case ')':
358                         if (--parencnt < 0)
359                                 SYNTAX( "extra )" );
360                         RET(')');
361                 case '{':
362                         bracecnt++;
363                         RET('{');
364                 case '[':
365                         brackcnt++;
366                         RET('[');
367                 case '(':
368                         parencnt++;
369                         RET('(');
370
371                 case '"':
372                         return string();        /* BUG: should be like tran.c ? */
373
374                 default:
375                         RET(c);
376                 }
377         }
378 }
379
380 int string(void)
381 {
382         int c, n;
383         char *s, *bp;
384         static char *buf = NULL;
385         static int bufsz = 500;
386
387         if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
388                 FATAL("out of space for strings");
389         for (bp = buf; (c = input()) != '"'; ) {
390                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
391                         FATAL("out of space for string %.10s...", buf);
392                 switch (c) {
393                 case '\n':
394                 case '\r':
395                 case 0:
396                         *bp = '\0';
397                         SYNTAX( "non-terminated string %.10s...", buf );
398                         if (c == 0)     /* hopeless */
399                                 FATAL( "giving up" );
400                         lineno++;
401                         break;
402                 case '\\':
403                         c = input();
404                         switch (c) {
405                         case '\n': break;
406                         case '"': *bp++ = '"'; break;
407                         case 'n': *bp++ = '\n'; break;
408                         case 't': *bp++ = '\t'; break;
409                         case 'f': *bp++ = '\f'; break;
410                         case 'r': *bp++ = '\r'; break;
411                         case 'b': *bp++ = '\b'; break;
412                         case 'v': *bp++ = '\v'; break;
413                         case 'a': *bp++ = '\a'; break;
414                         case '\\': *bp++ = '\\'; break;
415
416                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
417                         case '3': case '4': case '5': case '6': case '7':
418                                 n = c - '0';
419                                 if ((c = peek()) >= '0' && c < '8') {
420                                         n = 8 * n + input() - '0';
421                                         if ((c = peek()) >= '0' && c < '8')
422                                                 n = 8 * n + input() - '0';
423                                 }
424                                 *bp++ = n;
425                                 break;
426
427                         case 'x':       /* hex  \x0-9a-fA-F + */
428                             {   char xbuf[100], *px;
429                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
430                                         if (isdigit(c)
431                                          || (c >= 'a' && c <= 'f')
432                                          || (c >= 'A' && c <= 'F'))
433                                                 *px++ = c;
434                                         else
435                                                 break;
436                                 }
437                                 *px = 0;
438                                 unput(c);
439                                 sscanf(xbuf, "%x", (unsigned int *) &n);
440                                 *bp++ = n;
441                                 break;
442                             }
443
444                         default:
445                                 *bp++ = c;
446                                 break;
447                         }
448                         break;
449                 default:
450                         *bp++ = c;
451                         break;
452                 }
453         }
454         *bp = 0;
455         s = tostring(buf);
456         *bp++ = ' '; *bp++ = '\0';
457         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
458         free(s);
459         RET(STRING);
460 }
461
462
463 static int binsearch(char *w, const Keyword *kp, int n)
464 {
465         int cond, low, mid, high;
466
467         low = 0;
468         high = n - 1;
469         while (low <= high) {
470                 mid = (low + high) / 2;
471                 if ((cond = strcmp(w, kp[mid].word)) < 0)
472                         high = mid - 1;
473                 else if (cond > 0)
474                         low = mid + 1;
475                 else
476                         return mid;
477         }
478         return -1;
479 }
480
481 int word(char *w)
482 {
483         const Keyword *kp;
484         int c, n;
485
486         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
487         if (n != -1) {  /* found in table */
488                 kp = keywords + n;
489                 yylval.i = kp->sub;
490                 switch (kp->type) {     /* special handling */
491                 case BLTIN:
492                         if (kp->sub == FSYSTEM && safe)
493                                 SYNTAX( "system is unsafe" );
494                         RET(kp->type);
495                 case FUNC:
496                         if (infunc)
497                                 SYNTAX( "illegal nested function" );
498                         RET(kp->type);
499                 case RETURN:
500                         if (!infunc)
501                                 SYNTAX( "return not in function" );
502                         RET(kp->type);
503                 case VARNF:
504                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
505                         RET(VARNF);
506                 default:
507                         RET(kp->type);
508                 }
509         }
510         c = peek();     /* look for '(' */
511         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
512                 yylval.i = n;
513                 RET(ARG);
514         } else {
515                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
516                 if (c == '(') {
517                         RET(CALL);
518                 } else {
519                         RET(VAR);
520                 }
521         }
522 }
523
524 void startreg(void)     /* next call to yylex will return a regular expression */
525 {
526         reg = true;
527 }
528
529 int regexpr(void)
530 {
531         int c;
532         static char *buf = NULL;
533         static int bufsz = 500;
534         char *bp;
535
536         if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
537                 FATAL("out of space for rex expr");
538         bp = buf;
539         for ( ; (c = input()) != '/' && c != 0; ) {
540                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
541                         FATAL("out of space for reg expr %.10s...", buf);
542                 if (c == '\n') {
543                         *bp = '\0';
544                         SYNTAX( "newline in regular expression %.10s...", buf );
545                         unput('\n');
546                         break;
547                 } else if (c == '\\') {
548                         *bp++ = '\\';
549                         *bp++ = input();
550                 } else {
551                         *bp++ = c;
552                 }
553         }
554         *bp = 0;
555         if (c == 0)
556                 SYNTAX("non-terminated regular expression %.10s...", buf);
557         yylval.s = tostring(buf);
558         unput('/');
559         RET(REGEXPR);
560 }
561
562 /* low-level lexical stuff, sort of inherited from lex */
563
564 char    ebuf[300];
565 char    *ep = ebuf;
566 char    yysbuf[100];    /* pushback buffer */
567 char    *yysptr = yysbuf;
568 FILE    *yyin = NULL;
569
570 int input(void) /* get next lexical input character */
571 {
572         int c;
573         extern char *lexprog;
574
575         if (yysptr > yysbuf)
576                 c = (uschar)*--yysptr;
577         else if (lexprog != NULL) {     /* awk '...' */
578                 if ((c = (uschar)*lexprog) != 0)
579                         lexprog++;
580         } else                          /* awk -f ... */
581                 c = pgetc();
582         if (c == EOF)
583                 c = 0;
584         if (ep >= ebuf + sizeof ebuf)
585                 ep = ebuf;
586         *ep = c;
587         if (c != 0) {
588                 ep++;
589         }
590         return (c);
591 }
592
593 void unput(int c)       /* put lexical character back on input */
594 {
595         if (c == '\n')  
596                 lineno--;
597         if (yysptr >= yysbuf + sizeof(yysbuf))
598                 FATAL("pushed back too much: %.20s...", yysbuf);
599         *yysptr++ = c;
600         if (--ep < ebuf)
601                 ep = ebuf + sizeof(ebuf) - 1;
602 }
603
604 void unputstr(const char *s)    /* put a string back on input */
605 {
606         int i;
607
608         for (i = strlen(s)-1; i >= 0; i--)
609                 unput(s[i]);
610 }