]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/one-true-awk/lex.c
MFV r357712: file 5.38.
[FreeBSD/FreeBSD.git] / contrib / one-true-awk / lex.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31
32 extern YYSTYPE  yylval;
33 extern int      infunc;
34
35 int     lineno  = 1;
36 int     bracecnt = 0;
37 int     brackcnt  = 0;
38 int     parencnt = 0;
39
40 typedef struct Keyword {
41         const char *word;
42         int     sub;
43         int     type;
44 } Keyword;
45
46 Keyword keywords[] ={   /* keep sorted: binary searched */
47         { "BEGIN",      XBEGIN,         XBEGIN },
48         { "END",        XEND,           XEND },
49         { "NF",         VARNF,          VARNF },
50         { "and",        FAND,           BLTIN },
51         { "atan2",      FATAN,          BLTIN },
52         { "break",      BREAK,          BREAK },
53         { "close",      CLOSE,          CLOSE },
54         { "compl",      FCOMPL,         BLTIN },
55         { "continue",   CONTINUE,       CONTINUE },
56         { "cos",        FCOS,           BLTIN },
57         { "delete",     DELETE,         DELETE },
58         { "do",         DO,             DO },
59         { "else",       ELSE,           ELSE },
60         { "exit",       EXIT,           EXIT },
61         { "exp",        FEXP,           BLTIN },
62         { "fflush",     FFLUSH,         BLTIN },
63         { "for",        FOR,            FOR },
64         { "func",       FUNC,           FUNC },
65         { "function",   FUNC,           FUNC },
66         { "getline",    GETLINE,        GETLINE },
67         { "gsub",       GSUB,           GSUB },
68         { "if",         IF,             IF },
69         { "in",         IN,             IN },
70         { "index",      INDEX,          INDEX },
71         { "int",        FINT,           BLTIN },
72         { "length",     FLENGTH,        BLTIN },
73         { "log",        FLOG,           BLTIN },
74         { "lshift",     FLSHIFT,        BLTIN },
75         { "match",      MATCHFCN,       MATCHFCN },
76         { "next",       NEXT,           NEXT },
77         { "nextfile",   NEXTFILE,       NEXTFILE },
78         { "or",         FFOR,           BLTIN },
79         { "print",      PRINT,          PRINT },
80         { "printf",     PRINTF,         PRINTF },
81         { "rand",       FRAND,          BLTIN },
82         { "return",     RETURN,         RETURN },
83         { "rshift",     FRSHIFT,        BLTIN },
84         { "sin",        FSIN,           BLTIN },
85         { "split",      SPLIT,          SPLIT },
86         { "sprintf",    SPRINTF,        SPRINTF },
87         { "sqrt",       FSQRT,          BLTIN },
88         { "srand",      FSRAND,         BLTIN },
89         { "sub",        SUB,            SUB },
90         { "substr",     SUBSTR,         SUBSTR },
91         { "system",     FSYSTEM,        BLTIN },
92         { "tolower",    FTOLOWER,       BLTIN },
93         { "toupper",    FTOUPPER,       BLTIN },
94         { "while",      WHILE,          WHILE },
95         { "xor",        FXOR,           BLTIN },
96 };
97
98 #define RET(x)  { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
99
100 int peek(void)
101 {
102         int c = input();
103         unput(c);
104         return c;
105 }
106
107 int gettok(char **pbuf, int *psz)       /* get next input token */
108 {
109         int c, retc;
110         char *buf = *pbuf;
111         int sz = *psz;
112         char *bp = buf;
113
114         c = input();
115         if (c == 0)
116                 return 0;
117         buf[0] = c;
118         buf[1] = 0;
119         if (!isalnum(c) && c != '.' && c != '_')
120                 return c;
121
122         *bp++ = c;
123         if (isalpha(c) || c == '_') {   /* it's a varname */
124                 for ( ; (c = input()) != 0; ) {
125                         if (bp-buf >= sz)
126                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
127                                         FATAL( "out of space for name %.10s...", buf );
128                         if (isalnum(c) || c == '_')
129                                 *bp++ = c;
130                         else {
131                                 *bp = 0;
132                                 unput(c);
133                                 break;
134                         }
135                 }
136                 *bp = 0;
137                 retc = 'a';     /* alphanumeric */
138         } else {        /* maybe it's a number, but could be . */
139                 char *rem;
140                 /* read input until can't be a number */
141                 for ( ; (c = input()) != 0; ) {
142                         if (bp-buf >= sz)
143                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
144                                         FATAL( "out of space for number %.10s...", buf );
145                         if (isdigit(c) || c == 'e' || c == 'E' 
146                           || c == '.' || c == '+' || c == '-')
147                                 *bp++ = c;
148                         else {
149                                 unput(c);
150                                 break;
151                         }
152                 }
153                 *bp = 0;
154                 strtod(buf, &rem);      /* parse the number */
155                 if (rem == buf) {       /* it wasn't a valid number at all */
156                         buf[1] = 0;     /* return one character as token */
157                         retc = buf[0];  /* character is its own type */
158                         unputstr(rem+1); /* put rest back for later */
159                 } else {        /* some prefix was a number */
160                         unputstr(rem);  /* put rest back for later */
161                         rem[0] = 0;     /* truncate buf after number part */
162                         retc = '0';     /* type is number */
163                 }
164         }
165         *pbuf = buf;
166         *psz = sz;
167         return retc;
168 }
169
170 int     word(char *);
171 int     string(void);
172 int     regexpr(void);
173 int     sc      = 0;    /* 1 => return a } right now */
174 int     reg     = 0;    /* 1 => return a REGEXPR now */
175
176 int yylex(void)
177 {
178         int c;
179         static char *buf = NULL;
180         static int bufsize = 5; /* BUG: setting this small causes core dump! */
181
182         if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
183                 FATAL( "out of space in yylex" );
184         if (sc) {
185                 sc = 0;
186                 RET('}');
187         }
188         if (reg) {
189                 reg = 0;
190                 return regexpr();
191         }
192         for (;;) {
193                 c = gettok(&buf, &bufsize);
194                 if (c == 0)
195                         return 0;
196                 if (isalpha(c) || c == '_')
197                         return word(buf);
198                 if (isdigit(c)) {
199                         yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
200                         /* should this also have STR set? */
201                         RET(NUMBER);
202                 }
203         
204                 yylval.i = c;
205                 switch (c) {
206                 case '\n':      /* {EOL} */
207                         lineno++;
208                         RET(NL);
209                 case '\r':      /* assume \n is coming */
210                 case ' ':       /* {WS}+ */
211                 case '\t':
212                         break;
213                 case '#':       /* #.* strip comments */
214                         while ((c = input()) != '\n' && c != 0)
215                                 ;
216                         unput(c);
217                         break;
218                 case ';':
219                         RET(';');
220                 case '\\':
221                         if (peek() == '\n') {
222                                 input();
223                                 lineno++;
224                         } else if (peek() == '\r') {
225                                 input(); input();       /* \n */
226                                 lineno++;
227                         } else {
228                                 RET(c);
229                         }
230                         break;
231                 case '&':
232                         if (peek() == '&') {
233                                 input(); RET(AND);
234                         } else 
235                                 RET('&');
236                 case '|':
237                         if (peek() == '|') {
238                                 input(); RET(BOR);
239                         } else
240                                 RET('|');
241                 case '!':
242                         if (peek() == '=') {
243                                 input(); yylval.i = NE; RET(NE);
244                         } else if (peek() == '~') {
245                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
246                         } else
247                                 RET(NOT);
248                 case '~':
249                         yylval.i = MATCH;
250                         RET(MATCHOP);
251                 case '<':
252                         if (peek() == '=') {
253                                 input(); yylval.i = LE; RET(LE);
254                         } else {
255                                 yylval.i = LT; RET(LT);
256                         }
257                 case '=':
258                         if (peek() == '=') {
259                                 input(); yylval.i = EQ; RET(EQ);
260                         } else {
261                                 yylval.i = ASSIGN; RET(ASGNOP);
262                         }
263                 case '>':
264                         if (peek() == '=') {
265                                 input(); yylval.i = GE; RET(GE);
266                         } else if (peek() == '>') {
267                                 input(); yylval.i = APPEND; RET(APPEND);
268                         } else {
269                                 yylval.i = GT; RET(GT);
270                         }
271                 case '+':
272                         if (peek() == '+') {
273                                 input(); yylval.i = INCR; RET(INCR);
274                         } else if (peek() == '=') {
275                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
276                         } else
277                                 RET('+');
278                 case '-':
279                         if (peek() == '-') {
280                                 input(); yylval.i = DECR; RET(DECR);
281                         } else if (peek() == '=') {
282                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
283                         } else
284                                 RET('-');
285                 case '*':
286                         if (peek() == '=') {    /* *= */
287                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
288                         } else if (peek() == '*') {     /* ** or **= */
289                                 input();        /* eat 2nd * */
290                                 if (peek() == '=') {
291                                         input(); yylval.i = POWEQ; RET(ASGNOP);
292                                 } else {
293                                         RET(POWER);
294                                 }
295                         } else
296                                 RET('*');
297                 case '/':
298                         RET('/');
299                 case '%':
300                         if (peek() == '=') {
301                                 input(); yylval.i = MODEQ; RET(ASGNOP);
302                         } else
303                                 RET('%');
304                 case '^':
305                         if (peek() == '=') {
306                                 input(); yylval.i = POWEQ; RET(ASGNOP);
307                         } else
308                                 RET(POWER);
309
310                 case '$':
311                         /* BUG: awkward, if not wrong */
312                         c = gettok(&buf, &bufsize);
313                         if (isalpha(c)) {
314                                 if (strcmp(buf, "NF") == 0) {   /* very special */
315                                         unputstr("(NF)");
316                                         RET(INDIRECT);
317                                 }
318                                 c = peek();
319                                 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
320                                         unputstr(buf);
321                                         RET(INDIRECT);
322                                 }
323                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
324                                 RET(IVAR);
325                         } else if (c == 0) {    /*  */
326                                 SYNTAX( "unexpected end of input after $" );
327                                 RET(';');
328                         } else {
329                                 unputstr(buf);
330                                 RET(INDIRECT);
331                         }
332         
333                 case '}':
334                         if (--bracecnt < 0)
335                                 SYNTAX( "extra }" );
336                         sc = 1;
337                         RET(';');
338                 case ']':
339                         if (--brackcnt < 0)
340                                 SYNTAX( "extra ]" );
341                         RET(']');
342                 case ')':
343                         if (--parencnt < 0)
344                                 SYNTAX( "extra )" );
345                         RET(')');
346                 case '{':
347                         bracecnt++;
348                         RET('{');
349                 case '[':
350                         brackcnt++;
351                         RET('[');
352                 case '(':
353                         parencnt++;
354                         RET('(');
355         
356                 case '"':
357                         return string();        /* BUG: should be like tran.c ? */
358         
359                 default:
360                         RET(c);
361                 }
362         }
363 }
364
365 int string(void)
366 {
367         int c, n;
368         char *s, *bp;
369         static char *buf = NULL;
370         static int bufsz = 500;
371
372         if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
373                 FATAL("out of space for strings");
374         for (bp = buf; (c = input()) != '"'; ) {
375                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
376                         FATAL("out of space for string %.10s...", buf);
377                 switch (c) {
378                 case '\n':
379                 case '\r':
380                 case 0:
381                         *bp = '\0';
382                         SYNTAX( "non-terminated string %.10s...", buf );
383                         if (c == 0)     /* hopeless */
384                                 FATAL( "giving up" );
385                         lineno++;
386                         break;
387                 case '\\':
388                         c = input();
389                         switch (c) {
390                         case '"': *bp++ = '"'; break;
391                         case 'n': *bp++ = '\n'; break;  
392                         case 't': *bp++ = '\t'; break;
393                         case 'f': *bp++ = '\f'; break;
394                         case 'r': *bp++ = '\r'; break;
395                         case 'b': *bp++ = '\b'; break;
396                         case 'v': *bp++ = '\v'; break;
397                         case 'a': *bp++ = '\007'; break;
398                         case '\\': *bp++ = '\\'; break;
399
400                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
401                         case '3': case '4': case '5': case '6': case '7':
402                                 n = c - '0';
403                                 if ((c = peek()) >= '0' && c < '8') {
404                                         n = 8 * n + input() - '0';
405                                         if ((c = peek()) >= '0' && c < '8')
406                                                 n = 8 * n + input() - '0';
407                                 }
408                                 *bp++ = n;
409                                 break;
410
411                         case 'x':       /* hex  \x0-9a-fA-F + */
412                             {   char xbuf[100], *px;
413                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
414                                         if (isdigit(c)
415                                          || (c >= 'a' && c <= 'f')
416                                          || (c >= 'A' && c <= 'F'))
417                                                 *px++ = c;
418                                         else
419                                                 break;
420                                 }
421                                 *px = 0;
422                                 unput(c);
423                                 sscanf(xbuf, "%x", (unsigned int *) &n);
424                                 *bp++ = n;
425                                 break;
426                             }
427
428                         default: 
429                                 *bp++ = c;
430                                 break;
431                         }
432                         break;
433                 default:
434                         *bp++ = c;
435                         break;
436                 }
437         }
438         *bp = 0; 
439         s = tostring(buf);
440         *bp++ = ' '; *bp++ = 0;
441         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
442         RET(STRING);
443 }
444
445
446 int binsearch(char *w, Keyword *kp, int n)
447 {
448         int cond, low, mid, high;
449
450         low = 0;
451         high = n - 1;
452         while (low <= high) {
453                 mid = (low + high) / 2;
454                 if ((cond = strcmp(w, kp[mid].word)) < 0)
455                         high = mid - 1;
456                 else if (cond > 0)
457                         low = mid + 1;
458                 else
459                         return mid;
460         }
461         return -1;
462 }
463
464 int word(char *w) 
465 {
466         Keyword *kp;
467         int c, n;
468
469         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
470 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
471         kp = keywords + n;
472         if (n != -1) {  /* found in table */
473                 yylval.i = kp->sub;
474                 switch (kp->type) {     /* special handling */
475                 case BLTIN:
476                         if (kp->sub == FSYSTEM && safe)
477                                 SYNTAX( "system is unsafe" );
478                         RET(kp->type);
479                 case FUNC:
480                         if (infunc)
481                                 SYNTAX( "illegal nested function" );
482                         RET(kp->type);
483                 case RETURN:
484                         if (!infunc)
485                                 SYNTAX( "return not in function" );
486                         RET(kp->type);
487                 case VARNF:
488                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
489                         RET(VARNF);
490                 default:
491                         RET(kp->type);
492                 }
493         }
494         c = peek();     /* look for '(' */
495         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
496                 yylval.i = n;
497                 RET(ARG);
498         } else {
499                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
500                 if (c == '(') {
501                         RET(CALL);
502                 } else {
503                         RET(VAR);
504                 }
505         }
506 }
507
508 void startreg(void)     /* next call to yylex will return a regular expression */
509 {
510         reg = 1;
511 }
512
513 int regexpr(void)
514 {
515         int c;
516         static char *buf = NULL;
517         static int bufsz = 500;
518         char *bp;
519
520         if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
521                 FATAL("out of space for rex expr");
522         bp = buf;
523         for ( ; (c = input()) != '/' && c != 0; ) {
524                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
525                         FATAL("out of space for reg expr %.10s...", buf);
526                 if (c == '\n') {
527                         *bp = '\0';
528                         SYNTAX( "newline in regular expression %.10s...", buf ); 
529                         unput('\n');
530                         break;
531                 } else if (c == '\\') {
532                         *bp++ = '\\'; 
533                         *bp++ = input();
534                 } else {
535                         *bp++ = c;
536                 }
537         }
538         *bp = 0;
539         if (c == 0)
540                 SYNTAX("non-terminated regular expression %.10s...", buf);
541         yylval.s = tostring(buf);
542         unput('/');
543         RET(REGEXPR);
544 }
545
546 /* low-level lexical stuff, sort of inherited from lex */
547
548 char    ebuf[300];
549 char    *ep = ebuf;
550 char    yysbuf[100];    /* pushback buffer */
551 char    *yysptr = yysbuf;
552 FILE    *yyin = NULL;
553
554 int input(void) /* get next lexical input character */
555 {
556         int c;
557         extern char *lexprog;
558
559         if (yysptr > yysbuf)
560                 c = (uschar)*--yysptr;
561         else if (lexprog != NULL) {     /* awk '...' */
562                 if ((c = (uschar)*lexprog) != 0)
563                         lexprog++;
564         } else                          /* awk -f ... */
565                 c = pgetc();
566         if (c == EOF)
567                 c = 0;
568         if (ep >= ebuf + sizeof ebuf)
569                 ep = ebuf;
570         *ep = c;
571         if (c != 0) {
572                 ep++;
573         }
574         return (c);
575 }
576
577 void unput(int c)       /* put lexical character back on input */
578 {
579         if (yysptr >= yysbuf + sizeof(yysbuf))
580                 FATAL("pushed back too much: %.20s...", yysbuf);
581         *yysptr++ = c;
582         if (--ep < ebuf)
583                 ep = ebuf + sizeof(ebuf) - 1;
584 }
585
586 void unputstr(const char *s)    /* put a string back on input */
587 {
588         int i;
589
590         for (i = strlen(s)-1; i >= 0; i--)
591                 unput(s[i]);
592 }