lex.c - scc - simple c99 compiler
(HTM) git clone git://git.simple-cc.org/scc
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Submodules
(DIR) README
(DIR) LICENSE
---
lex.c (16273B)
---
1 #include <assert.h>
2 #include <ctype.h>
3 #include <errno.h>
4 #include <limits.h>
5 #include <setjmp.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9
10 #include <scc/cstd.h>
11 #include <scc/scc.h>
12 #include "cc1.h"
13
14 int yytoken;
15 struct yystype yylval;
16 char yytext[STRINGSIZ+3];
17 unsigned short yylen;
18 int lexmode = CCMODE;
19 unsigned lineno;
20 char filenam[FILENAME_MAX];
21
22 int namespace = NS_IDEN;
23 static int safe;
24 Input *input;
25
26 void
27 setloc(char *fname, unsigned line)
28 {
29 size_t len;
30
31 if (fname) {
32 if ((len = strlen(fname)) >= FILENAME_MAX)
33 die("cc1: %s: file name too long", fname);
34 memmove(filenam, fname, len);
35 filenam[len] = '\0';
36
37 /*
38 * There are cases where we want to call setloc()
39 * with the data in input, and then we have t be
40 * careful about freeing input->filenam
41 */
42 if (fname != input->filenam) {
43 free(input->filenam);
44 input->filenam = xstrdup(fname);
45 }
46 }
47
48 lineno = input->lineno = line;
49 }
50
51 int
52 addinput(int type, void *arg, int fail)
53 {
54 FILE *fp;
55 char *extp, *fname, *buffer, *infile;
56 int infileln;
57 Macro *mp;
58 Symbol *sym;
59 Input *newip, *curip = input;
60
61 if (curip)
62 curip->lineno = lineno;
63
64 switch (type) {
65 case IMACRO:
66 fp = NULL;
67 mp = arg;
68 sym = mp->sym;
69 fname = mp->fname;
70 buffer = mp->buffer;
71 DBG("INPUT: expanding macro %s", sym->name);
72 break;
73 case IPARAM:
74 fp = NULL;
75 mp = NULL;
76 buffer = arg;
77 fname = filenam;
78 DBG("INPUT: macro parameter '%s'", buffer);
79 break;
80 case IFILE:
81 fname = arg;
82 mp = NULL;
83 buffer = NULL;
84
85 if ((fp = fopen(fname, "r")) == NULL) {
86 if (!fail)
87 return 0;
88 die("cc1: %s: %s", fname, strerror(errno));
89 }
90 if (curip && onlyheader) {
91 infile = curip->filenam;
92 infileln = strlen(infile);
93 if (extp = strrchr(infile, '.'))
94 infileln -= strlen(extp);
95 printf("%.*s.o: %s %s\n",
96 infileln, infile, infile, fname);
97 }
98 lineno = 0;
99 DBG("INPUT: file input '%s'", fname);
100 break;
101 case ISTDIN:
102 fp = stdin;
103 mp = NULL;
104 fname = "<stdin>";
105 buffer = NULL;
106 lineno = 0;
107 DBG("INPUT: file input 'stdin'");
108 break;
109 default:
110 abort();
111 }
112
113 if (!buffer) {
114 buffer = xmalloc(INPUTSIZ);
115 buffer[0] = '\0';
116 } else {
117 buffer = xstrdup(buffer);
118 }
119
120 newip = xmalloc(sizeof(*newip));
121 newip->next = curip;
122 newip->macro = mp;
123 newip->p = newip->begin = newip->line = buffer;
124 newip->filenam = NULL;
125 newip->lineno = 0;
126 newip->fp = fp;
127 newip->flags = type;
128 input = newip;
129
130 setloc(fname, lineno);
131 return 1;
132 }
133
134 void
135 delinput(void)
136 {
137 Input *ip = input;
138
139 switch (ip->flags & ITYPE) {
140 case IFILE:
141 DBG("INPUT: file finished '%s'", ip->filenam);
142 if (fclose(ip->fp))
143 die("cc1: %s: %s", ip->filenam, strerror(errno));
144 break;
145 case IMACRO:
146 DBG("INPUT: macro %s finished", ip->macro->sym->name);
147 delmacro(ip->macro);
148 break;
149 case IPARAM:
150 DBG("INPUT: macro param finished");
151 break;
152 case ISTDIN:
153 DBG("INPUT: stdin finished");
154 break;
155 default:
156 abort();
157 }
158
159 input = ip->next;
160 free(ip->filenam);
161 free(ip->line);
162 free(ip);
163 if (input)
164 setloc(input->filenam, input->lineno);
165 }
166
167 static void
168 newline(void)
169 {
170 if (++lineno == 0)
171 die("cc1: %s: file too long", filenam);
172 }
173
174 /*
175 * Read the next character from the input file, counting number of lines
176 * and joining lines escaped with \
177 */
178 static int
179 readchar(void)
180 {
181 FILE *fp = input->fp;
182 int c;
183
184 repeat:
185 switch (c = getc(fp)) {
186 case '\\':
187 if ((c = getc(fp)) == '\n') {
188 newline();
189 goto repeat;
190 }
191 ungetc(c, fp);
192 c = '\\';
193 break;
194 case '\n':
195 newline();
196 break;
197 case EOF:
198 break;
199 }
200
201 return c;
202 }
203
204 /*
205 * discard a C comment. This function is only called from readline
206 * because it is impossible to have a comment in a macro, because
207 * comments are always discarded before processing any cpp directive
208 */
209 static void
210 comment(int type)
211 {
212 int c;
213
214 repeat:
215 while ((c = readchar()) != EOF && c != type)
216 ;
217
218 if (c == EOF) {
219 errorp("unterminated comment");
220 return;
221 }
222
223 if (type == '*' && (c = readchar()) != '/')
224 goto repeat;
225 }
226
227 /*
228 * readline is used to read a full logic line from a file.
229 * It discards comments and check that the line fits in
230 * the input buffer
231 */
232 static int
233 readline(void)
234 {
235 char *bp, *lim;
236 int c, peekc = 0, delim = 0;
237
238 if (feof(input->fp)) {
239 input->flags |= IEOF;
240 *input->p = '\0';
241 return 0;
242 }
243
244 *input->line = '\0';
245 lim = &input->line[INPUTSIZ-1];
246 for (bp = input->line; bp < lim-1; *bp++ = c) {
247 c = (peekc) ? peekc : readchar();
248 peekc = 0;
249 if (c == '\n' || c == EOF)
250 break;
251 if (c == '\\') {
252 peekc = readchar();
253 if (peekc == '\n' || peekc == EOF)
254 continue;
255 if (bp == lim-2)
256 break;
257 *bp++ = c;
258 c = peekc;
259 peekc = 0;
260 continue;
261 }
262
263 if (delim && c == delim)
264 delim = 0;
265 else if (!delim && (c == '"' || c == '\''))
266 delim = c;
267 if (c != '/' || delim)
268 continue;
269
270 /* check for /* or // */
271 peekc = readchar();
272 if (peekc != '*' && peekc != '/')
273 continue;
274
275 if (peekc == '/') {
276 comment('\n');
277 break;
278 } else {
279 comment('*');
280 c = ' ';
281 }
282 peekc = 0;
283 }
284
285 input->begin = input->p = input->line;
286 if (bp == lim-1) {
287 errorp("line too long");
288 --bp;
289 }
290 *bp++ = '\n';
291 *bp = '\0';
292
293 return 1;
294 }
295
296 /*
297 * moreinput gets more bytes to be passed to the lexer.
298 * It can take more bytes from macro expansions or
299 * directly reading from files. When a cpp directive
300 * is processed the line is discarded because it must not
301 * be passed to the lexer
302 */
303 static int
304 moreinput(void)
305 {
306 int wasexpand = 0;
307
308 repeat:
309 if (!input)
310 return 0;
311
312 if (*input->p == '\0') {
313 int t = input->flags & ITYPE;
314 if (t == IPARAM) {
315 input->flags |= IEOF;
316 return 0;
317 }
318 if (t == IMACRO) {
319 wasexpand = 1;
320 input->flags |= IEOF;
321 }
322 if (input->flags & IEOF) {
323 delinput();
324 goto repeat;
325 }
326 if (!readline()) {
327 *input->p = '\0';
328 goto repeat;
329 }
330 if (cpp())
331 goto repeat;
332 }
333
334 if (onlycpp && !wasexpand)
335 ppragmaln();
336 return 1;
337 }
338
339 static void
340 tok2str(void)
341 {
342 if ((yylen = input->p - input->begin) > INTIDENTSIZ)
343 error("token too big");
344 memcpy(yytext, input->begin, yylen);
345 yytext[yylen] = '\0';
346 input->begin = input->p;
347 }
348
349 static Symbol *
350 readint(char *s, int base, int sign, Symbol *sym)
351 {
352 Type *tp = sym->type;
353 struct limits *lim;
354 TUINT u, val, max;
355 int c;
356
357 lim = getlimits(tp);
358 max = lim->max.i;
359 if (*s == '0')
360 ++s;
361 if (toupper(*s) == 'X')
362 ++s;
363
364 for (u = 0; isxdigit(c = *s++); u = u*base + val) {
365 static char letters[] = "0123456789ABCDEF";
366 val = strchr(letters, toupper(c)) - letters;
367 repeat:
368 if (u <= max/base && u*base <= max - val)
369 continue;
370 if (tp->prop & TSIGNED) {
371 if (tp == inttype)
372 tp = (base==10) ? longtype : uinttype;
373 else if (tp == longtype)
374 tp = (base==10) ? llongtype : ulongtype;
375 else
376 goto overflow;
377 } else {
378 if (tp == uinttype)
379 tp = (sign==UNSIGNED) ? ulongtype : longtype;
380 else if (tp == ulongtype)
381 tp = (sign==UNSIGNED) ? ullongtype : llongtype;
382 else
383 goto overflow;
384 }
385 sym->type = tp;
386 lim = getlimits(tp);
387 max = lim->max.i;
388 goto repeat;
389 }
390
391 if (tp->prop & TSIGNED)
392 sym->u.i = u;
393 else
394 sym->u.u = u;
395
396 return sym;
397
398 overflow:
399 errorp("overflow in integer constant");
400 return sym;
401 }
402
403 static int
404 integer(char *s, int base)
405 {
406 Type *tp;
407 Symbol *sym;
408 unsigned size, sign;
409
410 for (size = sign = 0; ; ++input->p) {
411 switch (toupper(*input->p)) {
412 case 'L':
413 if (size == LLONG)
414 goto wrong_type;
415 size = (size == LONG) ? LLONG : LONG;
416 continue;
417 case 'U':
418 if (sign == UNSIGNED)
419 goto wrong_type;
420 sign = UNSIGNED;
421 continue;
422 default:
423 goto convert;
424 wrong_type:
425 error("invalid suffix in integer constant");
426 }
427 }
428
429 convert:
430 tok2str();
431 tp = ctype(INT, sign, size);
432 sym = newsym(NS_IDEN, NULL);
433 sym->type = tp;
434 sym->flags |= SCONSTANT;
435 yylval.sym = readint(s, base, sign, sym);
436 return CONSTANT;
437 }
438
439 static char *
440 digits(int base)
441 {
442 char *p;
443 int c;
444
445 for (p = input->p; c = *p; ++p) {
446 switch (base) {
447 case 8:
448 if (!strchr("01234567", c))
449 goto end;
450 break;
451 case 10:
452 if (!isdigit(c))
453 goto end;
454 break;
455 case 16:
456 if (!isxdigit(c))
457 goto end;
458 break;
459 }
460 }
461 end:
462 input->p = p;
463 return yytext;
464 }
465
466 static int
467 number(void)
468 {
469 int base;
470
471 if (*input->p != '0') {
472 base = 10;
473 } else {
474 if (toupper(*++input->p) == 'X') {
475 ++input->p;
476 base = 16;
477 } else {
478 base = 8;
479 }
480 }
481
482 return integer(digits(base), base);
483 }
484
485 static int
486 escape(void)
487 {
488 int c, d, i, cnt, base;
489
490 switch (*++input->p) {
491 case 'a':
492 return '\a';
493 case 'b':
494 return '\b';
495 case 'f':
496 return '\f';
497 case 'n':
498 return '\n';
499 case 'r':
500 return '\r';
501 case 't':
502 return '\t';
503 case 'v':
504 return '\v';
505 case '"':
506 return '"';
507 case '\'':
508 return '\'';
509 case '\\':
510 return '\\';
511 case '\?':
512 return '\?';
513 case 'u':
514 /*
515 * FIXME: universal constants are not correctly handled
516 */
517 if (!isdigit(*++input->p))
518 warn("incorrect digit for numerical character constant");
519 base = 10;
520 break;
521 case 'x':
522 if (!isxdigit(*++input->p))
523 warn("\\x used with no following hex digits");
524 cnt = 2;
525 base = 16;
526 break;
527 case '0':
528 case '1':
529 case '2':
530 case '3':
531 case '4':
532 case '5':
533 case '6':
534 case '7':
535 cnt = 3;
536 base = 8;
537 break;
538 default:
539 warn("unknown escape sequence");
540 return ' ';
541 }
542
543 for (c = i = 0; i < cnt; ++i) {
544 static char digits[] = "0123456789ABCDEF";
545 char *p = strchr(digits, toupper(*input->p));
546
547 if (!p || (d = p - digits) > base)
548 break;
549 c *= base;
550 c += d;
551 ++input->p;
552 }
553 --input->p;
554
555 return c;
556 }
557
558 static Rune
559 utf8rune(void)
560 {
561 Rune wc;
562 unsigned c;
563 size_t i, len;
564
565 c = *input->p;
566 for (len = 0; c & 0x80; len++)
567 c <<= 1;
568 if (len == 0)
569 return c;
570 if (len == 1 || len == 8)
571 goto invalid;
572
573 wc = (c & 0xFF) >> len;
574 for (i = 0; i < len-1; i++) {
575 c = input->p[1];
576 if ((c & 0xC0) != 0x80)
577 goto invalid;
578 input->p++;
579 wc <<= 6;
580 wc |= c & 0x3F;
581 }
582 return wc;
583
584 invalid:
585 errorp("invalid multibyte sequence");
586 return 0xFFFD;
587 }
588
589 static Rune
590 decode(int multi)
591 {
592 Rune r;
593
594 if (*input->p == '\\') {
595 r = escape();
596 return r;
597 }
598
599 return multi ? utf8rune() : *input->p;
600 }
601
602 static int
603 character(void)
604 {
605 int i, multi = 0;
606 Rune r, d;
607 Type *tp = inttype;
608 Symbol *sym;
609
610 if (*input->p == 'L') {
611 multi = 1;
612 tp = wchartype;
613 input->p++;
614 }
615
616 d = 0;
617 input->p++;
618 for (i = 0; *input->p != '\''; i++) {
619 r = decode(multi);
620 if (r > getlimits(tp)->max.i)
621 warn("character too large for enclosing character literal type");
622 d |= r;
623 input->p++;
624 }
625 input->p++;
626
627 if (i == 0)
628 errorp("empty character constant");
629 if (i > 1)
630 warn("multi-character character constant");
631
632 sym = newsym(NS_IDEN, NULL);
633 sym->u.i = d;
634 sym->type = tp;
635 yylval.sym = sym;
636 tok2str();
637 return CONSTANT;
638 }
639
640 /*
641 * string() parses a constant string, and convert all the
642 * escape sequences into single characters. This behaviour
643 * is correct except when we parse a #define, where we want
644 * to preserve the literal content of the string. In that
645 * case cpp.c:/^define( sets the variable disescape to
646 * disable converting escape sequences into characters.
647 */
648 static int
649 string(void)
650 {
651 char *bp = yytext;
652 int c, esc;
653
654 *bp++ = '"';
655 esc = 0;
656 for (++input->p; ; ++input->p) {
657 c = *input->p;
658
659 if (c == '"' && !esc)
660 break;
661
662 if (c == '\0') {
663 errorp("missing terminating '\"' character");
664 break;
665 }
666
667 esc = (c == '\\' && !esc && disescape);
668
669 if (c == '\\' && !esc)
670 c = escape();
671
672 if (bp == &yytext[STRINGSIZ+1]) {
673 /* too long, ignore everything until next quote */
674 for (++input->p; *input->p != '"'; ++input->p) {
675 if (*input->p == '\\')
676 ++input->p;
677 if (*input->p == '\0')
678 break;
679 }
680 --bp;
681 errorp("string too long");
682 break;
683 }
684 *bp++ = c;
685 }
686
687 input->begin = ++input->p;
688 *bp = '\0';
689
690 yylen = bp - yytext + 1;
691 yylval.sym = newstring(yytext+1, yylen-1);
692 *bp++ = '"';
693 *bp = '\0';
694 return STRING;
695 }
696
697 static int
698 iden(void)
699 {
700 Symbol *sym;
701 char *p, *begin;
702
703 if (input->p[0] == 'L' && input->p[1] == '\'')
704 return character();
705
706 begin = input->p;
707 for (p = begin; isalnum(*p) || *p == '_'; ++p)
708 ;
709 input->p = p;
710 tok2str();
711 if ((sym = lookup(NS_CPP, yytext, NOALLOC)) != NULL) {
712 if (expand(sym))
713 return next();
714 }
715 sym = lookup(namespace, yytext, ALLOC);
716 yylval.sym = sym;
717 if (sym->flags & SCONSTANT)
718 return CONSTANT;
719 if (sym->token != IDEN)
720 yylval.token = sym->u.token;
721 return sym->token;
722 }
723
724 static int
725 follow(int expect, int ifyes, int ifno)
726 {
727 if (*input->p++ == expect)
728 return ifyes;
729 --input->p;
730 return ifno;
731 }
732
733 static int
734 minus(void)
735 {
736 switch (*input->p++) {
737 case '-':
738 return DEC;
739 case '>':
740 return INDIR;
741 case '=':
742 return SUB_EQ;
743 default:
744 --input->p;
745 return '-';
746 }
747 }
748
749 static int
750 plus(void)
751 {
752 switch (*input->p++) {
753 case '+':
754 return INC;
755 case '=':
756 return ADD_EQ;
757 default:
758 --input->p;
759 return '+';
760 }
761 }
762
763 static int
764 relational(int op, int equal, int shift, int assig)
765 {
766 int c;
767
768 if ((c = *input->p++) == '=')
769 return equal;
770 if (c == op)
771 return follow('=', assig, shift);
772 --input->p;
773 return op;
774 }
775
776 static int
777 logic(int op, int equal, int logic)
778 {
779 int c;
780
781 if ((c = *input->p++) == '=')
782 return equal;
783 if (c == op)
784 return logic;
785 --input->p;
786 return op;
787 }
788
789 static int
790 dot(void)
791 {
792 int c;
793
794 if ((c = *input->p) != '.')
795 return '.';
796 if ((c = *++input->p) != '.')
797 error("incorrect token '..'");
798 ++input->p;
799 return ELLIPSIS;
800 }
801
802 static int
803 operator(void)
804 {
805 int t;
806
807 switch (t = *input->p++) {
808 case '<':
809 t = relational('<', LE, SHL, SHL_EQ);
810 break;
811 case '>':
812 t = relational('>', GE, SHR, SHR_EQ);
813 break;
814 case '&':
815 t = logic('&', AND_EQ, AND);
816 break;
817 case '|':
818 t = logic('|', OR_EQ, OR);
819 break;
820 case '=':
821 t = follow('=', EQ, '=');
822 break;
823 case '^':
824 t = follow('=', XOR_EQ, '^');
825 break;
826 case '*':
827 t = follow('=', MUL_EQ, '*');
828 break;
829 case '/':
830 t = follow('=', DIV_EQ, '/');
831 break;
832 case '%':
833 t = follow('=', MOD_EQ, '%');
834 break;
835 case '!':
836 t = follow('=', NE, '!');
837 break;
838 case '#':
839 t = follow('#', CONCAT, STRINGIZE);
840 break;
841 case '-':
842 t = minus();
843 break;
844 case '+':
845 t = plus();
846 break;
847 case '.':
848 t = dot();
849 break;
850 }
851 tok2str();
852 return t;
853 }
854
855 /* TODO: Ensure that namespace is NS_IDEN after a recovery */
856
857 /*
858 * skip all the spaces until the next token. When we are in
859 * CPPMODE \n is not considered a whitespace
860 */
861 static int
862 skipspaces(void)
863 {
864 int c;
865
866 if (!input)
867 return EOF;
868
869 for (;;) {
870 switch (c = *input->p) {
871 case '\n':
872 if (lexmode == CPPMODE)
873 goto return_byte;
874 ++input->p;
875 case '\0':
876 if (!moreinput())
877 return EOF;
878 break;
879 case ' ':
880 case '\t':
881 case '\v':
882 case '\r':
883 case '\f':
884 ++input->p;
885 break;
886 default:
887 goto return_byte;
888 }
889 }
890
891 return_byte:
892 input->begin = input->p;
893 return c;
894 }
895
896 int
897 next(void)
898 {
899 int c;
900
901 if ((c = skipspaces()) == EOF)
902 yytoken = EOFTOK;
903 else if (isalpha(c) || c == '_')
904 yytoken = iden();
905 else if (isdigit(c))
906 yytoken = number();
907 else if (c == '"')
908 yytoken = string();
909 else if (c == '\'')
910 yytoken = character();
911 else
912 yytoken = operator();
913
914 if (yytoken == EOFTOK) {
915 strcpy(yytext, "<EOF>");
916 if (cppctx && !input)
917 errorp("#endif expected");
918 }
919
920 DBG("TOKEN %s", yytext);
921 return yytoken;
922 }
923
924 void
925 expect(int tok)
926 {
927 if (yytoken != tok) {
928 if (isgraph(tok))
929 errorp("expected '%c' before '%s'", tok, yytext);
930 else
931 errorp("unexpected '%s'", yytext);
932 } else {
933 next();
934 }
935 }
936
937 int
938 ahead(void)
939 {
940 skipspaces();
941 return *input->begin;
942 }
943
944 void
945 setsafe(int type)
946 {
947 safe = type;
948 }
949
950 void
951 discard(void)
952 {
953 extern jmp_buf recover;
954 int c;
955
956 for (c = yytoken; ; c = *input->p++) {
957 switch (safe) {
958 case END_COMP:
959 if (c == '}')
960 goto jump;
961 goto semicolon;
962 case END_COND:
963 if (c == ')')
964 goto jump;
965 break;
966 case END_LDECL:
967 if (c == ',')
968 goto jump;
969 case END_DECL:
970 semicolon:
971 if (c == ';')
972 goto jump;
973 break;
974 }
975 if ((c == '\0' || c == EOFTOK) && !moreinput())
976 exit(EXIT_FAILURE);
977 }
978 jump:
979 input->begin = input->p;
980 yytoken = c;
981 yytext[0] = c;
982 yytext[1] = '\0';
983 exit(EXIT_FAILURE);
984
985 /*
986 * FIXME: We don't have a proper recover mechanism at this moment
987 * and we don't set the recover point ever, so executing this
988 * longjmp will generate surely a segmentation fault, so it does
989 * not make sense to do it. We just exit until we can find time
990 * to solve this problem.
991 */
992 longjmp(recover, 1);
993 }