/*++ /* NAME /* tok_io 3 /* SUMMARY /* token I/O /* PACKAGE /* unproto /* SYNOPSIS /* #include "token.h" /* /* struct token *tok_get() /* /* void tok_flush(t) /* struct token *t; /* /* void tok_show(t) /* struct token *t; /* /* void tok_show_ch(t) /* struct token *t; /* /* void put_str(s) /* char *s; /* /* void put_ch(c) /* int c; /* /* void put_nl() /* /* char *in_path; /* int in_line; /* DESCRIPTION /* These functions read from stdin and write to stdout. The /* tokenizer keeps track of where the token appeared in the input /* stream; on output, this information is used to preserve correct /* line number information (even after lots of token lookahead or /* after function-header rewriting) so that diagnostics from the /* next compiler stage make sense. /* /* tok_get() reads the next token from standard input. It returns /* a null pointer when the end of input is reached. /* /* tok_show() displays the contents of a (possibly composite) token /* on the standard output. /* /* tok_show_ch() displays the contents of a single-character token /* on the standard output. The character should not be a newline. /* /* tok_flush() displays the contents of a (possibly composite) token /* on the standard output and makes it available for re-use. /* /* put_str() writes a null-terminated string to standard output. /* There should be no newline characters in the string argument. /* /* put_ch() writes one character to standard output. The character /* should not be a newline. /* /* put_nl() outputs a newline character and adjusts the program's idea of /* the current output line. /* /* The in_path and in_line variables contain the file name and /* line number of the most recently read token. /* BUGS /* The tokenizer is just good enough for the unproto filter. /* As a benefit, it is quite fast. /* AUTHOR(S) /* Wietse Venema /* Eindhoven University of Technology /* Department of Mathematics and Computer Science /* Den Dolech 2, P.O. Box 513, 5600 MB Eindhoven, The Netherlands /* LAST MODIFICATION /* 92/01/15 21:52:59 /* VERSION/RELEASE /* 1.3 /*--*/ static char io_sccsid[] = "@(#) tok_io.c 1.3 92/01/15 21:52:59"; /* C library */ #include #include extern char *strchr(); extern char *malloc(); extern char *realloc(); extern char *strcpy(); /* Application-specific stuff */ #include "token.h" #include "vstring.h" #include "error.h" extern char *strsave(); /* XXX need include file */ /* Stuff to keep track of original source file name and position */ static char def_path[] = ""; /* default path name */ char *in_path = def_path; /* current input file name */ int in_line = 1; /* current input line number */ static char *out_path = def_path; /* last name in output line control */ static int out_line = 1; /* current output line number */ int last_ch; /* type of last output */ /* Forward declarations */ static int read_quoted(); static void read_comment(); static int backslash_newline(); static char *read_hex(); static char *read_octal(); static void fix_line_control(); /* * Character input with one level of pushback. The INPUT() macro recursively * strips backslash-newline pairs from the input stream. The UNPUT() macro * should be used only for characters obtained through the INPUT() macro. * * After skipping a backslash-newline pair, the input line counter is not * updated, and we continue with the same logical source line. We just * update a counter with the number of backslash-newline sequences that must * be accounted for (backslash_newline() updates the counter). At the end of * the logical source line, an appropriate number of newline characters is * pushed back (in tok_get()). I do not know how GCC handles this, but it * seems to produce te same output. * * Because backslash_newline() recursively calls itself (through the INPUT() * macro), we will run out of stack space, given a sufficiently long * sequence of backslash-newline pairs. */ static char in_char = 0; /* push-back storage */ static int in_flag = 0; /* pushback available */ static int nl_compensate = 0; /* line continuation kluge */ #define INPUT(c) (in_flag ? (in_flag = 0, c = in_char) : \ (c = getchar()) != '\\' ? c : \ (c = getchar()) != '\n' ? (ungetc(c, stdin), c = '\\') : \ (c = backslash_newline())) #define UNPUT(c) (in_flag = 1, in_char = c) /* Directives that should be ignored. */ #ifdef IGNORE_DIRECTIVES static char *ignore_directives[] = { IGNORE_DIRECTIVES, 0, }; #endif /* Modified string and ctype stuff. */ #define STREQUAL(x,y) (*(x) == *(y) && strcmp((x),(y)) == 0) #define ISALNUM(c) (isalnum(c) || (c) == '_') #define ISALPHA(c) (isalpha(c) || (c) == '_') #define ISSPACE(c) (isspace(c) && c != '\n') #define ISDOT(c) (c == '.') #define ISHEX(c) (isdigit(c) || strchr("abcdefABCDEF", c) != 0) #define ISOCTAL(c) (isdigit(c) && (c) != '8' && (c) != '9') /* Collect all characters that satisfy one condition */ #define COLLECT(v,c,cond) { \ register struct vstring *vs = v; \ register char *cp = vs->str; \ *cp++ = c; \ while (INPUT(c) != EOF) { \ if (cond) { \ if (VS_ADDCH(vs, cp, c) == 0) \ fatal("out of memory"); \ } else { \ UNPUT(c); \ break; \ } \ } \ *cp = 0; \ } /* Ensure that output line information is correct */ #define CHECK_LINE_CONTROL(p,l) { if (out_path != (p) || out_line != (l)) \ fix_line_control((p),(l)); } /* do_control - parse control line */ static int do_control() { struct token *t; int line; char *path; /* Make sure that the directive shows up in the right place. */ CHECK_LINE_CONTROL(in_path, in_line); while (t = tok_get()) { switch (t->tokno) { case TOK_WSPACE: /* Ignore blanks after "#" token. */ tok_free(t); break; case TOK_NUMBER: /* * Line control is of the form: number pathname junk. Since we * have no idea what junk the preprocessor may generate, we copy * all line control tokens to stdout. */ put_str("# "); line = atoi(t->vstr->str); /* extract line number */ tok_flush(t); while ((t = tok_get()) && t->tokno == TOK_WSPACE) tok_flush(t); /* copy white space */ if (t) { /* extract path name */ path = (t->tokno == '"') ? strsave(t->vstr->str) : in_path; do { tok_flush(t); /* copy until newline */ } while (t->tokno != '\n' && (t = tok_get())); } out_line = in_line = line; /* synchronize */ out_path = in_path = path; /* synchronize */ return; #ifdef IGNORE_DIRECTIVES case TOK_WORD: /* * Optionally ignore other #directives. This is only a partial * solution, because the preprocessor will still see them. */ { char **cpp; char *cp = t->vstr->str; for (cpp = ignore_directives; *cpp; cpp++) { if (STREQUAL(cp, *cpp)) { do { tok_free(t); } while (t->tokno != '\n' && (t = tok_get())); return; } } } /* FALLTHROUGH */ #endif default: /* Pass through. */ put_ch('#'); do { tok_flush(t); } while (t->tokno != '\n' && (t = tok_get())); return; case 0: /* Hit EOF, punt. */ put_ch('#'); return; } } } /* backslash_newline - fix up things after reading a backslash-newline pair */ static int backslash_newline() { register int c; nl_compensate++; return (INPUT(c)); } /* tok_get - get next token */ static int last_tokno = '\n'; struct token *tok_get() { register struct token *t; register int c; int d; /* * Get one from the pool and fill it in. The loop is here in case we hit * a preprocessor control line, which happens in a minority of all cases. * We update the token input path and line info *after* backslash-newline * processing or the newline compensation would go wrong. */ t = tok_alloc(); for (;;) { if ((INPUT(c)) == EOF) { tok_free(t); return (0); } else if ((t->line = in_line, t->path = in_path), !isascii(c)) { t->vstr->str[0] = c; t->vstr->str[1] = 0; t->tokno = TOK_OTHER; break; } else if (ISSPACE(c)) { COLLECT(t->vstr, c, ISSPACE(c)); t->tokno = TOK_WSPACE; break; } else if (ISALPHA(c)) { COLLECT(t->vstr, c, ISALNUM(c)); t->tokno = TOK_WORD; break; } else if (isdigit(c)) { COLLECT(t->vstr, c, isdigit(c)); t->tokno = TOK_NUMBER; break; } else if (c == '"' || c == '\'') { t->tokno = read_quoted(t->vstr, c); /* detect missing end quote */ break; } else if (ISDOT(c)) { COLLECT(t->vstr, c, ISDOT(c)); t->tokno = TOK_OTHER; break; } else if (c == '#' && last_tokno == '\n') { do_control(); continue; } else { t->vstr->str[0] = c; if (c == '\n') { in_line++; if (nl_compensate > 0) { /* compensation for bs-nl */ UNPUT('\n'); nl_compensate--; } } else if (c == '/') { if ((INPUT(d)) == '*') { t->vstr->str[1] = d; /* comment */ read_comment(t->vstr); t->tokno = TOK_WSPACE; break; } else { if (d != EOF) UNPUT(d); } } else if (c == '\\') { t->vstr->str[1] = (INPUT(c) == EOF ? 0 : c); t->vstr->str[2] = 0; t->tokno = TOK_OTHER; break; } t->vstr->str[1] = 0; t->tokno = c; break; } } last_tokno = t->tokno; t->end_line = in_line; return (t); } /* read_quoted - read string or character literal, canonicalize escapes */ static int read_quoted(vs, ch) register struct vstring *vs; int ch; { register char *cp = vs->str; register int c; int ret = TOK_OTHER; *cp++ = ch; /* * Clobber the token type in case of a premature newline or EOF. This * prevents us from attempting to concatenate string constants with * broken ones that have no closing quote. */ while (INPUT(c) != EOF) { if (c == '\n') { /* newline in string */ UNPUT(c); break; } if (VS_ADDCH(vs, cp, c) == 0) /* store character */ fatal("out of memory"); if (c == ch) { /* closing quote */ ret = c; break; } if (c == '\\') { /* parse escape sequence */ if ((INPUT(c)) == EOF) { /* EOF, punt */ break; } else if (c == 'a') { /* \a -> audible bell */ #ifdef BELL if ((cp = vs_strcpy(vs, cp, BELL)) == 0) #else if ((cp = vs_strcpy(vs, cp, "\007")) == 0) #endif fatal("out of memory"); } else if (c == 'x') { /* \xhh -> \nnn */ cp = read_hex(vs, cp); } else if (ISOCTAL(c) && ch != '\'') { cp = read_octal(vs, cp, c); /* canonicalize \octal */ } else { if (VS_ADDCH(vs, cp, c) == 0) /* \other: leave alone */ fatal("out of memory"); } } } *cp = 0; return (ret); } /* read_comment - stuff a whole comment into one huge token */ static void read_comment(vs) register struct vstring *vs; { register char *cp = vs->str + 2; /* skip slash star */ register int c; register int d; while (INPUT(c) != EOF) { if (VS_ADDCH(vs, cp, c) == 0) fatal("out of memory"); if (c == '*') { if ((INPUT(d)) == '/') { if (VS_ADDCH(vs, cp, d) == 0) fatal("out of memory"); break; } else { if (d != EOF) UNPUT(d); } } else if (c == '\n') { in_line++; } else if (c == '\\') { if ((INPUT(d)) != EOF && VS_ADDCH(vs, cp, d) == 0) fatal("out of memory"); } } *cp = 0; } /* read_hex - rewrite hex escape to three-digit octal escape */ static char *read_hex(vs, cp) struct vstring *vs; register char *cp; { register int c; register int i; char buf[BUFSIZ]; int len; unsigned val; /* * Eat up all subsequent hex digits. Complain later when there are too * many. */ for (i = 0; i < sizeof(buf) && (INPUT(c) != EOF) && ISHEX(c); i++) buf[i] = c; buf[i] = 0; if (i < sizeof(buf) && c) UNPUT(c); /* * Convert hex form to three-digit octal form. The three-digit form is * used so that strings can be concatenated without problems. Complain * about malformed input; truncate the result to at most three octal * digits. */ if (i == 0) { error("\\x escape sequence without hexadecimal digits"); if (VS_ADDCH(vs, cp, 'x') == 0) fatal("out of memory"); } else { (void) sscanf(buf, "%x", &val); sprintf(buf, "%03o", val); if ((len = strlen(buf)) > 3) error("\\x escape sequence yields non-character value"); if ((cp = vs_strcpy(vs, cp, buf + len - 3)) == 0) fatal("out of memory"); } return (cp); } /* read_octal - convert octal escape to three-digit format */ static char obuf[] = "00123"; static char *read_octal(vs, cp, c) register struct vstring *vs; register char *cp; register int c; { register int i; #define buf_input (obuf + 2) /* Eat up at most three octal digits. */ buf_input[0] = c; for (i = 1; i < 3 && (INPUT(c) != EOF) && ISOCTAL(c); i++) buf_input[i] = c; buf_input[i] = 0; if (i < 3 && c) UNPUT(c); /* * Leave three-digit octal escapes alone. Convert one-digit and two-digit * octal escapes to three-digit form by prefixing them with a suitable * number of '0' characters. This is done so that strings can be * concatenated without problems. */ if ((cp = vs_strcpy(vs, cp, buf_input + i - 3)) == 0) fatal("out of memory"); return (cp); } /* put_nl - emit newline and adjust output line count */ void put_nl() { put_ch('\n'); out_line++; } /* fix_line_control - to adjust path and/or line count info in output */ static void fix_line_control(path, line) register char *path; register int line; { /* * This function is called sporadically, so it should not be a problem * that we repeat some of the tests that preceded this function call. * * Emit a newline if we are not at the start of a line. * * If we switch files, or if we jump backwards, emit line control. If we * jump forward, emit the proper number of newlines to compensate. */ if (last_ch != '\n') /* terminate open line */ put_nl(); if (path != out_path || line < out_line) { /* file switch or back jump */ printf("# %d %s\n", out_line = line, out_path = path); last_ch = '\n'; } else { /* forward jump */ while (line > out_line) put_nl(); } } /* tok_show_ch - output single-character token (not newline) */ void tok_show_ch(t) register struct token *t; { CHECK_LINE_CONTROL(t->path, t->line); put_ch(t->tokno); /* show token contents */ } /* tok_show - output (possibly composite) token */ void tok_show(t) register struct token *t; { register struct token *p; if (t->tokno == TOK_LIST) { register struct token *s; /* * This branch is completely in terms of tok_xxx() primitives, so * there is no need to check the line control information. */ for (s = t->head; s; s = s->next) { tok_show_ch(s); /* '(' or ',' or ')' */ for (p = s->head; p; p = p->next) tok_show(p); /* show list element */ } } else { register char *cp = t->vstr->str; /* * Measurements show that it pays off to give special treatment to * single-character tokens. Note that both types of token may cause a * change of output line number. */ CHECK_LINE_CONTROL(t->path, t->line); if (cp[1] == 0) { put_ch(*cp); /* single-character token */ } else { put_str(cp); /* multi_character token */ } out_line = t->end_line; /* may span multiple lines */ for (p = t->head; p; p = p->next) tok_show(p); /* trailing blanks */ } }