1 files changed, 612 insertions, 0 deletions
diff --git a/unproto/tok_io.c b/unproto/tok_io.c
new file mode 100644
index 0000000..74ae6bc
--- /dev/null
+++ b/unproto/tok_io.c
@@ -0,0 +1,612 @@
+/*++
+/* NAME
+/*	tok_io 3
+/* SUMMARY
+/*	token I/O
+/* PACKAGE
+/*	unproto
+/* SYNOPSIS
+/*	#include "token.h"
+/*
+/*	struct token *tok_get()
+/*
+/*	void tok_flush(t)
+/*	struct token *t;
+/*
+/*	void tok_show(t)
+/*	struct token *t;
+/*
+/*	void tok_show_ch(t)
+/*	struct token *t;
+/*
+/*	void put_str(s)
+/*	char *s;
+/*
+/*	void put_ch(c)
+/*	int c;
+/*
+/*	void put_nl()
+/*
+/*	char *in_path;
+/*	int in_line;
+/* DESCRIPTION
+/*	These functions read from stdin and write to stdout. The
+/*	tokenizer keeps track of where the token appeared in the input
+/*	stream; on output, this information is used to preserve correct
+/*	line number information (even after lots of token lookahead or
+/*	after function-header rewriting) so that diagnostics from the
+/*	next compiler stage make sense.
+/*
+/*	tok_get() reads the next token from standard input. It returns
+/*	a null pointer when the end of input is reached.
+/*
+/*	tok_show() displays the contents of a (possibly composite) token
+/*	on the standard output.
+/*
+/*	tok_show_ch() displays the contents of a single-character token
+/*	on the standard output. The character should not be a newline.
+/*
+/*	tok_flush() displays the contents of a (possibly composite) token
+/*	on the standard output and makes it available for re-use.
+/*
+/*	put_str() writes a null-terminated string to standard output.
+/*	There should be no newline characters in the string argument.
+/*
+/*	put_ch() writes one character to standard output. The character
+/*	should not be a newline.
+/*
+/*	put_nl() outputs a newline character and adjusts the program's idea of
+/*	the current output line.
+/*
+/*	The in_path and in_line variables contain the file name and
+/*	line number of the most recently read token.
+/* BUGS
+/*	The tokenizer is just good enough for the unproto filter.
+/*	As a benefit, it is quite fast.
+/* AUTHOR(S)
+/*	Wietse Venema
+/*	Eindhoven University of Technology
+/*	Department of Mathematics and Computer Science
+/*	Den Dolech 2, P.O. Box 513, 5600 MB Eindhoven, The Netherlands
+/* LAST MODIFICATION
+/*	92/01/15 21:52:59
+/* VERSION/RELEASE
+/*	1.3
+/*--*/
+
+static char io_sccsid[] = "@(#) tok_io.c 1.3 92/01/15 21:52:59";
+
+/* C library */
+
+#include <stdio.h>
+#include <ctype.h>
+
+extern char *strchr();
+extern char *malloc();
+extern char *realloc();
+extern char *strcpy();
+
+/* Application-specific stuff */
+
+#include "token.h"
+#include "vstring.h"
+#include "error.h"
+
+extern char *strsave();			/* XXX need include file */
+
+/* Stuff to keep track of original source file name and position */
+
+static char def_path[] = "";		/* default path name */
+
+char   *in_path = def_path;		/* current input file name */
+int     in_line = 1;			/* current input line number */
+
+static char *out_path = def_path;	/* last name in output line control */
+static int out_line = 1;		/* current output line number */
+int     last_ch;			/* type of last output */
+
+/* Forward declarations */
+
+static int read_quoted();
+static void read_comment();
+static int backslash_newline();
+static char *read_hex();
+static char *read_octal();
+static void fix_line_control();
+
+ /*
+  * Character input with one level of pushback. The INPUT() macro recursively
+  * strips backslash-newline pairs from the input stream. The UNPUT() macro
+  * should be used only for characters obtained through the INPUT() macro.
+  * 
+  * After skipping a backslash-newline pair, the input line counter is not
+  * updated, and we continue with the same logical source line. We just
+  * update a counter with the number of backslash-newline sequences that must
+  * be accounted for (backslash_newline() updates the counter). At the end of
+  * the logical source line, an appropriate number of newline characters is
+  * pushed back (in tok_get()). I do not know how GCC handles this, but it
+  * seems to produce te same output.
+  * 
+  * Because backslash_newline() recursively calls itself (through the INPUT()
+  * macro), we will run out of stack space, given a sufficiently long
+  * sequence of backslash-newline pairs.
+  */
+
+static char in_char = 0;		/* push-back storage */
+static int in_flag = 0;			/* pushback available */
+static int nl_compensate = 0;		/* line continuation kluge */
+
+#define INPUT(c) (in_flag ? (in_flag = 0, c = in_char) : \
+		    (c = getchar()) != '\\' ? c : \
+		    (c = getchar()) != '\n' ? (ungetc(c, stdin), c = '\\') : \
+		    (c = backslash_newline()))
+#define	UNPUT(c) (in_flag = 1, in_char = c)
+
+/* Directives that should be ignored. */
+
+#ifdef IGNORE_DIRECTIVES
+
+static char *ignore_directives[] = {
+    IGNORE_DIRECTIVES,
+    0,
+};
+
+#endif
+
+/* Modified string and ctype stuff. */
+
+#define	STREQUAL(x,y)	(*(x) == *(y) && strcmp((x),(y)) == 0)
+
+#define	ISALNUM(c)	(isalnum(c) || (c) == '_')
+#define	ISALPHA(c)	(isalpha(c) || (c) == '_')
+#define	ISSPACE(c)	(isspace(c) && c != '\n')
+#define	ISDOT(c)	(c == '.')
+#define	ISHEX(c)	(isdigit(c) || strchr("abcdefABCDEF", c) != 0)
+#define	ISOCTAL(c)	(isdigit(c) && (c) != '8' && (c) != '9')
+
+/* Collect all characters that satisfy one condition */
+
+#define	COLLECT(v,c,cond) { \
+				register struct vstring *vs = v; \
+				register char *cp = vs->str; \
+				*cp++ = c; \
+				while (INPUT(c) != EOF) { \
+				    if (cond) { \
+					if (VS_ADDCH(vs, cp, c) == 0) \
+					    fatal("out of memory"); \
+				    } else { \
+					UNPUT(c); \
+					break; \
+				    } \
+				} \
+				*cp = 0; \
+			    }
+
+/* Ensure that output line information is correct */
+
+#define	CHECK_LINE_CONTROL(p,l) { if (out_path != (p) || out_line != (l)) \
+					fix_line_control((p),(l)); }
+
+/* do_control - parse control line */
+
+static int do_control()
+{
+    struct token *t;
+    int     line;
+    char   *path;
+
+    /* Make sure that the directive shows up in the right place. */
+
+    CHECK_LINE_CONTROL(in_path, in_line);
+
+    while (t = tok_get()) {
+	switch (t->tokno) {
+
+	case TOK_WSPACE:
+	    /* Ignore blanks after "#" token. */
+	    tok_free(t);
+	    break;
+
+	case TOK_NUMBER:
+
+	    /*
+	     * Line control is of the form: number pathname junk. Since we
+	     * have no idea what junk the preprocessor may generate, we copy
+	     * all line control tokens to stdout.
+	     */
+
+	    put_str("# ");
+	    line = atoi(t->vstr->str);		/* extract line number */
+	    tok_flush(t);
+	    while ((t = tok_get()) && t->tokno == TOK_WSPACE)
+		tok_flush(t);			/* copy white space */
+	    if (t) {				/* extract path name */
+		path = (t->tokno == '"') ? strsave(t->vstr->str) : in_path;
+		do {
+		    tok_flush(t);		/* copy until newline */
+		} while (t->tokno != '\n' && (t = tok_get()));
+	    }
+	    out_line = in_line = line;		/* synchronize */
+	    out_path = in_path = path;		/* synchronize */
+	    return;
+
+#ifdef IGNORE_DIRECTIVES
+
+	case TOK_WORD:
+
+	    /*
+	     * Optionally ignore other #directives. This is only a partial
+	     * solution, because the preprocessor will still see them.
+	     */
+	    {
+		char  **cpp;
+		char   *cp = t->vstr->str;
+
+		for (cpp = ignore_directives; *cpp; cpp++) {
+		    if (STREQUAL(cp, *cpp)) {
+			do {
+			    tok_free(t);
+			} while (t->tokno != '\n' && (t = tok_get()));
+			return;
+		    }
+		}
+	    }
+	    /* FALLTHROUGH */
+#endif
+	default:
+	    /* Pass through. */
+	    put_ch('#');
+	    do {
+		tok_flush(t);
+	    } while (t->tokno != '\n' && (t = tok_get()));
+	    return;
+
+	case 0:
+	    /* Hit EOF, punt. */
+	    put_ch('#');
+	    return;
+	}
+    }
+}
+
+/* backslash_newline - fix up things after reading a backslash-newline pair */
+
+static int backslash_newline()
+{
+    register int c;
+
+    nl_compensate++;
+    return (INPUT(c));
+}
+
+/* tok_get - get next token */
+
+static int last_tokno = '\n';
+
+struct token *tok_get()
+{
+    register struct token *t;
+    register int c;
+    int     d;
+
+    /*
+     * Get one from the pool and fill it in. The loop is here in case we hit
+     * a preprocessor control line, which happens in a minority of all cases.
+     * We update the token input path and line info *after* backslash-newline
+     * processing or the newline compensation would go wrong.
+     */
+
+    t = tok_alloc();
+
+    for (;;) {
+	if ((INPUT(c)) == EOF) {
+	    tok_free(t);
+	    return (0);
+	} else if ((t->line = in_line, t->path = in_path), !isascii(c)) {
+	    t->vstr->str[0] = c;
+	    t->vstr->str[1] = 0;
+	    t->tokno = TOK_OTHER;
+	    break;
+	} else if (ISSPACE(c)) {
+	    COLLECT(t->vstr, c, ISSPACE(c));
+	    t->tokno = TOK_WSPACE;
+	    break;
+	} else if (ISALPHA(c)) {
+	    COLLECT(t->vstr, c, ISALNUM(c));
+	    t->tokno = TOK_WORD;
+	    break;
+	} else if (isdigit(c)) {
+	    COLLECT(t->vstr, c, isdigit(c));
+	    t->tokno = TOK_NUMBER;
+	    break;
+	} else if (c == '"' || c == '\'') {
+	    t->tokno = read_quoted(t->vstr, c);	/* detect missing end quote */
+	    break;
+	} else if (ISDOT(c)) {
+	    COLLECT(t->vstr, c, ISDOT(c));
+	    t->tokno = TOK_OTHER;
+	    break;
+	} else if (c == '#' && last_tokno == '\n') {
+	    do_control();
+	    continue;
+	} else {
+	    t->vstr->str[0] = c;
+	    if (c == '\n') {
+		in_line++;
+		if (nl_compensate > 0) {	/* compensation for bs-nl */
+		    UNPUT('\n');
+		    nl_compensate--;
+		}
+	    } else if (c == '/') {
+		if ((INPUT(d)) == '*') {
+		    t->vstr->str[1] = d;	/* comment */
+		    read_comment(t->vstr);
+		    t->tokno = TOK_WSPACE;
+		    break;
+		} else {
+		    if (d != EOF)
+			UNPUT(d);
+		}
+	    } else if (c == '\\') {
+		t->vstr->str[1] = (INPUT(c) == EOF ? 0 : c);
+		t->vstr->str[2] = 0;
+		t->tokno = TOK_OTHER;
+		break;
+	    }
+	    t->vstr->str[1] = 0;
+	    t->tokno = c;
+	    break;
+	}
+    }
+    last_tokno = t->tokno;
+    t->end_line = in_line;
+    return (t);
+}
+
+/* read_quoted - read string or character literal, canonicalize escapes */
+
+static int read_quoted(vs, ch)
+register struct vstring *vs;
+int     ch;
+{
+    register char *cp = vs->str;
+    register int c;
+    int     ret = TOK_OTHER;
+
+    *cp++ = ch;
+
+    /*
+     * Clobber the token type in case of a premature newline or EOF. This
+     * prevents us from attempting to concatenate string constants with
+     * broken ones that have no closing quote.
+     */
+
+    while (INPUT(c) != EOF) {
+	if (c == '\n') {			/* newline in string */
+	    UNPUT(c);
+	    break;
+	}
+	if (VS_ADDCH(vs, cp, c) == 0)		/* store character */
+	    fatal("out of memory");
+	if (c == ch) {				/* closing quote */
+	    ret = c;
+	    break;
+	}
+	if (c == '\\') {			/* parse escape sequence */
+	    if ((INPUT(c)) == EOF) {		/* EOF, punt */
+		break;
+	    } else if (c == 'a') {		/* \a -> audible bell */
+		if ((cp = vs_strcpy(vs, cp, BELL)) == 0)
+		    fatal("out of memory");
+	    } else if (c == 'x') {		/* \xhh -> \nnn */
+		cp = read_hex(vs, cp);
+	    } else if (ISOCTAL(c) && ch != '\'') {
+		cp = read_octal(vs, cp, c);	/* canonicalize \octal */
+	    } else {
+		if (VS_ADDCH(vs, cp, c) == 0)	/* \other: leave alone */
+		    fatal("out of memory");
+	    }
+	}
+    }
+    *cp = 0;
+    return (ret);
+}
+
+/* read_comment - stuff a whole comment into one huge token */
+
+static void read_comment(vs)
+register struct vstring *vs;
+{
+    register char *cp = vs->str + 2;	/* skip slash star */
+    register int c;
+    register int d;
+
+    while (INPUT(c) != EOF) {
+	if (VS_ADDCH(vs, cp, c) == 0)
+	    fatal("out of memory");
+	if (c == '*') {
+	    if ((INPUT(d)) == '/') {
+		if (VS_ADDCH(vs, cp, d) == 0)
+		    fatal("out of memory");
+		break;
+	    } else {
+		if (d != EOF)
+		    UNPUT(d);
+	    }
+	} else if (c == '\n') {
+	    in_line++;
+	} else if (c == '\\') {
+	    if ((INPUT(d)) != EOF && VS_ADDCH(vs, cp, d) == 0)
+		fatal("out of memory");
+	}
+    }
+    *cp = 0;
+}
+
+/* read_hex - rewrite hex escape to three-digit octal escape */
+
+static char *read_hex(vs, cp)
+struct vstring *vs;
+register char *cp;
+{
+    register int c;
+    register int i;
+    char    buf[BUFSIZ];
+    int     len;
+    unsigned val;
+
+    /*
+     * Eat up all subsequent hex digits. Complain later when there are too
+     * many.
+     */
+
+    for (i = 0; i < sizeof(buf) && (INPUT(c) != EOF) && ISHEX(c); i++)
+	buf[i] = c;
+    buf[i] = 0;
+
+    if (i < sizeof(buf) && c)
+	UNPUT(c);
+
+    /*
+     * Convert hex form to three-digit octal form. The three-digit form is
+     * used so that strings can be concatenated without problems. Complain
+     * about malformed input; truncate the result to at most three octal
+     * digits.
+     */
+
+    if (i == 0) {
+	error("\\x escape sequence without hexadecimal digits");
+	if (VS_ADDCH(vs, cp, 'x') == 0)
+	    fatal("out of memory");
+    } else {
+	(void) sscanf(buf, "%x", &val);
+	sprintf(buf, "%03o", val);
+	if ((len = strlen(buf)) > 3)
+	    error("\\x escape sequence yields non-character value");
+	if ((cp = vs_strcpy(vs, cp, buf + len - 3)) == 0)
+	    fatal("out of memory");
+    }
+    return (cp);
+}
+
+/* read_octal - convert octal escape to three-digit format */
+
+static char obuf[] = "00123";
+
+static char *read_octal(vs, cp, c)
+register struct vstring *vs;
+register char *cp;
+register int c;
+{
+    register int i;
+
+#define	buf_input (obuf + 2)
+
+    /* Eat up at most three octal digits. */
+
+    buf_input[0] = c;
+    for (i = 1; i < 3 && (INPUT(c) != EOF) && ISOCTAL(c); i++)
+	buf_input[i] = c;
+    buf_input[i] = 0;
+
+    if (i < 3 && c)
+	UNPUT(c);
+
+    /*
+     * Leave three-digit octal escapes alone. Convert one-digit and two-digit
+     * octal escapes to three-digit form by prefixing them with a suitable
+     * number of '0' characters. This is done so that strings can be
+     * concatenated without problems.
+     */
+
+    if ((cp = vs_strcpy(vs, cp, buf_input + i - 3)) == 0)
+	fatal("out of memory");
+    return (cp);
+}
+
+/* put_nl - emit newline and adjust output line count */
+
+void    put_nl()
+{
+    put_ch('\n');
+    out_line++;
+}
+
+/* fix_line_control - to adjust path and/or line count info in output */
+
+static void fix_line_control(path, line)
+register char *path;
+register int line;
+{
+
+    /*
+     * This function is called sporadically, so it should not be a problem
+     * that we repeat some of the tests that preceded this function call.
+     * 
+     * Emit a newline if we are not at the start of a line.
+     * 
+     * If we switch files, or if we jump backwards, emit line control. If we
+     * jump forward, emit the proper number of newlines to compensate.
+     */
+
+    if (last_ch != '\n')			/* terminate open line */
+	put_nl();
+    if (path != out_path || line < out_line) {	/* file switch or back jump */
+	printf("# %d %s\n", out_line = line, out_path = path);
+	last_ch = '\n';
+    } else {					/* forward jump */
+	while (line > out_line)
+	    put_nl();
+    }
+}
+
+/* tok_show_ch - output single-character token (not newline) */
+
+void    tok_show_ch(t)
+register struct token *t;
+{
+    CHECK_LINE_CONTROL(t->path, t->line);
+
+    put_ch(t->tokno);				/* show token contents */
+}
+
+/* tok_show - output (possibly composite) token */
+
+void    tok_show(t)
+register struct token *t;
+{
+    register struct token *p;
+
+    if (t->tokno == TOK_LIST) {
+	register struct token *s;
+
+	/*
+	 * This branch is completely in terms of tok_xxx() primitives, so
+	 * there is no need to check the line control information.
+	 */
+
+	for (s = t->head; s; s = s->next) {
+	    tok_show_ch(s);			/* '(' or ',' or ')' */
+	    for (p = s->head; p; p = p->next)
+		tok_show(p);			/* show list element */
+	}
+    } else {
+	register char *cp = t->vstr->str;
+
+	/*
+	 * Measurements show that it pays off to give special treatment to
+	 * single-character tokens. Note that both types of token may cause a
+	 * change of output line number.
+	 */
+
+	CHECK_LINE_CONTROL(t->path, t->line);
+	if (cp[1] == 0) {
+	    put_ch(*cp);			/* single-character token */
+	} else {
+	    put_str(cp);			/* multi_character token */
+	}
+	out_line = t->end_line;			/* may span multiple lines */
+	for (p = t->head; p; p = p->next)
+	    tok_show(p);			/* trailing blanks */
+    }
+}