utils/unlit/unlit.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402

/* unlit.c                                   Wed Dec  5 17:16:24 GMT 1990
 *
 * Literate script filter.  In contrast with the format used by most
 * programming languages, a literate script is a program in which
 * comments are given the leading role, whilst program text must be
 * explicitly flagged as such by placing a `>' character in the first
 * column on each line.  It is hoped that this style of programming will
 * encourage the writing of accurate and clearly documented programs
 * in which the writer may include motivating arguments, examples
 * and explanations.
 *
 * Unlit is a filter that can be used to strip all of the comment lines
 * out of a literate script file.  The command format for unlit is:
 *              unlit [-n] [-q] ifile ofile
 * where ifile and ofile are the names of the input (literate script) and
 * output (raw program) files respectively.  Either of these names may
 * be `-' representing the standard input or the standard output resp.
 * A number of rules are used in an attempt to guard against the most
 * common errors that are made when writing literate scripts:
 * 1) Empty script files are not permitted.  A file in which no lines
 *    begin with `>' usually indicates a file in which the programmer
 *    has forgotten about the literate script convention.
 * 2) A line containing part of program definition (i.e. preceeded by `>')
 *    cannot be used immediately before or after a comment line unless
 *    the comment line is blank.  This error usually indicates that
 *    the `>' character has been omitted from a line in a section of
 *    program spread over a number of lines.
 * Using the -q (quiet) flag suppresses the signalling of these error
 * conditions.  The default behaviour can be selected explicitly using
 * the -n (noisy) option so that any potential errors in the script file
 * are reported.
 *
 * The original idea for the use of literate scripts is due to Richard
 * Bird of the programming Research Group, Oxford and was initially
 * adopted for use in the implementation of the functional programming
 * language Orwell used for teaching in Oxford.  This idea has subsequently
 * been borrowed in a number of other language implementations.
 *
 * Modified to understand \begin{code} ... \end{code} used in Glasgow.  -- LA
 * And \begin{pseudocode} ... \end{pseudocode}.  -- LA
 */

#include "fs.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#define NULLSTR        ((char *)0)
#define DEFNCHAR       '>'
#define MISSINGBLANK   "unlit: Program line next to comment"
#define EMPTYSCRIPT    "unlit: No definitions in file (perhaps you forgot the '>'s?)"
#define USAGE          "usage: unlit [-q] [-n] [-c] [-#] [-P] [-h label] file1 file2\n"
#define CANNOTOPEN     "unlit: cannot open \"%s\"\n"
#define CANNOTWRITE    "unlit: error writing \"%s\"\n"
#define CANNOTWRITESTDOUT "unlit: error writing standard output\n"
#define DISTINCTNAMES  "unlit: input and output filenames must differ\n"
#define MISSINGENDCODE "unlit: missing \\end{code}\n"
#define SPURIOUSENDCODE "unlit: spurious \\end{code}\n"

#define BEGINCODE "\\begin{code}"
#define LENBEGINCODE 12
#define ENDCODE "\\end{code}"
#define LENENDCODE 10
#if defined(PSEUDOCODE)
/* According to Will Partain, the inventor of pseudocode, this gone now. */
#define MISSINGENDPSEUDOCODE "unlit: missing \\end{pseudocode}\n"
#define BEGINPSEUDOCODE "\\begin{pseudocode}"
#define LENBEGINPSEUDOCODE 18
#define ENDPSEUDOCODE "\\end{pseudocode}"
#define LENENDPSEUDOCODE 16
#endif

typedef enum { START, BLANK, TEXT, DEFN, BEGIN, END, /*PSEUDO,*/ ENDFILE, HASH, SHEBANG } line;
#define isWhitespace(c)  (c==' '  || c=='\t' || c=='\r')
#define isLineTerm(c)    (c=='\n' || c==EOF)

static int noisy  = 1;   /* 0 => keep quiet about errors, 1 => report errors */
static int errors = 0;   /* count the number of errors reported              */
static int crunchnl = 0; /* don't print \n for removed lines                 */
static int leavecpp = 1; /* leave preprocessor lines */
static int ignore_shebang = 1; /* Leave out shebang (#!) lines */
static int no_line_pragma = 0; /* Leave out initial line pragma */

static char* prefix_str = NULL; /* Prefix output with a string */

static char *ofilename = NULL;

/* complain(file,line,what)
 *
 * print error message `what' for `file' at `line'.  The error is suppressed
 * if noisy is not set.
 */

static void complain(char *file, int lin, char *what)
{
    if (noisy) {
        if (file)
            fprintf(stderr, "%s ", file);
        fprintf(stderr,"line %d: %s\n",lin,what);
        errors++;
    }
}

static void writeerror(void)
{
    if (!strcmp(ofilename,"-")) {
	fprintf(stderr, CANNOTWRITESTDOUT);
    } else {
	fprintf(stderr, CANNOTWRITE, ofilename);
    }
    exit(1);
}

static void myputc(char c, FILE *ostream)
{
    if (putc(c,ostream) == EOF) {
	writeerror();
    }
}

#define TABPOS 8

/* As getc, but does TAB expansion */
static int egetc(FILE *istream)
{
    static int spleft = 0;
    static int linepos = 0;
    int c;

    if (spleft > 0) {
	spleft--;
	linepos++;
	return ' ';
    }
    c = getc(istream);
    if (c == EOF)
	return c;
    else if (c == '\n' || c == '\f') {
	linepos = 0;
	return c;
    } else if (c == '\t') {
	spleft = TABPOS - linepos % TABPOS;
	spleft--;
	linepos++;
	return ' ';
    } else {
	linepos++;
	return c;
    }

}

/* readline(istream, ostream)
 *
 * Read a line from the input stream `istream', and return a value
 * indicating whether that line was:
 *     BLANK (whitespace only),
 *     DEFN  (first character is DEFNCHAR),
 *     TEXT  (a line of text)
 *     BEGIN (a \begin{code} line)
 *     PSEUDO (a \begin{pseodocode} line)
 *     HASH  (a preprocessor line)
 *     END   (a (spurious) \end{code} line)
 * or  ENDFILE (indicating an EOF).
 * Lines of type DEFN are copied to the output stream `ostream'
 * (without the leading DEFNCHAR).  BLANK and TEXT lines are
 * replaced by empty (i.e. blank lines) in the output stream, so
 * that error messages refering to line numbers in the output file
 * can also be used to locate the corresponding line in the input
 * stream.
 */

static line readline(FILE *istream, FILE *ostream) {
    int c, c1;
    char buf[100];
    int i;

    c = egetc(istream);

    if (c==EOF)
        return ENDFILE;

    if ( c == '#' ) {
      if ( ignore_shebang ) {
         c1 = egetc(istream);
         if ( c1 == '!' ) {
           while (c=egetc(istream), !isLineTerm(c)) ;
           return SHEBANG;
	 }
	 myputc(c, ostream);
	 c=c1;
      }
      if ( leavecpp ) {
	myputc(c, ostream);
        while (c=egetc(istream), !isLineTerm(c))
            myputc(c,ostream);
        myputc('\n',ostream);
        return HASH;
      }
    }

    if (c==DEFNCHAR) {
	myputc(' ',ostream);
        while (c=egetc(istream), !isLineTerm(c))
            myputc(c,ostream);
        myputc('\n',ostream);
        return DEFN;
    }

    if (!crunchnl)
	myputc('\n',ostream);

    while (isWhitespace(c))
        c=egetc(istream);
    if (isLineTerm(c))
        return BLANK;

    i = 0;
    buf[i++] = c;
    while (c=egetc(istream), !isLineTerm(c))
        if (i < sizeof buf - 1)
	    buf[i++] = c;
    while(i > 0 && isspace(buf[i-1]))
	i--;
    buf[i] = 0;
    if (strcmp(buf, BEGINCODE) == 0)
	return BEGIN;
    if (strcmp(buf, ENDCODE) == 0)
	return END;
#if defined(PSEUDOCODE)
    else if (strcmp(buf, BEGINPSEUDOCODE) == 0)
	return PSEUDO;
#endif
    else
	return TEXT;
}


/* unlit(file,istream,ostream)
 *
 * Copy the file named `file', accessed using the input stream `istream'
 * to the output stream `ostream', removing any comments and checking
 * for bad use of literate script features:
 *  - there should be at least one BLANK line between a DEFN and TEXT
 *  - there should be at least one DEFN line in a script.
 */

static void unlit(char *file, FILE *istream, FILE *ostream)
{
    line last, this=START;
    int  linesread=0;
    int  defnsread=0;

    do {
        last = this;
        this = readline(istream, ostream);
        linesread++;
        if (this==DEFN)
            defnsread++;
        if (last==DEFN && this==TEXT)
            complain(file, linesread-1, MISSINGBLANK);
        if (last==TEXT && this==DEFN)
            complain(file, linesread, MISSINGBLANK);
        if (this==END)
            complain(file, linesread, SPURIOUSENDCODE);
	if (this == BEGIN) {
	    /* start of code, copy to end */
	    char lineb[1000];
	    for(;;) {
		if (fgets(lineb, sizeof lineb, istream) == NULL) {
		    complain(file, linesread, MISSINGENDCODE);
		    exit(1);
		}
		linesread++;
		if (strncmp(lineb,ENDCODE,LENENDCODE) == 0) {
		    myputc('\n', ostream);
		    break;
		}
		fputs(lineb, ostream);
	    }
	    defnsread++;
	}
#if defined(PSEUDOCODE)
	if (this == PSEUDO) {
	    char lineb[1000];
	    for(;;) {
		if (fgets(lineb, sizeof lineb, istream) == NULL) {
		    complain(file, linesread, MISSINGENDPSEUDOCODE);
		    exit(1);
		}
		linesread++;
		myputc('\n', ostream);
		if (strncmp(lineb,ENDPSEUDOCODE,LENENDPSEUDOCODE) == 0) {
		    break;
		}
	    }
	}
#endif
	if (this == SHEBANG) {
	    myputc('\n', ostream);
	}
    } while(this!=ENDFILE);

    if (defnsread==0)
        complain(file,linesread,EMPTYSCRIPT);
}

/* main(argc, argv)
 *
 * Main program.  Processes command line arguments, looking for leading:
 *  -q  quiet mode - do not complain about bad literate script files
 *  -n  noisy mode - complain about bad literate script files.
 *  -r  remove cpp droppings in output.
 *  -P  don't output any CPP line pragmas.
 * Expects two additional arguments, a file name for the input and a file
 * name for the output file.  These two names must normally be distinct.
 * An exception is made for the special name "-" which can be used in either
 * position to specify the standard input or the standard output respectively.
 */

int main(int argc,char **argv)
{
    FILE *istream, *ostream;
    char *file;

    for (argc--, argv++; argc > 0; argc--, argv++)
        if (strcmp(*argv,"-n")==0)
            noisy = 1;
        else if (strcmp(*argv,"-q")==0)
            noisy = 0;
        else if (strcmp(*argv,"-c")==0)
	    crunchnl = 1;
        else if (strcmp(*argv,"-P")==0)
	    no_line_pragma = 1;
        else if (strcmp(*argv,"-h")==0) {
	  if (argc > 1) {
	    argc--; argv++;
	    if (prefix_str)
	      free(prefix_str);
	    prefix_str = (char*)malloc(sizeof(char)*(1+strlen(*argv)));
	    if (prefix_str)
	      strcpy(prefix_str, *argv);
	  }
        } else if (strcmp(*argv,"-#")==0)
	    ignore_shebang = 0;
        else
            break;

    if (argc!=2) {
        fprintf(stderr, USAGE);
        exit(1);
    }

    if (strcmp(argv[0],argv[1])==0 && strcmp(argv[0],"-")!=0) {
        fprintf(stderr, DISTINCTNAMES);
        exit(1);
    }

    file = argv[0];
    if (strcmp(argv[0], "-")==0) {
        istream = stdin;
        file    = "stdin";
    }
    else
        if ((istream=__hs_fopen(argv[0], "r")) == NULL) {
            fprintf(stderr, CANNOTOPEN, argv[0]);
            exit(1);
        }

    ofilename=argv[1];
    if (strcmp(argv[1], "-")==0)
        ostream = stdout;
    else
        if ((ostream=__hs_fopen(argv[1], "w")) == NULL)  {
            fprintf(stderr, CANNOTOPEN, argv[1]);
            exit(1);
        }

    /* Prefix the output with line pragmas */
    if (!no_line_pragma && prefix_str) {
      /* Both GHC and CPP understand the #line pragma.
       * We used to throw in both a #line and a {-# LINE #-} pragma
       * here, but CPP doesn't understand {-# LINE #-} so it thought
       * the line numbers were off by one.  We could put the {-# LINE
       * #-} before the #line, but there's no point since GHC
       * understands #line anyhow.  --SDM 8/2003
       */
      fprintf(ostream, "#line 1 \"%s\"\n", prefix_str);
    }

    unlit(file, istream, ostream);

    if (istream != stdin) fclose(istream);
    if (ostream != stdout) {
	if (fclose(ostream) == EOF) {
	    writeerror();
	}
    }

    exit(errors==0 ? 0 : 1);
}