summaryrefslogtreecommitdiff
path: root/src/ptx.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/ptx.c')
-rw-r--r--src/ptx.c2224
1 files changed, 2224 insertions, 0 deletions
diff --git a/src/ptx.c b/src/ptx.c
new file mode 100644
index 0000000..9c35596
--- /dev/null
+++ b/src/ptx.c
@@ -0,0 +1,2224 @@
+/* Permuted index for GNU, with keywords in their context.
+ Copyright (C) 1990, 1991, 1993, 1998-2006 Free Software Foundation, Inc.
+ François Pinard <pinard@iro.umontreal.ca>, 1988.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software Foundation,
+ Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+ François Pinard <pinard@iro.umontreal.ca> */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include "system.h"
+#include "argmatch.h"
+#include "diacrit.h"
+#include "error.h"
+#include "quote.h"
+#include "quotearg.h"
+#include "regex.h"
+#include "xstrtol.h"
+
+/* The official name of this program (e.g., no `g' prefix). */
+#define PROGRAM_NAME "ptx"
+
+/* Note to translator: Please translate "F. Pinard" to "François
+ Pinard" if "ç" (c-with-cedilla) is available in the
+ translation's character set and encoding. */
+#define AUTHORS _("F. Pinard")
+
+/* Number of possible characters in a byte. */
+#define CHAR_SET_SIZE 256
+
+#define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
+#define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
+ : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
+#define OCTTOBIN(C) ((C) - '0')
+
+/* Debugging the memory allocator. */
+
+#if WITH_DMALLOC
+# define MALLOC_FUNC_CHECK 1
+# include <dmalloc.h>
+#endif
+
+/* Global definitions. */
+
+/* FIXME: There are many unchecked integer overflows in this file,
+ that will cause this command to misbehave given large inputs or
+ options. Many of the "int" values below should be "size_t" or
+ something else like that. */
+
+/* Reallocation step when swallowing non regular files. The value is not
+ the actual reallocation step, but its base two logarithm. */
+#define SWALLOW_REALLOC_LOG 12
+
+/* Imported from "regex.c". */
+#define Sword 1
+
+/* The name this program was run with. */
+char *program_name;
+
+/* Program options. */
+
+enum Format
+{
+ UNKNOWN_FORMAT, /* output format still unknown */
+ DUMB_FORMAT, /* output for a dumb terminal */
+ ROFF_FORMAT, /* output for `troff' or `nroff' */
+ TEX_FORMAT /* output for `TeX' or `LaTeX' */
+};
+
+static bool gnu_extensions = true; /* trigger all GNU extensions */
+static bool auto_reference = false; /* refs are `file_name:line_number:' */
+static bool input_reference = false; /* refs at beginning of input lines */
+static bool right_reference = false; /* output refs after right context */
+static int line_width = 72; /* output line width in characters */
+static int gap_size = 3; /* number of spaces between output fields */
+static const char *truncation_string = "/";
+ /* string used to mark line truncations */
+static const char *macro_name = "xx"; /* macro name for roff or TeX output */
+static enum Format output_format = UNKNOWN_FORMAT;
+ /* output format */
+
+static bool ignore_case = false; /* fold lower to upper for sorting */
+static const char *break_file = NULL; /* name of the `Break characters' file */
+static const char *only_file = NULL; /* name of the `Only words' file */
+static const char *ignore_file = NULL; /* name of the `Ignore words' file */
+
+/* Options that use regular expressions. */
+struct regex_data
+{
+ /* The original regular expression, as a string. */
+ char const *string;
+
+ /* The compiled regular expression, and its fastmap. */
+ struct re_pattern_buffer pattern;
+ char fastmap[UCHAR_MAX + 1];
+};
+
+static struct regex_data context_regex; /* end of context */
+static struct regex_data word_regex; /* keyword */
+
+/* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
+ whole file. A WORD is something smaller, its length should fit in a
+ short integer. A WORD_TABLE may contain several WORDs. */
+
+typedef struct
+ {
+ char *start; /* pointer to beginning of region */
+ char *end; /* pointer to end + 1 of region */
+ }
+BLOCK;
+
+typedef struct
+ {
+ char *start; /* pointer to beginning of region */
+ short int size; /* length of the region */
+ }
+WORD;
+
+typedef struct
+ {
+ WORD *start; /* array of WORDs */
+ size_t alloc; /* allocated length */
+ size_t length; /* number of used entries */
+ }
+WORD_TABLE;
+
+/* Pattern description tables. */
+
+/* For each character, provide its folded equivalent. */
+static unsigned char folded_chars[CHAR_SET_SIZE];
+
+/* End of context pattern register indices. */
+static struct re_registers context_regs;
+
+/* Keyword pattern register indices. */
+static struct re_registers word_regs;
+
+/* A word characters fastmap is used only when no word regexp has been
+ provided. A word is then made up of a sequence of one or more characters
+ allowed by the fastmap. Contains !0 if character allowed in word. Not
+ only this is faster in most cases, but it simplifies the implementation
+ of the Break files. */
+static char word_fastmap[CHAR_SET_SIZE];
+
+/* Maximum length of any word read. */
+static int maximum_word_length;
+
+/* Maximum width of any reference used. */
+static int reference_max_width;
+
+/* Ignore and Only word tables. */
+
+static WORD_TABLE ignore_table; /* table of words to ignore */
+static WORD_TABLE only_table; /* table of words to select */
+
+/* Source text table, and scanning macros. */
+
+static int number_input_files; /* number of text input files */
+static int total_line_count; /* total number of lines seen so far */
+static const char **input_file_name; /* array of text input file names */
+static int *file_line_count; /* array of `total_line_count' values at end */
+
+static BLOCK text_buffer; /* file to study */
+
+/* SKIP_NON_WHITE used only for getting or skipping the reference. */
+
+#define SKIP_NON_WHITE(cursor, limit) \
+ while (cursor < limit && ! isspace (to_uchar (*cursor))) \
+ cursor++
+
+#define SKIP_WHITE(cursor, limit) \
+ while (cursor < limit && isspace (to_uchar (*cursor))) \
+ cursor++
+
+#define SKIP_WHITE_BACKWARDS(cursor, start) \
+ while (cursor > start && isspace (to_uchar (cursor[-1]))) \
+ cursor--
+
+#define SKIP_SOMETHING(cursor, limit) \
+ if (word_regex.string) \
+ { \
+ regoff_t count; \
+ count = re_match (&word_regex.pattern, cursor, limit - cursor, 0, NULL); \
+ if (count == -2) \
+ matcher_error (); \
+ cursor += count == -1 ? 1 : count; \
+ } \
+ else if (word_fastmap[to_uchar (*cursor)]) \
+ while (cursor < limit && word_fastmap[to_uchar (*cursor)]) \
+ cursor++; \
+ else \
+ cursor++
+
+/* Occurrences table.
+
+ The `keyword' pointer provides the central word, which is surrounded
+ by a left context and a right context. The `keyword' and `length'
+ field allow full 8-bit characters keys, even including NULs. At other
+ places in this program, the name `keyafter' refers to the keyword
+ followed by its right context.
+
+ The left context does not extend, towards the beginning of the file,
+ further than a distance given by the `left' value. This value is
+ relative to the keyword beginning, it is usually negative. This
+ insures that, except for white space, we will never have to backward
+ scan the source text, when it is time to generate the final output
+ lines.
+
+ The right context, indirectly attainable through the keyword end, does
+ not extend, towards the end of the file, further than a distance given
+ by the `right' value. This value is relative to the keyword
+ beginning, it is usually positive.
+
+ When automatic references are used, the `reference' value is the
+ overall line number in all input files read so far, in this case, it
+ is of type (int). When input references are used, the `reference'
+ value indicates the distance between the keyword beginning and the
+ start of the reference field, it is of type (DELTA) and usually
+ negative. */
+
+typedef short int DELTA; /* to hold displacement within one context */
+
+typedef struct
+ {
+ WORD key; /* description of the keyword */
+ DELTA left; /* distance to left context start */
+ DELTA right; /* distance to right context end */
+ int reference; /* reference descriptor */
+ }
+OCCURS;
+
+/* The various OCCURS tables are indexed by the language. But the time
+ being, there is no such multiple language support. */
+
+static OCCURS *occurs_table[1]; /* all words retained from the read text */
+static size_t occurs_alloc[1]; /* allocated size of occurs_table */
+static size_t number_of_occurs[1]; /* number of used slots in occurs_table */
+
+
+/* Communication among output routines. */
+
+/* Indicate if special output processing is requested for each character. */
+static char edited_flag[CHAR_SET_SIZE];
+
+static int half_line_width; /* half of line width, reference excluded */
+static int before_max_width; /* maximum width of before field */
+static int keyafter_max_width; /* maximum width of keyword-and-after field */
+static int truncation_string_length;/* length of string used to flag truncation */
+
+/* When context is limited by lines, wraparound may happen on final output:
+ the `head' pointer gives access to some supplementary left context which
+ will be seen at the end of the output line, the `tail' pointer gives
+ access to some supplementary right context which will be seen at the
+ beginning of the output line. */
+
+static BLOCK tail; /* tail field */
+static int tail_truncation; /* flag truncation after the tail field */
+
+static BLOCK before; /* before field */
+static int before_truncation; /* flag truncation before the before field */
+
+static BLOCK keyafter; /* keyword-and-after field */
+static int keyafter_truncation; /* flag truncation after the keyafter field */
+
+static BLOCK head; /* head field */
+static int head_truncation; /* flag truncation before the head field */
+
+static BLOCK reference; /* reference field for input reference mode */
+
+/* Miscellaneous routines. */
+
+/* Diagnose an error in the regular expression matcher. Then exit. */
+
+static void ATTRIBUTE_NORETURN
+matcher_error (void)
+{
+ error (0, errno, _("error in regular expression matcher"));
+ exit (EXIT_FAILURE);
+}
+
+/*------------------------------------------------------.
+| Duplicate string STRING, while evaluating \-escapes. |
+`------------------------------------------------------*/
+
+/* Loosely adapted from GNU sh-utils printf.c code. */
+
+static char *
+copy_unescaped_string (const char *string)
+{
+ char *result; /* allocated result */
+ char *cursor; /* cursor in result */
+ int value; /* value of \nnn escape */
+ int length; /* length of \nnn escape */
+
+ result = xmalloc (strlen (string) + 1);
+ cursor = result;
+
+ while (*string)
+ if (*string == '\\')
+ {
+ string++;
+ switch (*string)
+ {
+ case 'x': /* \xhhh escape, 3 chars maximum */
+ value = 0;
+ for (length = 0, string++;
+ length < 3 && isxdigit (to_uchar (*string));
+ length++, string++)
+ value = value * 16 + HEXTOBIN (*string);
+ if (length == 0)
+ {
+ *cursor++ = '\\';
+ *cursor++ = 'x';
+ }
+ else
+ *cursor++ = value;
+ break;
+
+ case '0': /* \0ooo escape, 3 chars maximum */
+ value = 0;
+ for (length = 0, string++;
+ length < 3 && ISODIGIT (*string);
+ length++, string++)
+ value = value * 8 + OCTTOBIN (*string);
+ *cursor++ = value;
+ break;
+
+ case 'a': /* alert */
+#if __STDC__
+ *cursor++ = '\a';
+#else
+ *cursor++ = 7;
+#endif
+ string++;
+ break;
+
+ case 'b': /* backspace */
+ *cursor++ = '\b';
+ string++;
+ break;
+
+ case 'c': /* cancel the rest of the output */
+ while (*string)
+ string++;
+ break;
+
+ case 'f': /* form feed */
+ *cursor++ = '\f';
+ string++;
+ break;
+
+ case 'n': /* new line */
+ *cursor++ = '\n';
+ string++;
+ break;
+
+ case 'r': /* carriage return */
+ *cursor++ = '\r';
+ string++;
+ break;
+
+ case 't': /* horizontal tab */
+ *cursor++ = '\t';
+ string++;
+ break;
+
+ case 'v': /* vertical tab */
+#if __STDC__
+ *cursor++ = '\v';
+#else
+ *cursor++ = 11;
+#endif
+ string++;
+ break;
+
+ default:
+ *cursor++ = '\\';
+ *cursor++ = *string++;
+ break;
+ }
+ }
+ else
+ *cursor++ = *string++;
+
+ *cursor = '\0';
+ return result;
+}
+
+/*--------------------------------------------------------------------------.
+| Compile the regex represented by REGEX, diagnose and abort if any error. |
+`--------------------------------------------------------------------------*/
+
+static void
+compile_regex (struct regex_data *regex)
+{
+ struct re_pattern_buffer *pattern = &regex->pattern;
+ char const *string = regex->string;
+ char const *message;
+
+ pattern->buffer = NULL;
+ pattern->allocated = 0;
+ pattern->fastmap = regex->fastmap;
+ pattern->translate = ignore_case ? folded_chars : NULL;
+
+ message = re_compile_pattern (string, strlen (string), pattern);
+ if (message)
+ error (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
+
+ /* The fastmap should be compiled before `re_match'. The following
+ call is not mandatory, because `re_search' is always called sooner,
+ and it compiles the fastmap if this has not been done yet. */
+
+ re_compile_fastmap (pattern);
+}
+
+/*------------------------------------------------------------------------.
+| This will initialize various tables for pattern match and compiles some |
+| regexps. |
+`------------------------------------------------------------------------*/
+
+static void
+initialize_regex (void)
+{
+ int character; /* character value */
+
+ /* Initialize the case folding table. */
+
+ if (ignore_case)
+ for (character = 0; character < CHAR_SET_SIZE; character++)
+ folded_chars[character] = toupper (character);
+
+ /* Unless the user already provided a description of the end of line or
+ end of sentence sequence, select an end of line sequence to compile.
+ If the user provided an empty definition, thus disabling end of line
+ or sentence feature, make it NULL to speed up tests. If GNU
+ extensions are enabled, use end of sentence like in GNU emacs. If
+ disabled, use end of lines. */
+
+ if (context_regex.string)
+ {
+ if (!*context_regex.string)
+ context_regex.string = NULL;
+ }
+ else if (gnu_extensions & !input_reference)
+ context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*";
+ else
+ context_regex.string = "\n";
+
+ if (context_regex.string)
+ compile_regex (&context_regex);
+
+ /* If the user has already provided a non-empty regexp to describe
+ words, compile it. Else, unless this has already been done through
+ a user provided Break character file, construct a fastmap of
+ characters that may appear in a word. If GNU extensions enabled,
+ include only letters of the underlying character set. If disabled,
+ include almost everything, even punctuations; stop only on white
+ space. */
+
+ if (word_regex.string)
+ compile_regex (&word_regex);
+ else if (!break_file)
+ {
+ if (gnu_extensions)
+ {
+
+ /* Simulate \w+. */
+
+ for (character = 0; character < CHAR_SET_SIZE; character++)
+ word_fastmap[character] = !! isalpha (character);
+ }
+ else
+ {
+
+ /* Simulate [^ \t\n]+. */
+
+ memset (word_fastmap, 1, CHAR_SET_SIZE);
+ word_fastmap[' '] = 0;
+ word_fastmap['\t'] = 0;
+ word_fastmap['\n'] = 0;
+ }
+ }
+}
+
+/*------------------------------------------------------------------------.
+| This routine will attempt to swallow a whole file name FILE_NAME into a |
+| contiguous region of memory and return a description of it into BLOCK. |
+| Standard input is assumed whenever FILE_NAME is NULL, empty or "-". |
+| |
+| Previously, in some cases, white space compression was attempted while |
+| inputting text. This was defeating some regexps like default end of |
+| sentence, which checks for two consecutive spaces. If white space |
+| compression is ever reinstated, it should be in output routines. |
+`------------------------------------------------------------------------*/
+
+static void
+swallow_file_in_memory (const char *file_name, BLOCK *block)
+{
+ int file_handle; /* file descriptor number */
+ struct stat stat_block; /* stat block for file */
+ size_t allocated_length; /* allocated length of memory buffer */
+ size_t used_length; /* used length in memory buffer */
+ int read_length; /* number of character gotten on last read */
+
+ /* As special cases, a file name which is NULL or "-" indicates standard
+ input, which is already opened. In all other cases, open the file from
+ its name. */
+ bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
+ if (using_stdin)
+ file_handle = STDIN_FILENO;
+ else
+ if ((file_handle = open (file_name, O_RDONLY)) < 0)
+ error (EXIT_FAILURE, errno, "%s", file_name);
+
+ /* If the file is a plain, regular file, allocate the memory buffer all at
+ once and swallow the file in one blow. In other cases, read the file
+ repeatedly in smaller chunks until we have it all, reallocating memory
+ once in a while, as we go. */
+
+ if (fstat (file_handle, &stat_block) < 0)
+ error (EXIT_FAILURE, errno, "%s", file_name);
+
+ if (S_ISREG (stat_block.st_mode))
+ {
+ size_t in_memory_size;
+
+ block->start = xmalloc ((size_t) stat_block.st_size);
+
+ if ((in_memory_size = read (file_handle,
+ block->start, (size_t) stat_block.st_size))
+ != stat_block.st_size)
+ {
+#if MSDOS
+ /* On MSDOS, in memory size may be smaller than the file
+ size, because of end of line conversions. But it can
+ never be smaller than half the file size, because the
+ minimum is when all lines are empty and terminated by
+ CR+LF. */
+ if (in_memory_size != (size_t)-1
+ && in_memory_size >= stat_block.st_size / 2)
+ block->start = xrealloc (block->start, in_memory_size);
+ else
+#endif /* not MSDOS */
+
+ error (EXIT_FAILURE, errno, "%s", file_name);
+ }
+ block->end = block->start + in_memory_size;
+ }
+ else
+ {
+ block->start = xmalloc ((size_t) 1 << SWALLOW_REALLOC_LOG);
+ used_length = 0;
+ allocated_length = (1 << SWALLOW_REALLOC_LOG);
+
+ while (read_length = read (file_handle,
+ block->start + used_length,
+ allocated_length - used_length),
+ read_length > 0)
+ {
+ used_length += read_length;
+ if (used_length == allocated_length)
+ {
+ allocated_length += (1 << SWALLOW_REALLOC_LOG);
+ block->start
+ = xrealloc (block->start, allocated_length);
+ }
+ }
+
+ if (read_length < 0)
+ error (EXIT_FAILURE, errno, "%s", file_name);
+
+ block->end = block->start + used_length;
+ }
+
+ /* Close the file, but only if it was not the standard input. */
+
+ if (! using_stdin && close (file_handle) != 0)
+ error (EXIT_FAILURE, errno, "%s", file_name);
+}
+
+/* Sort and search routines. */
+
+/*--------------------------------------------------------------------------.
+| Compare two words, FIRST and SECOND, and return 0 if they are identical. |
+| Return less than 0 if the first word goes before the second; return |
+| greater than 0 if the first word goes after the second. |
+| |
+| If a word is indeed a prefix of the other, the shorter should go first. |
+`--------------------------------------------------------------------------*/
+
+static int
+compare_words (const void *void_first, const void *void_second)
+{
+#define first ((const WORD *) void_first)
+#define second ((const WORD *) void_second)
+ int length; /* minimum of two lengths */
+ int counter; /* cursor in words */
+ int value; /* value of comparison */
+
+ length = first->size < second->size ? first->size : second->size;
+
+ if (ignore_case)
+ {
+ for (counter = 0; counter < length; counter++)
+ {
+ value = (folded_chars [to_uchar (first->start[counter])]
+ - folded_chars [to_uchar (second->start[counter])]);
+ if (value != 0)
+ return value;
+ }
+ }
+ else
+ {
+ for (counter = 0; counter < length; counter++)
+ {
+ value = (to_uchar (first->start[counter])
+ - to_uchar (second->start[counter]));
+ if (value != 0)
+ return value;
+ }
+ }
+
+ return first->size - second->size;
+#undef first
+#undef second
+}
+
+/*-----------------------------------------------------------------------.
+| Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
+| go first. In case of a tie, preserve the original order through a |
+| pointer comparison. |
+`-----------------------------------------------------------------------*/
+
+static int
+compare_occurs (const void *void_first, const void *void_second)
+{
+#define first ((const OCCURS *) void_first)
+#define second ((const OCCURS *) void_second)
+ int value;
+
+ value = compare_words (&first->key, &second->key);
+ return value == 0 ? first->key.start - second->key.start : value;
+#undef first
+#undef second
+}
+
+/*------------------------------------------------------------.
+| Return !0 if WORD appears in TABLE. Uses a binary search. |
+`------------------------------------------------------------*/
+
+static int
+search_table (WORD *word, WORD_TABLE *table)
+{
+ int lowest; /* current lowest possible index */
+ int highest; /* current highest possible index */
+ int middle; /* current middle index */
+ int value; /* value from last comparison */
+
+ lowest = 0;
+ highest = table->length - 1;
+ while (lowest <= highest)
+ {
+ middle = (lowest + highest) / 2;
+ value = compare_words (word, table->start + middle);
+ if (value < 0)
+ highest = middle - 1;
+ else if (value > 0)
+ lowest = middle + 1;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+/*---------------------------------------------------------------------.
+| Sort the whole occurs table in memory. Presumably, `qsort' does not |
+| take intermediate copies or table elements, so the sort will be |
+| stabilized throughout the comparison routine. |
+`---------------------------------------------------------------------*/
+
+static void
+sort_found_occurs (void)
+{
+
+ /* Only one language for the time being. */
+
+ qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
+ compare_occurs);
+}
+
+/* Parameter files reading routines. */
+
+/*----------------------------------------------------------------------.
+| Read a file named FILE_NAME, containing a set of break characters. |
+| Build a content to the array word_fastmap in which all characters are |
+| allowed except those found in the file. Characters may be repeated. |
+`----------------------------------------------------------------------*/
+
+static void
+digest_break_file (const char *file_name)
+{
+ BLOCK file_contents; /* to receive a copy of the file */
+ char *cursor; /* cursor in file copy */
+
+ swallow_file_in_memory (file_name, &file_contents);
+
+ /* Make the fastmap and record the file contents in it. */
+
+ memset (word_fastmap, 1, CHAR_SET_SIZE);
+ for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
+ word_fastmap[to_uchar (*cursor)] = 0;
+
+ if (!gnu_extensions)
+ {
+
+ /* If GNU extensions are enabled, the only way to avoid newline as
+ a break character is to write all the break characters in the
+ file with no newline at all, not even at the end of the file.
+ If disabled, spaces, tabs and newlines are always considered as
+ break characters even if not included in the break file. */
+
+ word_fastmap[' '] = 0;
+ word_fastmap['\t'] = 0;
+ word_fastmap['\n'] = 0;
+ }
+
+ /* Return the space of the file, which is no more required. */
+
+ free (file_contents.start);
+}
+
+/*-----------------------------------------------------------------------.
+| Read a file named FILE_NAME, containing one word per line, then |
+| construct in TABLE a table of WORD descriptors for them. The routine |
+| swallows the whole file in memory; this is at the expense of space |
+| needed for newlines, which are useless; however, the reading is fast. |
+`-----------------------------------------------------------------------*/
+
+static void
+digest_word_file (const char *file_name, WORD_TABLE *table)
+{
+ BLOCK file_contents; /* to receive a copy of the file */
+ char *cursor; /* cursor in file copy */
+ char *word_start; /* start of the current word */
+
+ swallow_file_in_memory (file_name, &file_contents);
+
+ table->start = NULL;
+ table->alloc = 0;
+ table->length = 0;
+
+ /* Read the whole file. */
+
+ cursor = file_contents.start;
+ while (cursor < file_contents.end)
+ {
+
+ /* Read one line, and save the word in contains. */
+
+ word_start = cursor;
+ while (cursor < file_contents.end && *cursor != '\n')
+ cursor++;
+
+ /* Record the word in table if it is not empty. */
+
+ if (cursor > word_start)
+ {
+ if (table->length == table->alloc)
+ {
+ if ((SIZE_MAX / sizeof *table->start - 1) / 2 < table->alloc)
+ xalloc_die ();
+ table->alloc = table->alloc * 2 + 1;
+ table->start = xrealloc (table->start,
+ table->alloc * sizeof *table->start);
+ }
+
+ table->start[table->length].start = word_start;
+ table->start[table->length].size = cursor - word_start;
+ table->length++;
+ }
+
+ /* This test allows for an incomplete line at end of file. */
+
+ if (cursor < file_contents.end)
+ cursor++;
+ }
+
+ /* Finally, sort all the words read. */
+
+ qsort (table->start, table->length, sizeof table->start[0], compare_words);
+}
+
+/* Keyword recognition and selection. */
+
+/*----------------------------------------------------------------------.
+| For each keyword in the source text, constructs an OCCURS structure. |
+`----------------------------------------------------------------------*/
+
+static void
+find_occurs_in_text (void)
+{
+ char *cursor; /* for scanning the source text */
+ char *scan; /* for scanning the source text also */
+ char *line_start; /* start of the current input line */
+ char *line_scan; /* newlines scanned until this point */
+ int reference_length; /* length of reference in input mode */
+ WORD possible_key; /* possible key, to ease searches */
+ OCCURS *occurs_cursor; /* current OCCURS under construction */
+
+ char *context_start; /* start of left context */
+ char *context_end; /* end of right context */
+ char *word_start; /* start of word */
+ char *word_end; /* end of word */
+ char *next_context_start; /* next start of left context */
+
+ /* reference_length is always used within `if (input_reference)'.
+ However, GNU C diagnoses that it may be used uninitialized. The
+ following assignment is merely to shut it up. */
+
+ reference_length = 0;
+
+ /* Tracking where lines start is helpful for reference processing. In
+ auto reference mode, this allows counting lines. In input reference
+ mode, this permits finding the beginning of the references.
+
+ The first line begins with the file, skip immediately this very first
+ reference in input reference mode, to help further rejection any word
+ found inside it. Also, unconditionally assigning these variable has
+ the happy effect of shutting up lint. */
+
+ line_start = text_buffer.start;
+ line_scan = line_start;
+ if (input_reference)
+ {
+ SKIP_NON_WHITE (line_scan, text_buffer.end);
+ reference_length = line_scan - line_start;
+ SKIP_WHITE (line_scan, text_buffer.end);
+ }
+
+ /* Process the whole buffer, one line or one sentence at a time. */
+
+ for (cursor = text_buffer.start;
+ cursor < text_buffer.end;
+ cursor = next_context_start)
+ {
+
+ /* `context_start' gets initialized before the processing of each
+ line, or once for the whole buffer if no end of line or sentence
+ sequence separator. */
+
+ context_start = cursor;
+
+ /* If a end of line or end of sentence sequence is defined and
+ non-empty, `next_context_start' will be recomputed to be the end of
+ each line or sentence, before each one is processed. If no such
+ sequence, then `next_context_start' is set at the end of the whole
+ buffer, which is then considered to be a single line or sentence.
+ This test also accounts for the case of an incomplete line or
+ sentence at the end of the buffer. */
+
+ next_context_start = text_buffer.end;
+ if (context_regex.string)
+ switch (re_search (&context_regex.pattern, cursor,
+ text_buffer.end - cursor,
+ 0, text_buffer.end - cursor, &context_regs))
+ {
+ case -2:
+ matcher_error ();
+
+ case -1:
+ break;
+
+ default:
+ next_context_start = cursor + context_regs.end[0];
+ break;
+ }
+
+ /* Include the separator into the right context, but not any suffix
+ white space in this separator; this insures it will be seen in
+ output and will not take more space than necessary. */
+
+ context_end = next_context_start;
+ SKIP_WHITE_BACKWARDS (context_end, context_start);
+
+ /* Read and process a single input line or sentence, one word at a
+ time. */
+
+ while (1)
+ {
+ if (word_regex.string)
+
+ /* If a word regexp has been compiled, use it to skip at the
+ beginning of the next word. If there is no such word, exit
+ the loop. */
+
+ {
+ regoff_t r = re_search (&word_regex.pattern, cursor,
+ context_end - cursor,
+ 0, context_end - cursor, &word_regs);
+ if (r == -2)
+ matcher_error ();
+ if (r == -1)
+ break;
+ word_start = cursor + word_regs.start[0];
+ word_end = cursor + word_regs.end[0];
+ }
+ else
+
+ /* Avoid re_search and use the fastmap to skip to the
+ beginning of the next word. If there is no more word in
+ the buffer, exit the loop. */
+
+ {
+ scan = cursor;
+ while (scan < context_end
+ && !word_fastmap[to_uchar (*scan)])
+ scan++;
+
+ if (scan == context_end)
+ break;
+
+ word_start = scan;
+
+ while (scan < context_end
+ && word_fastmap[to_uchar (*scan)])
+ scan++;
+
+ word_end = scan;
+ }
+
+ /* Skip right to the beginning of the found word. */
+
+ cursor = word_start;
+
+ /* Skip any zero length word. Just advance a single position,
+ then go fetch the next word. */
+
+ if (word_end == word_start)
+ {
+ cursor++;
+ continue;
+ }
+
+ /* This is a genuine, non empty word, so save it as a possible
+ key. Then skip over it. Also, maintain the maximum length of
+ all words read so far. It is mandatory to take the maximum
+ length of all words in the file, without considering if they
+ are actually kept or rejected, because backward jumps at output
+ generation time may fall in *any* word. */
+
+ possible_key.start = cursor;
+ possible_key.size = word_end - word_start;
+ cursor += possible_key.size;
+
+ if (possible_key.size > maximum_word_length)
+ maximum_word_length = possible_key.size;
+
+ /* In input reference mode, update `line_start' from its previous
+ value. Count the lines just in case auto reference mode is
+ also selected. If it happens that the word just matched is
+ indeed part of a reference; just ignore it. */
+
+ if (input_reference)
+ {
+ while (line_scan < possible_key.start)
+ if (*line_scan == '\n')
+ {
+ total_line_count++;
+ line_scan++;
+ line_start = line_scan;
+ SKIP_NON_WHITE (line_scan, text_buffer.end);
+ reference_length = line_scan - line_start;
+ }
+ else
+ line_scan++;
+ if (line_scan > possible_key.start)
+ continue;
+ }
+
+ /* Ignore the word if an `Ignore words' table exists and if it is
+ part of it. Also ignore the word if an `Only words' table and
+ if it is *not* part of it.
+
+ It is allowed that both tables be used at once, even if this
+ may look strange for now. Just ignore a word that would appear
+ in both. If regexps are eventually implemented for these
+ tables, the Ignore table could then reject words that would
+ have been previously accepted by the Only table. */
+
+ if (ignore_file && search_table (&possible_key, &ignore_table))
+ continue;
+ if (only_file && !search_table (&possible_key, &only_table))
+ continue;
+
+ /* A non-empty word has been found. First of all, insure
+ proper allocation of the next OCCURS, and make a pointer to
+ where it will be constructed. */
+
+ if (number_of_occurs[0] == occurs_alloc[0])
+ {
+ if ((SIZE_MAX / sizeof *occurs_table[0] - 1) / 2
+ < occurs_alloc[0])
+ xalloc_die ();
+ occurs_alloc[0] = occurs_alloc[0] * 2 + 1;
+ occurs_table[0] = xrealloc (occurs_table[0],
+ occurs_alloc[0] * sizeof *occurs_table[0]);
+ }
+
+ occurs_cursor = occurs_table[0] + number_of_occurs[0];
+
+ /* Define the refence field, if any. */
+
+ if (auto_reference)
+ {
+
+ /* While auto referencing, update `line_start' from its
+ previous value, counting lines as we go. If input
+ referencing at the same time, `line_start' has been
+ advanced earlier, and the following loop is never really
+ executed. */
+
+ while (line_scan < possible_key.start)
+ if (*line_scan == '\n')
+ {
+ total_line_count++;
+ line_scan++;
+ line_start = line_scan;
+ SKIP_NON_WHITE (line_scan, text_buffer.end);
+ }
+ else
+ line_scan++;
+
+ occurs_cursor->reference = total_line_count;
+ }
+ else if (input_reference)
+ {
+
+ /* If only input referencing, `line_start' has been computed
+ earlier to detect the case the word matched would be part
+ of the reference. The reference position is simply the
+ value of `line_start'. */
+
+ occurs_cursor->reference
+ = (DELTA) (line_start - possible_key.start);
+ if (reference_length > reference_max_width)
+ reference_max_width = reference_length;
+ }
+
+ /* Exclude the reference from the context in simple cases. */
+
+ if (input_reference && line_start == context_start)
+ {
+ SKIP_NON_WHITE (context_start, context_end);
+ SKIP_WHITE (context_start, context_end);
+ }
+
+ /* Completes the OCCURS structure. */
+
+ occurs_cursor->key = possible_key;
+ occurs_cursor->left = context_start - possible_key.start;
+ occurs_cursor->right = context_end - possible_key.start;
+
+ number_of_occurs[0]++;
+ }
+ }
+}
+
+/* Formatting and actual output - service routines. */
+
+/*-----------------------------------------.
+| Prints some NUMBER of spaces on stdout. |
+`-----------------------------------------*/
+
+static void
+print_spaces (int number)
+{
+ int counter;
+
+ for (counter = number; counter > 0; counter--)
+ putchar (' ');
+}
+
+/*-------------------------------------.
+| Prints the field provided by FIELD. |
+`-------------------------------------*/
+
+static void
+print_field (BLOCK field)
+{
+ char *cursor; /* Cursor in field to print */
+ int base; /* Base character, without diacritic */
+ int diacritic; /* Diacritic code for the character */
+
+ /* Whitespace is not really compressed. Instead, each white space
+ character (tab, vt, ht etc.) is printed as one single space. */
+
+ for (cursor = field.start; cursor < field.end; cursor++)
+ {
+ unsigned char character = *cursor;
+ if (edited_flag[character])
+ {
+
+ /* First check if this is a diacriticized character.
+
+ This works only for TeX. I do not know how diacriticized
+ letters work with `roff'. Please someone explain it to me! */
+
+ diacritic = todiac (character);
+ if (diacritic != 0 && output_format == TEX_FORMAT)
+ {
+ base = tobase (character);
+ switch (diacritic)
+ {
+
+ case 1: /* Latin diphthongs */
+ switch (base)
+ {
+ case 'o':
+ fputs ("\\oe{}", stdout);
+ break;
+
+ case 'O':
+ fputs ("\\OE{}", stdout);
+ break;
+
+ case 'a':
+ fputs ("\\ae{}", stdout);
+ break;
+
+ case 'A':
+ fputs ("\\AE{}", stdout);
+ break;
+
+ default:
+ putchar (' ');
+ }
+ break;
+
+ case 2: /* Acute accent */
+ printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
+ break;
+
+ case 3: /* Grave accent */
+ printf ("\\`%s%c", (base == 'i' ? "\\" : ""), base);
+ break;
+
+ case 4: /* Circumflex accent */
+ printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
+ break;
+
+ case 5: /* Diaeresis */
+ printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
+ break;
+
+ case 6: /* Tilde accent */
+ printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
+ break;
+
+ case 7: /* Cedilla */
+ printf ("\\c{%c}", base);
+ break;
+
+ case 8: /* Small circle beneath */
+ switch (base)
+ {
+ case 'a':
+ fputs ("\\aa{}", stdout);
+ break;
+
+ case 'A':
+ fputs ("\\AA{}", stdout);
+ break;
+
+ default:
+ putchar (' ');
+ }
+ break;
+
+ case 9: /* Strike through */
+ switch (base)
+ {
+ case 'o':
+ fputs ("\\o{}", stdout);
+ break;
+
+ case 'O':
+ fputs ("\\O{}", stdout);
+ break;
+
+ default:
+ putchar (' ');
+ }
+ break;
+ }
+ }
+ else
+
+ /* This is not a diacritic character, so handle cases which are
+ really specific to `roff' or TeX. All white space processing
+ is done as the default case of this switch. */
+
+ switch (character)
+ {
+ case '"':
+ /* In roff output format, double any quote. */
+ putchar ('"');
+ putchar ('"');
+ break;
+
+ case '$':
+ case '%':
+ case '&':
+ case '#':
+ case '_':
+ /* In TeX output format, precede these with a backslash. */
+ putchar ('\\');
+ putchar (character);
+ break;
+
+ case '{':
+ case '}':
+ /* In TeX output format, precede these with a backslash and
+ force mathematical mode. */
+ printf ("$\\%c$", character);
+ break;
+
+ case '\\':
+ /* In TeX output mode, request production of a backslash. */
+ fputs ("\\backslash{}", stdout);
+ break;
+
+ default:
+ /* Any other flagged character produces a single space. */
+ putchar (' ');
+ }
+ }
+ else
+ putchar (*cursor);
+ }
+}
+
+/* Formatting and actual output - planning routines. */
+
+/*--------------------------------------------------------------------.
+| From information collected from command line options and input file |
+| readings, compute and fix some output parameter values. |
+`--------------------------------------------------------------------*/
+
+static void
+fix_output_parameters (void)
+{
+ int file_index; /* index in text input file arrays */
+ int line_ordinal; /* line ordinal value for reference */
+ char ordinal_string[12]; /* edited line ordinal for reference */
+ int reference_width; /* width for the whole reference */
+ int character; /* character ordinal */
+ const char *cursor; /* cursor in some constant strings */
+
+ /* In auto reference mode, the maximum width of this field is
+ precomputed and subtracted from the overall line width. Add one for
+ the column which separate the file name from the line number. */
+
+ if (auto_reference)
+ {
+ reference_max_width = 0;
+ for (file_index = 0; file_index < number_input_files; file_index++)
+ {
+ line_ordinal = file_line_count[file_index] + 1;
+ if (file_index > 0)
+ line_ordinal -= file_line_count[file_index - 1];
+ sprintf (ordinal_string, "%d", line_ordinal);
+ reference_width = strlen (ordinal_string);
+ if (input_file_name[file_index])
+ reference_width += strlen (input_file_name[file_index]);
+ if (reference_width > reference_max_width)
+ reference_max_width = reference_width;
+ }
+ reference_max_width++;
+ reference.start = xmalloc ((size_t) reference_max_width + 1);
+ }
+
+ /* If the reference appears to the left of the output line, reserve some
+ space for it right away, including one gap size. */
+
+ if ((auto_reference | input_reference) & !right_reference)
+ line_width -= reference_max_width + gap_size;
+
+ /* The output lines, minimally, will contain from left to right a left
+ context, a gap, and a keyword followed by the right context with no
+ special intervening gap. Half of the line width is dedicated to the
+ left context and the gap, the other half is dedicated to the keyword
+ and the right context; these values are computed once and for all here.
+ There also are tail and head wrap around fields, used when the keyword
+ is near the beginning or the end of the line, or when some long word
+ cannot fit in, but leave place from wrapped around shorter words. The
+ maximum width of these fields are recomputed separately for each line,
+ on a case by case basis. It is worth noting that it cannot happen that
+ both the tail and head fields are used at once. */
+
+ half_line_width = line_width / 2;
+ before_max_width = half_line_width - gap_size;
+ keyafter_max_width = half_line_width;
+
+ /* If truncation_string is the empty string, make it NULL to speed up
+ tests. In this case, truncation_string_length will never get used, so
+ there is no need to set it. */
+
+ if (truncation_string && *truncation_string)
+ truncation_string_length = strlen (truncation_string);
+ else
+ truncation_string = NULL;
+
+ if (gnu_extensions)
+ {
+
+ /* When flagging truncation at the left of the keyword, the
+ truncation mark goes at the beginning of the before field,
+ unless there is a head field, in which case the mark goes at the
+ left of the head field. When flagging truncation at the right
+ of the keyword, the mark goes at the end of the keyafter field,
+ unless there is a tail field, in which case the mark goes at the
+ end of the tail field. Only eight combination cases could arise
+ for truncation marks:
+
+ . None.
+ . One beginning the before field.
+ . One beginning the head field.
+ . One ending the keyafter field.
+ . One ending the tail field.
+ . One beginning the before field, another ending the keyafter field.
+ . One ending the tail field, another beginning the before field.
+ . One ending the keyafter field, another beginning the head field.
+
+ So, there is at most two truncation marks, which could appear both
+ on the left side of the center of the output line, both on the
+ right side, or one on either side. */
+
+ before_max_width -= 2 * truncation_string_length;
+ keyafter_max_width -= 2 * truncation_string_length;
+ }
+ else
+ {
+
+ /* I never figured out exactly how UNIX' ptx plans the output width
+ of its various fields. If GNU extensions are disabled, do not
+ try computing the field widths correctly; instead, use the
+ following formula, which does not completely imitate UNIX' ptx,
+ but almost. */
+
+ keyafter_max_width -= 2 * truncation_string_length + 1;
+ }
+
+ /* Compute which characters need special output processing. Initialize
+ by flagging any white space character. Some systems do not consider
+ form feed as a space character, but we do. */
+
+ for (character = 0; character < CHAR_SET_SIZE; character++)
+ edited_flag[character] = !! isspace (character);
+ edited_flag['\f'] = 1;
+
+ /* Complete the special character flagging according to selected output
+ format. */
+
+ switch (output_format)
+ {
+ case UNKNOWN_FORMAT:
+ /* Should never happen. */
+
+ case DUMB_FORMAT:
+ break;
+
+ case ROFF_FORMAT:
+
+ /* `Quote' characters should be doubled. */
+
+ edited_flag['"'] = 1;
+ break;
+
+ case TEX_FORMAT:
+
+ /* Various characters need special processing. */
+
+ for (cursor = "$%&#_{}\\"; *cursor; cursor++)
+ edited_flag[to_uchar (*cursor)] = 1;
+
+ /* Any character with 8th bit set will print to a single space, unless
+ it is diacriticized. */
+
+ for (character = 0200; character < CHAR_SET_SIZE; character++)
+ edited_flag[character] = todiac (character) != 0;
+ break;
+ }
+}
+
+/*------------------------------------------------------------------.
+| Compute the position and length of all the output fields, given a |
+| pointer to some OCCURS. |
+`------------------------------------------------------------------*/
+
+static void
+define_all_fields (OCCURS *occurs)
+{
+ int tail_max_width; /* allowable width of tail field */
+ int head_max_width; /* allowable width of head field */
+ char *cursor; /* running cursor in source text */
+ char *left_context_start; /* start of left context */
+ char *right_context_end; /* end of right context */
+ char *left_field_start; /* conservative start for `head'/`before' */
+ int file_index; /* index in text input file arrays */
+ const char *file_name; /* file name for reference */
+ int line_ordinal; /* line ordinal for reference */
+
+ /* Define `keyafter', start of left context and end of right context.
+ `keyafter' starts at the saved position for keyword and extend to the
+ right from the end of the keyword, eating separators or full words, but
+ not beyond maximum allowed width for `keyafter' field or limit for the
+ right context. Suffix spaces will be removed afterwards. */
+
+ keyafter.start = occurs->key.start;
+ keyafter.end = keyafter.start + occurs->key.size;
+ left_context_start = keyafter.start + occurs->left;
+ right_context_end = keyafter.start + occurs->right;
+
+ cursor = keyafter.end;
+ while (cursor < right_context_end
+ && cursor <= keyafter.start + keyafter_max_width)
+ {
+ keyafter.end = cursor;
+ SKIP_SOMETHING (cursor, right_context_end);
+ }
+ if (cursor <= keyafter.start + keyafter_max_width)
+ keyafter.end = cursor;
+
+ keyafter_truncation = truncation_string && keyafter.end < right_context_end;
+
+ SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
+
+ /* When the left context is wide, it might take some time to catch up from
+ the left context boundary to the beginning of the `head' or `before'
+ fields. So, in this case, to speed the catchup, we jump back from the
+ keyword, using some secure distance, possibly falling in the middle of
+ a word. A secure backward jump would be at least half the maximum
+ width of a line, plus the size of the longest word met in the whole
+ input. We conclude this backward jump by a skip forward of at least
+ one word. In this manner, we should not inadvertently accept only part
+ of a word. From the reached point, when it will be time to fix the
+ beginning of `head' or `before' fields, we will skip forward words or
+ delimiters until we get sufficiently near. */
+
+ if (-occurs->left > half_line_width + maximum_word_length)
+ {
+ left_field_start
+ = keyafter.start - (half_line_width + maximum_word_length);
+ SKIP_SOMETHING (left_field_start, keyafter.start);
+ }
+ else
+ left_field_start = keyafter.start + occurs->left;
+
+ /* `before' certainly ends at the keyword, but not including separating
+ spaces. It starts after than the saved value for the left context, by
+ advancing it until it falls inside the maximum allowed width for the
+ before field. There will be no prefix spaces either. `before' only
+ advances by skipping single separators or whole words. */
+
+ before.start = left_field_start;
+ before.end = keyafter.start;
+ SKIP_WHITE_BACKWARDS (before.end, before.start);
+
+ while (before.start + before_max_width < before.end)
+ SKIP_SOMETHING (before.start, before.end);
+
+ if (truncation_string)
+ {
+ cursor = before.start;
+ SKIP_WHITE_BACKWARDS (cursor, text_buffer.start);
+ before_truncation = cursor > left_context_start;
+ }
+ else
+ before_truncation = 0;
+
+ SKIP_WHITE (before.start, text_buffer.end);
+
+ /* The tail could not take more columns than what has been left in the
+ left context field, and a gap is mandatory. It starts after the
+ right context, and does not contain prefixed spaces. It ends at
+ the end of line, the end of buffer or when the tail field is full,
+ whichever comes first. It cannot contain only part of a word, and
+ has no suffixed spaces. */
+
+ tail_max_width
+ = before_max_width - (before.end - before.start) - gap_size;
+
+ if (tail_max_width > 0)
+ {
+ tail.start = keyafter.end;
+ SKIP_WHITE (tail.start, text_buffer.end);
+
+ tail.end = tail.start;
+ cursor = tail.end;
+ while (cursor < right_context_end
+ && cursor < tail.start + tail_max_width)
+ {
+ tail.end = cursor;
+ SKIP_SOMETHING (cursor, right_context_end);
+ }
+
+ if (cursor < tail.start + tail_max_width)
+ tail.end = cursor;
+
+ if (tail.end > tail.start)
+ {
+ keyafter_truncation = 0;
+ tail_truncation = truncation_string && tail.end < right_context_end;
+ }
+ else
+ tail_truncation = 0;
+
+ SKIP_WHITE_BACKWARDS (tail.end, tail.start);
+ }
+ else
+ {
+
+ /* No place left for a tail field. */
+
+ tail.start = NULL;
+ tail.end = NULL;
+ tail_truncation = 0;
+ }
+
+ /* `head' could not take more columns than what has been left in the right
+ context field, and a gap is mandatory. It ends before the left
+ context, and does not contain suffixed spaces. Its pointer is advanced
+ until the head field has shrunk to its allowed width. It cannot
+ contain only part of a word, and has no suffixed spaces. */
+
+ head_max_width
+ = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
+
+ if (head_max_width > 0)
+ {
+ head.end = before.start;
+ SKIP_WHITE_BACKWARDS (head.end, text_buffer.start);
+
+ head.start = left_field_start;
+ while (head.start + head_max_width < head.end)
+ SKIP_SOMETHING (head.start, head.end);
+
+ if (head.end > head.start)
+ {
+ before_truncation = 0;
+ head_truncation = (truncation_string
+ && head.start > left_context_start);
+ }
+ else
+ head_truncation = 0;
+
+ SKIP_WHITE (head.start, head.end);
+ }
+ else
+ {
+
+ /* No place left for a head field. */
+
+ head.start = NULL;
+ head.end = NULL;
+ head_truncation = 0;
+ }
+
+ if (auto_reference)
+ {
+
+ /* Construct the reference text in preallocated space from the file
+ name and the line number. Find out in which file the reference
+ occurred. Standard input yields an empty file name. Insure line
+ numbers are one based, even if they are computed zero based. */
+
+ file_index = 0;
+ while (file_line_count[file_index] < occurs->reference)
+ file_index++;
+
+ file_name = input_file_name[file_index];
+ if (!file_name)
+ file_name = "";
+
+ line_ordinal = occurs->reference + 1;
+ if (file_index > 0)
+ line_ordinal -= file_line_count[file_index - 1];
+
+ sprintf (reference.start, "%s:%d", file_name, line_ordinal);
+ reference.end = reference.start + strlen (reference.start);
+ }
+ else if (input_reference)
+ {
+
+ /* Reference starts at saved position for reference and extends right
+ until some white space is met. */
+
+ reference.start = keyafter.start + (DELTA) occurs->reference;
+ reference.end = reference.start;
+ SKIP_NON_WHITE (reference.end, right_context_end);
+ }
+}
+
+/* Formatting and actual output - control routines. */
+
+/*----------------------------------------------------------------------.
+| Output the current output fields as one line for `troff' or `nroff'. |
+`----------------------------------------------------------------------*/
+
+static void
+output_one_roff_line (void)
+{
+ /* Output the `tail' field. */
+
+ printf (".%s \"", macro_name);
+ print_field (tail);
+ if (tail_truncation)
+ fputs (truncation_string, stdout);
+ putchar ('"');
+
+ /* Output the `before' field. */
+
+ fputs (" \"", stdout);
+ if (before_truncation)
+ fputs (truncation_string, stdout);
+ print_field (before);
+ putchar ('"');
+
+ /* Output the `keyafter' field. */
+
+ fputs (" \"", stdout);
+ print_field (keyafter);
+ if (keyafter_truncation)
+ fputs (truncation_string, stdout);
+ putchar ('"');
+
+ /* Output the `head' field. */
+
+ fputs (" \"", stdout);
+ if (head_truncation)
+ fputs (truncation_string, stdout);
+ print_field (head);
+ putchar ('"');
+
+ /* Conditionally output the `reference' field. */
+
+ if (auto_reference | input_reference)
+ {
+ fputs (" \"", stdout);
+ print_field (reference);
+ putchar ('"');
+ }
+
+ putchar ('\n');
+}
+
+/*---------------------------------------------------------.
+| Output the current output fields as one line for `TeX'. |
+`---------------------------------------------------------*/
+
+static void
+output_one_tex_line (void)
+{
+ BLOCK key; /* key field, isolated */
+ BLOCK after; /* after field, isolated */
+ char *cursor; /* running cursor in source text */
+
+ printf ("\\%s ", macro_name);
+ putchar ('{');
+ print_field (tail);
+ fputs ("}{", stdout);
+ print_field (before);
+ fputs ("}{", stdout);
+ key.start = keyafter.start;
+ after.end = keyafter.end;
+ cursor = keyafter.start;
+ SKIP_SOMETHING (cursor, keyafter.end);
+ key.end = cursor;
+ after.start = cursor;
+ print_field (key);
+ fputs ("}{", stdout);
+ print_field (after);
+ fputs ("}{", stdout);
+ print_field (head);
+ putchar ('}');
+ if (auto_reference | input_reference)
+ {
+ putchar ('{');
+ print_field (reference);
+ putchar ('}');
+ }
+ putchar ('\n');
+}
+
+/*-------------------------------------------------------------------.
+| Output the current output fields as one line for a dumb terminal. |
+`-------------------------------------------------------------------*/
+
+static void
+output_one_dumb_line (void)
+{
+ if (!right_reference)
+ {
+ if (auto_reference)
+ {
+
+ /* Output the `reference' field, in such a way that GNU emacs
+ next-error will handle it. The ending colon is taken from the
+ gap which follows. */
+
+ print_field (reference);
+ putchar (':');
+ print_spaces (reference_max_width
+ + gap_size
+ - (reference.end - reference.start)
+ - 1);
+ }
+ else
+ {
+
+ /* Output the `reference' field and its following gap. */
+
+ print_field (reference);
+ print_spaces (reference_max_width
+ + gap_size
+ - (reference.end - reference.start));
+ }
+ }
+
+ if (tail.start < tail.end)
+ {
+ /* Output the `tail' field. */
+
+ print_field (tail);
+ if (tail_truncation)
+ fputs (truncation_string, stdout);
+
+ print_spaces (half_line_width - gap_size
+ - (before.end - before.start)
+ - (before_truncation ? truncation_string_length : 0)
+ - (tail.end - tail.start)
+ - (tail_truncation ? truncation_string_length : 0));
+ }
+ else
+ print_spaces (half_line_width - gap_size
+ - (before.end - before.start)
+ - (before_truncation ? truncation_string_length : 0));
+
+ /* Output the `before' field. */
+
+ if (before_truncation)
+ fputs (truncation_string, stdout);
+ print_field (before);
+
+ print_spaces (gap_size);
+
+ /* Output the `keyafter' field. */
+
+ print_field (keyafter);
+ if (keyafter_truncation)
+ fputs (truncation_string, stdout);
+
+ if (head.start < head.end)
+ {
+ /* Output the `head' field. */
+
+ print_spaces (half_line_width
+ - (keyafter.end - keyafter.start)
+ - (keyafter_truncation ? truncation_string_length : 0)
+ - (head.end - head.start)
+ - (head_truncation ? truncation_string_length : 0));
+ if (head_truncation)
+ fputs (truncation_string, stdout);
+ print_field (head);
+ }
+ else
+
+ if ((auto_reference | input_reference) & right_reference)
+ print_spaces (half_line_width
+ - (keyafter.end - keyafter.start)
+ - (keyafter_truncation ? truncation_string_length : 0));
+
+ if ((auto_reference | input_reference) & right_reference)
+ {
+ /* Output the `reference' field. */
+
+ print_spaces (gap_size);
+ print_field (reference);
+ }
+
+ putchar ('\n');
+}
+
+/*------------------------------------------------------------------------.
+| Scan the whole occurs table and, for each entry, output one line in the |
+| appropriate format. |
+`------------------------------------------------------------------------*/
+
+static void
+generate_all_output (void)
+{
+ size_t occurs_index; /* index of keyword entry being processed */
+ OCCURS *occurs_cursor; /* current keyword entry being processed */
+
+ /* The following assignments are useful to provide default values in case
+ line contexts or references are not used, in which case these variables
+ would never be computed. */
+
+ tail.start = NULL;
+ tail.end = NULL;
+ tail_truncation = 0;
+
+ head.start = NULL;
+ head.end = NULL;
+ head_truncation = 0;
+
+ /* Loop over all keyword occurrences. */
+
+ occurs_cursor = occurs_table[0];
+
+ for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
+ {
+ /* Compute the exact size of every field and whenever truncation flags
+ are present or not. */
+
+ define_all_fields (occurs_cursor);
+
+ /* Produce one output line according to selected format. */
+
+ switch (output_format)
+ {
+ case UNKNOWN_FORMAT:
+ /* Should never happen. */
+
+ case DUMB_FORMAT:
+ output_one_dumb_line ();
+ break;
+
+ case ROFF_FORMAT:
+ output_one_roff_line ();
+ break;
+
+ case TEX_FORMAT:
+ output_one_tex_line ();
+ break;
+ }
+
+ /* Advance the cursor into the occurs table. */
+
+ occurs_cursor++;
+ }
+}
+
+/* Option decoding and main program. */
+
+/*------------------------------------------------------.
+| Print program identification and options, then exit. |
+`------------------------------------------------------*/
+
+void
+usage (int status)
+{
+ if (status != EXIT_SUCCESS)
+ fprintf (stderr, _("Try `%s --help' for more information.\n"),
+ program_name);
+ else
+ {
+ printf (_("\
+Usage: %s [OPTION]... [INPUT]... (without -G)\n\
+ or: %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
+ program_name, program_name);
+ fputs (_("\
+Output a permuted index, including context, of the words in the input files.\n\
+\n\
+"), stdout);
+ fputs (_("\
+Mandatory arguments to long options are mandatory for short options too.\n\
+"), stdout);
+ fputs (_("\
+ -A, --auto-reference output automatically generated references\n\
+ -G, --traditional behave more like System V `ptx'\n\
+ -F, --flag-truncation=STRING use STRING for flagging line truncations\n\
+"), stdout);
+ fputs (_("\
+ -M, --macro-name=STRING macro name to use instead of `xx'\n\
+ -O, --format=roff generate output as roff directives\n\
+ -R, --right-side-refs put references at right, not counted in -w\n\
+ -S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\
+ -T, --format=tex generate output as TeX directives\n\
+"), stdout);
+ fputs (_("\
+ -W, --word-regexp=REGEXP use REGEXP to match each keyword\n\
+ -b, --break-file=FILE word break characters in this FILE\n\
+ -f, --ignore-case fold lower case to upper case for sorting\n\
+ -g, --gap-size=NUMBER gap size in columns between output fields\n\
+ -i, --ignore-file=FILE read ignore word list from FILE\n\
+ -o, --only-file=FILE read only word list from this FILE\n\
+"), stdout);
+ fputs (_("\
+ -r, --references first field of each line is a reference\n\
+ -t, --typeset-mode - not implemented -\n\
+ -w, --width=NUMBER output width in columns, reference excluded\n\
+"), stdout);
+ fputs (HELP_OPTION_DESCRIPTION, stdout);
+ fputs (VERSION_OPTION_DESCRIPTION, stdout);
+ fputs (_("\
+\n\
+With no FILE or if FILE is -, read Standard Input. `-F /' by default.\n\
+"), stdout);
+ printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
+ }
+ exit (status);
+}
+
+/*----------------------------------------------------------------------.
+| Main program. Decode ARGC arguments passed through the ARGV array of |
+| strings, then launch execution. |
+`----------------------------------------------------------------------*/
+
+/* Long options equivalences. */
+static const struct option long_options[] =
+{
+ {"auto-reference", no_argument, NULL, 'A'},
+ {"break-file", required_argument, NULL, 'b'},
+ {"copyright", no_argument, NULL, 'C'}, /* Deprecated, remove in 2007. */
+ {"flag-truncation", required_argument, NULL, 'F'},
+ {"ignore-case", no_argument, NULL, 'f'},
+ {"gap-size", required_argument, NULL, 'g'},
+ {"ignore-file", required_argument, NULL, 'i'},
+ {"macro-name", required_argument, NULL, 'M'},
+ {"only-file", required_argument, NULL, 'o'},
+ {"references", no_argument, NULL, 'r'},
+ {"right-side-refs", no_argument, NULL, 'R'},
+ {"format", required_argument, NULL, 10},
+ {"sentence-regexp", required_argument, NULL, 'S'},
+ {"traditional", no_argument, NULL, 'G'},
+ {"typeset-mode", no_argument, NULL, 't'},
+ {"width", required_argument, NULL, 'w'},
+ {"word-regexp", required_argument, NULL, 'W'},
+ {GETOPT_HELP_OPTION_DECL},
+ {GETOPT_VERSION_OPTION_DECL},
+ {NULL, 0, NULL, 0},
+};
+
+static char const* const format_args[] =
+{
+ "roff", "tex", NULL
+};
+
+static enum Format const format_vals[] =
+{
+ ROFF_FORMAT, TEX_FORMAT
+};
+
+int
+main (int argc, char **argv)
+{
+ int optchar; /* argument character */
+ int file_index; /* index in text input file arrays */
+
+ /* Decode program options. */
+
+ initialize_main (&argc, &argv);
+ program_name = argv[0];
+ setlocale (LC_ALL, "");
+ bindtextdomain (PACKAGE, LOCALEDIR);
+ textdomain (PACKAGE);
+
+ atexit (close_stdout);
+
+#if HAVE_SETCHRCLASS
+ setchrclass (NULL);
+#endif
+
+ while (optchar = getopt_long (argc, argv, "ACF:GM:ORS:TW:b:i:fg:o:trw:",
+ long_options, NULL),
+ optchar != EOF)
+ {
+ switch (optchar)
+ {
+ default:
+ usage (EXIT_FAILURE);
+
+ case 'G':
+ gnu_extensions = false;
+ break;
+
+ case 'b':
+ break_file = optarg;
+ break;
+
+ case 'f':
+ ignore_case = true;
+ break;
+
+ case 'g':
+ {
+ unsigned long int tmp_ulong;
+ if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
+ || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
+ error (EXIT_FAILURE, 0, _("invalid gap width: %s"),
+ quotearg (optarg));
+ gap_size = tmp_ulong;
+ break;
+ }
+
+ case 'i':
+ ignore_file = optarg;
+ break;
+
+ case 'o':
+ only_file = optarg;
+ break;
+
+ case 'r':
+ input_reference = true;
+ break;
+
+ case 't':
+ /* Yet to understand... */
+ break;
+
+ case 'w':
+ {
+ unsigned long int tmp_ulong;
+ if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
+ || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
+ error (EXIT_FAILURE, 0, _("invalid line width: %s"),
+ quotearg (optarg));
+ line_width = tmp_ulong;
+ break;
+ }
+
+ case 'A':
+ auto_reference = true;
+ break;
+
+ case 'F':
+ truncation_string = copy_unescaped_string (optarg);
+ break;
+
+ case 'M':
+ macro_name = optarg;
+ break;
+
+ case 'O':
+ output_format = ROFF_FORMAT;
+ break;
+
+ case 'R':
+ right_reference = true;
+ break;
+
+ case 'S':
+ context_regex.string = copy_unescaped_string (optarg);
+ break;
+
+ case 'T':
+ output_format = TEX_FORMAT;
+ break;
+
+ case 'W':
+ word_regex.string = copy_unescaped_string (optarg);
+ if (!*word_regex.string)
+ word_regex.string = NULL;
+ break;
+
+ case 10:
+ output_format = XARGMATCH ("--format", optarg,
+ format_args, format_vals);
+ case_GETOPT_HELP_CHAR;
+
+ case 'C':
+ error (0, 0, _("\
+the --copyright option is deprecated; use --version instead"));
+ /* fallthrough */
+
+ case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
+ }
+ }
+
+ /* Process remaining arguments. If GNU extensions are enabled, process
+ all arguments as input parameters. If disabled, accept at most two
+ arguments, the second of which is an output parameter. */
+
+ if (optind == argc)
+ {
+
+ /* No more argument simply means: read standard input. */
+
+ input_file_name = xmalloc (sizeof *input_file_name);
+ file_line_count = xmalloc (sizeof *file_line_count);
+ number_input_files = 1;
+ input_file_name[0] = NULL;
+ }
+ else if (gnu_extensions)
+ {
+ number_input_files = argc - optind;
+ input_file_name = xmalloc (number_input_files * sizeof *input_file_name);
+ file_line_count = xmalloc (number_input_files * sizeof *file_line_count);
+
+ for (file_index = 0; file_index < number_input_files; file_index++)
+ {
+ input_file_name[file_index] = argv[optind];
+ if (!*argv[optind] || STREQ (argv[optind], "-"))
+ input_file_name[0] = NULL;
+ else
+ input_file_name[0] = argv[optind];
+ optind++;
+ }
+ }
+ else
+ {
+
+ /* There is one necessary input file. */
+
+ number_input_files = 1;
+ input_file_name = xmalloc (sizeof *input_file_name);
+ file_line_count = xmalloc (sizeof *file_line_count);
+ if (!*argv[optind] || STREQ (argv[optind], "-"))
+ input_file_name[0] = NULL;
+ else
+ input_file_name[0] = argv[optind];
+ optind++;
+
+ /* Redirect standard output, only if requested. */
+
+ if (optind < argc)
+ {
+ if (! freopen (argv[optind], "w", stdout))
+ error (EXIT_FAILURE, errno, "%s", argv[optind]);
+ optind++;
+ }
+
+ /* Diagnose any other argument as an error. */
+
+ if (optind < argc)
+ {
+ error (0, 0, _("extra operand %s"), quote (argv[optind]));
+ usage (EXIT_FAILURE);
+ }
+ }
+
+ /* If the output format has not been explicitly selected, choose dumb
+ terminal format if GNU extensions are enabled, else `roff' format. */
+
+ if (output_format == UNKNOWN_FORMAT)
+ output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
+
+ /* Initialize the main tables. */
+
+ initialize_regex ();
+
+ /* Read `Break character' file, if any. */
+
+ if (break_file)
+ digest_break_file (break_file);
+
+ /* Read `Ignore words' file and `Only words' files, if any. If any of
+ these files is empty, reset the name of the file to NULL, to avoid
+ unnecessary calls to search_table. */
+
+ if (ignore_file)
+ {
+ digest_word_file (ignore_file, &ignore_table);
+ if (ignore_table.length == 0)
+ ignore_file = NULL;
+ }
+
+ if (only_file)
+ {
+ digest_word_file (only_file, &only_table);
+ if (only_table.length == 0)
+ only_file = NULL;
+ }
+
+ /* Prepare to study all the input files. */
+
+ number_of_occurs[0] = 0;
+ total_line_count = 0;
+ maximum_word_length = 0;
+ reference_max_width = 0;
+
+ for (file_index = 0; file_index < number_input_files; file_index++)
+ {
+
+ /* Read the file in core, than study it. */
+
+ swallow_file_in_memory (input_file_name[file_index], &text_buffer);
+ find_occurs_in_text ();
+
+ /* Maintain for each file how many lines has been read so far when its
+ end is reached. Incrementing the count first is a simple kludge to
+ handle a possible incomplete line at end of file. */
+
+ total_line_count++;
+ file_line_count[file_index] = total_line_count;
+ }
+
+ /* Do the output process phase. */
+
+ sort_found_occurs ();
+ fix_output_parameters ();
+ generate_all_output ();
+
+ /* All done. */
+
+ exit (EXIT_SUCCESS);
+}