diff options
Diffstat (limited to 'src/comm.c')
-rw-r--r-- | src/comm.c | 338 |
1 files changed, 255 insertions, 83 deletions
@@ -1,10 +1,10 @@ /* comm -- compare two sorted files line by line. - Copyright (C) 86, 90, 91, 1995-2005 Free Software Foundation, Inc. + Copyright (C) 1986-2016 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or modify + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -12,11 +12,10 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software Foundation, - Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* Written by Richard Stallman and David MacKenzie. */ - + #include <config.h> #include <getopt.h> @@ -24,23 +23,24 @@ #include "system.h" #include "linebuffer.h" #include "error.h" +#include "fadvise.h" #include "hard-locale.h" #include "quote.h" #include "stdio--.h" +#include "memcmp2.h" #include "xmemcoll.h" -/* The official name of this program (e.g., no `g' prefix). */ +/* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "comm" -#define AUTHORS "Richard Stallman", "David MacKenzie" +#define AUTHORS \ + proper_name ("Richard M. Stallman"), \ + proper_name ("David MacKenzie") /* Undefine, to avoid warning about redefinition on some systems. */ #undef min #define min(x, y) ((x) < (y) ? (x) : (y)) -/* The name this program was run with. */ -char *program_name; - /* True if the LC_COLLATE locale is hard. */ static bool hard_LC_COLLATE; @@ -53,45 +53,105 @@ static bool only_file_2; /* If true, print lines that are found in both files. */ static bool both; +/* If nonzero, we have seen at least one unpairable line. */ +static bool seen_unpairable; + +/* If nonzero, we have warned about disorder in that file. */ +static bool issued_disorder_warning[2]; + +/* line delimiter. */ +static unsigned char delim = '\n'; + +/* If nonzero, check that the input is correctly ordered. */ +static enum + { + CHECK_ORDER_DEFAULT, + CHECK_ORDER_ENABLED, + CHECK_ORDER_DISABLED + } check_input_order; + +/* Output columns will be delimited with this string, which may be set + on the command-line with --output-delimiter=STR. */ +static char const *col_sep = "\t"; +static size_t col_sep_len = 0; + +/* For long options that have no equivalent short option, use a + non-character as a pseudo short option, starting with CHAR_MAX + 1. */ +enum +{ + CHECK_ORDER_OPTION = CHAR_MAX + 1, + NOCHECK_ORDER_OPTION, + OUTPUT_DELIMITER_OPTION +}; + static struct option const long_options[] = { + {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, + {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, + {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION}, + {"zero-terminated", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} }; - void usage (int status) { if (status != EXIT_SUCCESS) - fprintf (stderr, _("Try `%s --help' for more information.\n"), - program_name); + emit_try_help (); else { printf (_("\ Usage: %s [OPTION]... FILE1 FILE2\n\ "), - program_name); + program_name); fputs (_("\ Compare sorted files FILE1 and FILE2 line by line.\n\ "), stdout); fputs (_("\ \n\ +When FILE1 or FILE2 (not both) is -, read standard input.\n\ +"), stdout); + fputs (_("\ +\n\ With no options, produce three-column output. Column one contains\n\ lines unique to FILE1, column two contains lines unique to FILE2,\n\ and column three contains lines common to both files.\n\ "), stdout); fputs (_("\ \n\ - -1 suppress lines unique to FILE1\n\ - -2 suppress lines unique to FILE2\n\ - -3 suppress lines that appear in both files\n\ + -1 suppress column 1 (lines unique to FILE1)\n\ + -2 suppress column 2 (lines unique to FILE2)\n\ + -3 suppress column 3 (lines that appear in both files)\n\ +"), stdout); + fputs (_("\ +\n\ + --check-order check that the input is correctly sorted, even\n\ + if all input lines are pairable\n\ + --nocheck-order do not check that the input is correctly sorted\n\ +"), stdout); + fputs (_("\ + --output-delimiter=STR separate columns with STR\n\ +"), stdout); + fputs (_("\ + -z, --zero-terminated line delimiter is NUL, not newline\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); - printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); + fputs (_("\ +\n\ +Note, comparisons honor the rules specified by 'LC_COLLATE'.\n\ +"), stdout); + printf (_("\ +\n\ +Examples:\n\ + %s -12 file1 file2 Print only lines present in both file1 and file2.\n\ + %s -3 file1 file2 Print lines in file1 not in file2, and vice versa.\n\ +"), + program_name, program_name); + emit_ancillary_info (PROGRAM_NAME); } exit (status); } @@ -102,38 +162,80 @@ and column three contains lines common to both files.\n\ 2 for a line only in file 2, 3 for a line in both. */ static void -writeline (const struct linebuffer *line, FILE *stream, int class) +writeline (struct linebuffer const *line, FILE *stream, int class) { switch (class) { case 1: if (!only_file_1) - return; + return; break; case 2: if (!only_file_2) - return; - /* Print a TAB if we are printing lines from file 1. */ + return; if (only_file_1) - putc ('\t', stream); + fwrite (col_sep, 1, col_sep_len, stream); break; case 3: if (!both) - return; - /* Print a TAB if we are printing lines from file 1. */ + return; if (only_file_1) - putc ('\t', stream); - /* Print a TAB if we are printing lines from file 2. */ + fwrite (col_sep, 1, col_sep_len, stream); if (only_file_2) - putc ('\t', stream); + fwrite (col_sep, 1, col_sep_len, stream); break; } fwrite (line->buffer, sizeof (char), line->length, stream); } +/* Check that successive input lines PREV and CURRENT from input file + WHATFILE are presented in order. + + If the user specified --nocheck-order, the check is not made. + If the user specified --check-order, the problem is fatal. + Otherwise (the default), the message is simply a warning. + + A message is printed at most once per input file. + + This function was copied (nearly) verbatim from 'src/join.c'. */ + +static void +check_order (struct linebuffer const *prev, + struct linebuffer const *current, + int whatfile) +{ + + if (check_input_order != CHECK_ORDER_DISABLED + && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable)) + { + if (!issued_disorder_warning[whatfile - 1]) + { + int order; + + if (hard_LC_COLLATE) + order = xmemcoll (prev->buffer, prev->length - 1, + current->buffer, current->length - 1); + else + order = memcmp2 (prev->buffer, prev->length - 1, + current->buffer, current->length - 1); + + if (0 < order) + { + error ((check_input_order == CHECK_ORDER_ENABLED + ? EXIT_FAILURE : 0), + 0, _("file %d is not in sorted order"), whatfile); + + /* If we get to here, the message was just a warning, but we + want only to issue it once. */ + issued_disorder_warning[whatfile - 1] = true; + } + } + } +} + /* Compare INFILES[0] and INFILES[1]. If either is "-", use the standard input for that file. Assume that each input file is sorted; @@ -142,85 +244,126 @@ writeline (const struct linebuffer *line, FILE *stream, int class) static void compare_files (char **infiles) { - /* For each file, we have one linebuffer in lb1. */ - struct linebuffer lb1[2]; + /* For each file, we have four linebuffers in lba. */ + struct linebuffer lba[2][4]; /* thisline[i] points to the linebuffer holding the next available line in file i, or is NULL if there are no lines left in that file. */ struct linebuffer *thisline[2]; + /* all_line[i][alt[i][0]] also points to the linebuffer holding the + current line in file i. We keep two buffers of history around so we + can look two lines back when we get to the end of a file. */ + struct linebuffer *all_line[2][4]; + + /* This is used to rotate through the buffers for each input file. */ + int alt[2][3]; + /* streams[i] holds the input stream for file i. */ FILE *streams[2]; - int i; + int i, j; /* Initialize the storage. */ for (i = 0; i < 2; i++) { - initbuffer (&lb1[i]); - thisline[i] = &lb1[i]; + for (j = 0; j < 4; j++) + { + initbuffer (&lba[i][j]); + all_line[i][j] = &lba[i][j]; + } + alt[i][0] = 0; + alt[i][1] = 0; + alt[i][2] = 0; streams[i] = (STREQ (infiles[i], "-") ? stdin : fopen (infiles[i], "r")); if (!streams[i]) - error (EXIT_FAILURE, errno, "%s", infiles[i]); + error (EXIT_FAILURE, errno, "%s", quotef (infiles[i])); - thisline[i] = readlinebuffer (thisline[i], streams[i]); + fadvise (streams[i], FADVISE_SEQUENTIAL); + + thisline[i] = readlinebuffer_delim (all_line[i][alt[i][0]], streams[i], + delim); if (ferror (streams[i])) - error (EXIT_FAILURE, errno, "%s", infiles[i]); + error (EXIT_FAILURE, errno, "%s", quotef (infiles[i])); } while (thisline[0] || thisline[1]) { int order; + bool fill_up[2] = { false, false }; /* Compare the next available lines of the two files. */ if (!thisline[0]) - order = 1; + order = 1; else if (!thisline[1]) - order = -1; + order = -1; else - { - if (hard_LC_COLLATE) - order = xmemcoll (thisline[0]->buffer, thisline[0]->length - 1, - thisline[1]->buffer, thisline[1]->length - 1); - else - { - size_t len = min (thisline[0]->length, thisline[1]->length) - 1; - order = memcmp (thisline[0]->buffer, thisline[1]->buffer, len); - if (order == 0) - order = (thisline[0]->length < thisline[1]->length - ? -1 - : thisline[0]->length != thisline[1]->length); - } - } + { + if (hard_LC_COLLATE) + order = xmemcoll (thisline[0]->buffer, thisline[0]->length - 1, + thisline[1]->buffer, thisline[1]->length - 1); + else + { + size_t len = min (thisline[0]->length, thisline[1]->length) - 1; + order = memcmp (thisline[0]->buffer, thisline[1]->buffer, len); + if (order == 0) + order = (thisline[0]->length < thisline[1]->length + ? -1 + : thisline[0]->length != thisline[1]->length); + } + } /* Output the line that is lesser. */ if (order == 0) - writeline (thisline[1], stdout, 3); - else if (order > 0) - writeline (thisline[1], stdout, 2); + writeline (thisline[1], stdout, 3); else - writeline (thisline[0], stdout, 1); + { + seen_unpairable = true; + if (order <= 0) + writeline (thisline[0], stdout, 1); + else + writeline (thisline[1], stdout, 2); + } /* Step the file the line came from. - If the files match, step both files. */ - if (order >= 0) - { - thisline[1] = readlinebuffer (thisline[1], streams[1]); - if (ferror (streams[1])) - error (EXIT_FAILURE, errno, "%s", infiles[1]); - } + If the files match, step both files. */ + if (0 <= order) + fill_up[1] = true; if (order <= 0) - { - thisline[0] = readlinebuffer (thisline[0], streams[0]); - if (ferror (streams[0])) - error (EXIT_FAILURE, errno, "%s", infiles[0]); - } + fill_up[0] = true; + + for (i = 0; i < 2; i++) + if (fill_up[i]) + { + /* Rotate the buffers for this file. */ + alt[i][2] = alt[i][1]; + alt[i][1] = alt[i][0]; + alt[i][0] = (alt[i][0] + 1) & 0x03; + + thisline[i] = readlinebuffer_delim (all_line[i][alt[i][0]], + streams[i], delim); + + if (thisline[i]) + check_order (all_line[i][alt[i][1]], thisline[i], i + 1); + + /* If this is the end of the file we may need to re-check + the order of the previous two lines, since we might have + discovered an unpairable match since we checked before. */ + else if (all_line[i][alt[i][2]]->buffer) + check_order (all_line[i][alt[i][2]], + all_line[i][alt[i][1]], i + 1); + + if (ferror (streams[i])) + error (EXIT_FAILURE, errno, "%s", quotef (infiles[i])); + + fill_up[i] = false; + } } for (i = 0; i < 2; i++) if (fclose (streams[i]) != 0) - error (EXIT_FAILURE, errno, "%s", infiles[i]); + error (EXIT_FAILURE, errno, "%s", quotef (infiles[i])); } int @@ -229,7 +372,7 @@ main (int argc, char **argv) int c; initialize_main (&argc, &argv); - program_name = argv[0]; + set_program_name (argv[0]); setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); @@ -241,35 +384,61 @@ main (int argc, char **argv) only_file_2 = true; both = true; - while ((c = getopt_long (argc, argv, "123", long_options, NULL)) != -1) + seen_unpairable = false; + issued_disorder_warning[0] = issued_disorder_warning[1] = false; + check_input_order = CHECK_ORDER_DEFAULT; + + while ((c = getopt_long (argc, argv, "123z", long_options, NULL)) != -1) switch (c) { case '1': - only_file_1 = false; - break; + only_file_1 = false; + break; case '2': - only_file_2 = false; - break; + only_file_2 = false; + break; case '3': - both = false; - break; + both = false; + break; + + case 'z': + delim = '\0'; + break; + + case NOCHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_DISABLED; + break; + + case CHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_ENABLED; + break; + + case OUTPUT_DELIMITER_OPTION: + if (col_sep_len && !STREQ (col_sep, optarg)) + error (EXIT_FAILURE, 0, _("multiple output delimiters specified")); + col_sep = optarg; + col_sep_len = *optarg ? strlen (optarg) : 1; + break; case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); default: - usage (EXIT_FAILURE); + usage (EXIT_FAILURE); } + if (! col_sep_len) + col_sep_len = 1; + if (argc - optind < 2) { if (argc <= optind) - error (0, 0, _("missing operand")); + error (0, 0, _("missing operand")); else - error (0, 0, _("missing operand after %s"), quote (argv[argc - 1])); + error (0, 0, _("missing operand after %s"), quote (argv[argc - 1])); usage (EXIT_FAILURE); } @@ -281,5 +450,8 @@ main (int argc, char **argv) compare_files (argv + optind); - exit (EXIT_SUCCESS); + if (issued_disorder_warning[0] || issued_disorder_warning[1]) + return EXIT_FAILURE; + else + return EXIT_SUCCESS; } |