summaryrefslogtreecommitdiff
path: root/src/comm.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/comm.c')
-rw-r--r--src/comm.c338
1 files changed, 255 insertions, 83 deletions
diff --git a/src/comm.c b/src/comm.c
index 9b7e03f..802bf90 100644
--- a/src/comm.c
+++ b/src/comm.c
@@ -1,10 +1,10 @@
/* comm -- compare two sorted files line by line.
- Copyright (C) 86, 90, 91, 1995-2005 Free Software Foundation, Inc.
+ Copyright (C) 1986-2016 Free Software Foundation, Inc.
- This program is free software; you can redistribute it and/or modify
+ This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -12,11 +12,10 @@
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software Foundation,
- Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
/* Written by Richard Stallman and David MacKenzie. */
-
+
#include <config.h>
#include <getopt.h>
@@ -24,23 +23,24 @@
#include "system.h"
#include "linebuffer.h"
#include "error.h"
+#include "fadvise.h"
#include "hard-locale.h"
#include "quote.h"
#include "stdio--.h"
+#include "memcmp2.h"
#include "xmemcoll.h"
-/* The official name of this program (e.g., no `g' prefix). */
+/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "comm"
-#define AUTHORS "Richard Stallman", "David MacKenzie"
+#define AUTHORS \
+ proper_name ("Richard M. Stallman"), \
+ proper_name ("David MacKenzie")
/* Undefine, to avoid warning about redefinition on some systems. */
#undef min
#define min(x, y) ((x) < (y) ? (x) : (y))
-/* The name this program was run with. */
-char *program_name;
-
/* True if the LC_COLLATE locale is hard. */
static bool hard_LC_COLLATE;
@@ -53,45 +53,105 @@ static bool only_file_2;
/* If true, print lines that are found in both files. */
static bool both;
+/* If nonzero, we have seen at least one unpairable line. */
+static bool seen_unpairable;
+
+/* If nonzero, we have warned about disorder in that file. */
+static bool issued_disorder_warning[2];
+
+/* line delimiter. */
+static unsigned char delim = '\n';
+
+/* If nonzero, check that the input is correctly ordered. */
+static enum
+ {
+ CHECK_ORDER_DEFAULT,
+ CHECK_ORDER_ENABLED,
+ CHECK_ORDER_DISABLED
+ } check_input_order;
+
+/* Output columns will be delimited with this string, which may be set
+ on the command-line with --output-delimiter=STR. */
+static char const *col_sep = "\t";
+static size_t col_sep_len = 0;
+
+/* For long options that have no equivalent short option, use a
+ non-character as a pseudo short option, starting with CHAR_MAX + 1. */
+enum
+{
+ CHECK_ORDER_OPTION = CHAR_MAX + 1,
+ NOCHECK_ORDER_OPTION,
+ OUTPUT_DELIMITER_OPTION
+};
+
static struct option const long_options[] =
{
+ {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
+ {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
+ {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
+ {"zero-terminated", no_argument, NULL, 'z'},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
};
-
void
usage (int status)
{
if (status != EXIT_SUCCESS)
- fprintf (stderr, _("Try `%s --help' for more information.\n"),
- program_name);
+ emit_try_help ();
else
{
printf (_("\
Usage: %s [OPTION]... FILE1 FILE2\n\
"),
- program_name);
+ program_name);
fputs (_("\
Compare sorted files FILE1 and FILE2 line by line.\n\
"), stdout);
fputs (_("\
\n\
+When FILE1 or FILE2 (not both) is -, read standard input.\n\
+"), stdout);
+ fputs (_("\
+\n\
With no options, produce three-column output. Column one contains\n\
lines unique to FILE1, column two contains lines unique to FILE2,\n\
and column three contains lines common to both files.\n\
"), stdout);
fputs (_("\
\n\
- -1 suppress lines unique to FILE1\n\
- -2 suppress lines unique to FILE2\n\
- -3 suppress lines that appear in both files\n\
+ -1 suppress column 1 (lines unique to FILE1)\n\
+ -2 suppress column 2 (lines unique to FILE2)\n\
+ -3 suppress column 3 (lines that appear in both files)\n\
+"), stdout);
+ fputs (_("\
+\n\
+ --check-order check that the input is correctly sorted, even\n\
+ if all input lines are pairable\n\
+ --nocheck-order do not check that the input is correctly sorted\n\
+"), stdout);
+ fputs (_("\
+ --output-delimiter=STR separate columns with STR\n\
+"), stdout);
+ fputs (_("\
+ -z, --zero-terminated line delimiter is NUL, not newline\n\
"), stdout);
fputs (HELP_OPTION_DESCRIPTION, stdout);
fputs (VERSION_OPTION_DESCRIPTION, stdout);
- printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
+ fputs (_("\
+\n\
+Note, comparisons honor the rules specified by 'LC_COLLATE'.\n\
+"), stdout);
+ printf (_("\
+\n\
+Examples:\n\
+ %s -12 file1 file2 Print only lines present in both file1 and file2.\n\
+ %s -3 file1 file2 Print lines in file1 not in file2, and vice versa.\n\
+"),
+ program_name, program_name);
+ emit_ancillary_info (PROGRAM_NAME);
}
exit (status);
}
@@ -102,38 +162,80 @@ and column three contains lines common to both files.\n\
2 for a line only in file 2, 3 for a line in both. */
static void
-writeline (const struct linebuffer *line, FILE *stream, int class)
+writeline (struct linebuffer const *line, FILE *stream, int class)
{
switch (class)
{
case 1:
if (!only_file_1)
- return;
+ return;
break;
case 2:
if (!only_file_2)
- return;
- /* Print a TAB if we are printing lines from file 1. */
+ return;
if (only_file_1)
- putc ('\t', stream);
+ fwrite (col_sep, 1, col_sep_len, stream);
break;
case 3:
if (!both)
- return;
- /* Print a TAB if we are printing lines from file 1. */
+ return;
if (only_file_1)
- putc ('\t', stream);
- /* Print a TAB if we are printing lines from file 2. */
+ fwrite (col_sep, 1, col_sep_len, stream);
if (only_file_2)
- putc ('\t', stream);
+ fwrite (col_sep, 1, col_sep_len, stream);
break;
}
fwrite (line->buffer, sizeof (char), line->length, stream);
}
+/* Check that successive input lines PREV and CURRENT from input file
+ WHATFILE are presented in order.
+
+ If the user specified --nocheck-order, the check is not made.
+ If the user specified --check-order, the problem is fatal.
+ Otherwise (the default), the message is simply a warning.
+
+ A message is printed at most once per input file.
+
+ This function was copied (nearly) verbatim from 'src/join.c'. */
+
+static void
+check_order (struct linebuffer const *prev,
+ struct linebuffer const *current,
+ int whatfile)
+{
+
+ if (check_input_order != CHECK_ORDER_DISABLED
+ && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
+ {
+ if (!issued_disorder_warning[whatfile - 1])
+ {
+ int order;
+
+ if (hard_LC_COLLATE)
+ order = xmemcoll (prev->buffer, prev->length - 1,
+ current->buffer, current->length - 1);
+ else
+ order = memcmp2 (prev->buffer, prev->length - 1,
+ current->buffer, current->length - 1);
+
+ if (0 < order)
+ {
+ error ((check_input_order == CHECK_ORDER_ENABLED
+ ? EXIT_FAILURE : 0),
+ 0, _("file %d is not in sorted order"), whatfile);
+
+ /* If we get to here, the message was just a warning, but we
+ want only to issue it once. */
+ issued_disorder_warning[whatfile - 1] = true;
+ }
+ }
+ }
+}
+
/* Compare INFILES[0] and INFILES[1].
If either is "-", use the standard input for that file.
Assume that each input file is sorted;
@@ -142,85 +244,126 @@ writeline (const struct linebuffer *line, FILE *stream, int class)
static void
compare_files (char **infiles)
{
- /* For each file, we have one linebuffer in lb1. */
- struct linebuffer lb1[2];
+ /* For each file, we have four linebuffers in lba. */
+ struct linebuffer lba[2][4];
/* thisline[i] points to the linebuffer holding the next available line
in file i, or is NULL if there are no lines left in that file. */
struct linebuffer *thisline[2];
+ /* all_line[i][alt[i][0]] also points to the linebuffer holding the
+ current line in file i. We keep two buffers of history around so we
+ can look two lines back when we get to the end of a file. */
+ struct linebuffer *all_line[2][4];
+
+ /* This is used to rotate through the buffers for each input file. */
+ int alt[2][3];
+
/* streams[i] holds the input stream for file i. */
FILE *streams[2];
- int i;
+ int i, j;
/* Initialize the storage. */
for (i = 0; i < 2; i++)
{
- initbuffer (&lb1[i]);
- thisline[i] = &lb1[i];
+ for (j = 0; j < 4; j++)
+ {
+ initbuffer (&lba[i][j]);
+ all_line[i][j] = &lba[i][j];
+ }
+ alt[i][0] = 0;
+ alt[i][1] = 0;
+ alt[i][2] = 0;
streams[i] = (STREQ (infiles[i], "-") ? stdin : fopen (infiles[i], "r"));
if (!streams[i])
- error (EXIT_FAILURE, errno, "%s", infiles[i]);
+ error (EXIT_FAILURE, errno, "%s", quotef (infiles[i]));
- thisline[i] = readlinebuffer (thisline[i], streams[i]);
+ fadvise (streams[i], FADVISE_SEQUENTIAL);
+
+ thisline[i] = readlinebuffer_delim (all_line[i][alt[i][0]], streams[i],
+ delim);
if (ferror (streams[i]))
- error (EXIT_FAILURE, errno, "%s", infiles[i]);
+ error (EXIT_FAILURE, errno, "%s", quotef (infiles[i]));
}
while (thisline[0] || thisline[1])
{
int order;
+ bool fill_up[2] = { false, false };
/* Compare the next available lines of the two files. */
if (!thisline[0])
- order = 1;
+ order = 1;
else if (!thisline[1])
- order = -1;
+ order = -1;
else
- {
- if (hard_LC_COLLATE)
- order = xmemcoll (thisline[0]->buffer, thisline[0]->length - 1,
- thisline[1]->buffer, thisline[1]->length - 1);
- else
- {
- size_t len = min (thisline[0]->length, thisline[1]->length) - 1;
- order = memcmp (thisline[0]->buffer, thisline[1]->buffer, len);
- if (order == 0)
- order = (thisline[0]->length < thisline[1]->length
- ? -1
- : thisline[0]->length != thisline[1]->length);
- }
- }
+ {
+ if (hard_LC_COLLATE)
+ order = xmemcoll (thisline[0]->buffer, thisline[0]->length - 1,
+ thisline[1]->buffer, thisline[1]->length - 1);
+ else
+ {
+ size_t len = min (thisline[0]->length, thisline[1]->length) - 1;
+ order = memcmp (thisline[0]->buffer, thisline[1]->buffer, len);
+ if (order == 0)
+ order = (thisline[0]->length < thisline[1]->length
+ ? -1
+ : thisline[0]->length != thisline[1]->length);
+ }
+ }
/* Output the line that is lesser. */
if (order == 0)
- writeline (thisline[1], stdout, 3);
- else if (order > 0)
- writeline (thisline[1], stdout, 2);
+ writeline (thisline[1], stdout, 3);
else
- writeline (thisline[0], stdout, 1);
+ {
+ seen_unpairable = true;
+ if (order <= 0)
+ writeline (thisline[0], stdout, 1);
+ else
+ writeline (thisline[1], stdout, 2);
+ }
/* Step the file the line came from.
- If the files match, step both files. */
- if (order >= 0)
- {
- thisline[1] = readlinebuffer (thisline[1], streams[1]);
- if (ferror (streams[1]))
- error (EXIT_FAILURE, errno, "%s", infiles[1]);
- }
+ If the files match, step both files. */
+ if (0 <= order)
+ fill_up[1] = true;
if (order <= 0)
- {
- thisline[0] = readlinebuffer (thisline[0], streams[0]);
- if (ferror (streams[0]))
- error (EXIT_FAILURE, errno, "%s", infiles[0]);
- }
+ fill_up[0] = true;
+
+ for (i = 0; i < 2; i++)
+ if (fill_up[i])
+ {
+ /* Rotate the buffers for this file. */
+ alt[i][2] = alt[i][1];
+ alt[i][1] = alt[i][0];
+ alt[i][0] = (alt[i][0] + 1) & 0x03;
+
+ thisline[i] = readlinebuffer_delim (all_line[i][alt[i][0]],
+ streams[i], delim);
+
+ if (thisline[i])
+ check_order (all_line[i][alt[i][1]], thisline[i], i + 1);
+
+ /* If this is the end of the file we may need to re-check
+ the order of the previous two lines, since we might have
+ discovered an unpairable match since we checked before. */
+ else if (all_line[i][alt[i][2]]->buffer)
+ check_order (all_line[i][alt[i][2]],
+ all_line[i][alt[i][1]], i + 1);
+
+ if (ferror (streams[i]))
+ error (EXIT_FAILURE, errno, "%s", quotef (infiles[i]));
+
+ fill_up[i] = false;
+ }
}
for (i = 0; i < 2; i++)
if (fclose (streams[i]) != 0)
- error (EXIT_FAILURE, errno, "%s", infiles[i]);
+ error (EXIT_FAILURE, errno, "%s", quotef (infiles[i]));
}
int
@@ -229,7 +372,7 @@ main (int argc, char **argv)
int c;
initialize_main (&argc, &argv);
- program_name = argv[0];
+ set_program_name (argv[0]);
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
@@ -241,35 +384,61 @@ main (int argc, char **argv)
only_file_2 = true;
both = true;
- while ((c = getopt_long (argc, argv, "123", long_options, NULL)) != -1)
+ seen_unpairable = false;
+ issued_disorder_warning[0] = issued_disorder_warning[1] = false;
+ check_input_order = CHECK_ORDER_DEFAULT;
+
+ while ((c = getopt_long (argc, argv, "123z", long_options, NULL)) != -1)
switch (c)
{
case '1':
- only_file_1 = false;
- break;
+ only_file_1 = false;
+ break;
case '2':
- only_file_2 = false;
- break;
+ only_file_2 = false;
+ break;
case '3':
- both = false;
- break;
+ both = false;
+ break;
+
+ case 'z':
+ delim = '\0';
+ break;
+
+ case NOCHECK_ORDER_OPTION:
+ check_input_order = CHECK_ORDER_DISABLED;
+ break;
+
+ case CHECK_ORDER_OPTION:
+ check_input_order = CHECK_ORDER_ENABLED;
+ break;
+
+ case OUTPUT_DELIMITER_OPTION:
+ if (col_sep_len && !STREQ (col_sep, optarg))
+ error (EXIT_FAILURE, 0, _("multiple output delimiters specified"));
+ col_sep = optarg;
+ col_sep_len = *optarg ? strlen (optarg) : 1;
+ break;
case_GETOPT_HELP_CHAR;
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
default:
- usage (EXIT_FAILURE);
+ usage (EXIT_FAILURE);
}
+ if (! col_sep_len)
+ col_sep_len = 1;
+
if (argc - optind < 2)
{
if (argc <= optind)
- error (0, 0, _("missing operand"));
+ error (0, 0, _("missing operand"));
else
- error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
+ error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
usage (EXIT_FAILURE);
}
@@ -281,5 +450,8 @@ main (int argc, char **argv)
compare_files (argv + optind);
- exit (EXIT_SUCCESS);
+ if (issued_disorder_warning[0] || issued_disorder_warning[1])
+ return EXIT_FAILURE;
+ else
+ return EXIT_SUCCESS;
}