diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2016-01-20 10:55:18 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2016-01-20 10:55:18 +0000 |
commit | 70e9163c9c18e995515598085cb824e554eb7ae7 (patch) | |
tree | a42dc8b2a6c031354bf31472de888bfc8a060132 /src/uniq.c | |
parent | cbf5993c43f49281173f185863577d86bfac6eae (diff) | |
download | coreutils-tarball-70e9163c9c18e995515598085cb824e554eb7ae7.tar.gz |
coreutils-8.25HEADcoreutils-8.25master
Diffstat (limited to 'src/uniq.c')
-rw-r--r-- | src/uniq.c | 575 |
1 files changed, 345 insertions, 230 deletions
@@ -1,10 +1,10 @@ /* uniq -- remove duplicate lines from a sorted file - Copyright (C) 86, 91, 1995-2006 Free Software Foundation, Inc. + Copyright (C) 1986-2016 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or modify + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -12,14 +12,12 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software Foundation, - Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Written by Richard M. Stallman and David MacKenzie. */ -/* Written by Richard Stallman and David MacKenzie. */ - #include <config.h> -#include <stdio.h> #include <getopt.h> #include <sys/types.h> @@ -27,17 +25,21 @@ #include "argmatch.h" #include "linebuffer.h" #include "error.h" +#include "fadvise.h" #include "hard-locale.h" #include "posixver.h" -#include "quote.h" +#include "stdio--.h" #include "xmemcoll.h" #include "xstrtol.h" #include "memcasecmp.h" +#include "quote.h" -/* The official name of this program (e.g., no `g' prefix). */ +/* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "uniq" -#define AUTHORS "Richard Stallman", "David MacKenzie" +#define AUTHORS \ + proper_name ("Richard M. Stallman"), \ + proper_name ("David MacKenzie") #define SWAP_LINES(A, B) \ do \ @@ -49,9 +51,6 @@ } \ while (0) -/* The name this program was run with. */ -char *program_name; - /* True if the LC_COLLATE locale is hard. */ static bool hard_LC_COLLATE; @@ -109,16 +108,53 @@ static enum delimit_method const delimit_method_map[] = /* Select whether/how to delimit groups of duplicate lines. */ static enum delimit_method delimit_groups; +enum grouping_method +{ + /* No grouping, when "--group" isn't used */ + GM_NONE, + + /* Delimiter preceges all groups. --group=prepend */ + GM_PREPEND, + + /* Delimiter follows all groups. --group=append */ + GM_APPEND, + + /* Delimiter between groups. --group[=separate] */ + GM_SEPARATE, + + /* Delimiter before and after each group. --group=both */ + GM_BOTH +}; + +static char const *const grouping_method_string[] = +{ + "prepend", "append", "separate", "both", NULL +}; + +static enum grouping_method const grouping_method_map[] = +{ + GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH +}; + +static enum grouping_method grouping = GM_NONE; + +enum +{ + GROUP_OPTION = CHAR_MAX + 1 +}; + static struct option const longopts[] = { {"count", no_argument, NULL, 'c'}, {"repeated", no_argument, NULL, 'd'}, {"all-repeated", optional_argument, NULL, 'D'}, + {"group", optional_argument, NULL, GROUP_OPTION}, {"ignore-case", no_argument, NULL, 'i'}, {"unique", no_argument, NULL, 'u'}, {"skip-fields", required_argument, NULL, 'f'}, {"skip-chars", required_argument, NULL, 's'}, {"check-chars", required_argument, NULL, 'w'}, + {"zero-terminated", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -128,35 +164,47 @@ void usage (int status) { if (status != EXIT_SUCCESS) - fprintf (stderr, _("Try `%s --help' for more information.\n"), - program_name); + emit_try_help (); else { printf (_("\ Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\ "), - program_name); + program_name); fputs (_("\ -Discard all but one of successive identical lines from INPUT (or\n\ -standard input), writing to OUTPUT (or standard output).\n\ +Filter adjacent matching lines from INPUT (or standard input),\n\ +writing to OUTPUT (or standard output).\n\ \n\ +With no options, matching lines are merged to the first occurrence.\n\ "), stdout); + + emit_mandatory_arg_note (); + fputs (_("\ -Mandatory arguments to long options are mandatory for short options too.\n\ + -c, --count prefix lines by the number of occurrences\n\ + -d, --repeated only print duplicate lines, one for each group\n\ "), stdout); fputs (_("\ - -c, --count prefix lines by the number of occurrences\n\ - -d, --repeated only print duplicate lines\n\ + -D print all duplicate lines\n\ + --all-repeated[=METHOD] like -D, but allow separating groups\n\ + with an empty line;\n\ + METHOD={none(default),prepend,separate}\n\ "), stdout); fputs (_("\ - -D, --all-repeated[=delimit-method] print all duplicate lines\n\ - delimit-method={none(default),prepend,separate}\n\ - Delimiting is done with blank lines.\n\ -f, --skip-fields=N avoid comparing the first N fields\n\ +"), stdout); + fputs (_("\ + --group[=METHOD] show all items, separating groups with an empty line;\n\ + METHOD={separate(default),prepend,append,both}\n\ +"), stdout); + fputs (_("\ -i, --ignore-case ignore differences in case when comparing\n\ -s, --skip-chars=N avoid comparing the first N characters\n\ -u, --unique only print unique lines\n\ "), stdout); + fputs (_("\ + -z, --zero-terminated line delimiter is NUL, not newline\n\ +"), stdout); fputs (_("\ -w, --check-chars=N compare no more than N characters in lines\n\ "), stdout); @@ -164,10 +212,16 @@ Mandatory arguments to long options are mandatory for short options too.\n\ fputs (VERSION_OPTION_DESCRIPTION, stdout); fputs (_("\ \n\ -A field is a run of whitespace, then non-whitespace characters.\n\ -Fields are skipped before chars.\n\ +A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\ +characters. Fields are skipped before chars.\n\ +"), stdout); + fputs (_("\ +\n\ +Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\ +You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\ +Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\ "), stdout); - printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); + emit_ancillary_info (PROGRAM_NAME); } exit (status); } @@ -197,26 +251,25 @@ size_opt (char const *opt, char const *msgid) /* Given a linebuffer LINE, return a pointer to the beginning of the line's field to be compared. */ -static char * -find_field (const struct linebuffer *line) +static char * _GL_ATTRIBUTE_PURE +find_field (struct linebuffer const *line) { size_t count; - char *lp = line->buffer; + char const *lp = line->buffer; size_t size = line->length - 1; size_t i = 0; for (count = 0; count < skip_fields && i < size; count++) { - while (i < size && isblank (lp[i])) - i++; - while (i < size && !isblank (lp[i])) - i++; + while (i < size && field_sep (lp[i])) + i++; + while (i < size && !field_sep (lp[i])) + i++; } - for (count = 0; count < skip_chars && i < size; count++) - i++; + i += MIN (skip_chars, size - i); - return lp + i; + return line->buffer + i; } /* Return false if two strings OLD and NEW match, true if not. @@ -251,11 +304,11 @@ different (char *old, char *new, size_t oldlen, size_t newlen) static void writeline (struct linebuffer const *line, - bool match, uintmax_t linecount) + bool match, uintmax_t linecount) { if (! (linecount == 0 ? output_unique - : !match ? output_first_repeated - : output_later_repeated)) + : !match ? output_first_repeated + : output_later_repeated)) return; if (countmode == count_occurrences) @@ -268,15 +321,17 @@ writeline (struct linebuffer const *line, If either is "-", use the standard I/O stream for it instead. */ static void -check_file (const char *infile, const char *outfile) +check_file (const char *infile, const char *outfile, char delimiter) { struct linebuffer lb1, lb2; struct linebuffer *thisline, *prevline; if (! (STREQ (infile, "-") || freopen (infile, "r", stdin))) - error (EXIT_FAILURE, errno, "%s", infile); + error (EXIT_FAILURE, errno, "%s", quotef (infile)); if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout))) - error (EXIT_FAILURE, errno, "%s", outfile); + error (EXIT_FAILURE, errno, "%s", quotef (outfile)); + + fadvise (stdin, FADVISE_SEQUENTIAL); thisline = &lb1; prevline = &lb2; @@ -284,37 +339,61 @@ check_file (const char *infile, const char *outfile) initbuffer (thisline); initbuffer (prevline); - /* The duplication in the following `if' and `else' blocks is an - optimization to distinguish the common case (in which none of - the following options has been specified: --count, -repeated, - --all-repeated, --unique) from the others. In the common case, - this optimization lets uniq output each different line right away, - without waiting to see if the next one is different. */ + /* The duplication in the following 'if' and 'else' blocks is an + optimization to distinguish between when we can print input + lines immediately (1. & 2.) or not. + 1. --group => all input lines are printed. + checking for unique/duplicated lines is used only for printing + group separators. + + 2. The default case in which none of these options has been specified: + --count, --repeated, --all-repeated, --unique + In the default case, this optimization lets uniq output each different + line right away, without waiting to see if the next one is different. + + 3. All other cases. + */ if (output_unique && output_first_repeated && countmode == count_none) { - char *prevfield IF_LINT (= NULL); - size_t prevlen IF_LINT (= 0); + char *prevfield IF_LINT ( = NULL); + size_t prevlen IF_LINT ( = 0); + bool first_group_printed = false; while (!feof (stdin)) - { - char *thisfield; - size_t thislen; - if (readlinebuffer (thisline, stdin) == 0) - break; - thisfield = find_field (thisline); - thislen = thisline->length - 1 - (thisfield - thisline->buffer); - if (prevline->length == 0 - || different (thisfield, prevfield, thislen, prevlen)) - { - fwrite (thisline->buffer, sizeof (char), - thisline->length, stdout); - - SWAP_LINES (prevline, thisline); - prevfield = thisfield; - prevlen = thislen; - } - } + { + char *thisfield; + size_t thislen; + bool new_group; + + if (readlinebuffer_delim (thisline, stdin, delimiter) == 0) + break; + + thisfield = find_field (thisline); + thislen = thisline->length - 1 - (thisfield - thisline->buffer); + + new_group = (prevline->length == 0 + || different (thisfield, prevfield, thislen, prevlen)); + + if (new_group && grouping != GM_NONE + && (grouping == GM_PREPEND || grouping == GM_BOTH + || (first_group_printed && (grouping == GM_APPEND + || grouping == GM_SEPARATE)))) + putchar (delimiter); + + if (new_group || grouping != GM_NONE) + { + fwrite (thisline->buffer, sizeof (char), + thisline->length, stdout); + + SWAP_LINES (prevline, thisline); + prevfield = thisfield; + prevlen = thislen; + first_group_printed = true; + } + } + if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed) + putchar (delimiter); } else { @@ -323,67 +402,67 @@ check_file (const char *infile, const char *outfile) uintmax_t match_count = 0; bool first_delimiter = true; - if (readlinebuffer (prevline, stdin) == 0) - goto closefiles; + if (readlinebuffer_delim (prevline, stdin, delimiter) == 0) + goto closefiles; prevfield = find_field (prevline); prevlen = prevline->length - 1 - (prevfield - prevline->buffer); while (!feof (stdin)) - { - bool match; - char *thisfield; - size_t thislen; - if (readlinebuffer (thisline, stdin) == 0) - { - if (ferror (stdin)) - goto closefiles; - break; - } - thisfield = find_field (thisline); - thislen = thisline->length - 1 - (thisfield - thisline->buffer); - match = !different (thisfield, prevfield, thislen, prevlen); - match_count += match; - - if (match_count == UINTMAX_MAX) - { - if (count_occurrences) - error (EXIT_FAILURE, 0, _("too many repeated lines")); - match_count--; - } + { + bool match; + char *thisfield; + size_t thislen; + if (readlinebuffer_delim (thisline, stdin, delimiter) == 0) + { + if (ferror (stdin)) + goto closefiles; + break; + } + thisfield = find_field (thisline); + thislen = thisline->length - 1 - (thisfield - thisline->buffer); + match = !different (thisfield, prevfield, thislen, prevlen); + match_count += match; + + if (match_count == UINTMAX_MAX) + { + if (count_occurrences) + error (EXIT_FAILURE, 0, _("too many repeated lines")); + match_count--; + } if (delimit_groups != DM_NONE) - { - if (!match) - { - if (match_count) /* a previous match */ - first_delimiter = false; /* Only used when DM_SEPARATE */ - } - else if (match_count == 1) - { - if ((delimit_groups == DM_PREPEND) - || (delimit_groups == DM_SEPARATE - && !first_delimiter)) - putchar ('\n'); - } - } - - if (!match || output_later_repeated) - { - writeline (prevline, match, match_count); - SWAP_LINES (prevline, thisline); - prevfield = thisfield; - prevlen = thislen; - if (!match) - match_count = 0; - } - } + { + if (!match) + { + if (match_count) /* a previous match */ + first_delimiter = false; /* Only used when DM_SEPARATE */ + } + else if (match_count == 1) + { + if ((delimit_groups == DM_PREPEND) + || (delimit_groups == DM_SEPARATE + && !first_delimiter)) + putchar (delimiter); + } + } + + if (!match || output_later_repeated) + { + writeline (prevline, match, match_count); + SWAP_LINES (prevline, thisline); + prevfield = thisfield; + prevlen = thislen; + if (!match) + match_count = 0; + } + } writeline (prevline, false, match_count); } closefiles: if (ferror (stdin) || fclose (stdin) != 0) - error (EXIT_FAILURE, 0, _("error reading %s"), infile); + error (EXIT_FAILURE, 0, _("error reading %s"), quoteaf (infile)); /* stdout is handled via the atexit-invoked close_stdout function. */ @@ -404,12 +483,14 @@ main (int argc, char **argv) int optc = 0; bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL); enum Skip_field_option_type skip_field_option_type = SFO_NONE; - int nfiles = 0; + unsigned int nfiles = 0; char const *file[2]; + char delimiter = '\n'; /* change with --zero-terminated, -z */ + bool output_option_used = false; /* if true, one of -u/-d/-D/-c was used */ file[0] = file[1] = "-"; initialize_main (&argc, &argv); - program_name = argv[0]; + set_program_name (argv[0]); setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); @@ -425,128 +506,162 @@ main (int argc, char **argv) countmode = count_none; delimit_groups = DM_NONE; - for (;;) + while (true) { /* Parse an operand with leading "+" as a file after "--" was seen; or if pedantic and a file was seen; or if not obsolete. */ if (optc == -1 - || (posixly_correct && nfiles != 0) - || ((optc = getopt_long (argc, argv, - "-0123456789Dcdf:is:uw:", longopts, NULL)) - == -1)) - { - if (argc <= optind) - break; - if (nfiles == 2) - { - error (0, 0, _("extra operand %s"), quote (argv[optind])); - usage (EXIT_FAILURE); - } - file[nfiles++] = argv[optind++]; - } + || (posixly_correct && nfiles != 0) + || ((optc = getopt_long (argc, argv, + "-0123456789Dcdf:is:uw:z", longopts, NULL)) + == -1)) + { + if (argc <= optind) + break; + if (nfiles == 2) + { + error (0, 0, _("extra operand %s"), quote (argv[optind])); + usage (EXIT_FAILURE); + } + file[nfiles++] = argv[optind++]; + } else switch (optc) - { - case 1: - { - unsigned long int size; - if (optarg[0] == '+' - && posix2_version () < 200112 - && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK - && size <= SIZE_MAX) - skip_chars = size; - else if (nfiles == 2) - { - error (0, 0, _("extra operand %s"), quote (optarg)); - usage (EXIT_FAILURE); - } - else - file[nfiles++] = optarg; - } - break; - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - if (skip_field_option_type == SFO_NEW) - skip_fields = 0; - - if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t)) - skip_fields = SIZE_MAX; - - skip_field_option_type = SFO_OBSOLETE; - } - break; - - case 'c': - countmode = count_occurrences; - break; - - case 'd': - output_unique = false; - break; - - case 'D': - output_unique = false; - output_later_repeated = true; - if (optarg == NULL) - delimit_groups = DM_NONE; - else - delimit_groups = XARGMATCH ("--all-repeated", optarg, - delimit_method_string, - delimit_method_map); - break; - - case 'f': - skip_field_option_type = SFO_NEW; - skip_fields = size_opt (optarg, - N_("invalid number of fields to skip")); - break; - - case 'i': - ignore_case = true; - break; - - case 's': - skip_chars = size_opt (optarg, - N_("invalid number of bytes to skip")); - break; - - case 'u': - output_first_repeated = false; - break; - - case 'w': - check_chars = size_opt (optarg, - N_("invalid number of bytes to compare")); - break; - - case_GETOPT_HELP_CHAR; - - case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); - - default: - usage (EXIT_FAILURE); - } + { + case 1: + { + unsigned long int size; + if (optarg[0] == '+' + && posix2_version () < 200112 + && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK + && size <= SIZE_MAX) + skip_chars = size; + else if (nfiles == 2) + { + error (0, 0, _("extra operand %s"), quote (optarg)); + usage (EXIT_FAILURE); + } + else + file[nfiles++] = optarg; + } + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + if (skip_field_option_type == SFO_NEW) + skip_fields = 0; + + if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t)) + skip_fields = SIZE_MAX; + + skip_field_option_type = SFO_OBSOLETE; + } + break; + + case 'c': + countmode = count_occurrences; + output_option_used = true; + break; + + case 'd': + output_unique = false; + output_option_used = true; + break; + + case 'D': + output_unique = false; + output_later_repeated = true; + if (optarg == NULL) + delimit_groups = DM_NONE; + else + delimit_groups = XARGMATCH ("--all-repeated", optarg, + delimit_method_string, + delimit_method_map); + output_option_used = true; + break; + + case GROUP_OPTION: + if (optarg == NULL) + grouping = GM_SEPARATE; + else + grouping = XARGMATCH ("--group", optarg, + grouping_method_string, + grouping_method_map); + break; + + case 'f': + skip_field_option_type = SFO_NEW; + skip_fields = size_opt (optarg, + N_("invalid number of fields to skip")); + break; + + case 'i': + ignore_case = true; + break; + + case 's': + skip_chars = size_opt (optarg, + N_("invalid number of bytes to skip")); + break; + + case 'u': + output_first_repeated = false; + output_option_used = true; + break; + + case 'w': + check_chars = size_opt (optarg, + N_("invalid number of bytes to compare")); + break; + + case 'z': + delimiter = '\0'; + break; + + case_GETOPT_HELP_CHAR; + + case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); + + default: + usage (EXIT_FAILURE); + } + } + + /* Note we could allow --group with -D at least, and that would + avoid the need to specify a grouping method to --all-repeated. + It was thought best to avoid deprecating those parameters though + and keep --group separate to other options. */ + if (grouping != GM_NONE && output_option_used) + { + error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u")); + usage (EXIT_FAILURE); + } + + if (grouping != GM_NONE && countmode != count_none) + { + error (0, 0, + _("grouping and printing repeat counts is meaningless")); + usage (EXIT_FAILURE); } if (countmode == count_occurrences && output_later_repeated) { error (0, 0, - _("printing all duplicated lines and repeat counts is meaningless")); + _("printing all duplicated lines and repeat counts is meaningless")); usage (EXIT_FAILURE); } - check_file (file[0], file[1]); + check_file (file[0], file[1], delimiter); - exit (EXIT_SUCCESS); + return EXIT_SUCCESS; } |