summaryrefslogtreecommitdiff
path: root/src/uniq.c
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2007-03-22 21:23:21 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2007-03-22 21:23:21 +0000
commitcbf5993c43f49281173f185863577d86bfac6eae (patch)
tree90737c96cf15b97273a2bdc5950b3cf09f1d94ca /src/uniq.c
downloadcoreutils-tarball-cbf5993c43f49281173f185863577d86bfac6eae.tar.gz
coreutils-6.9coreutils-6.9
Diffstat (limited to 'src/uniq.c')
-rw-r--r--src/uniq.c552
1 files changed, 552 insertions, 0 deletions
diff --git a/src/uniq.c b/src/uniq.c
new file mode 100644
index 0000000..6c38ed8
--- /dev/null
+++ b/src/uniq.c
@@ -0,0 +1,552 @@
+/* uniq -- remove duplicate lines from a sorted file
+ Copyright (C) 86, 91, 1995-2006 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software Foundation,
+ Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+
+/* Written by Richard Stallman and David MacKenzie. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <getopt.h>
+#include <sys/types.h>
+
+#include "system.h"
+#include "argmatch.h"
+#include "linebuffer.h"
+#include "error.h"
+#include "hard-locale.h"
+#include "posixver.h"
+#include "quote.h"
+#include "xmemcoll.h"
+#include "xstrtol.h"
+#include "memcasecmp.h"
+
+/* The official name of this program (e.g., no `g' prefix). */
+#define PROGRAM_NAME "uniq"
+
+#define AUTHORS "Richard Stallman", "David MacKenzie"
+
+#define SWAP_LINES(A, B) \
+ do \
+ { \
+ struct linebuffer *_tmp; \
+ _tmp = (A); \
+ (A) = (B); \
+ (B) = _tmp; \
+ } \
+ while (0)
+
+/* The name this program was run with. */
+char *program_name;
+
+/* True if the LC_COLLATE locale is hard. */
+static bool hard_LC_COLLATE;
+
+/* Number of fields to skip on each line when doing comparisons. */
+static size_t skip_fields;
+
+/* Number of chars to skip after skipping any fields. */
+static size_t skip_chars;
+
+/* Number of chars to compare. */
+static size_t check_chars;
+
+enum countmode
+{
+ count_occurrences, /* -c Print count before output lines. */
+ count_none /* Default. Do not print counts. */
+};
+
+/* Whether and how to precede the output lines with a count of the number of
+ times they occurred in the input. */
+static enum countmode countmode;
+
+/* Which lines to output: unique lines, the first of a group of
+ repeated lines, and the second and subsequented of a group of
+ repeated lines. */
+static bool output_unique;
+static bool output_first_repeated;
+static bool output_later_repeated;
+
+/* If true, ignore case when comparing. */
+static bool ignore_case;
+
+enum delimit_method
+{
+ /* No delimiters output. --all-repeated[=none] */
+ DM_NONE,
+
+ /* Delimiter precedes all groups. --all-repeated=prepend */
+ DM_PREPEND,
+
+ /* Delimit all groups. --all-repeated=separate */
+ DM_SEPARATE
+};
+
+static char const *const delimit_method_string[] =
+{
+ "none", "prepend", "separate", NULL
+};
+
+static enum delimit_method const delimit_method_map[] =
+{
+ DM_NONE, DM_PREPEND, DM_SEPARATE
+};
+
+/* Select whether/how to delimit groups of duplicate lines. */
+static enum delimit_method delimit_groups;
+
+static struct option const longopts[] =
+{
+ {"count", no_argument, NULL, 'c'},
+ {"repeated", no_argument, NULL, 'd'},
+ {"all-repeated", optional_argument, NULL, 'D'},
+ {"ignore-case", no_argument, NULL, 'i'},
+ {"unique", no_argument, NULL, 'u'},
+ {"skip-fields", required_argument, NULL, 'f'},
+ {"skip-chars", required_argument, NULL, 's'},
+ {"check-chars", required_argument, NULL, 'w'},
+ {GETOPT_HELP_OPTION_DECL},
+ {GETOPT_VERSION_OPTION_DECL},
+ {NULL, 0, NULL, 0}
+};
+
+void
+usage (int status)
+{
+ if (status != EXIT_SUCCESS)
+ fprintf (stderr, _("Try `%s --help' for more information.\n"),
+ program_name);
+ else
+ {
+ printf (_("\
+Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
+"),
+ program_name);
+ fputs (_("\
+Discard all but one of successive identical lines from INPUT (or\n\
+standard input), writing to OUTPUT (or standard output).\n\
+\n\
+"), stdout);
+ fputs (_("\
+Mandatory arguments to long options are mandatory for short options too.\n\
+"), stdout);
+ fputs (_("\
+ -c, --count prefix lines by the number of occurrences\n\
+ -d, --repeated only print duplicate lines\n\
+"), stdout);
+ fputs (_("\
+ -D, --all-repeated[=delimit-method] print all duplicate lines\n\
+ delimit-method={none(default),prepend,separate}\n\
+ Delimiting is done with blank lines.\n\
+ -f, --skip-fields=N avoid comparing the first N fields\n\
+ -i, --ignore-case ignore differences in case when comparing\n\
+ -s, --skip-chars=N avoid comparing the first N characters\n\
+ -u, --unique only print unique lines\n\
+"), stdout);
+ fputs (_("\
+ -w, --check-chars=N compare no more than N characters in lines\n\
+"), stdout);
+ fputs (HELP_OPTION_DESCRIPTION, stdout);
+ fputs (VERSION_OPTION_DESCRIPTION, stdout);
+ fputs (_("\
+\n\
+A field is a run of whitespace, then non-whitespace characters.\n\
+Fields are skipped before chars.\n\
+"), stdout);
+ printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
+ }
+ exit (status);
+}
+
+/* Convert OPT to size_t, reporting an error using MSGID if OPT is
+ invalid. Silently convert too-large values to SIZE_MAX. */
+
+static size_t
+size_opt (char const *opt, char const *msgid)
+{
+ unsigned long int size;
+ verify (SIZE_MAX <= ULONG_MAX);
+
+ switch (xstrtoul (opt, NULL, 10, &size, ""))
+ {
+ case LONGINT_OK:
+ case LONGINT_OVERFLOW:
+ break;
+
+ default:
+ error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
+ }
+
+ return MIN (size, SIZE_MAX);
+}
+
+/* Given a linebuffer LINE,
+ return a pointer to the beginning of the line's field to be compared. */
+
+static char *
+find_field (const struct linebuffer *line)
+{
+ size_t count;
+ char *lp = line->buffer;
+ size_t size = line->length - 1;
+ size_t i = 0;
+
+ for (count = 0; count < skip_fields && i < size; count++)
+ {
+ while (i < size && isblank (lp[i]))
+ i++;
+ while (i < size && !isblank (lp[i]))
+ i++;
+ }
+
+ for (count = 0; count < skip_chars && i < size; count++)
+ i++;
+
+ return lp + i;
+}
+
+/* Return false if two strings OLD and NEW match, true if not.
+ OLD and NEW point not to the beginnings of the lines
+ but rather to the beginnings of the fields to compare.
+ OLDLEN and NEWLEN are their lengths. */
+
+static bool
+different (char *old, char *new, size_t oldlen, size_t newlen)
+{
+ if (check_chars < oldlen)
+ oldlen = check_chars;
+ if (check_chars < newlen)
+ newlen = check_chars;
+
+ if (ignore_case)
+ {
+ /* FIXME: This should invoke strcoll somehow. */
+ return oldlen != newlen || memcasecmp (old, new, oldlen);
+ }
+ else if (hard_LC_COLLATE)
+ return xmemcoll (old, oldlen, new, newlen) != 0;
+ else
+ return oldlen != newlen || memcmp (old, new, oldlen);
+}
+
+/* Output the line in linebuffer LINE to standard output
+ provided that the switches say it should be output.
+ MATCH is true if the line matches the previous line.
+ If requested, print the number of times it occurred, as well;
+ LINECOUNT + 1 is the number of times that the line occurred. */
+
+static void
+writeline (struct linebuffer const *line,
+ bool match, uintmax_t linecount)
+{
+ if (! (linecount == 0 ? output_unique
+ : !match ? output_first_repeated
+ : output_later_repeated))
+ return;
+
+ if (countmode == count_occurrences)
+ printf ("%7" PRIuMAX " ", linecount + 1);
+
+ fwrite (line->buffer, sizeof (char), line->length, stdout);
+}
+
+/* Process input file INFILE with output to OUTFILE.
+ If either is "-", use the standard I/O stream for it instead. */
+
+static void
+check_file (const char *infile, const char *outfile)
+{
+ struct linebuffer lb1, lb2;
+ struct linebuffer *thisline, *prevline;
+
+ if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
+ error (EXIT_FAILURE, errno, "%s", infile);
+ if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
+ error (EXIT_FAILURE, errno, "%s", outfile);
+
+ thisline = &lb1;
+ prevline = &lb2;
+
+ initbuffer (thisline);
+ initbuffer (prevline);
+
+ /* The duplication in the following `if' and `else' blocks is an
+ optimization to distinguish the common case (in which none of
+ the following options has been specified: --count, -repeated,
+ --all-repeated, --unique) from the others. In the common case,
+ this optimization lets uniq output each different line right away,
+ without waiting to see if the next one is different. */
+
+ if (output_unique && output_first_repeated && countmode == count_none)
+ {
+ char *prevfield IF_LINT (= NULL);
+ size_t prevlen IF_LINT (= 0);
+
+ while (!feof (stdin))
+ {
+ char *thisfield;
+ size_t thislen;
+ if (readlinebuffer (thisline, stdin) == 0)
+ break;
+ thisfield = find_field (thisline);
+ thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+ if (prevline->length == 0
+ || different (thisfield, prevfield, thislen, prevlen))
+ {
+ fwrite (thisline->buffer, sizeof (char),
+ thisline->length, stdout);
+
+ SWAP_LINES (prevline, thisline);
+ prevfield = thisfield;
+ prevlen = thislen;
+ }
+ }
+ }
+ else
+ {
+ char *prevfield;
+ size_t prevlen;
+ uintmax_t match_count = 0;
+ bool first_delimiter = true;
+
+ if (readlinebuffer (prevline, stdin) == 0)
+ goto closefiles;
+ prevfield = find_field (prevline);
+ prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
+
+ while (!feof (stdin))
+ {
+ bool match;
+ char *thisfield;
+ size_t thislen;
+ if (readlinebuffer (thisline, stdin) == 0)
+ {
+ if (ferror (stdin))
+ goto closefiles;
+ break;
+ }
+ thisfield = find_field (thisline);
+ thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+ match = !different (thisfield, prevfield, thislen, prevlen);
+ match_count += match;
+
+ if (match_count == UINTMAX_MAX)
+ {
+ if (count_occurrences)
+ error (EXIT_FAILURE, 0, _("too many repeated lines"));
+ match_count--;
+ }
+
+ if (delimit_groups != DM_NONE)
+ {
+ if (!match)
+ {
+ if (match_count) /* a previous match */
+ first_delimiter = false; /* Only used when DM_SEPARATE */
+ }
+ else if (match_count == 1)
+ {
+ if ((delimit_groups == DM_PREPEND)
+ || (delimit_groups == DM_SEPARATE
+ && !first_delimiter))
+ putchar ('\n');
+ }
+ }
+
+ if (!match || output_later_repeated)
+ {
+ writeline (prevline, match, match_count);
+ SWAP_LINES (prevline, thisline);
+ prevfield = thisfield;
+ prevlen = thislen;
+ if (!match)
+ match_count = 0;
+ }
+ }
+
+ writeline (prevline, false, match_count);
+ }
+
+ closefiles:
+ if (ferror (stdin) || fclose (stdin) != 0)
+ error (EXIT_FAILURE, 0, _("error reading %s"), infile);
+
+ /* stdout is handled via the atexit-invoked close_stdout function. */
+
+ free (lb1.buffer);
+ free (lb2.buffer);
+}
+
+enum Skip_field_option_type
+ {
+ SFO_NONE,
+ SFO_OBSOLETE,
+ SFO_NEW
+ };
+
+int
+main (int argc, char **argv)
+{
+ int optc = 0;
+ bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
+ enum Skip_field_option_type skip_field_option_type = SFO_NONE;
+ int nfiles = 0;
+ char const *file[2];
+
+ file[0] = file[1] = "-";
+ initialize_main (&argc, &argv);
+ program_name = argv[0];
+ setlocale (LC_ALL, "");
+ bindtextdomain (PACKAGE, LOCALEDIR);
+ textdomain (PACKAGE);
+ hard_LC_COLLATE = hard_locale (LC_COLLATE);
+
+ atexit (close_stdout);
+
+ skip_chars = 0;
+ skip_fields = 0;
+ check_chars = SIZE_MAX;
+ output_unique = output_first_repeated = true;
+ output_later_repeated = false;
+ countmode = count_none;
+ delimit_groups = DM_NONE;
+
+ for (;;)
+ {
+ /* Parse an operand with leading "+" as a file after "--" was
+ seen; or if pedantic and a file was seen; or if not
+ obsolete. */
+
+ if (optc == -1
+ || (posixly_correct && nfiles != 0)
+ || ((optc = getopt_long (argc, argv,
+ "-0123456789Dcdf:is:uw:", longopts, NULL))
+ == -1))
+ {
+ if (argc <= optind)
+ break;
+ if (nfiles == 2)
+ {
+ error (0, 0, _("extra operand %s"), quote (argv[optind]));
+ usage (EXIT_FAILURE);
+ }
+ file[nfiles++] = argv[optind++];
+ }
+ else switch (optc)
+ {
+ case 1:
+ {
+ unsigned long int size;
+ if (optarg[0] == '+'
+ && posix2_version () < 200112
+ && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
+ && size <= SIZE_MAX)
+ skip_chars = size;
+ else if (nfiles == 2)
+ {
+ error (0, 0, _("extra operand %s"), quote (optarg));
+ usage (EXIT_FAILURE);
+ }
+ else
+ file[nfiles++] = optarg;
+ }
+ break;
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ {
+ if (skip_field_option_type == SFO_NEW)
+ skip_fields = 0;
+
+ if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
+ skip_fields = SIZE_MAX;
+
+ skip_field_option_type = SFO_OBSOLETE;
+ }
+ break;
+
+ case 'c':
+ countmode = count_occurrences;
+ break;
+
+ case 'd':
+ output_unique = false;
+ break;
+
+ case 'D':
+ output_unique = false;
+ output_later_repeated = true;
+ if (optarg == NULL)
+ delimit_groups = DM_NONE;
+ else
+ delimit_groups = XARGMATCH ("--all-repeated", optarg,
+ delimit_method_string,
+ delimit_method_map);
+ break;
+
+ case 'f':
+ skip_field_option_type = SFO_NEW;
+ skip_fields = size_opt (optarg,
+ N_("invalid number of fields to skip"));
+ break;
+
+ case 'i':
+ ignore_case = true;
+ break;
+
+ case 's':
+ skip_chars = size_opt (optarg,
+ N_("invalid number of bytes to skip"));
+ break;
+
+ case 'u':
+ output_first_repeated = false;
+ break;
+
+ case 'w':
+ check_chars = size_opt (optarg,
+ N_("invalid number of bytes to compare"));
+ break;
+
+ case_GETOPT_HELP_CHAR;
+
+ case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
+
+ default:
+ usage (EXIT_FAILURE);
+ }
+ }
+
+ if (countmode == count_occurrences && output_later_repeated)
+ {
+ error (0, 0,
+ _("printing all duplicated lines and repeat counts is meaningless"));
+ usage (EXIT_FAILURE);
+ }
+
+ check_file (file[0], file[1]);
+
+ exit (EXIT_SUCCESS);
+}