diff options
Diffstat (limited to 'src/join.c')
-rw-r--r-- | src/join.c | 940 |
1 files changed, 940 insertions, 0 deletions
diff --git a/src/join.c b/src/join.c new file mode 100644 index 0000000..b113c54 --- /dev/null +++ b/src/join.c @@ -0,0 +1,940 @@ +/* join - join lines of two files on a common field + Copyright (C) 91, 1995-2006 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + Written by Mike Haertel, mike@gnu.ai.mit.edu. */ + +#include <config.h> + +#include <assert.h> +#include <sys/types.h> +#include <getopt.h> + +#include "system.h" +#include "error.h" +#include "hard-locale.h" +#include "linebuffer.h" +#include "memcasecmp.h" +#include "quote.h" +#include "stdio--.h" +#include "xmemcoll.h" +#include "xstrtol.h" + +/* The official name of this program (e.g., no `g' prefix). */ +#define PROGRAM_NAME "join" + +#define AUTHORS "Mike Haertel" + +#define join system_join + +/* An element of the list identifying which fields to print for each + output line. */ +struct outlist + { + /* File number: 0, 1, or 2. 0 means use the join field. + 1 means use the first file argument, 2 the second. */ + int file; + + /* Field index (zero-based), specified only when FILE is 1 or 2. */ + size_t field; + + struct outlist *next; + }; + +/* A field of a line. */ +struct field + { + char *beg; /* First character in field. */ + size_t len; /* The length of the field. */ + }; + +/* A line read from an input file. */ +struct line + { + struct linebuffer buf; /* The line itself. */ + size_t nfields; /* Number of elements in `fields'. */ + size_t nfields_allocated; /* Number of elements allocated for `fields'. */ + struct field *fields; + }; + +/* One or more consecutive lines read from a file that all have the + same join field value. */ +struct seq + { + size_t count; /* Elements used in `lines'. */ + size_t alloc; /* Elements allocated in `lines'. */ + struct line *lines; + }; + +/* The name this program was run with. */ +char *program_name; + +/* True if the LC_COLLATE locale is hard. */ +static bool hard_LC_COLLATE; + +/* If nonzero, print unpairable lines in file 1 or 2. */ +static bool print_unpairables_1, print_unpairables_2; + +/* If nonzero, print pairable lines. */ +static bool print_pairables; + +/* Empty output field filler. */ +static char const *empty_filler; + +/* Field to join on; SIZE_MAX means they haven't been determined yet. */ +static size_t join_field_1 = SIZE_MAX; +static size_t join_field_2 = SIZE_MAX; + +/* List of fields to print. */ +static struct outlist outlist_head; + +/* Last element in `outlist', where a new element can be added. */ +static struct outlist *outlist_end = &outlist_head; + +/* Tab character separating fields. If negative, fields are separated + by any nonempty string of blanks, otherwise by exactly one + tab character whose value (when cast to unsigned char) equals TAB. */ +static int tab = -1; + +static struct option const longopts[] = +{ + {"ignore-case", no_argument, NULL, 'i'}, + {GETOPT_HELP_OPTION_DECL}, + {GETOPT_VERSION_OPTION_DECL}, + {NULL, 0, NULL, 0} +}; + +/* Used to print non-joining lines */ +static struct line uni_blank; + +/* If nonzero, ignore case when comparing join fields. */ +static bool ignore_case; + +void +usage (int status) +{ + if (status != EXIT_SUCCESS) + fprintf (stderr, _("Try `%s --help' for more information.\n"), + program_name); + else + { + printf (_("\ +Usage: %s [OPTION]... FILE1 FILE2\n\ +"), + program_name); + fputs (_("\ +For each pair of input lines with identical join fields, write a line to\n\ +standard output. The default join field is the first, delimited\n\ +by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ +\n\ + -a FILENUM print unpairable lines coming from file FILENUM, where\n\ + FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\ + -e EMPTY replace missing input fields with EMPTY\n\ +"), stdout); + fputs (_("\ + -i, --ignore-case ignore differences in case when comparing fields\n\ + -j FIELD equivalent to `-1 FIELD -2 FIELD'\n\ + -o FORMAT obey FORMAT while constructing output line\n\ + -t CHAR use CHAR as input and output field separator\n\ +"), stdout); + fputs (_("\ + -v FILENUM like -a FILENUM, but suppress joined output lines\n\ + -1 FIELD join on this FIELD of file 1\n\ + -2 FIELD join on this FIELD of file 2\n\ +"), stdout); + fputs (HELP_OPTION_DESCRIPTION, stdout); + fputs (VERSION_OPTION_DESCRIPTION, stdout); + fputs (_("\ +\n\ +Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\ +else fields are separated by CHAR. Any FIELD is a field number counted\n\ +from 1. FORMAT is one or more comma or blank separated specifications,\n\ +each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\ +the remaining fields from FILE1, the remaining fields from FILE2, all\n\ +separated by CHAR.\n\ +\n\ +Important: FILE1 and FILE2 must be sorted on the join fields.\n\ +E.g., use `sort -k 1b,1' if `join' has no options.\n\ +"), stdout); + printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); + } + exit (status); +} + +/* Record a field in LINE, with location FIELD and size LEN. */ + +static void +extract_field (struct line *line, char *field, size_t len) +{ + if (line->nfields >= line->nfields_allocated) + { + line->fields = X2NREALLOC (line->fields, &line->nfields_allocated); + } + line->fields[line->nfields].beg = field; + line->fields[line->nfields].len = len; + ++(line->nfields); +} + +/* Fill in the `fields' structure in LINE. */ + +static void +xfields (struct line *line) +{ + char *ptr = line->buf.buffer; + char const *lim = ptr + line->buf.length - 1; + + if (ptr == lim) + return; + + if (0 <= tab) + { + char *sep; + for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1) + extract_field (line, ptr, sep - ptr); + } + else + { + /* Skip leading blanks before the first field. */ + while (isblank (to_uchar (*ptr))) + if (++ptr == lim) + return; + + do + { + char *sep; + for (sep = ptr + 1; sep != lim && ! isblank (to_uchar (*sep)); sep++) + continue; + extract_field (line, ptr, sep - ptr); + if (sep == lim) + return; + for (ptr = sep + 1; ptr != lim && isblank (to_uchar (*ptr)); ptr++) + continue; + } + while (ptr != lim); + } + + extract_field (line, ptr, lim - ptr); +} + +/* Read a line from FP into LINE and split it into fields. + Return true if successful. */ + +static bool +get_line (FILE *fp, struct line *line) +{ + initbuffer (&line->buf); + + if (! readlinebuffer (&line->buf, fp)) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("read error")); + free (line->buf.buffer); + line->buf.buffer = NULL; + return false; + } + + line->nfields_allocated = 0; + line->nfields = 0; + line->fields = NULL; + xfields (line); + return true; +} + +static void +freeline (struct line *line) +{ + free (line->fields); + free (line->buf.buffer); + line->buf.buffer = NULL; +} + +static void +initseq (struct seq *seq) +{ + seq->count = 0; + seq->alloc = 0; + seq->lines = NULL; +} + +/* Read a line from FP and add it to SEQ. Return true if successful. */ + +static bool +getseq (FILE *fp, struct seq *seq) +{ + if (seq->count == seq->alloc) + seq->lines = X2NREALLOC (seq->lines, &seq->alloc); + + if (get_line (fp, &seq->lines[seq->count])) + { + ++seq->count; + return true; + } + return false; +} + +static void +delseq (struct seq *seq) +{ + size_t i; + for (i = 0; i < seq->count; i++) + if (seq->lines[i].buf.buffer) + freeline (&seq->lines[i]); + free (seq->lines); +} + +/* Return <0 if the join field in LINE1 compares less than the one in LINE2; + >0 if it compares greater; 0 if it compares equal. + Report an error and exit if the comparison fails. */ + +static int +keycmp (struct line const *line1, struct line const *line2) +{ + /* Start of field to compare in each file. */ + char *beg1; + char *beg2; + + size_t len1; + size_t len2; /* Length of fields to compare. */ + int diff; + + if (join_field_1 < line1->nfields) + { + beg1 = line1->fields[join_field_1].beg; + len1 = line1->fields[join_field_1].len; + } + else + { + beg1 = NULL; + len1 = 0; + } + + if (join_field_2 < line2->nfields) + { + beg2 = line2->fields[join_field_2].beg; + len2 = line2->fields[join_field_2].len; + } + else + { + beg2 = NULL; + len2 = 0; + } + + if (len1 == 0) + return len2 == 0 ? 0 : -1; + if (len2 == 0) + return 1; + + if (ignore_case) + { + /* FIXME: ignore_case does not work with NLS (in particular, + with multibyte chars). */ + diff = memcasecmp (beg1, beg2, MIN (len1, len2)); + } + else + { + if (hard_LC_COLLATE) + return xmemcoll (beg1, len1, beg2, len2); + diff = memcmp (beg1, beg2, MIN (len1, len2)); + } + + if (diff) + return diff; + return len1 < len2 ? -1 : len1 != len2; +} + +/* Print field N of LINE if it exists and is nonempty, otherwise + `empty_filler' if it is nonempty. */ + +static void +prfield (size_t n, struct line const *line) +{ + size_t len; + + if (n < line->nfields) + { + len = line->fields[n].len; + if (len) + fwrite (line->fields[n].beg, 1, len, stdout); + else if (empty_filler) + fputs (empty_filler, stdout); + } + else if (empty_filler) + fputs (empty_filler, stdout); +} + +/* Print the join of LINE1 and LINE2. */ + +static void +prjoin (struct line const *line1, struct line const *line2) +{ + const struct outlist *outlist; + char output_separator = tab < 0 ? ' ' : tab; + + outlist = outlist_head.next; + if (outlist) + { + const struct outlist *o; + + o = outlist; + while (1) + { + size_t field; + struct line const *line; + + if (o->file == 0) + { + if (line1 == &uni_blank) + { + line = line2; + field = join_field_2; + } + else + { + line = line1; + field = join_field_1; + } + } + else + { + line = (o->file == 1 ? line1 : line2); + field = o->field; + } + prfield (field, line); + o = o->next; + if (o == NULL) + break; + putchar (output_separator); + } + putchar ('\n'); + } + else + { + size_t i; + + if (line1 == &uni_blank) + { + struct line const *t; + t = line1; + line1 = line2; + line2 = t; + } + prfield (join_field_1, line1); + for (i = 0; i < join_field_1 && i < line1->nfields; ++i) + { + putchar (output_separator); + prfield (i, line1); + } + for (i = join_field_1 + 1; i < line1->nfields; ++i) + { + putchar (output_separator); + prfield (i, line1); + } + + for (i = 0; i < join_field_2 && i < line2->nfields; ++i) + { + putchar (output_separator); + prfield (i, line2); + } + for (i = join_field_2 + 1; i < line2->nfields; ++i) + { + putchar (output_separator); + prfield (i, line2); + } + putchar ('\n'); + } +} + +/* Print the join of the files in FP1 and FP2. */ + +static void +join (FILE *fp1, FILE *fp2) +{ + struct seq seq1, seq2; + struct line line; + int diff; + bool eof1, eof2; + + /* Read the first line of each file. */ + initseq (&seq1); + getseq (fp1, &seq1); + initseq (&seq2); + getseq (fp2, &seq2); + + while (seq1.count && seq2.count) + { + size_t i; + diff = keycmp (&seq1.lines[0], &seq2.lines[0]); + if (diff < 0) + { + if (print_unpairables_1) + prjoin (&seq1.lines[0], &uni_blank); + freeline (&seq1.lines[0]); + seq1.count = 0; + getseq (fp1, &seq1); + continue; + } + if (diff > 0) + { + if (print_unpairables_2) + prjoin (&uni_blank, &seq2.lines[0]); + freeline (&seq2.lines[0]); + seq2.count = 0; + getseq (fp2, &seq2); + continue; + } + + /* Keep reading lines from file1 as long as they continue to + match the current line from file2. */ + eof1 = false; + do + if (!getseq (fp1, &seq1)) + { + eof1 = true; + ++seq1.count; + break; + } + while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0])); + + /* Keep reading lines from file2 as long as they continue to + match the current line from file1. */ + eof2 = false; + do + if (!getseq (fp2, &seq2)) + { + eof2 = true; + ++seq2.count; + break; + } + while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1])); + + if (print_pairables) + { + for (i = 0; i < seq1.count - 1; ++i) + { + size_t j; + for (j = 0; j < seq2.count - 1; ++j) + prjoin (&seq1.lines[i], &seq2.lines[j]); + } + } + + for (i = 0; i < seq1.count - 1; ++i) + freeline (&seq1.lines[i]); + if (!eof1) + { + seq1.lines[0] = seq1.lines[seq1.count - 1]; + seq1.count = 1; + } + else + seq1.count = 0; + + for (i = 0; i < seq2.count - 1; ++i) + freeline (&seq2.lines[i]); + if (!eof2) + { + seq2.lines[0] = seq2.lines[seq2.count - 1]; + seq2.count = 1; + } + else + seq2.count = 0; + } + + if (print_unpairables_1 && seq1.count) + { + prjoin (&seq1.lines[0], &uni_blank); + freeline (&seq1.lines[0]); + while (get_line (fp1, &line)) + { + prjoin (&line, &uni_blank); + freeline (&line); + } + } + + if (print_unpairables_2 && seq2.count) + { + prjoin (&uni_blank, &seq2.lines[0]); + freeline (&seq2.lines[0]); + while (get_line (fp2, &line)) + { + prjoin (&uni_blank, &line); + freeline (&line); + } + } + + delseq (&seq1); + delseq (&seq2); +} + +/* Add a field spec for field FIELD of file FILE to `outlist'. */ + +static void +add_field (int file, size_t field) +{ + struct outlist *o; + + assert (file == 0 || file == 1 || file == 2); + assert (file != 0 || field == 0); + + o = xmalloc (sizeof *o); + o->file = file; + o->field = field; + o->next = NULL; + + /* Add to the end of the list so the fields are in the right order. */ + outlist_end->next = o; + outlist_end = o; +} + +/* Convert a string of decimal digits, STR (the 1-based join field number), + to an integral value. Upon successful conversion, return one less + (the zero-based field number). Silently convert too-large values + to SIZE_MAX - 1. Otherwise, if a value cannot be converted, give a + diagnostic and exit. */ + +static size_t +string_to_join_field (char const *str) +{ + size_t result; + unsigned long int val; + verify (SIZE_MAX <= ULONG_MAX); + + strtol_error s_err = xstrtoul (str, NULL, 10, &val, ""); + if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && SIZE_MAX < val)) + val = SIZE_MAX; + else if (s_err != LONGINT_OK || val == 0) + error (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str)); + + result = val - 1; + + return result; +} + +/* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX + pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based. + If S is valid, return true. Otherwise, give a diagnostic and exit. */ + +static void +decode_field_spec (const char *s, int *file_index, size_t *field_index) +{ + /* The first character must be 0, 1, or 2. */ + switch (s[0]) + { + case '0': + if (s[1]) + { + /* `0' must be all alone -- no `.FIELD'. */ + error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s)); + } + *file_index = 0; + *field_index = 0; + break; + + case '1': + case '2': + if (s[1] != '.') + error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s)); + *file_index = s[0] - '0'; + *field_index = string_to_join_field (s + 2); + break; + + default: + error (EXIT_FAILURE, 0, + _("invalid file number in field spec: %s"), quote (s)); + + /* Tell gcc -W -Wall that we can't get beyond this point. + This avoids a warning (otherwise legit) that the caller's copies + of *file_index and *field_index might be used uninitialized. */ + abort (); + + break; + } +} + +/* Add the comma or blank separated field spec(s) in STR to `outlist'. */ + +static void +add_field_list (char *str) +{ + char *p = str; + + do + { + int file_index; + size_t field_index; + char const *spec_item = p; + + p = strpbrk (p, ", \t"); + if (p) + *p++ = '\0'; + decode_field_spec (spec_item, &file_index, &field_index); + add_field (file_index, field_index); + } + while (p); +} + +/* Set the join field *VAR to VAL, but report an error if *VAR is set + more than once to incompatible values. */ + +static void +set_join_field (size_t *var, size_t val) +{ + if (*var != SIZE_MAX && *var != val) + { + unsigned long int var1 = *var + 1; + unsigned long int val1 = val + 1; + error (EXIT_FAILURE, 0, _("incompatible join fields %lu, %lu"), + var1, val1); + } + *var = val; +} + +/* Status of command-line arguments. */ + +enum operand_status + { + /* This argument must be an operand, i.e., one of the files to be + joined. */ + MUST_BE_OPERAND, + + /* This might be the argument of the preceding -j1 or -j2 option, + or it might be an operand. */ + MIGHT_BE_J1_ARG, + MIGHT_BE_J2_ARG, + + /* This might be the argument of the preceding -o option, or it might be + an operand. */ + MIGHT_BE_O_ARG + }; + +/* Add NAME to the array of input file NAMES with operand statuses + OPERAND_STATUS; currently there are NFILES names in the list. */ + +static void +add_file_name (char *name, char *names[2], + int operand_status[2], int joption_count[2], int *nfiles, + int *prev_optc_status, int *optc_status) +{ + int n = *nfiles; + + if (n == 2) + { + bool op0 = (operand_status[0] == MUST_BE_OPERAND); + char *arg = names[op0]; + switch (operand_status[op0]) + { + case MUST_BE_OPERAND: + error (0, 0, _("extra operand %s"), quote (name)); + usage (EXIT_FAILURE); + + case MIGHT_BE_J1_ARG: + joption_count[0]--; + set_join_field (&join_field_1, string_to_join_field (arg)); + break; + + case MIGHT_BE_J2_ARG: + joption_count[1]--; + set_join_field (&join_field_2, string_to_join_field (arg)); + break; + + case MIGHT_BE_O_ARG: + add_field_list (arg); + break; + } + if (!op0) + { + operand_status[0] = operand_status[1]; + names[0] = names[1]; + } + n = 1; + } + + operand_status[n] = *prev_optc_status; + names[n] = name; + *nfiles = n + 1; + if (*prev_optc_status == MIGHT_BE_O_ARG) + *optc_status = MIGHT_BE_O_ARG; +} + +int +main (int argc, char **argv) +{ + int optc_status; + int prev_optc_status = MUST_BE_OPERAND; + int operand_status[2]; + int joption_count[2] = { 0, 0 }; + char *names[2]; + FILE *fp1, *fp2; + int optc; + int nfiles = 0; + int i; + + initialize_main (&argc, &argv); + program_name = argv[0]; + setlocale (LC_ALL, ""); + bindtextdomain (PACKAGE, LOCALEDIR); + textdomain (PACKAGE); + hard_LC_COLLATE = hard_locale (LC_COLLATE); + + atexit (close_stdout); + + print_pairables = true; + + while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", + longopts, NULL)) + != -1) + { + optc_status = MUST_BE_OPERAND; + + switch (optc) + { + case 'v': + print_pairables = false; + /* Fall through. */ + + case 'a': + { + unsigned long int val; + if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK + || (val != 1 && val != 2)) + error (EXIT_FAILURE, 0, + _("invalid field number: %s"), quote (optarg)); + if (val == 1) + print_unpairables_1 = true; + else + print_unpairables_2 = true; + } + break; + + case 'e': + if (empty_filler && ! STREQ (empty_filler, optarg)) + error (EXIT_FAILURE, 0, + _("conflicting empty-field replacement strings")); + empty_filler = optarg; + break; + + case 'i': + ignore_case = true; + break; + + case '1': + set_join_field (&join_field_1, string_to_join_field (optarg)); + break; + + case '2': + set_join_field (&join_field_2, string_to_join_field (optarg)); + break; + + case 'j': + if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1] + && optarg == argv[optind - 1] + 2) + { + /* The argument was either "-j1" or "-j2". */ + bool is_j2 = (optarg[0] == '2'); + joption_count[is_j2]++; + optc_status = MIGHT_BE_J1_ARG + is_j2; + } + else + { + set_join_field (&join_field_1, string_to_join_field (optarg)); + set_join_field (&join_field_2, join_field_1); + } + break; + + case 'o': + add_field_list (optarg); + optc_status = MIGHT_BE_O_ARG; + break; + + case 't': + { + unsigned char newtab = optarg[0]; + if (! newtab) + error (EXIT_FAILURE, 0, _("empty tab")); + if (optarg[1]) + { + if (STREQ (optarg, "\\0")) + newtab = '\0'; + else + error (EXIT_FAILURE, 0, _("multi-character tab %s"), + quote (optarg)); + } + if (0 <= tab && tab != newtab) + error (EXIT_FAILURE, 0, _("incompatible tabs")); + tab = newtab; + } + break; + + case 1: /* Non-option argument. */ + add_file_name (optarg, names, operand_status, joption_count, + &nfiles, &prev_optc_status, &optc_status); + break; + + case_GETOPT_HELP_CHAR; + + case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); + + default: + usage (EXIT_FAILURE); + } + + prev_optc_status = optc_status; + } + + /* Process any operands after "--". */ + prev_optc_status = MUST_BE_OPERAND; + while (optind < argc) + add_file_name (argv[optind++], names, operand_status, joption_count, + &nfiles, &prev_optc_status, &optc_status); + + if (nfiles != 2) + { + if (nfiles == 0) + error (0, 0, _("missing operand")); + else + error (0, 0, _("missing operand after %s"), quote (argv[argc - 1])); + usage (EXIT_FAILURE); + } + + /* If "-j1" was specified and it turns out not to have had an argument, + treat it as "-j 1". Likewise for -j2. */ + for (i = 0; i < 2; i++) + if (joption_count[i] != 0) + { + set_join_field (&join_field_1, i); + set_join_field (&join_field_2, i); + } + + if (join_field_1 == SIZE_MAX) + join_field_1 = 0; + if (join_field_2 == SIZE_MAX) + join_field_2 = 0; + + fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r"); + if (!fp1) + error (EXIT_FAILURE, errno, "%s", names[0]); + fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r"); + if (!fp2) + error (EXIT_FAILURE, errno, "%s", names[1]); + if (fp1 == fp2) + error (EXIT_FAILURE, errno, _("both files cannot be standard input")); + join (fp1, fp2); + + if (fclose (fp1) != 0) + error (EXIT_FAILURE, errno, "%s", names[0]); + if (fclose (fp2) != 0) + error (EXIT_FAILURE, errno, "%s", names[1]); + + exit (EXIT_SUCCESS); +} |