diff options
Diffstat (limited to 'src/split.c')
-rw-r--r-- | src/split.c | 1640 |
1 files changed, 1335 insertions, 305 deletions
diff --git a/src/split.c b/src/split.c index 1f0f3d7..510af13 100644 --- a/src/split.c +++ b/src/split.c @@ -1,10 +1,10 @@ /* split.c -- split a file into pieces. - Copyright (C) 1988, 1991, 1995-2006 Free Software Foundation, Inc. + Copyright (C) 1988-2016 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or modify + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -12,42 +12,56 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software Foundation, - Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ - -/* By tege@sics.se, with rms. + along with this program. If not, see <http://www.gnu.org/licenses/>. */ - To do: - * Implement -t CHAR or -t REGEX to specify break characters other - than newline. */ +/* By tege@sics.se, with rms. + TODO: + * support -p REGEX as in BSD's split. + * support --suppress-matched as in csplit. */ #include <config.h> +#include <assert.h> #include <stdio.h> #include <getopt.h> +#include <signal.h> #include <sys/types.h> +#include <sys/wait.h> #include "system.h" #include "error.h" #include "fd-reopen.h" #include "fcntl--.h" -#include "getpagesize.h" -#include "full-read.h" #include "full-write.h" -#include "inttostr.h" +#include "ioblksize.h" #include "quote.h" #include "safe-read.h" +#include "sig2str.h" +#include "xfreopen.h" +#include "xdectoint.h" #include "xstrtol.h" -/* The official name of this program (e.g., no `g' prefix). */ +/* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "split" -#define AUTHORS "Torbjorn Granlund", "Richard M. Stallman" +#define AUTHORS \ + proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \ + proper_name ("Richard M. Stallman") -#define DEFAULT_SUFFIX_LENGTH 2 +/* Shell command to filter through, instead of creating files. */ +static char const *filter_command; -/* The name this program was run with. */ -char *program_name; +/* Process ID of the filter. */ +static int filter_pid; + +/* Array of open pipes. */ +static int *open_pipes; +static size_t open_pipes_alloc; +static size_t n_open_pipes; + +/* Blocked signals. */ +static sigset_t oldblocked; +static sigset_t newblocked; /* Base name of output files. */ static char const *outbase; @@ -59,27 +73,59 @@ static char *outfile; Suffixes are inserted here. */ static char *outfile_mid; +/* Generate new suffix when suffixes are exhausted. */ +static bool suffix_auto = true; + /* Length of OUTFILE's suffix. */ -static size_t suffix_length = DEFAULT_SUFFIX_LENGTH; +static size_t suffix_length; /* Alphabet of characters to use in suffix. */ static char const *suffix_alphabet = "abcdefghijklmnopqrstuvwxyz"; +/* Numerical suffix start value. */ +static const char *numeric_suffix_start; + +/* Additional suffix to append to output file names. */ +static char const *additional_suffix; + /* Name of input file. May be "-". */ static char *infile; +/* stat buf for input file. */ +static struct stat in_stat_buf; + /* Descriptor on which output file is open. */ -static int output_desc; +static int output_desc = -1; /* If true, print a diagnostic on standard error just before each output file is opened. */ static bool verbose; +/* If true, don't generate zero length output files. */ +static bool elide_empty_files; + +/* If true, in round robin mode, immediately copy + input to output, which is much slower, so disabled by default. */ +static bool unbuffered; + +/* The character marking end of line. Defaults to \n below. */ +static int eolchar = -1; + +/* The split mode to use. */ +enum Split_type +{ + type_undef, type_bytes, type_byteslines, type_lines, type_digits, + type_chunk_bytes, type_chunk_lines, type_rr +}; + /* For long options that have no equivalent short option, use a non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum { - VERBOSE_OPTION = CHAR_MAX + 1 + VERBOSE_OPTION = CHAR_MAX + 1, + FILTER_OPTION, + IO_BLKSIZE_OPTION, + ADDITIONAL_SUFFIX_OPTION }; static struct option const longopts[] = @@ -87,92 +133,259 @@ static struct option const longopts[] = {"bytes", required_argument, NULL, 'b'}, {"lines", required_argument, NULL, 'l'}, {"line-bytes", required_argument, NULL, 'C'}, + {"number", required_argument, NULL, 'n'}, + {"elide-empty-files", no_argument, NULL, 'e'}, + {"unbuffered", no_argument, NULL, 'u'}, {"suffix-length", required_argument, NULL, 'a'}, - {"numeric-suffixes", no_argument, NULL, 'd'}, + {"additional-suffix", required_argument, NULL, + ADDITIONAL_SUFFIX_OPTION}, + {"numeric-suffixes", optional_argument, NULL, 'd'}, + {"filter", required_argument, NULL, FILTER_OPTION}, {"verbose", no_argument, NULL, VERBOSE_OPTION}, + {"separator", required_argument, NULL, 't'}, + {"-io-blksize", required_argument, NULL, + IO_BLKSIZE_OPTION}, /* do not document */ {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} }; +/* Return true if the errno value, ERR, is ignorable. */ +static inline bool +ignorable (int err) +{ + return filter_command && err == EPIPE; +} + +static void +set_suffix_length (uintmax_t n_units, enum Split_type split_type) +{ +#define DEFAULT_SUFFIX_LENGTH 2 + + uintmax_t suffix_needed = 0; + + /* The suffix auto length feature is incompatible with + a user specified start value as the generated suffixes + are not all consecutive. */ + if (numeric_suffix_start) + suffix_auto = false; + + /* Auto-calculate the suffix length if the number of files is given. */ + if (split_type == type_chunk_bytes || split_type == type_chunk_lines + || split_type == type_rr) + { + uintmax_t n_units_end = n_units; + if (numeric_suffix_start) + { + uintmax_t n_start; + strtol_error e = xstrtoumax (numeric_suffix_start, NULL, 10, + &n_start, ""); + if (e == LONGINT_OK && n_start <= UINTMAX_MAX - n_units) + { + /* Restrict auto adjustment so we don't keep + incrementing a suffix size arbitrarily, + as that would break sort order for files + generated from multiple split runs. */ + if (n_start < n_units) + n_units_end += n_start; + } + + } + size_t alphabet_len = strlen (suffix_alphabet); + bool alphabet_slop = (n_units_end % alphabet_len) != 0; + while (n_units_end /= alphabet_len) + suffix_needed++; + suffix_needed += alphabet_slop; + suffix_auto = false; + } + + if (suffix_length) /* set by user */ + { + if (suffix_length < suffix_needed) + { + error (EXIT_FAILURE, 0, + _("the suffix length needs to be at least %"PRIuMAX), + suffix_needed); + } + suffix_auto = false; + return; + } + else + suffix_length = MAX (DEFAULT_SUFFIX_LENGTH, suffix_needed); +} + void usage (int status) { if (status != EXIT_SUCCESS) - fprintf (stderr, _("Try `%s --help' for more information.\n"), - program_name); + emit_try_help (); else { printf (_("\ -Usage: %s [OPTION] [INPUT [PREFIX]]\n\ +Usage: %s [OPTION]... [FILE [PREFIX]]\n\ "), - program_name); - fputs (_("\ -Output fixed-size pieces of INPUT to PREFIXaa, PREFIXab, ...; default\n\ -size is 1000 lines, and default PREFIX is `x'. With no INPUT, or when INPUT\n\ -is -, read standard input.\n\ -\n\ -"), stdout); + program_name); fputs (_("\ -Mandatory arguments to long options are mandatory for short options too.\n\ +Output pieces of FILE to PREFIXaa, PREFIXab, ...;\n\ +default size is 1000 lines, and default PREFIX is 'x'.\n\ "), stdout); + + emit_stdin_note (); + emit_mandatory_arg_note (); + fprintf (stdout, _("\ - -a, --suffix-length=N use suffixes of length N (default %d)\n\ + -a, --suffix-length=N generate suffixes of length N (default %d)\n\ + --additional-suffix=SUFFIX append an additional SUFFIX to file names\n\ -b, --bytes=SIZE put SIZE bytes per output file\n\ - -C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\ - -d, --numeric-suffixes use numeric suffixes instead of alphabetic\n\ - -l, --lines=NUMBER put NUMBER lines per output file\n\ + -C, --line-bytes=SIZE put at most SIZE bytes of records per output file\n\ + -d use numeric suffixes starting at 0, not alphabetic\n\ + --numeric-suffixes[=FROM] same as -d, but allow setting the start value\ +\n\ + -e, --elide-empty-files do not generate empty output files with '-n'\n\ + --filter=COMMAND write to shell COMMAND; file name is $FILE\n\ + -l, --lines=NUMBER put NUMBER lines/records per output file\n\ + -n, --number=CHUNKS generate CHUNKS output files; see explanation below\n\ + -t, --separator=SEP use SEP instead of newline as the record separator;\n\ + '\\0' (zero) specifies the NUL character\n\ + -u, --unbuffered immediately copy input to output with '-n r/...'\n\ "), DEFAULT_SUFFIX_LENGTH); fputs (_("\ - --verbose print a diagnostic to standard error just\n\ - before each output file is opened\n\ + --verbose print a diagnostic just before each\n\ + output file is opened\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); - fputs (_("\ -\n\ -SIZE may have a multiplier suffix: b for 512, k for 1K, m for 1 Meg.\n\ + emit_size_note (); + fputs (_("\n\ +CHUNKS may be:\n\ + N split into N files based on size of input\n\ + K/N output Kth of N to stdout\n\ + l/N split into N files without splitting lines/records\n\ + l/K/N output Kth of N to stdout without splitting lines/records\n\ + r/N like 'l' but use round robin distribution\n\ + r/K/N likewise but only output Kth of N to stdout\n\ "), stdout); - printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); + emit_ancillary_info (PROGRAM_NAME); } exit (status); } +/* Return the number of bytes that can be read from FD, a file with + apparent size SIZE. Actually read the data into BUF (of size + BUFSIZE) if the file appears to be smaller than BUFSIZE, as this + works better on proc-like file systems. If the returned value is + less than BUFSIZE, store all the file's data into BUF; otherwise, + restore the input file's position so that the file can be reread if + needed. */ + +static off_t +input_file_size (int fd, off_t size, char *buf, size_t bufsize) +{ + if (size < bufsize) + { + size = 0; + while (true) + { + size_t save = size < bufsize ? size : 0; + size_t n_read = safe_read (fd, buf + save, bufsize - save); + if (n_read == 0) + break; + if (n_read == SAFE_READ_ERROR) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + size += n_read; + } + if (bufsize <= size && lseek (fd, - size, SEEK_CUR) < 0) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + } + + return size; +} + /* Compute the next sequential output file name and store it into the - string `outfile'. */ + string 'outfile'. */ static void next_file_name (void) { /* Index in suffix_alphabet of each character in the suffix. */ static size_t *sufindex; + static size_t outbase_length; + static size_t outfile_length; + static size_t addsuf_length; if (! outfile) { - /* Allocate and initialize the first file name. */ + bool widen; + +new_name: + widen = !! outfile_length; + + if (! widen) + { + /* Allocate and initialize the first file name. */ + + outbase_length = strlen (outbase); + addsuf_length = additional_suffix ? strlen (additional_suffix) : 0; + outfile_length = outbase_length + suffix_length + addsuf_length; + } + else + { + /* Reallocate and initialize a new wider file name. + We do this by subsuming the unchanging part of + the generated suffix into the prefix (base), and + reinitializing the now one longer suffix. */ + + outfile_length += 2; + suffix_length++; + } - size_t outbase_length = strlen (outbase); - size_t outfile_length = outbase_length + suffix_length; if (outfile_length + 1 < outbase_length) - xalloc_die (); - outfile = xmalloc (outfile_length + 1); + xalloc_die (); + outfile = xrealloc (outfile, outfile_length + 1); + + if (! widen) + memcpy (outfile, outbase, outbase_length); + else + { + /* Append the last alphabet character to the file name prefix. */ + outfile[outbase_length] = suffix_alphabet[sufindex[0]]; + outbase_length++; + } + outfile_mid = outfile + outbase_length; - memcpy (outfile, outbase, outbase_length); memset (outfile_mid, suffix_alphabet[0], suffix_length); + if (additional_suffix) + memcpy (outfile_mid + suffix_length, additional_suffix, addsuf_length); outfile[outfile_length] = 0; + + free (sufindex); sufindex = xcalloc (suffix_length, sizeof *sufindex); + if (numeric_suffix_start) + { + assert (! widen); + + /* Update the output file name. */ + size_t i = strlen (numeric_suffix_start); + memcpy (outfile_mid + suffix_length - i, numeric_suffix_start, i); + + /* Update the suffix index. */ + size_t *sufindex_end = sufindex + suffix_length; + while (i-- != 0) + *--sufindex_end = numeric_suffix_start[i] - '0'; + } + #if ! _POSIX_NO_TRUNC && HAVE_PATHCONF && defined _PC_NAME_MAX /* POSIX requires that if the output file name is too long for - its directory, `split' must fail without creating any files. - This must be checked for explicitly on operating systems that - silently truncate file names. */ + its directory, 'split' must fail without creating any files. + This must be checked for explicitly on operating systems that + silently truncate file names. */ { - char *dir = dir_name (outfile); - long name_max = pathconf (dir, _PC_NAME_MAX); - if (0 <= name_max && name_max < base_len (last_component (outfile))) - error (EXIT_FAILURE, ENAMETOOLONG, "%s", outfile); - free (dir); + char *dir = dir_name (outfile); + long name_max = pathconf (dir, _PC_NAME_MAX); + if (0 <= name_max && name_max < base_len (last_component (outfile))) + error (EXIT_FAILURE, ENAMETOOLONG, "%s", quotef (outfile)); + free (dir); } #endif } @@ -182,15 +395,153 @@ next_file_name (void) size_t i = suffix_length; while (i-- != 0) - { - sufindex[i]++; - outfile_mid[i] = suffix_alphabet[sufindex[i]]; - if (outfile_mid[i]) - return; - sufindex[i] = 0; - outfile_mid[i] = suffix_alphabet[sufindex[i]]; - } - error (EXIT_FAILURE, 0, _("Output file suffixes exhausted")); + { + sufindex[i]++; + if (suffix_auto && i == 0 && ! suffix_alphabet[sufindex[0] + 1]) + goto new_name; + outfile_mid[i] = suffix_alphabet[sufindex[i]]; + if (outfile_mid[i]) + return; + sufindex[i] = 0; + outfile_mid[i] = suffix_alphabet[sufindex[i]]; + } + error (EXIT_FAILURE, 0, _("output file suffixes exhausted")); + } +} + +/* Create or truncate a file. */ + +static int +create (const char *name) +{ + if (!filter_command) + { + if (verbose) + fprintf (stdout, _("creating file %s\n"), quoteaf (name)); + + int fd = open (name, O_WRONLY | O_CREAT | O_BINARY, MODE_RW_UGO); + if (fd < 0) + return fd; + struct stat out_stat_buf; + if (fstat (fd, &out_stat_buf) != 0) + error (EXIT_FAILURE, errno, _("failed to stat %s"), quoteaf (name)); + if (SAME_INODE (in_stat_buf, out_stat_buf)) + error (EXIT_FAILURE, 0, _("%s would overwrite input; aborting"), + quoteaf (name)); + if (ftruncate (fd, 0) != 0) + error (EXIT_FAILURE, errno, _("%s: error truncating"), quotef (name)); + + return fd; + } + else + { + int fd_pair[2]; + pid_t child_pid; + char const *shell_prog = getenv ("SHELL"); + if (shell_prog == NULL) + shell_prog = "/bin/sh"; + if (setenv ("FILE", name, 1) != 0) + error (EXIT_FAILURE, errno, + _("failed to set FILE environment variable")); + if (verbose) + fprintf (stdout, _("executing with FILE=%s\n"), quotef (name)); + if (pipe (fd_pair) != 0) + error (EXIT_FAILURE, errno, _("failed to create pipe")); + child_pid = fork (); + if (child_pid == 0) + { + /* This is the child process. If an error occurs here, the + parent will eventually learn about it after doing a wait, + at which time it will emit its own error message. */ + int j; + /* We have to close any pipes that were opened during an + earlier call, otherwise this process will be holding a + write-pipe that will prevent the earlier process from + reading an EOF on the corresponding read-pipe. */ + for (j = 0; j < n_open_pipes; ++j) + if (close (open_pipes[j]) != 0) + error (EXIT_FAILURE, errno, _("closing prior pipe")); + if (close (fd_pair[1])) + error (EXIT_FAILURE, errno, _("closing output pipe")); + if (fd_pair[0] != STDIN_FILENO) + { + if (dup2 (fd_pair[0], STDIN_FILENO) != STDIN_FILENO) + error (EXIT_FAILURE, errno, _("moving input pipe")); + if (close (fd_pair[0]) != 0) + error (EXIT_FAILURE, errno, _("closing input pipe")); + } + sigprocmask (SIG_SETMASK, &oldblocked, NULL); + execl (shell_prog, last_component (shell_prog), "-c", + filter_command, (char *) NULL); + error (EXIT_FAILURE, errno, _("failed to run command: \"%s -c %s\""), + shell_prog, filter_command); + } + if (child_pid == -1) + error (EXIT_FAILURE, errno, _("fork system call failed")); + if (close (fd_pair[0]) != 0) + error (EXIT_FAILURE, errno, _("failed to close input pipe")); + filter_pid = child_pid; + if (n_open_pipes == open_pipes_alloc) + open_pipes = x2nrealloc (open_pipes, &open_pipes_alloc, + sizeof *open_pipes); + open_pipes[n_open_pipes++] = fd_pair[1]; + return fd_pair[1]; + } +} + +/* Close the output file, and do any associated cleanup. + If FP and FD are both specified, they refer to the same open file; + in this case FP is closed, but FD is still used in cleanup. */ +static void +closeout (FILE *fp, int fd, pid_t pid, char const *name) +{ + if (fp != NULL && fclose (fp) != 0 && ! ignorable (errno)) + error (EXIT_FAILURE, errno, "%s", quotef (name)); + if (fd >= 0) + { + if (fp == NULL && close (fd) < 0) + error (EXIT_FAILURE, errno, "%s", quotef (name)); + int j; + for (j = 0; j < n_open_pipes; ++j) + { + if (open_pipes[j] == fd) + { + open_pipes[j] = open_pipes[--n_open_pipes]; + break; + } + } + } + if (pid > 0) + { + int wstatus = 0; + if (waitpid (pid, &wstatus, 0) == -1 && errno != ECHILD) + error (EXIT_FAILURE, errno, _("waiting for child process")); + if (WIFSIGNALED (wstatus)) + { + int sig = WTERMSIG (wstatus); + if (sig != SIGPIPE) + { + char signame[MAX (SIG2STR_MAX, INT_BUFSIZE_BOUND (int))]; + if (sig2str (sig, signame) != 0) + sprintf (signame, "%d", sig); + error (sig + 128, 0, + _("with FILE=%s, signal %s from command: %s"), + quotef (name), signame, filter_command); + } + } + else if (WIFEXITED (wstatus)) + { + int ex = WEXITSTATUS (wstatus); + if (ex != 0) + error (ex, 0, _("with FILE=%s, exit %d from command: %s"), + quotef (name), ex, filter_command); + } + else + { + /* shouldn't happen. */ + error (EXIT_FAILURE, 0, + _("unknown status from command (0x%X)"), wstatus + 0u); + } } } @@ -203,66 +554,86 @@ cwrite (bool new_file_flag, const char *bp, size_t bytes) { if (new_file_flag) { - if (output_desc >= 0 && close (output_desc) < 0) - error (EXIT_FAILURE, errno, "%s", outfile); - + if (!bp && bytes == 0 && elide_empty_files) + return; + closeout (NULL, output_desc, filter_pid, outfile); next_file_name (); - if (verbose) - fprintf (stderr, _("creating file %s\n"), quote (outfile)); - output_desc = open (outfile, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, - (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP - | S_IROTH | S_IWOTH)); - if (output_desc < 0) - error (EXIT_FAILURE, errno, "%s", outfile); - } - if (full_write (output_desc, bp, bytes) != bytes) - error (EXIT_FAILURE, errno, "%s", outfile); + if ((output_desc = create (outfile)) < 0) + error (EXIT_FAILURE, errno, "%s", quotef (outfile)); + } + if (full_write (output_desc, bp, bytes) != bytes && ! ignorable (errno)) + error (EXIT_FAILURE, errno, "%s", quotef (outfile)); } /* Split into pieces of exactly N_BYTES bytes. - Use buffer BUF, whose size is BUFSIZE. */ + Use buffer BUF, whose size is BUFSIZE. + If INITIAL_READ != SIZE_MAX, the entire input file has already been + partly read into BUF and BUF contains INITIAL_READ input bytes. */ static void -bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) +bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read, + uintmax_t max_files) { size_t n_read; bool new_file_flag = true; size_t to_read; uintmax_t to_write = n_bytes; char *bp_out; + uintmax_t opened = 0; do { - n_read = full_read (STDIN_FILENO, buf, bufsize); - if (n_read == SAFE_READ_ERROR) - error (EXIT_FAILURE, errno, "%s", infile); + if (initial_read != SIZE_MAX) + { + n_read = initial_read; + initial_read = SIZE_MAX; + } + else + { + n_read = safe_read (STDIN_FILENO, buf, bufsize); + if (n_read == SAFE_READ_ERROR) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + } bp_out = buf; to_read = n_read; - for (;;) - { - if (to_read < to_write) - { - if (to_read) /* do not write 0 bytes! */ - { - cwrite (new_file_flag, bp_out, to_read); - to_write -= to_read; - new_file_flag = false; - } - break; - } - else - { - size_t w = to_write; - cwrite (new_file_flag, bp_out, w); - bp_out += w; - to_read -= w; - new_file_flag = true; - to_write = n_bytes; - } - } - } - while (n_read == bufsize); + while (true) + { + if (to_read < to_write) + { + if (to_read) /* do not write 0 bytes! */ + { + cwrite (new_file_flag, bp_out, to_read); + opened += new_file_flag; + to_write -= to_read; + new_file_flag = false; + } + break; + } + else + { + size_t w = to_write; + cwrite (new_file_flag, bp_out, w); + opened += new_file_flag; + new_file_flag = !max_files || (opened < max_files); + if (!new_file_flag && ignorable (errno)) + { + /* If filter no longer accepting input, stop reading. */ + n_read = 0; + break; + } + bp_out += w; + to_read -= w; + to_write = n_bytes; + } + } + } + while (n_read); + + /* Ensure NUMBER files are created, which truncates + any existing files or notifies any consumers on fifos. + FIXME: Should we do this before EXIT_FAILURE? */ + while (opened++ < max_files) + cwrite (true, NULL, 0); } /* Split into pieces of exactly N_LINES lines. @@ -278,90 +649,555 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize) do { - n_read = full_read (STDIN_FILENO, buf, bufsize); + n_read = safe_read (STDIN_FILENO, buf, bufsize); if (n_read == SAFE_READ_ERROR) - error (EXIT_FAILURE, errno, "%s", infile); + error (EXIT_FAILURE, errno, "%s", quotef (infile)); bp = bp_out = buf; eob = bp + n_read; - *eob = '\n'; - for (;;) - { - bp = memchr (bp, '\n', eob - bp + 1); - if (bp == eob) - { - if (eob != bp_out) /* do not write 0 bytes! */ - { - size_t len = eob - bp_out; - cwrite (new_file_flag, bp_out, len); - new_file_flag = false; - } - break; - } - - ++bp; - if (++n >= n_lines) - { - cwrite (new_file_flag, bp_out, bp - bp_out); - bp_out = bp; - new_file_flag = true; - n = 0; - } - } - } - while (n_read == bufsize); + *eob = eolchar; + while (true) + { + bp = memchr (bp, eolchar, eob - bp + 1); + if (bp == eob) + { + if (eob != bp_out) /* do not write 0 bytes! */ + { + size_t len = eob - bp_out; + cwrite (new_file_flag, bp_out, len); + new_file_flag = false; + } + break; + } + + ++bp; + if (++n >= n_lines) + { + cwrite (new_file_flag, bp_out, bp - bp_out); + bp_out = bp; + new_file_flag = true; + n = 0; + } + } + } + while (n_read); } - + /* Split into pieces that are as large as possible while still not more than N_BYTES bytes, and are split on line boundaries except - where lines longer than N_BYTES bytes occur. - FIXME: Allow N_BYTES to be any uintmax_t value, and don't require a - buffer of size N_BYTES, in case N_BYTES is very large. */ + where lines longer than N_BYTES bytes occur. */ static void -line_bytes_split (size_t n_bytes) +line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) { size_t n_read; - char *bp; - bool eof = false; - size_t n_buffered = 0; - char *buf = xmalloc (n_bytes); + uintmax_t n_out = 0; /* for each split. */ + size_t n_hold = 0; + char *hold = NULL; /* for lines > bufsize. */ + size_t hold_size = 0; + bool split_line = false; /* Whether a \n was output in a split. */ do { - /* Fill up the full buffer size from the input file. */ + n_read = safe_read (STDIN_FILENO, buf, bufsize); + if (n_read == SAFE_READ_ERROR) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + size_t n_left = n_read; + char *sob = buf; + while (n_left) + { + size_t split_rest = 0; + char *eoc = NULL; + char *eol; + + /* Determine End Of Chunk and/or End of Line, + which are used below to select what to write or buffer. */ + if (n_bytes - n_out - n_hold <= n_left) + { + /* Have enough for split. */ + split_rest = n_bytes - n_out - n_hold; + eoc = sob + split_rest - 1; + eol = memrchr (sob, eolchar, split_rest); + } + else + eol = memrchr (sob, eolchar, n_left); + + /* Output hold space if possible. */ + if (n_hold && !(!eol && n_out)) + { + cwrite (n_out == 0, hold, n_hold); + n_out += n_hold; + if (n_hold > bufsize) + hold = xrealloc (hold, bufsize); + n_hold = 0; + hold_size = bufsize; + } + + /* Output to eol if present. */ + if (eol) + { + split_line = true; + size_t n_write = eol - sob + 1; + cwrite (n_out == 0, sob, n_write); + n_out += n_write; + n_left -= n_write; + sob += n_write; + if (eoc) + split_rest -= n_write; + } + + /* Output to eoc or eob if possible. */ + if (n_left && !split_line) + { + size_t n_write = eoc ? split_rest : n_left; + cwrite (n_out == 0, sob, n_write); + n_out += n_write; + n_left -= n_write; + sob += n_write; + if (eoc) + split_rest -= n_write; + } + + /* Update hold if needed. */ + if ((eoc && split_rest) || (!eoc && n_left)) + { + size_t n_buf = eoc ? split_rest : n_left; + if (hold_size - n_hold < n_buf) + { + if (hold_size <= SIZE_MAX - bufsize) + hold_size += bufsize; + else + xalloc_die (); + hold = xrealloc (hold, hold_size); + } + memcpy (hold + n_hold, sob, n_buf); + n_hold += n_buf; + n_left -= n_buf; + sob += n_buf; + } + + /* Reset for new split. */ + if (eoc) + { + n_out = 0; + split_line = false; + } + } + } + while (n_read); + + /* Handle no eol at end of file. */ + if (n_hold) + cwrite (n_out == 0, hold, n_hold); + + free (hold); +} + +/* -n l/[K/]N: Write lines to files of approximately file size / N. + The file is partitioned into file size / N sized portions, with the + last assigned any excess. If a line _starts_ within a partition + it is written completely to the corresponding file. Since lines + are not split even if they overlap a partition, the files written + can be larger or smaller than the partition size, and even empty + if a line is so long as to completely overlap the partition. */ + +static void +lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, + size_t initial_read, off_t file_size) +{ + assert (n && k <= n && n <= file_size); + + const off_t chunk_size = file_size / n; + uintmax_t chunk_no = 1; + off_t chunk_end = chunk_size - 1; + off_t n_written = 0; + bool new_file_flag = true; + bool chunk_truncated = false; + + if (k > 1) + { + /* Start reading 1 byte before kth chunk of file. */ + off_t start = (k - 1) * chunk_size - 1; + if (initial_read != SIZE_MAX) + { + memmove (buf, buf + start, initial_read - start); + initial_read -= start; + } + else if (lseek (STDIN_FILENO, start, SEEK_CUR) < 0) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + n_written = start; + chunk_no = k - 1; + chunk_end = chunk_no * chunk_size - 1; + } + + while (n_written < file_size) + { + char *bp = buf, *eob; + size_t n_read; + if (initial_read != SIZE_MAX) + { + n_read = initial_read; + initial_read = SIZE_MAX; + } + else + { + n_read = safe_read (STDIN_FILENO, buf, bufsize); + if (n_read == SAFE_READ_ERROR) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + } + if (n_read == 0) + break; /* eof. */ + n_read = MIN (n_read, file_size - n_written); + chunk_truncated = false; + eob = buf + n_read; + + while (bp != eob) + { + size_t to_write; + bool next = false; + + /* Begin looking for '\n' at last byte of chunk. */ + off_t skip = MIN (n_read, MAX (0, chunk_end - n_written)); + char *bp_out = memchr (bp + skip, eolchar, n_read - skip); + if (bp_out++) + next = true; + else + bp_out = eob; + to_write = bp_out - bp; + + if (k == chunk_no) + { + /* We don't use the stdout buffer here since we're writing + large chunks from an existing file, so it's more efficient + to write out directly. */ + if (full_write (STDOUT_FILENO, bp, to_write) != to_write) + error (EXIT_FAILURE, errno, "%s", _("write error")); + } + else if (! k) + cwrite (new_file_flag, bp, to_write); + n_written += to_write; + bp += to_write; + n_read -= to_write; + new_file_flag = next; + + /* A line could have been so long that it skipped + entire chunks. So create empty files in that case. */ + while (next || chunk_end <= n_written - 1) + { + if (!next && bp == eob) + { + /* replenish buf, before going to next chunk. */ + chunk_truncated = true; + break; + } + chunk_no++; + if (k && chunk_no > k) + return; + if (chunk_no == n) + chunk_end = file_size - 1; /* >= chunk_size. */ + else + chunk_end += chunk_size; + if (chunk_end <= n_written - 1) + { + if (! k) + cwrite (true, NULL, 0); + } + else + next = false; + } + } + } + + if (chunk_truncated) + chunk_no++; + + /* Ensure NUMBER files are created, which truncates + any existing files or notifies any consumers on fifos. + FIXME: Should we do this before EXIT_FAILURE? */ + while (!k && chunk_no++ <= n) + cwrite (true, NULL, 0); +} + +/* -n K/N: Extract Kth of N chunks. */ + +static void +bytes_chunk_extract (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, + size_t initial_read, off_t file_size) +{ + off_t start; + off_t end; + + assert (k && n && k <= n && n <= file_size); + + start = (k - 1) * (file_size / n); + end = (k == n) ? file_size : k * (file_size / n); + + if (initial_read != SIZE_MAX) + { + memmove (buf, buf + start, initial_read - start); + initial_read -= start; + } + else if (lseek (STDIN_FILENO, start, SEEK_CUR) < 0) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + + while (start < end) + { + size_t n_read; + if (initial_read != SIZE_MAX) + { + n_read = initial_read; + initial_read = SIZE_MAX; + } + else + { + n_read = safe_read (STDIN_FILENO, buf, bufsize); + if (n_read == SAFE_READ_ERROR) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + } + if (n_read == 0) + break; /* eof. */ + n_read = MIN (n_read, end - start); + if (full_write (STDOUT_FILENO, buf, n_read) != n_read + && ! ignorable (errno)) + error (EXIT_FAILURE, errno, "%s", quotef ("-")); + start += n_read; + } +} + +typedef struct of_info +{ + char *of_name; + int ofd; + FILE *ofile; + int opid; +} of_t; + +enum +{ + OFD_NEW = -1, + OFD_APPEND = -2 +}; + +/* Rotate file descriptors when we're writing to more output files than we + have available file descriptors. + Return whether we came under file resource pressure. + If so, it's probably best to close each file when finished with it. */ + +static bool +ofile_open (of_t *files, size_t i_check, size_t nfiles) +{ + bool file_limit = false; + + if (files[i_check].ofd <= OFD_NEW) + { + int fd; + size_t i_reopen = i_check ? i_check - 1 : nfiles - 1; + + /* Another process could have opened a file in between the calls to + close and open, so we should keep trying until open succeeds or + we've closed all of our files. */ + while (true) + { + if (files[i_check].ofd == OFD_NEW) + fd = create (files[i_check].of_name); + else /* OFD_APPEND */ + { + /* Attempt to append to previously opened file. + We use O_NONBLOCK to support writing to fifos, + where the other end has closed because of our + previous close. In that case we'll immediately + get an error, rather than waiting indefinitely. + In specialised cases the consumer can keep reading + from the fifo, terminating on conditions in the data + itself, or perhaps never in the case of 'tail -f'. + I.e., for fifos it is valid to attempt this reopen. + + We don't handle the filter_command case here, as create() + will exit if there are not enough files in that case. + I.e., we don't support restarting filters, as that would + put too much burden on users specifying --filter commands. */ + fd = open (files[i_check].of_name, + O_WRONLY | O_BINARY | O_APPEND | O_NONBLOCK); + } + + if (-1 < fd) + break; + + if (!(errno == EMFILE || errno == ENFILE)) + error (EXIT_FAILURE, errno, "%s", quotef (files[i_check].of_name)); + + file_limit = true; + + /* Search backwards for an open file to close. */ + while (files[i_reopen].ofd < 0) + { + i_reopen = i_reopen ? i_reopen - 1 : nfiles - 1; + /* No more open files to close, exit with E[NM]FILE. */ + if (i_reopen == i_check) + error (EXIT_FAILURE, errno, "%s", + quotef (files[i_check].of_name)); + } + + if (fclose (files[i_reopen].ofile) != 0) + error (EXIT_FAILURE, errno, "%s", quotef (files[i_reopen].of_name)); + files[i_reopen].ofile = NULL; + files[i_reopen].ofd = OFD_APPEND; + } + + files[i_check].ofd = fd; + if (!(files[i_check].ofile = fdopen (fd, "a"))) + error (EXIT_FAILURE, errno, "%s", quotef (files[i_check].of_name)); + files[i_check].opid = filter_pid; + filter_pid = 0; + } + + return file_limit; +} - n_read = full_read (STDIN_FILENO, buf + n_buffered, n_bytes - n_buffered); +/* -n r/[K/]N: Divide file into N chunks in round robin fashion. + When K == 0, we try to keep the files open in parallel. + If we run out of file resources, then we revert + to opening and closing each file for each line. */ + +static void +lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize) +{ + bool wrapped = false; + bool wrote = false; + bool file_limit; + size_t i_file; + of_t *files IF_LINT (= NULL); + uintmax_t line_no; + + if (k) + line_no = 1; + else + { + if (SIZE_MAX < n) + xalloc_die (); + files = xnmalloc (n, sizeof *files); + + /* Generate output file names. */ + for (i_file = 0; i_file < n; i_file++) + { + next_file_name (); + files[i_file].of_name = xstrdup (outfile); + files[i_file].ofd = OFD_NEW; + files[i_file].ofile = NULL; + files[i_file].opid = 0; + } + i_file = 0; + file_limit = false; + } + + while (true) + { + char *bp = buf, *eob; + size_t n_read = safe_read (STDIN_FILENO, buf, bufsize); if (n_read == SAFE_READ_ERROR) - error (EXIT_FAILURE, errno, "%s", infile); - - n_buffered += n_read; - if (n_buffered != n_bytes) - eof = true; - - /* Find where to end this chunk. */ - bp = buf + n_buffered; - if (n_buffered == n_bytes) - { - while (bp > buf && bp[-1] != '\n') - bp--; - } - - /* If chunk has no newlines, use all the chunk. */ - if (bp == buf) - bp = buf + n_buffered; - - /* Output the chars as one output file. */ - cwrite (true, buf, bp - buf); - - /* Discard the chars we just output; move rest of chunk - down to be the start of the next chunk. Source and - destination probably overlap. */ - n_buffered -= bp - buf; - if (n_buffered > 0) - memmove (buf, bp, n_buffered); - } - while (!eof); - free (buf); + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + else if (n_read == 0) + break; /* eof. */ + eob = buf + n_read; + + while (bp != eob) + { + size_t to_write; + bool next = false; + + /* Find end of line. */ + char *bp_out = memchr (bp, eolchar, eob - bp); + if (bp_out) + { + bp_out++; + next = true; + } + else + bp_out = eob; + to_write = bp_out - bp; + + if (k) + { + if (line_no == k && unbuffered) + { + if (full_write (STDOUT_FILENO, bp, to_write) != to_write) + error (EXIT_FAILURE, errno, "%s", _("write error")); + } + else if (line_no == k && fwrite (bp, to_write, 1, stdout) != 1) + { + clearerr (stdout); /* To silence close_stdout(). */ + error (EXIT_FAILURE, errno, "%s", _("write error")); + } + if (next) + line_no = (line_no == n) ? 1 : line_no + 1; + } + else + { + /* Secure file descriptor. */ + file_limit |= ofile_open (files, i_file, n); + if (unbuffered) + { + /* Note writing to fd, rather than flushing the FILE gives + an 8% performance benefit, due to reduced data copying. */ + if (full_write (files[i_file].ofd, bp, to_write) != to_write + && ! ignorable (errno)) + { + error (EXIT_FAILURE, errno, "%s", + quotef (files[i_file].of_name)); + } + } + else if (fwrite (bp, to_write, 1, files[i_file].ofile) != 1 + && ! ignorable (errno)) + { + error (EXIT_FAILURE, errno, "%s", + quotef (files[i_file].of_name)); + } + if (! ignorable (errno)) + wrote = true; + + if (file_limit) + { + if (fclose (files[i_file].ofile) != 0) + { + error (EXIT_FAILURE, errno, "%s", + quotef (files[i_file].of_name)); + } + files[i_file].ofile = NULL; + files[i_file].ofd = OFD_APPEND; + } + if (next && ++i_file == n) + { + wrapped = true; + /* If no filters are accepting input, stop reading. */ + if (! wrote) + goto no_filters; + wrote = false; + i_file = 0; + } + } + + bp = bp_out; + } + } + +no_filters: + /* Ensure all files created, so that any existing files are truncated, + and to signal any waiting fifo consumers. + Also, close any open file descriptors. + FIXME: Should we do this before EXIT_FAILURE? */ + if (!k) + { + int ceiling = (wrapped ? n : i_file); + for (i_file = 0; i_file < n; i_file++) + { + if (i_file >= ceiling && !elide_empty_files) + file_limit |= ofile_open (files, i_file, n); + if (files[i_file].ofd >= 0) + closeout (files[i_file].ofile, files[i_file].ofd, + files[i_file].opid, files[i_file].of_name); + files[i_file].ofd = OFD_APPEND; + } + } + IF_LINT (free (files)); } #define FAIL_ONLY_ONE_WAY() \ @@ -372,23 +1208,39 @@ line_bytes_split (size_t n_bytes) } \ while (0) + +/* Parse K/N syntax of chunk options. */ + +static void +parse_chunk (uintmax_t *k_units, uintmax_t *n_units, char *slash) +{ + *n_units = xdectoumax (slash + 1, 1, UINTMAX_MAX, "", + _("invalid number of chunks"), 0); + if (slash != optarg) /* a leading number is specified. */ + { + *slash = '\0'; + *k_units = xdectoumax (optarg, 1, *n_units, "", + _("invalid chunk number"), 0); + } +} + + int main (int argc, char **argv) { - struct stat stat_buf; - enum - { - type_undef, type_bytes, type_byteslines, type_lines, type_digits - } split_type = type_undef; - size_t in_blk_size; /* optimal block size of input file device */ - char *buf; /* file i/o buffer */ + enum Split_type split_type = type_undef; + size_t in_blk_size = 0; /* optimal block size of input file device */ size_t page_size = getpagesize (); - uintmax_t n_units; + uintmax_t k_units = 0; + uintmax_t n_units = 0; + + static char const multipliers[] = "bEGKkMmPTYZ0"; int c; int digits_optind = 0; + off_t file_size IF_LINT (= 0); initialize_main (&argc, &argv); - program_name = argv[0]; + set_program_name (argv[0]); setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); @@ -397,113 +1249,204 @@ main (int argc, char **argv) /* Parse command line options. */ - infile = "-"; - outbase = "x"; + infile = bad_cast ("-"); + outbase = bad_cast ("x"); - while (1) + while (true) { /* This is the argv-index of the option we will read next. */ int this_optind = optind ? optind : 1; + char *slash; - c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL); + c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:u", + longopts, NULL); if (c == -1) - break; + break; switch (c) - { - case 'a': - { - unsigned long tmp; - if (xstrtoul (optarg, NULL, 10, &tmp, "") != LONGINT_OK - || SIZE_MAX / sizeof (size_t) < tmp) - { - error (0, 0, _("%s: invalid suffix length"), optarg); - usage (EXIT_FAILURE); - } - suffix_length = tmp; - } - break; - - case 'b': - if (split_type != type_undef) - FAIL_ONLY_ONE_WAY (); - split_type = type_bytes; - if (xstrtoumax (optarg, NULL, 10, &n_units, "bkm") != LONGINT_OK - || n_units == 0) - { - error (0, 0, _("%s: invalid number of bytes"), optarg); - usage (EXIT_FAILURE); - } - break; - - case 'l': - if (split_type != type_undef) - FAIL_ONLY_ONE_WAY (); - split_type = type_lines; - if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK - || n_units == 0) - { - error (0, 0, _("%s: invalid number of lines"), optarg); - usage (EXIT_FAILURE); - } - break; - - case 'C': - if (split_type != type_undef) - FAIL_ONLY_ONE_WAY (); - split_type = type_byteslines; - if (xstrtoumax (optarg, NULL, 10, &n_units, "bkm") != LONGINT_OK - || n_units == 0 || SIZE_MAX < n_units) - { - error (0, 0, _("%s: invalid number of bytes"), optarg); - usage (EXIT_FAILURE); - } - break; - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - if (split_type == type_undef) - { - split_type = type_digits; - n_units = 0; - } - if (split_type != type_undef && split_type != type_digits) - FAIL_ONLY_ONE_WAY (); - if (digits_optind != 0 && digits_optind != this_optind) - n_units = 0; /* More than one number given; ignore other. */ - digits_optind = this_optind; - if (!DECIMAL_DIGIT_ACCUMULATE (n_units, c - '0', uintmax_t)) - { - char buffer[INT_BUFSIZE_BOUND (uintmax_t)]; - error (EXIT_FAILURE, 0, - _("line count option -%s%c... is too large"), - umaxtostr (n_units, buffer), c); - } - break; - - case 'd': - suffix_alphabet = "0123456789"; - break; - - case VERBOSE_OPTION: - verbose = true; - break; - - case_GETOPT_HELP_CHAR; - - case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); - - default: - usage (EXIT_FAILURE); - } + { + case 'a': + suffix_length = xdectoumax (optarg, 0, SIZE_MAX / sizeof (size_t), + "", _("invalid suffix length"), 0); + break; + + case ADDITIONAL_SUFFIX_OPTION: + if (last_component (optarg) != optarg) + { + error (0, 0, + _("invalid suffix %s, contains directory separator"), + quote (optarg)); + usage (EXIT_FAILURE); + } + additional_suffix = optarg; + break; + + case 'b': + if (split_type != type_undef) + FAIL_ONLY_ONE_WAY (); + split_type = type_bytes; + /* Limit to OFF_T_MAX, because if input is a pipe, we could get more + data than is possible to write to a single file, so indicate that + immediately rather than having possibly future invocations fail. */ + n_units = xdectoumax (optarg, 1, OFF_T_MAX, multipliers, + _("invalid number of bytes"), 0); + break; + + case 'l': + if (split_type != type_undef) + FAIL_ONLY_ONE_WAY (); + split_type = type_lines; + n_units = xdectoumax (optarg, 1, UINTMAX_MAX, "", + _("invalid number of lines"), 0); + break; + + case 'C': + if (split_type != type_undef) + FAIL_ONLY_ONE_WAY (); + split_type = type_byteslines; + n_units = xdectoumax (optarg, 1, MIN (SIZE_MAX, OFF_T_MAX), + multipliers, _("invalid number of bytes"), 0); + break; + + case 'n': + if (split_type != type_undef) + FAIL_ONLY_ONE_WAY (); + /* skip any whitespace */ + while (isspace (to_uchar (*optarg))) + optarg++; + if (STRNCMP_LIT (optarg, "r/") == 0) + { + split_type = type_rr; + optarg += 2; + } + else if (STRNCMP_LIT (optarg, "l/") == 0) + { + split_type = type_chunk_lines; + optarg += 2; + } + else + split_type = type_chunk_bytes; + if ((slash = strchr (optarg, '/'))) + parse_chunk (&k_units, &n_units, slash); + else + n_units = xdectoumax (optarg, 1, UINTMAX_MAX, "", + _("invalid number of chunks"), 0); + break; + + case 'u': + unbuffered = true; + break; + + case 't': + { + char neweol = optarg[0]; + if (! neweol) + error (EXIT_FAILURE, 0, _("empty record separator")); + if (optarg[1]) + { + if (STREQ (optarg, "\\0")) + neweol = '\0'; + else + { + /* Provoke with 'split -txx'. Complain about + "multi-character tab" instead of "multibyte tab", so + that the diagnostic's wording does not need to be + changed once multibyte characters are supported. */ + error (EXIT_FAILURE, 0, _("multi-character separator %s"), + quote (optarg)); + } + } + /* Make it explicit we don't support multiple separators. */ + if (0 <= eolchar && neweol != eolchar) + { + error (EXIT_FAILURE, 0, + _("multiple separator characters specified")); + } + + eolchar = neweol; + } + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (split_type == type_undef) + { + split_type = type_digits; + n_units = 0; + } + if (split_type != type_undef && split_type != type_digits) + FAIL_ONLY_ONE_WAY (); + if (digits_optind != 0 && digits_optind != this_optind) + n_units = 0; /* More than one number given; ignore other. */ + digits_optind = this_optind; + if (!DECIMAL_DIGIT_ACCUMULATE (n_units, c - '0', uintmax_t)) + { + char buffer[INT_BUFSIZE_BOUND (uintmax_t)]; + error (EXIT_FAILURE, 0, + _("line count option -%s%c... is too large"), + umaxtostr (n_units, buffer), c); + } + break; + + case 'd': + suffix_alphabet = "0123456789"; + if (optarg) + { + if (strlen (optarg) != strspn (optarg, suffix_alphabet)) + { + error (0, 0, + _("%s: invalid start value for numerical suffix"), + quote (optarg)); + usage (EXIT_FAILURE); + } + else + { + /* Skip any leading zero. */ + while (*optarg == '0' && *(optarg + 1) != '\0') + optarg++; + numeric_suffix_start = optarg; + } + } + break; + + case 'e': + elide_empty_files = true; + break; + + case FILTER_OPTION: + filter_command = optarg; + break; + + case IO_BLKSIZE_OPTION: + in_blk_size = xdectoumax (optarg, 1, SIZE_MAX - page_size, + multipliers, _("invalid IO block size"), 0); + break; + + case VERBOSE_OPTION: + verbose = true; + break; + + case_GETOPT_HELP_CHAR; + + case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); + + default: + usage (EXIT_FAILURE); + } + } + + if (k_units != 0 && filter_command) + { + error (0, 0, _("--filter does not process a chunk extracted to stdout")); + usage (EXIT_FAILURE); } /* Handle default case. */ @@ -515,10 +1458,15 @@ main (int argc, char **argv) if (n_units == 0) { - error (0, 0, _("invalid number of lines: 0")); + error (0, 0, "%s: %s", _("invalid number of lines"), quote ("0")); usage (EXIT_FAILURE); } + if (eolchar < 0) + eolchar = '\n'; + + set_suffix_length (n_units, split_type); + /* Get out the filename arguments. */ if (optind < argc) @@ -533,26 +1481,87 @@ main (int argc, char **argv) usage (EXIT_FAILURE); } + /* Check that the suffix length is large enough for the numerical + suffix start value. */ + if (numeric_suffix_start && strlen (numeric_suffix_start) > suffix_length) + { + error (0, 0, _("numerical suffix start value is too large " + "for the suffix length")); + usage (EXIT_FAILURE); + } + /* Open the input file. */ if (! STREQ (infile, "-") && fd_reopen (STDIN_FILENO, infile, O_RDONLY, 0) < 0) error (EXIT_FAILURE, errno, _("cannot open %s for reading"), - quote (infile)); + quoteaf (infile)); - /* Binary I/O is safer when bytecounts are used. */ + /* Binary I/O is safer when byte counts are used. */ if (O_BINARY && ! isatty (STDIN_FILENO)) - freopen (NULL, "rb", stdin); - - /* No output file is open now. */ - output_desc = -1; + xfreopen (NULL, "rb", stdin); /* Get the optimal block size of input device and make a buffer. */ - if (fstat (STDIN_FILENO, &stat_buf) != 0) - error (EXIT_FAILURE, errno, "%s", infile); - in_blk_size = ST_BLKSIZE (stat_buf); + if (fstat (STDIN_FILENO, &in_stat_buf) != 0) + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + + bool specified_buf_size = !! in_blk_size; + if (! specified_buf_size) + in_blk_size = io_blksize (in_stat_buf); + + void *b = xmalloc (in_blk_size + 1 + page_size - 1); + char *buf = ptr_align (b, page_size); + size_t initial_read = SIZE_MAX; - buf = ptr_align (xmalloc (in_blk_size + 1 + page_size - 1), page_size); + if (split_type == type_chunk_bytes || split_type == type_chunk_lines) + { + off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR); + if (0 <= input_offset) + { + if (usable_st_size (&in_stat_buf) && ! specified_buf_size) + { + assert (ST_BLKSIZE (in_stat_buf) <= in_blk_size); + file_size = input_file_size (STDIN_FILENO, in_stat_buf.st_size, + buf, in_blk_size); + if (file_size < in_blk_size) + initial_read = file_size; + } + else + { + file_size = lseek (STDIN_FILENO, 0, SEEK_END); + input_offset = (file_size < 0 + ? file_size + : lseek (STDIN_FILENO, input_offset, SEEK_SET)); + file_size -= input_offset; + } + } + if (input_offset < 0) + error (EXIT_FAILURE, 0, _("%s: cannot determine file size"), + quotef (infile)); + /* Overflow, and sanity checking. */ + if (OFF_T_MAX < n_units) + { + char buffer[INT_BUFSIZE_BOUND (uintmax_t)]; + error (EXIT_FAILURE, EOVERFLOW, "%s: %s", + _("invalid number of chunks"), + quote (umaxtostr (n_units, buffer))); + } + /* increase file_size to n_units here, so that we still process + any input data, and create empty files for the rest. */ + file_size = MAX (file_size, n_units); + } + + /* When filtering, closure of one pipe must not terminate the process, + as there may still be other streams expecting input from us. */ + if (filter_command) + { + struct sigaction act; + sigemptyset (&newblocked); + sigaction (SIGPIPE, NULL, &act); + if (act.sa_handler != SIG_IGN) + sigaddset (&newblocked, SIGPIPE); + sigprocmask (SIG_BLOCK, &newblocked, &oldblocked); + } switch (split_type) { @@ -562,21 +1571,42 @@ main (int argc, char **argv) break; case type_bytes: - bytes_split (n_units, buf, in_blk_size); + bytes_split (n_units, buf, in_blk_size, SIZE_MAX, 0); break; case type_byteslines: - line_bytes_split (n_units); + line_bytes_split (n_units, buf, in_blk_size); + break; + + case type_chunk_bytes: + if (k_units == 0) + bytes_split (file_size / n_units, buf, in_blk_size, initial_read, + n_units); + else + bytes_chunk_extract (k_units, n_units, buf, in_blk_size, initial_read, + file_size); + break; + + case type_chunk_lines: + lines_chunk_split (k_units, n_units, buf, in_blk_size, initial_read, + file_size); + break; + + case type_rr: + /* Note, this is like 'sed -n ${k}~${n}p' when k > 0, + but the functionality is provided for symmetry. */ + lines_rr (k_units, n_units, buf, in_blk_size); break; default: abort (); } + IF_LINT (free (b)); + if (close (STDIN_FILENO) != 0) - error (EXIT_FAILURE, errno, "%s", infile); - if (output_desc >= 0 && close (output_desc) < 0) - error (EXIT_FAILURE, errno, "%s", outfile); + error (EXIT_FAILURE, errno, "%s", quotef (infile)); + closeout (NULL, output_desc, filter_pid, outfile); - exit (EXIT_SUCCESS); + return EXIT_SUCCESS; } |