From 5532e83f3887d45d125d7b1b8fc17c39eb14c149 Mon Sep 17 00:00:00 2001 From: ph10 Date: Wed, 5 Dec 2007 20:56:03 +0000 Subject: Add --line-offsets and --file-offsets to pcregrep. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@280 2f5784b3-3f2a-0410-8824-cb99058d5e15 --- RunGrepTest | 6 ++++++ doc/pcregrep.1 | 25 ++++++++++++++++++++++--- pcregrep.c | 50 ++++++++++++++++++++++++++++++++++++++++++-------- testdata/grepoutput | 12 ++++++++++++ 4 files changed, 82 insertions(+), 11 deletions(-) diff --git a/RunGrepTest b/RunGrepTest index 0c6eda8..6f295b7 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -208,6 +208,12 @@ echo "---------------------------- Test 50 ------------------------------" >>tes echo "---------------------------- Test 51 ------------------------------" >>testtry (cd $srcdir; $valgrind $pcregrep --colour=always jumps ./testdata/grepinputv) >>testtry +echo "---------------------------- Test 52 ------------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep --file-offsets 'before|zero|after' ./testdata/grepinput) >>testtry + +echo "---------------------------- Test 53 ------------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep --line-offsets 'before|zero|after' ./testdata/grepinput) >>testtry + # Now compare the results. $cf $srcdir/testdata/grepoutput testtry diff --git a/doc/pcregrep.1 b/doc/pcregrep.1 index 1820d17..3a6184a 100644 --- a/doc/pcregrep.1 +++ b/doc/pcregrep.1 @@ -158,6 +158,14 @@ is an overall maximum of 100 patterns. Trailing white space is removed from each line, and blank lines are ignored. An empty file contains no patterns and therefore matches nothing. .TP +\fB--file-offsets\fP +Instead of showing lines or parts of lines that match, show each match as an +offset from the start of the file and a length, separated by a comma. In this +mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP +options are ignored. If there is more than one match in a line, each of them is +shown separately. This option is mutually exclusive with \fB--line-offsets\fP +and \fB--only-matching\fP. +.TP \fB-H\fP, \fB--with-filename\fP Force the inclusion of the filename at the start of output lines when searching a single file. By default, the filename is not shown in this case. For matching @@ -201,6 +209,15 @@ This option supplies a name to be used for the standard input when file names are being output. If not supplied, "(standard input)" is used. There is no short form for this option. .TP +\fB--line-offsets\fP +Instead of showing lines or parts of lines that match, show each match as a +line number, the offset from the start of the line, and a length. The line +number is terminated by a colon (as usual; see the \fB-n\fP option), and the +offset and length are separated by a comma. In this mode, no context is shown. +That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is +more than one match in a line, each of them is shown separately. This option is +mutually exclusive with \fB--file-offsets\fP and \fB--only-matching\fP. +.TP \fB--locale\fP=\fIlocale-name\fP This option specifies a locale to be used for pattern matching. It overrides the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no @@ -241,7 +258,8 @@ being scanned does not agree with the convention set by this option, \fB-n\fP, \fB--line-number\fP Precede each output line by its line number in the file, followed by a colon and a space for matching lines or a hyphen and a space for context lines. If -the filename is also being output, it precedes the line number. +the filename is also being output, it precedes the line number. This option is +forced if \fB--line-offsets\fP is used. .TP \fB-o\fP, \fB--only-matching\fP Show only the part of the line that matched a pattern. In this mode, no @@ -249,7 +267,8 @@ context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is more than one match in a line, each of them is shown separately. If \fB-o\fP is combined with \fB-v\fP (invert the sense of the match to find non-matching lines), no output is generated, but the return code -is set appropriately. +is set appropriately. This option is mutually exclusive with +\fB--file-offsets\fP and \fB--line-offsets\fP. .TP \fB-q\fP, \fB--quiet\fP Work quietly, that is, display nothing except error messages. The exit @@ -391,6 +410,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 04 December 2007 +Last updated: 05 December 2007 Copyright (c) 1997-2007 University of Cambridge. .fi diff --git a/pcregrep.c b/pcregrep.c index 6c3f7a1..88edc8f 100644 --- a/pcregrep.c +++ b/pcregrep.c @@ -142,8 +142,10 @@ static int process_options = 0; static BOOL count_only = FALSE; static BOOL do_colour = FALSE; +static BOOL file_offsets = FALSE; static BOOL hyphenpending = FALSE; static BOOL invert = FALSE; +static BOOL line_offsets = FALSE; static BOOL multiline = FALSE; static BOOL number = FALSE; static BOOL only_matching = FALSE; @@ -174,6 +176,8 @@ used to identify them. */ #define N_LABEL (-5) #define N_LOCALE (-6) #define N_NULL (-7) +#define N_LOFFSETS (-8) +#define N_FOFFSETS (-9) static option_item optionlist[] = { { OP_NODATA, N_NULL, NULL, "", " terminate options" }, @@ -189,15 +193,17 @@ static option_item optionlist[] = { { OP_PATLIST, 'e', NULL, "regex(p)", "specify pattern (may be used more than once)" }, { OP_NODATA, 'F', NULL, "fixed_strings", "patterns are sets of newline-separated strings" }, { OP_STRING, 'f', &pattern_filename, "file=path", "read patterns from file" }, + { OP_NODATA, N_FOFFSETS, NULL, "file-offsets", "output file offsets, not text" }, { OP_NODATA, 'H', NULL, "with-filename", "force the prefixing filename on output" }, { OP_NODATA, 'h', NULL, "no-filename", "suppress the prefixing filename on output" }, { OP_NODATA, 'i', NULL, "ignore-case", "ignore case distinctions" }, { OP_NODATA, 'l', NULL, "files-with-matches", "print only FILE names containing matches" }, { OP_NODATA, 'L', NULL, "files-without-match","print only FILE names not containing matches" }, { OP_STRING, N_LABEL, &stdin_name, "label=name", "set name for standard input" }, + { OP_NODATA, N_LOFFSETS, NULL, "line-offsets", "output line numbers and offsets, not text" }, { OP_STRING, N_LOCALE, &locale, "locale=locale", "use the named locale" }, { OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" }, - { OP_STRING, 'N', &newline, "newline=type", "specify newline type (CR, LF, CRLF, ANYCRLF or ANY)" }, + { OP_STRING, 'N', &newline, "newline=type", "set newline type (CR, LF, CRLF, ANYCRLF or ANY)" }, { OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" }, { OP_NODATA, 'o', NULL, "only-matching", "show only the part of the line that matched" }, { OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" }, @@ -820,6 +826,7 @@ int rc = 1; int linenumber = 1; int lastmatchnumber = 0; int count = 0; +int filepos = 0; int offsets[99]; char *lastmatchrestart = NULL; char buffer[3*MBUFTHIRD]; @@ -972,10 +979,12 @@ while (ptr < endptr) else if (quiet) return 0; /* The --only-matching option prints just the substring that matched, and - does not print any context. Afterwards, adjust the start and length, and - then jump back to look for further matches in the same line. If we are in - invert mode, however, nothing is printed - this could be useful still - because the return code is set. */ + the --file-offsets and --line-offsets options output offsets for the + matching substring (they both force --only-matching). None of these options + prints any context. Afterwards, adjust the start and length, and then jump + back to look for further matches in the same line. If we are in invert + mode, however, nothing is printed - this could be still useful because the + return code is set. */ else if (only_matching) { @@ -983,7 +992,14 @@ while (ptr < endptr) { if (printname != NULL) fprintf(stdout, "%s:", printname); if (number) fprintf(stdout, "%d:", linenumber); - fwrite(matchptr + offsets[0], 1, offsets[1] - offsets[0], stdout); + if (line_offsets) + fprintf(stdout, "%d,%d", matchptr + offsets[0] - ptr, + offsets[1] - offsets[0]); + else if (file_offsets) + fprintf(stdout, "%d,%d", filepos + matchptr + offsets[0] - ptr, + offsets[1] - offsets[0]); + else + fwrite(matchptr + offsets[0], 1, offsets[1] - offsets[0], stdout); fprintf(stdout, "\n"); matchptr += offsets[1]; length -= offsets[1]; @@ -1163,9 +1179,11 @@ while (ptr < endptr) linelength = endmatch - ptr - ellength; } - /* Advance to after the newline and increment the line number. */ + /* Advance to after the newline and increment the line number. The file + offset to the current line is maintained in filepos. */ ptr += linelength + endlinelength; + filepos += linelength + endlinelength; linenumber++; /* If we haven't yet reached the end of the file (the buffer is full), and @@ -1352,7 +1370,8 @@ for (op = optionlist; op->one_char != 0; op++) if (op->one_char > 0) fprintf(stderr, "%c", op->one_char); } fprintf(stderr, "] [long options] [pattern] [files]\n"); -fprintf(stderr, "Type `pcregrep --help' for more information.\n"); +fprintf(stderr, "Type `pcregrep --help' for more information and the long " + "options.\n"); return rc; } @@ -1407,7 +1426,9 @@ handle_option(int letter, int options) { switch(letter) { + case N_FOFFSETS: file_offsets = TRUE; break; case N_HELP: help(); exit(0); + case N_LOFFSETS: line_offsets = number = TRUE; break; case 'c': count_only = TRUE; break; case 'F': process_options |= PO_FIXED_STRINGS; break; case 'H': filenames = FN_FORCE; break; @@ -1843,6 +1864,19 @@ if (both_context > 0) if (after_context == 0) after_context = both_context; if (before_context == 0) before_context = both_context; } + +/* Only one of --only-matching, --file-offsets, or --line-offsets is permitted. +However, the latter two set the only_matching flag. */ + +if ((only_matching && (file_offsets || line_offsets)) || + (file_offsets && line_offsets)) + { + fprintf(stderr, "pcregrep: Cannot mix --only-matching, --file-offsets " + "and/or --line-offsets\n"); + exit(usage(2)); + } + +if (file_offsets || line_offsets) only_matching = TRUE; /* If a locale has not been provided as an option, see if the LC_CTYPE or LC_ALL environment variable is set, and if so, use it. */ diff --git a/testdata/grepoutput b/testdata/grepoutput index abf7aeb..3241984 100644 --- a/testdata/grepoutput +++ b/testdata/grepoutput @@ -390,3 +390,15 @@ PUT NEW DATA ABOVE THIS LINE. over the lazy dog. ---------------------------- Test 51 ------------------------------ fox jumps +---------------------------- Test 52 ------------------------------ +36972,6 +36990,4 +37024,4 +37066,5 +37083,4 +---------------------------- Test 53 ------------------------------ +595:15,6 +595:33,4 +596:28,4 +597:15,5 +597:32,4 -- cgit v1.2.1