From 23c1575f747393f9847874fd1ed72a44557459d1 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 17 Jan 2009 17:29:43 +0100 Subject: color-words: refactor word splitting and use ALLOC_GROW() Word splitting is now performed by the function diff_words_fill(), avoiding having the same code twice. In the same spirit, avoid duplicating the code of ALLOC_GROW(). Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- diff.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index d23548292a..c111eef13e 100644 --- a/diff.c +++ b/diff.c @@ -326,10 +326,7 @@ struct diff_words_buffer { static void diff_words_append(char *line, unsigned long len, struct diff_words_buffer *buffer) { - if (buffer->text.size + len > buffer->alloc) { - buffer->alloc = (buffer->text.size + len) * 3 / 2; - buffer->text.ptr = xrealloc(buffer->text.ptr, buffer->alloc); - } + ALLOC_GROW(buffer->text.ptr, buffer->text.size + len, buffer->alloc); line++; len--; memcpy(buffer->text.ptr + buffer->text.size, line, len); @@ -398,6 +395,22 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) } } +/* + * This function splits the words in buffer->text, and stores the list with + * newline separator into out. + */ +static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) +{ + int i; + out->size = buffer->text.size; + out->ptr = xmalloc(out->size); + memcpy(out->ptr, buffer->text.ptr, out->size); + for (i = 0; i < out->size; i++) + if (isspace(out->ptr[i])) + out->ptr[i] = '\n'; + buffer->current = 0; +} + /* this executes the word diff on the accumulated buffers */ static void diff_words_show(struct diff_words_data *diff_words) { @@ -405,26 +418,11 @@ static void diff_words_show(struct diff_words_data *diff_words) xdemitconf_t xecfg; xdemitcb_t ecb; mmfile_t minus, plus; - int i; memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); - minus.size = diff_words->minus.text.size; - minus.ptr = xmalloc(minus.size); - memcpy(minus.ptr, diff_words->minus.text.ptr, minus.size); - for (i = 0; i < minus.size; i++) - if (isspace(minus.ptr[i])) - minus.ptr[i] = '\n'; - diff_words->minus.current = 0; - - plus.size = diff_words->plus.text.size; - plus.ptr = xmalloc(plus.size); - memcpy(plus.ptr, diff_words->plus.text.ptr, plus.size); - for (i = 0; i < plus.size; i++) - if (isspace(plus.ptr[i])) - plus.ptr[i] = '\n'; - diff_words->plus.current = 0; - + diff_words_fill(&diff_words->minus, &minus); + diff_words_fill(&diff_words->plus, &plus); xpp.flags = XDF_NEED_MINIMAL; xecfg.ctxlen = diff_words->minus.alloc + diff_words->plus.alloc; xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words, -- cgit v1.2.1 From 2e5d2003b28820f88296e47a79eb440ca0295000 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 17 Jan 2009 17:29:44 +0100 Subject: color-words: change algorithm to allow for 0-character word boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Up until now, the color-words code assumed that word boundaries are identical to white space characters. Therefore, it could get away with a very simple scheme: it copied the hunks, substituted newlines for each white space character, called libxdiff with the processed text, and then identified the text to output by the offsets (which agreed since the original text had the same length). This code was ugly, for a number of reasons: - it was impossible to introduce 0-character word boundaries, - we had to print everything word by word, and - the code needed extra special handling of newlines in the removed part. Fix all of these issues by processing the text such that - we build word lists, separated by newlines, - we remember the original offsets for every word, and - after calling libxdiff on the wordlists, we parse the hunk headers, and find the corresponding offsets, and then - we print the removed/added parts in one go. The pre and post samples in the test were provided by Santi BĂ©jar. Note that there is some strange special handling of hunk headers where one line range is 0 due to POSIX: in this case, the start is one too low. In other words a hunk header '@@ -1,0 +2 @@' actually means that the line must be added after the _second_ line of the pre text, _not_ the first. Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- diff.c | 157 ++++++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 91 insertions(+), 66 deletions(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index c111eef13e..37c886a815 100644 --- a/diff.c +++ b/diff.c @@ -319,8 +319,10 @@ static int fill_mmfile(mmfile_t *mf, struct diff_filespec *one) struct diff_words_buffer { mmfile_t text; long alloc; - long current; /* output pointer */ - int suppressed_newline; + struct diff_words_orig { + const char *begin, *end; + } *orig; + int orig_nr, orig_alloc; }; static void diff_words_append(char *line, unsigned long len, @@ -335,80 +337,89 @@ static void diff_words_append(char *line, unsigned long len, struct diff_words_data { struct diff_words_buffer minus, plus; + const char *current_plus; FILE *file; }; -static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color, - int suppress_newline) +static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) { - const char *ptr; - int eol = 0; + struct diff_words_data *diff_words = priv; + int minus_first, minus_len, plus_first, plus_len; + const char *minus_begin, *minus_end, *plus_begin, *plus_end; - if (len == 0) + if (line[0] != '@' || parse_hunk_header(line, len, + &minus_first, &minus_len, &plus_first, &plus_len)) return; - ptr = buffer->text.ptr + buffer->current; - buffer->current += len; + /* POSIX requires that first be decremented by one if len == 0... */ + if (minus_len) { + minus_begin = diff_words->minus.orig[minus_first].begin; + minus_end = + diff_words->minus.orig[minus_first + minus_len - 1].end; + } else + minus_begin = minus_end = + diff_words->minus.orig[minus_first].end; - if (ptr[len - 1] == '\n') { - eol = 1; - len--; - } + if (plus_len) { + plus_begin = diff_words->plus.orig[plus_first].begin; + plus_end = diff_words->plus.orig[plus_first + plus_len - 1].end; + } else + plus_begin = plus_end = diff_words->plus.orig[plus_first].end; - fputs(diff_get_color(1, color), file); - fwrite(ptr, len, 1, file); - fputs(diff_get_color(1, DIFF_RESET), file); + if (diff_words->current_plus != plus_begin) + fwrite(diff_words->current_plus, + plus_begin - diff_words->current_plus, 1, + diff_words->file); + if (minus_begin != minus_end) + color_fwrite_lines(diff_words->file, + diff_get_color(1, DIFF_FILE_OLD), + minus_end - minus_begin, minus_begin); + if (plus_begin != plus_end) + color_fwrite_lines(diff_words->file, + diff_get_color(1, DIFF_FILE_NEW), + plus_end - plus_begin, plus_begin); - if (eol) { - if (suppress_newline) - buffer->suppressed_newline = 1; - else - putc('\n', file); - } -} - -static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) -{ - struct diff_words_data *diff_words = priv; - - if (diff_words->minus.suppressed_newline) { - if (line[0] != '+') - putc('\n', diff_words->file); - diff_words->minus.suppressed_newline = 0; - } - - len--; - switch (line[0]) { - case '-': - print_word(diff_words->file, - &diff_words->minus, len, DIFF_FILE_OLD, 1); - break; - case '+': - print_word(diff_words->file, - &diff_words->plus, len, DIFF_FILE_NEW, 0); - break; - case ' ': - print_word(diff_words->file, - &diff_words->plus, len, DIFF_PLAIN, 0); - diff_words->minus.current += len; - break; - } + diff_words->current_plus = plus_end; } /* - * This function splits the words in buffer->text, and stores the list with - * newline separator into out. + * This function splits the words in buffer->text, stores the list with + * newline separator into out, and saves the offsets of the original words + * in buffer->orig. */ static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) { - int i; - out->size = buffer->text.size; - out->ptr = xmalloc(out->size); - memcpy(out->ptr, buffer->text.ptr, out->size); - for (i = 0; i < out->size; i++) - if (isspace(out->ptr[i])) - out->ptr[i] = '\n'; - buffer->current = 0; + int i, j; + + out->size = 0; + out->ptr = xmalloc(buffer->text.size); + + /* fake an empty "0th" word */ + ALLOC_GROW(buffer->orig, 1, buffer->orig_alloc); + buffer->orig[0].begin = buffer->orig[0].end = buffer->text.ptr; + buffer->orig_nr = 1; + + for (i = 0; i < buffer->text.size; i++) { + if (isspace(buffer->text.ptr[i])) + continue; + for (j = i + 1; j < buffer->text.size && + !isspace(buffer->text.ptr[j]); j++) + ; /* find the end of the word */ + + /* store original boundaries */ + ALLOC_GROW(buffer->orig, buffer->orig_nr + 1, + buffer->orig_alloc); + buffer->orig[buffer->orig_nr].begin = buffer->text.ptr + i; + buffer->orig[buffer->orig_nr].end = buffer->text.ptr + j; + buffer->orig_nr++; + + /* store one word */ + memcpy(out->ptr + out->size, buffer->text.ptr + i, j - i); + out->ptr[out->size + j - i] = '\n'; + out->size += j - i + 1; + + i = j - 1; + } } /* this executes the word diff on the accumulated buffers */ @@ -419,22 +430,34 @@ static void diff_words_show(struct diff_words_data *diff_words) xdemitcb_t ecb; mmfile_t minus, plus; + /* special case: only removal */ + if (!diff_words->plus.text.size) { + color_fwrite_lines(diff_words->file, + diff_get_color(1, DIFF_FILE_OLD), + diff_words->minus.text.size, diff_words->minus.text.ptr); + diff_words->minus.text.size = 0; + return; + } + + diff_words->current_plus = diff_words->plus.text.ptr; + memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); diff_words_fill(&diff_words->minus, &minus); diff_words_fill(&diff_words->plus, &plus); xpp.flags = XDF_NEED_MINIMAL; - xecfg.ctxlen = diff_words->minus.alloc + diff_words->plus.alloc; + xecfg.ctxlen = 0; xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words, &xpp, &xecfg, &ecb); free(minus.ptr); free(plus.ptr); + if (diff_words->current_plus != diff_words->plus.text.ptr + + diff_words->plus.text.size) + fwrite(diff_words->current_plus, + diff_words->plus.text.ptr + diff_words->plus.text.size + - diff_words->current_plus, 1, + diff_words->file); diff_words->minus.text.size = diff_words->plus.text.size = 0; - - if (diff_words->minus.suppressed_newline) { - putc('\n', diff_words->file); - diff_words->minus.suppressed_newline = 0; - } } typedef unsigned long (*sane_truncate_fn)(char *line, unsigned long len); @@ -458,7 +481,9 @@ static void free_diff_words_data(struct emit_callback *ecbdata) diff_words_show(ecbdata->diff_words); free (ecbdata->diff_words->minus.text.ptr); + free (ecbdata->diff_words->minus.orig); free (ecbdata->diff_words->plus.text.ptr); + free (ecbdata->diff_words->plus.orig); free(ecbdata->diff_words); ecbdata->diff_words = NULL; } -- cgit v1.2.1 From 2b6a5417d750d086d1da906e46de2b3ad8df6753 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 17 Jan 2009 17:29:45 +0100 Subject: color-words: take an optional regular expression describing words In some applications, words are not delimited by white space. To allow for that, you can specify a regular expression describing what makes a word with git diff --color-words='[A-Za-z0-9]+' Note that words cannot contain newline characters. As suggested by Thomas Rast, the words are the exact matches of the regular expression. Note that a regular expression beginning with a '^' will match only a word at the beginning of the hunk, not a word at the beginning of a line, and is probably not what you want. This commit contains a quoting fix by Thomas Rast. Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- diff.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 9 deletions(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index 37c886a815..9fb3d0df31 100644 --- a/diff.c +++ b/diff.c @@ -333,12 +333,14 @@ static void diff_words_append(char *line, unsigned long len, len--; memcpy(buffer->text.ptr + buffer->text.size, line, len); buffer->text.size += len; + buffer->text.ptr[buffer->text.size] = '\0'; } struct diff_words_data { struct diff_words_buffer minus, plus; const char *current_plus; FILE *file; + regex_t *word_regex; }; static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) @@ -382,17 +384,49 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) diff_words->current_plus = plus_end; } +/* This function starts looking at *begin, and returns 0 iff a word was found. */ +static int find_word_boundaries(mmfile_t *buffer, regex_t *word_regex, + int *begin, int *end) +{ + if (word_regex && *begin < buffer->size) { + regmatch_t match[1]; + if (!regexec(word_regex, buffer->ptr + *begin, 1, match, 0)) { + char *p = memchr(buffer->ptr + *begin + match[0].rm_so, + '\n', match[0].rm_eo - match[0].rm_so); + *end = p ? p - buffer->ptr : match[0].rm_eo + *begin; + *begin += match[0].rm_so; + return *begin >= *end; + } + return -1; + } + + /* find the next word */ + while (*begin < buffer->size && isspace(buffer->ptr[*begin])) + (*begin)++; + if (*begin >= buffer->size) + return -1; + + /* find the end of the word */ + *end = *begin + 1; + while (*end < buffer->size && !isspace(buffer->ptr[*end])) + (*end)++; + + return 0; +} + /* * This function splits the words in buffer->text, stores the list with * newline separator into out, and saves the offsets of the original words * in buffer->orig. */ -static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) +static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out, + regex_t *word_regex) { int i, j; + long alloc = 0; out->size = 0; - out->ptr = xmalloc(buffer->text.size); + out->ptr = NULL; /* fake an empty "0th" word */ ALLOC_GROW(buffer->orig, 1, buffer->orig_alloc); @@ -400,11 +434,8 @@ static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) buffer->orig_nr = 1; for (i = 0; i < buffer->text.size; i++) { - if (isspace(buffer->text.ptr[i])) - continue; - for (j = i + 1; j < buffer->text.size && - !isspace(buffer->text.ptr[j]); j++) - ; /* find the end of the word */ + if (find_word_boundaries(&buffer->text, word_regex, &i, &j)) + return; /* store original boundaries */ ALLOC_GROW(buffer->orig, buffer->orig_nr + 1, @@ -414,6 +445,7 @@ static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) buffer->orig_nr++; /* store one word */ + ALLOC_GROW(out->ptr, out->size + j - i + 1, alloc); memcpy(out->ptr + out->size, buffer->text.ptr + i, j - i); out->ptr[out->size + j - i] = '\n'; out->size += j - i + 1; @@ -443,9 +475,10 @@ static void diff_words_show(struct diff_words_data *diff_words) memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); - diff_words_fill(&diff_words->minus, &minus); - diff_words_fill(&diff_words->plus, &plus); + diff_words_fill(&diff_words->minus, &minus, diff_words->word_regex); + diff_words_fill(&diff_words->plus, &plus, diff_words->word_regex); xpp.flags = XDF_NEED_MINIMAL; + /* as only the hunk header will be parsed, we need a 0-context */ xecfg.ctxlen = 0; xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words, &xpp, &xecfg, &ecb); @@ -484,6 +517,7 @@ static void free_diff_words_data(struct emit_callback *ecbdata) free (ecbdata->diff_words->minus.orig); free (ecbdata->diff_words->plus.text.ptr); free (ecbdata->diff_words->plus.orig); + free(ecbdata->diff_words->word_regex); free(ecbdata->diff_words); ecbdata->diff_words = NULL; } @@ -1506,6 +1540,14 @@ static void builtin_diff(const char *name_a, ecbdata.diff_words = xcalloc(1, sizeof(struct diff_words_data)); ecbdata.diff_words->file = o->file; + if (o->word_regex) { + ecbdata.diff_words->word_regex = (regex_t *) + xmalloc(sizeof(regex_t)); + if (regcomp(ecbdata.diff_words->word_regex, + o->word_regex, REG_EXTENDED)) + die ("Invalid regular expression: %s", + o->word_regex); + } } xdi_diff_outf(&mf1, &mf2, fn_out_consume, &ecbdata, &xpp, &xecfg, &ecb); @@ -2517,6 +2559,10 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac) DIFF_OPT_CLR(options, COLOR_DIFF); else if (!strcmp(arg, "--color-words")) options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS; + else if (!prefixcmp(arg, "--color-words=")) { + options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS; + options->word_regex = arg + 14; + } else if (!strcmp(arg, "--exit-code")) DIFF_OPT_SET(options, EXIT_WITH_STATUS); else if (!strcmp(arg, "--quiet")) -- cgit v1.2.1 From bf82940dbf12f066ba42a2a03a5bb626ba22c067 Mon Sep 17 00:00:00 2001 From: Thomas Rast Date: Sat, 17 Jan 2009 17:29:46 +0100 Subject: color-words: enable REG_NEWLINE to help user We silently truncate a match at the newline, which may lead to unexpected behaviour, e.g., when matching "<[^>]*>" against since then "" doesn't!) even though the regex said only angle-bracket-delimited things can be words. To alleviate the problem slightly, use REG_NEWLINE so that negated classes can't match a newline. Of course newlines can still be matched explicitly. Signed-off-by: Thomas Rast Signed-off-by: Junio C Hamano --- diff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index 9fb3d0df31..00c661f82e 100644 --- a/diff.c +++ b/diff.c @@ -1544,7 +1544,8 @@ static void builtin_diff(const char *name_a, ecbdata.diff_words->word_regex = (regex_t *) xmalloc(sizeof(regex_t)); if (regcomp(ecbdata.diff_words->word_regex, - o->word_regex, REG_EXTENDED)) + o->word_regex, + REG_EXTENDED | REG_NEWLINE)) die ("Invalid regular expression: %s", o->word_regex); } -- cgit v1.2.1 From 80c49c3de2d5a3aa12b0980a65f1163c8aef0c16 Mon Sep 17 00:00:00 2001 From: Thomas Rast Date: Sat, 17 Jan 2009 17:29:48 +0100 Subject: color-words: make regex configurable via attributes Make the --color-words splitting regular expression configurable via the diff driver's 'wordregex' attribute. The user can then set the driver on a file in .gitattributes. If a regex is given on the command line, it overrides the driver's setting. We also provide built-in regexes for the languages that already had funcname patterns, and add an appropriate diff driver entry for C/++. (The patterns are designed to run UTF-8 sequences into a single chunk to make sure they remain readable.) Signed-off-by: Thomas Rast Signed-off-by: Junio C Hamano --- diff.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'diff.c') diff --git a/diff.c b/diff.c index 00c661f82e..9fcde963db 100644 --- a/diff.c +++ b/diff.c @@ -1380,6 +1380,12 @@ static const struct userdiff_funcname *diff_funcname_pattern(struct diff_filespe return one->driver->funcname.pattern ? &one->driver->funcname : NULL; } +static const char *userdiff_word_regex(struct diff_filespec *one) +{ + diff_filespec_load_driver(one); + return one->driver->word_regex; +} + void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const char *b) { if (!options->a_prefix) @@ -1540,6 +1546,10 @@ static void builtin_diff(const char *name_a, ecbdata.diff_words = xcalloc(1, sizeof(struct diff_words_data)); ecbdata.diff_words->file = o->file; + if (!o->word_regex) + o->word_regex = userdiff_word_regex(one); + if (!o->word_regex) + o->word_regex = userdiff_word_regex(two); if (o->word_regex) { ecbdata.diff_words->word_regex = (regex_t *) xmalloc(sizeof(regex_t)); -- cgit v1.2.1 From 98a4d87b87e9846eafd21ba232cc2b7ba3f718fc Mon Sep 17 00:00:00 2001 From: Boyd Stephen Smith Jr Date: Tue, 20 Jan 2009 21:46:57 -0600 Subject: color-words: Support diff.wordregex config option When diff is invoked with --color-words (w/o =regex), use the regular expression the user has configured as diff.wordregex. diff drivers configured via attributes take precedence over the diff.wordregex-words setting. If the user wants to change them, they have their own configuration variables. Signed-off-by: Boyd Stephen Smith Jr Signed-off-by: Junio C Hamano --- diff.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'diff.c') diff --git a/diff.c b/diff.c index 9fcde963db..ed8b83c68f 100644 --- a/diff.c +++ b/diff.c @@ -23,6 +23,7 @@ static int diff_detect_rename_default; static int diff_rename_limit_default = 200; static int diff_suppress_blank_empty; int diff_use_color_default = -1; +static const char *diff_word_regex_cfg; static const char *external_diff_cmd_cfg; int diff_auto_refresh_index = 1; static int diff_mnemonic_prefix; @@ -92,6 +93,8 @@ int git_diff_ui_config(const char *var, const char *value, void *cb) } if (!strcmp(var, "diff.external")) return git_config_string(&external_diff_cmd_cfg, var, value); + if (!strcmp(var, "diff.wordregex")) + return git_config_string(&diff_word_regex_cfg, var, value); return git_diff_basic_config(var, value, cb); } @@ -1550,6 +1553,8 @@ static void builtin_diff(const char *name_a, o->word_regex = userdiff_word_regex(one); if (!o->word_regex) o->word_regex = userdiff_word_regex(two); + if (!o->word_regex) + o->word_regex = diff_word_regex_cfg; if (o->word_regex) { ecbdata.diff_words->word_regex = (regex_t *) xmalloc(sizeof(regex_t)); -- cgit v1.2.1