diff options
Diffstat (limited to 'src/search.c')
-rw-r--r-- | src/search.c | 129 |
1 files changed, 41 insertions, 88 deletions
diff --git a/src/search.c b/src/search.c index 89a4a5ad68e..96ea41e8f8e 100644 --- a/src/search.c +++ b/src/search.c @@ -24,7 +24,7 @@ Boston, MA 02111-1307, USA. */ #include "syntax.h" #include "category.h" #include "buffer.h" -#include "charset.h" +#include "character.h" #include "region-cache.h" #include "commands.h" #include "blockinput.h" @@ -104,9 +104,8 @@ matcher_overflow () subexpression bounds. POSIX is nonzero if we want full backtracking (POSIX style) for this pattern. 0 means backtrack only enough to get a valid match. - MULTIBYTE is nonzero if we want to handle multibyte characters in - PATTERN. 0 means all multibyte characters are recognized just as - sequences of binary data. */ + MULTIBYTE is nonzero iff a target of match is a multibyte buffer or + string. */ static void compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) @@ -117,51 +116,19 @@ compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) int posix; int multibyte; { - unsigned char *raw_pattern; - int raw_pattern_size; char *val; reg_syntax_t old; - /* MULTIBYTE says whether the text to be searched is multibyte. - We must convert PATTERN to match that, or we will not really - find things right. */ - - if (multibyte == STRING_MULTIBYTE (pattern)) - { - raw_pattern = (unsigned char *) SDATA (pattern); - raw_pattern_size = SBYTES (pattern); - } - else if (multibyte) - { - raw_pattern_size = count_size_as_multibyte (SDATA (pattern), - SCHARS (pattern)); - raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1); - copy_text (SDATA (pattern), raw_pattern, - SCHARS (pattern), 0, 1); - } - else - { - /* Converting multibyte to single-byte. - - ??? Perhaps this conversion should be done in a special way - by subtracting nonascii-insert-offset from each non-ASCII char, - so that only the multibyte chars which really correspond to - the chosen single-byte character set can possibly match. */ - raw_pattern_size = SCHARS (pattern); - raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1); - copy_text (SDATA (pattern), raw_pattern, - SBYTES (pattern), 1, 0); - } - cp->regexp = Qnil; cp->buf.translate = (! NILP (translate) ? translate : make_number (0)); cp->posix = posix; - cp->buf.multibyte = multibyte; + cp->buf.multibyte = STRING_MULTIBYTE (pattern); + cp->buf.target_multibyte = multibyte; BLOCK_INPUT; old = re_set_syntax (RE_SYNTAX_EMACS | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); - val = (char *) re_compile_pattern ((char *)raw_pattern, - raw_pattern_size, &cp->buf); + val = (char *) re_compile_pattern ((char *) SDATA (pattern), + SBYTES (pattern), &cp->buf); re_set_syntax (old); UNBLOCK_INPUT; if (val) @@ -222,7 +189,7 @@ compile_pattern (pattern, regp, translate, posix, multibyte) && !NILP (Fstring_equal (cp->regexp, pattern)) && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0))) && cp->posix == posix - && cp->buf.multibyte == multibyte) + && cp->buf.target_multibyte == multibyte) break; /* If we're at the end of the cache, compile into the nil cell @@ -1140,7 +1107,12 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, unsigned char *patbuf; int multibyte = !NILP (current_buffer->enable_multibyte_characters); unsigned char *base_pat = SDATA (string); - int charset_base = -1; + /* High bits of char; 0 for ASCII characters, (CHAR & ~0x3F) + otherwise. Characters of the same high bits have the same + sequence of bytes but last. To do the BM search, all + characters in STRING must have the same high bits (including + their case translations). */ + int char_high_bits = -1; int boyer_moore_ok = 1; /* MULTIBYTE says whether the text to be searched is multibyte. @@ -1181,16 +1153,15 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, /* Copy and optionally translate the pattern. */ len = raw_pattern_size; len_byte = raw_pattern_size_byte; - patbuf = (unsigned char *) alloca (len_byte); + patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH); pat = patbuf; base_pat = raw_pattern; if (multibyte) { while (--len >= 0) { - unsigned char str[MAX_MULTIBYTE_LENGTH]; int c, translated, inverse; - int in_charlen, charlen; + int in_charlen; /* If we got here and the RE flag is set, it's because we're dealing with a regexp known to be trivial, so the backslash @@ -1206,23 +1177,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, /* Translate the character, if requested. */ TRANSLATE (translated, trt, c); - /* If translation changed the byte-length, go back - to the original character. */ - charlen = CHAR_STRING (translated, str); - if (in_charlen != charlen) - { - translated = c; - charlen = CHAR_STRING (c, str); - } - - /* If we are searching for something strange, - an invalid multibyte code, don't use boyer-moore. */ - if (! ASCII_BYTE_P (translated) - && (charlen == 1 /* 8bit code */ - || charlen != in_charlen /* invalid multibyte code */ - )) - boyer_moore_ok = 0; - TRANSLATE (inverse, inverse_trt, c); /* Did this char actually get translated? @@ -1231,22 +1185,22 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, { /* Keep track of which character set row contains the characters that need translation. */ - int charset_base_code = c & ~CHAR_FIELD3_MASK; - int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK; + int this_high_bit = ASCII_CHAR_P (c) ? 0 : (c & ~0x3F); + int c1 = inverse != c ? inverse : translated; + int trt_high_bit = ASCII_CHAR_P (c1) ? 0 : (c1 & ~0x3F); - if (charset_base_code != inverse_charset_base) + if (this_high_bit != trt_high_bit) boyer_moore_ok = 0; - else if (charset_base == -1) - charset_base = charset_base_code; - else if (charset_base != charset_base_code) + else if (char_high_bits == -1) + char_high_bits = this_high_bit; + else if (char_high_bits != this_high_bit) /* If two different rows appear, needing translation, then we cannot use boyer_moore search. */ boyer_moore_ok = 0; } /* Store this character into the translated pattern. */ - bcopy (str, pat, charlen); - pat += charlen; + CHAR_STRING_ADVANCE (translated, pat); base_pat += in_charlen; len_byte -= in_charlen; } @@ -1254,7 +1208,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, else { /* Unibyte buffer. */ - charset_base = 0; + char_high_bits = 0; while (--len >= 0) { int c, translated; @@ -1280,7 +1234,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, if (boyer_moore_ok) return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, pos, pos_byte, lim, lim_byte, - charset_base); + char_high_bits); else return simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte); @@ -1513,7 +1467,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) static int boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, - pos, pos_byte, lim, lim_byte, charset_base) + pos, pos_byte, lim, lim_byte, char_high_bits) int n; unsigned char *base_pat; int len, len_byte; @@ -1521,7 +1475,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, Lisp_Object inverse_trt; int pos, pos_byte; int lim, lim_byte; - int charset_base; + int char_high_bits; { int direction = ((n > 0) ? 1 : -1); register int dirlen; @@ -1622,7 +1576,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, while (! CHAR_HEAD_P (*charstart)) charstart--; untranslated = STRING_CHAR (charstart, ptr - charstart + 1); - if (charset_base == (untranslated & ~CHAR_FIELD3_MASK)) + if (char_high_bits + == (ASCII_CHAR_P (untranslated) ? 0 : untranslated & ~0x3F)) { TRANSLATE (ch, trt, untranslated); if (! CHAR_HEAD_P (*ptr)) @@ -1646,8 +1601,9 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, this_translated = 0; } - if (ch > 0400) - j = ((unsigned char) ch) | 0200; + if (this_translated + && ch >= 0200) + j = (ch & 0x3F) | 0200; else j = (unsigned char) ch; @@ -1664,8 +1620,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, while (1) { TRANSLATE (ch, inverse_trt, ch); - if (ch > 0400) - j = ((unsigned char) ch) | 0200; + if (ch > 0200) + j = (ch & 0x3F) | 0200; else j = (unsigned char) ch; @@ -1958,7 +1914,7 @@ wordify (string) { int c; - FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); + FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte); if (SYNTAX (c) != Sword) { @@ -1993,7 +1949,7 @@ wordify (string) int c; int i_byte_orig = i_byte; - FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); + FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte); if (SYNTAX (c) == Sword) { @@ -2277,11 +2233,11 @@ since only regular expressions have distinguished subexpressions. */) { if (NILP (string)) { - c = FETCH_CHAR (pos_byte); + c = FETCH_CHAR_AS_MULTIBYTE (pos_byte); INC_BOTH (pos, pos_byte); } else - FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte); + FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte); if (LOWERCASEP (c)) { @@ -2445,10 +2401,7 @@ since only regular expressions have distinguished subexpressions. */) Lisp_Object rev_tbl; int really_changed = 0; - rev_tbl= (!buf_multibyte && CHAR_TABLE_P (Vnonascii_translation_table) - ? Fchar_table_extra_slot (Vnonascii_translation_table, - make_number (0)) - : Qnil); + rev_tbl = Qnil; substed_alloc_size = length * 2 + 100; substed = (unsigned char *) xmalloc (substed_alloc_size + 1); @@ -2491,7 +2444,7 @@ since only regular expressions have distinguished subexpressions. */) { FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext, pos, pos_byte); - if (!buf_multibyte && !SINGLE_BYTE_CHAR_P (c)) + if (!buf_multibyte && !ASCII_CHAR_P (c)) c = multibyte_char_to_unibyte (c, rev_tbl); } else |