summaryrefslogtreecommitdiff
path: root/src/search.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/search.c')
-rw-r--r--src/search.c129
1 files changed, 41 insertions, 88 deletions
diff --git a/src/search.c b/src/search.c
index 89a4a5ad68e..96ea41e8f8e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -24,7 +24,7 @@ Boston, MA 02111-1307, USA. */
#include "syntax.h"
#include "category.h"
#include "buffer.h"
-#include "charset.h"
+#include "character.h"
#include "region-cache.h"
#include "commands.h"
#include "blockinput.h"
@@ -104,9 +104,8 @@ matcher_overflow ()
subexpression bounds.
POSIX is nonzero if we want full backtracking (POSIX style)
for this pattern. 0 means backtrack only enough to get a valid match.
- MULTIBYTE is nonzero if we want to handle multibyte characters in
- PATTERN. 0 means all multibyte characters are recognized just as
- sequences of binary data. */
+ MULTIBYTE is nonzero iff a target of match is a multibyte buffer or
+ string. */
static void
compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
@@ -117,51 +116,19 @@ compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
int posix;
int multibyte;
{
- unsigned char *raw_pattern;
- int raw_pattern_size;
char *val;
reg_syntax_t old;
- /* MULTIBYTE says whether the text to be searched is multibyte.
- We must convert PATTERN to match that, or we will not really
- find things right. */
-
- if (multibyte == STRING_MULTIBYTE (pattern))
- {
- raw_pattern = (unsigned char *) SDATA (pattern);
- raw_pattern_size = SBYTES (pattern);
- }
- else if (multibyte)
- {
- raw_pattern_size = count_size_as_multibyte (SDATA (pattern),
- SCHARS (pattern));
- raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
- copy_text (SDATA (pattern), raw_pattern,
- SCHARS (pattern), 0, 1);
- }
- else
- {
- /* Converting multibyte to single-byte.
-
- ??? Perhaps this conversion should be done in a special way
- by subtracting nonascii-insert-offset from each non-ASCII char,
- so that only the multibyte chars which really correspond to
- the chosen single-byte character set can possibly match. */
- raw_pattern_size = SCHARS (pattern);
- raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
- copy_text (SDATA (pattern), raw_pattern,
- SBYTES (pattern), 1, 0);
- }
-
cp->regexp = Qnil;
cp->buf.translate = (! NILP (translate) ? translate : make_number (0));
cp->posix = posix;
- cp->buf.multibyte = multibyte;
+ cp->buf.multibyte = STRING_MULTIBYTE (pattern);
+ cp->buf.target_multibyte = multibyte;
BLOCK_INPUT;
old = re_set_syntax (RE_SYNTAX_EMACS
| (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
- val = (char *) re_compile_pattern ((char *)raw_pattern,
- raw_pattern_size, &cp->buf);
+ val = (char *) re_compile_pattern ((char *) SDATA (pattern),
+ SBYTES (pattern), &cp->buf);
re_set_syntax (old);
UNBLOCK_INPUT;
if (val)
@@ -222,7 +189,7 @@ compile_pattern (pattern, regp, translate, posix, multibyte)
&& !NILP (Fstring_equal (cp->regexp, pattern))
&& EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
&& cp->posix == posix
- && cp->buf.multibyte == multibyte)
+ && cp->buf.target_multibyte == multibyte)
break;
/* If we're at the end of the cache, compile into the nil cell
@@ -1140,7 +1107,12 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
unsigned char *patbuf;
int multibyte = !NILP (current_buffer->enable_multibyte_characters);
unsigned char *base_pat = SDATA (string);
- int charset_base = -1;
+ /* High bits of char; 0 for ASCII characters, (CHAR & ~0x3F)
+ otherwise. Characters of the same high bits have the same
+ sequence of bytes but last. To do the BM search, all
+ characters in STRING must have the same high bits (including
+ their case translations). */
+ int char_high_bits = -1;
int boyer_moore_ok = 1;
/* MULTIBYTE says whether the text to be searched is multibyte.
@@ -1181,16 +1153,15 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
/* Copy and optionally translate the pattern. */
len = raw_pattern_size;
len_byte = raw_pattern_size_byte;
- patbuf = (unsigned char *) alloca (len_byte);
+ patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH);
pat = patbuf;
base_pat = raw_pattern;
if (multibyte)
{
while (--len >= 0)
{
- unsigned char str[MAX_MULTIBYTE_LENGTH];
int c, translated, inverse;
- int in_charlen, charlen;
+ int in_charlen;
/* If we got here and the RE flag is set, it's because we're
dealing with a regexp known to be trivial, so the backslash
@@ -1206,23 +1177,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
/* Translate the character, if requested. */
TRANSLATE (translated, trt, c);
- /* If translation changed the byte-length, go back
- to the original character. */
- charlen = CHAR_STRING (translated, str);
- if (in_charlen != charlen)
- {
- translated = c;
- charlen = CHAR_STRING (c, str);
- }
-
- /* If we are searching for something strange,
- an invalid multibyte code, don't use boyer-moore. */
- if (! ASCII_BYTE_P (translated)
- && (charlen == 1 /* 8bit code */
- || charlen != in_charlen /* invalid multibyte code */
- ))
- boyer_moore_ok = 0;
-
TRANSLATE (inverse, inverse_trt, c);
/* Did this char actually get translated?
@@ -1231,22 +1185,22 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
{
/* Keep track of which character set row
contains the characters that need translation. */
- int charset_base_code = c & ~CHAR_FIELD3_MASK;
- int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK;
+ int this_high_bit = ASCII_CHAR_P (c) ? 0 : (c & ~0x3F);
+ int c1 = inverse != c ? inverse : translated;
+ int trt_high_bit = ASCII_CHAR_P (c1) ? 0 : (c1 & ~0x3F);
- if (charset_base_code != inverse_charset_base)
+ if (this_high_bit != trt_high_bit)
boyer_moore_ok = 0;
- else if (charset_base == -1)
- charset_base = charset_base_code;
- else if (charset_base != charset_base_code)
+ else if (char_high_bits == -1)
+ char_high_bits = this_high_bit;
+ else if (char_high_bits != this_high_bit)
/* If two different rows appear, needing translation,
then we cannot use boyer_moore search. */
boyer_moore_ok = 0;
}
/* Store this character into the translated pattern. */
- bcopy (str, pat, charlen);
- pat += charlen;
+ CHAR_STRING_ADVANCE (translated, pat);
base_pat += in_charlen;
len_byte -= in_charlen;
}
@@ -1254,7 +1208,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
else
{
/* Unibyte buffer. */
- charset_base = 0;
+ char_high_bits = 0;
while (--len >= 0)
{
int c, translated;
@@ -1280,7 +1234,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
if (boyer_moore_ok)
return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
pos, pos_byte, lim, lim_byte,
- charset_base);
+ char_high_bits);
else
return simple_search (n, pat, len, len_byte, trt,
pos, pos_byte, lim, lim_byte);
@@ -1513,7 +1467,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
static int
boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
- pos, pos_byte, lim, lim_byte, charset_base)
+ pos, pos_byte, lim, lim_byte, char_high_bits)
int n;
unsigned char *base_pat;
int len, len_byte;
@@ -1521,7 +1475,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
Lisp_Object inverse_trt;
int pos, pos_byte;
int lim, lim_byte;
- int charset_base;
+ int char_high_bits;
{
int direction = ((n > 0) ? 1 : -1);
register int dirlen;
@@ -1622,7 +1576,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
while (! CHAR_HEAD_P (*charstart))
charstart--;
untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
- if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
+ if (char_high_bits
+ == (ASCII_CHAR_P (untranslated) ? 0 : untranslated & ~0x3F))
{
TRANSLATE (ch, trt, untranslated);
if (! CHAR_HEAD_P (*ptr))
@@ -1646,8 +1601,9 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
this_translated = 0;
}
- if (ch > 0400)
- j = ((unsigned char) ch) | 0200;
+ if (this_translated
+ && ch >= 0200)
+ j = (ch & 0x3F) | 0200;
else
j = (unsigned char) ch;
@@ -1664,8 +1620,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
while (1)
{
TRANSLATE (ch, inverse_trt, ch);
- if (ch > 0400)
- j = ((unsigned char) ch) | 0200;
+ if (ch > 0200)
+ j = (ch & 0x3F) | 0200;
else
j = (unsigned char) ch;
@@ -1958,7 +1914,7 @@ wordify (string)
{
int c;
- FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte);
+ FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
if (SYNTAX (c) != Sword)
{
@@ -1993,7 +1949,7 @@ wordify (string)
int c;
int i_byte_orig = i_byte;
- FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte);
+ FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
if (SYNTAX (c) == Sword)
{
@@ -2277,11 +2233,11 @@ since only regular expressions have distinguished subexpressions. */)
{
if (NILP (string))
{
- c = FETCH_CHAR (pos_byte);
+ c = FETCH_CHAR_AS_MULTIBYTE (pos_byte);
INC_BOTH (pos, pos_byte);
}
else
- FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte);
+ FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte);
if (LOWERCASEP (c))
{
@@ -2445,10 +2401,7 @@ since only regular expressions have distinguished subexpressions. */)
Lisp_Object rev_tbl;
int really_changed = 0;
- rev_tbl= (!buf_multibyte && CHAR_TABLE_P (Vnonascii_translation_table)
- ? Fchar_table_extra_slot (Vnonascii_translation_table,
- make_number (0))
- : Qnil);
+ rev_tbl = Qnil;
substed_alloc_size = length * 2 + 100;
substed = (unsigned char *) xmalloc (substed_alloc_size + 1);
@@ -2491,7 +2444,7 @@ since only regular expressions have distinguished subexpressions. */)
{
FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext,
pos, pos_byte);
- if (!buf_multibyte && !SINGLE_BYTE_CHAR_P (c))
+ if (!buf_multibyte && !ASCII_CHAR_P (c))
c = multibyte_char_to_unibyte (c, rev_tbl);
}
else