diff options
author | Karl Williamson <khw@cpan.org> | 2015-02-17 15:03:32 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-02-19 22:55:01 -0700 |
commit | 64935bc6975bb01af403817752e88d6540c8711d (patch) | |
tree | 6a1619ac46d1501a5b296a2b69d9af0b98db8c58 | |
parent | 0e0b935601a8b7a2c56653412a94a36f986bc34f (diff) | |
download | perl-64935bc6975bb01af403817752e88d6540c8711d.tar.gz |
Add qr/\b{gcb}/
A function implements seeing if the space between any two characters is
a grapheme cluster break. Afer I wrote this, I realized that an array
lookup might be a better implementation, but the deadline for v5.22 was
too close to change it. I did see that my gcc optimized it down to
an array lookup.
This makes the implementation of \X go from being complicated to
trivial.
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | embedvar.h | 1 | ||||
-rw-r--r-- | intrpvar.h | 1 | ||||
-rw-r--r-- | lib/unicore/mktables | 47 | ||||
-rw-r--r-- | perl.c | 3 | ||||
-rw-r--r-- | perl.h | 1 | ||||
-rw-r--r-- | pod/perlcheat.pod | 2 | ||||
-rw-r--r-- | pod/perldebguts.pod | 26 | ||||
-rw-r--r-- | pod/perldelta.pod | 9 | ||||
-rw-r--r-- | pod/perldiag.pod | 24 | ||||
-rw-r--r-- | pod/perlre.pod | 12 | ||||
-rw-r--r-- | pod/perlrebackslash.pod | 39 | ||||
-rw-r--r-- | pod/perlreref.pod | 2 | ||||
-rw-r--r-- | pod/perlunicode.pod | 8 | ||||
-rw-r--r-- | proto.h | 3 | ||||
-rw-r--r-- | regcomp.c | 96 | ||||
-rw-r--r-- | regcomp.h | 5 | ||||
-rw-r--r-- | regcomp.sym | 16 | ||||
-rw-r--r-- | regexec.c | 500 | ||||
-rw-r--r-- | regnodes.h | 16 | ||||
-rw-r--r-- | sv.c | 1 | ||||
-rw-r--r-- | t/lib/warnings/regexec | 15 | ||||
-rw-r--r-- | t/re/reg_mesg.t | 17 | ||||
-rw-r--r-- | utf8.c | 1 |
25 files changed, 587 insertions, 260 deletions
@@ -2276,6 +2276,7 @@ Es |void |to_utf8_substr |NN regexp * prog Es |bool |to_byte_substr |NN regexp * prog ERsn |I32 |reg_check_named_buff_matched |NN const regexp *rex \ |NN const regnode *scan +EsnR |bool |isGCB |const PL_GCB_enum before|const PL_GCB_enum after # ifdef DEBUGGING Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|NN const char *loc_regeol\ |NN const char *loc_bostr|NN const char *loc_reg_starttry|const bool do_utf8 @@ -1054,6 +1054,7 @@ #define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e) #define isFOO_lc(a,b) S_isFOO_lc(aTHX_ a,b) #define isFOO_utf8_lc(a,b) S_isFOO_utf8_lc(aTHX_ a,b) +#define isGCB S_isGCB #define reg_check_named_buff_matched S_reg_check_named_buff_matched #define regcppop(a,b) S_regcppop(aTHX_ a,b) #define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c) diff --git a/embedvar.h b/embedvar.h index da3c331634..dde2340c73 100644 --- a/embedvar.h +++ b/embedvar.h @@ -53,6 +53,7 @@ #define PL_DBtrace (vTHX->IDBtrace) #define PL_Dir (vTHX->IDir) #define PL_Env (vTHX->IEnv) +#define PL_GCB_invlist (vTHX->IGCB_invlist) #define PL_HasMultiCharFold (vTHX->IHasMultiCharFold) #define PL_InBitmap (vTHX->IInBitmap) #define PL_LIO (vTHX->ILIO) diff --git a/intrpvar.h b/intrpvar.h index b88f6dfb0b..dc44b31cd7 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -610,6 +610,7 @@ PERLVAR(I, utf8_charname_continue, SV *) PERLVARA(I, utf8_swash_ptrs, POSIX_SWASH_COUNT, SV *) PERLVARA(I, Posix_ptrs, POSIX_CC_COUNT, SV *) PERLVARA(I, XPosix_ptrs, POSIX_CC_COUNT, SV *) +PERLVAR(I, GCB_invlist, SV *) PERLVAR(I, last_swash_hv, HV *) PERLVAR(I, last_swash_tmps, U8 *) diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 2da7bb3416..511ad020ee 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -18762,6 +18762,7 @@ sub _test_break($$) { my @should_match = map { eval "\"$_\"" } @should_display; # If a string can be represented in both non-ut8 and utf8, test both cases + my $display_upgrade = ""; UPGRADE: for my $to_upgrade (0 .. 1) { @@ -18771,8 +18772,54 @@ sub _test_break($$) { next UPGRADE if utf8::is_utf8($string); utf8::upgrade($string); + $display_upgrade = " (utf8-upgraded)"; + } + + # The /l modifier has C after it to indicate the locale to try + my @modifiers = qw(a aa d lC u i); + push @modifiers, "l$utf8_locale" if defined $utf8_locale; + + # Test for each of the regex modifiers. + for my $modifier (@modifiers) { + my $display_locale = ""; + + # For /l, set the locale to what it says to. + if ($modifier =~ / ^ l (.*) /x) { + my $locale = $1; + $display_locale = "(locale = $locale)"; + use Config; + if (defined $Config{d_setlocale}) { + eval { require POSIX; import POSIX 'locale_h'; }; + if (defined &POSIX::LC_CTYPE) { + POSIX::setlocale(&POSIX::LC_CTYPE, $locale); + } + } + $modifier = 'l'; + } + + no warnings qw(locale regexp surrogate); + my $pattern = "(?$modifier:$break_pattern)"; + + # Actually do the test + my $matched = $string =~ qr/$pattern/; + print "not " unless $matched; + + # Fancy display of test results + $matched = ($matched) ? "matched" : "failed to match"; + print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n"; + + # Repeat with the first \B{} in the pattern. This makes sure the + # code in regexec.c:find_byclass() for \B gets executed + if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) { + my $B_pattern = "$1$2"; + $matched = $string =~ qr/$B_pattern/; + print "not " unless $matched; + print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n"; + } } + next if $break_type ne 'gcb'; + # Finally, do the \X match. my @matches = $string =~ /(\X)/g; @@ -33,7 +33,6 @@ #include "perl.h" #include "patchlevel.h" /* for local_patches */ #include "XSUB.h" -#include "charclass_invlists.h" #ifdef NETWARE #include "nwutil.h" @@ -391,6 +390,7 @@ perl_construct(pTHXx) PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(XPosixWord_invlist); PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); + PL_GCB_invlist = _new_invlist_C_array(Grapheme_Cluster_Break_invlist); ENTER; } @@ -1060,6 +1060,7 @@ perl_destruct(pTHXx) SvREFCNT_dec(PL_XPosix_ptrs[i]); PL_XPosix_ptrs[i] = NULL; } + PL_GCB_invlist = NULL; if (!specialWARN(PL_compiling.cop_warnings)) PerlMemShared_free(PL_compiling.cop_warnings); @@ -2685,6 +2685,7 @@ typedef struct padname PADNAME; #endif #include "handy.h" +#include "charclass_invlists.h" #if defined(USE_LARGE_FILES) && !defined(NO_64_BIT_RAWIO) # if LSEEKSIZE == 8 && !defined(USE_64_BIT_RAWIO) diff --git a/pod/perlcheat.pod b/pod/perlcheat.pod index f288692a87..6e4e919ff5 100644 --- a/pod/perlcheat.pod +++ b/pod/perlcheat.pod @@ -46,7 +46,7 @@ already be overwhelming. , => /a ASCII /aa safe {3,7} repeat in range list ops /l locale /d dual | alternation not /u Unicode [] character class - and /e evaluate /ee rpts \b word boundary + and /e evaluate /ee rpts \b boundary or xor /g global \z string end /o compile pat once () capture DEBUG (?:p) no capture diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index 57fa1f42ab..591e69bd12 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -573,19 +573,23 @@ will be lost. # Word Boundary Opcodes: BOUND no Match "" at any word boundary using native - charset rules for non-utf8 - BOUNDL no Match "" at any locale word boundary - BOUNDU no Match "" at any word boundary using Unicode - rules - BOUNDA no Match "" at any word boundary using ASCII - rules + charset rules for non-utf8, otherwise + Unicode rules + BOUNDL no Match "" at any boundary of a given type + using locale rules + BOUNDU no Match "" at any boundary of a given type + using Unicode rules + BOUNDA no Match "" at any boundary of a given type + using ASCII rules NBOUND no Match "" at any word non-boundary using - native charset rules for non-utf8 - NBOUNDL no Match "" at any locale word non-boundary - NBOUNDU no Match "" at any word non-boundary using + native charset rules for non-utf8, otherwise Unicode rules - NBOUNDA no Match "" at any word non-boundary using - ASCII rules + NBOUNDL no Match "" at any boundary of a given type + using locale rules + NBOUNDU no Match "" at any boundary of a given type + using using Unicode rules + NBOUNDA no Match "" at any boundary of a given type + using using ASCII rules # [Special] alternatives: REG_ANY no Match any one character (except newline). diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 5a80e745d9..5db41e2bdc 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -25,7 +25,14 @@ XXX New core language features go here. Summarize user-visible core language enhancements. Particularly prominent performance optimisations could go here, but most should go in the L</Performance Enhancements> section. -[ List each enhancement as a =head2 entry ] +=head2 qr/\b{gcb}/ is now handled in regular expressions + +C<gcb> stands for Grapheme Cluster Boundary. It is a Unicode property +that finds the boundary between sequences of characters that look like a +single character to a native speaker of a language. Perl has long had +the ability to deal with these through the C<\X> regular escape +sequence. Now, there is an alternative way of handling these. See +L<perlrebackslash/\b{}, \b, \B{}, \B> for details. =head1 Security diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 877b992270..7db5b5414d 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2894,6 +2894,12 @@ with 'useperlio'. (F) Your machine doesn't implement the sockatmark() functionality, neither as a system call nor an ioctl call (SIOCATMARK). +=item '%s' is an unknown bound type in regex; marked by <-- HERE in m/%s/ + +(F) You used C<\b{...}> or C<\B{...}> and the C<...> is not known to +Perl. The current valid ones are given in +L<perlrebackslash/\b{}, \b, \B{}, \B>. + =item "%s" is more clearly written simply as "%s" in regex; marked by <-- HERE in m/%s/ (W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>) @@ -6638,6 +6644,15 @@ is deprecated. See L<perlvar/"$[">. form if you wish to use an empty line as the terminator of the here-document. +=item Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale + +(W locale) You are matching a regular expression using locale rules, +and a Unicode boundary is being matched, but the locale is not a Unicode +one. This doesn't make sense. Perl will continue, assuming a Unicode +(UTF-8) locale, but the results could well be wrong except if the locale +happens to be ISO-8859-1 (Latin1) where this message is spurious and can +be ignored. + =item Use of chdir('') or chdir(undef) as chdir() deprecated (D deprecated) chdir() with no arguments is documented to change to @@ -6859,6 +6874,15 @@ a range. For these, what should happen isn't clear at all. In these circumstances, Perl discards all but the first character of the returned sequence, which is not likely what you want. +=item Using /u for '%s' instead of /%s in regex; marked by <-- HERE in m/%s/ + +(W regexp) You used a Unicode boundary (C<\b{...}> or C<\B{...}>) in a +portion of a regular expression where the character set modifiers C</a> +or C</aa> are in effect. These two modifiers indicate an ASCII +interpretation, and this doesn't make sense for a Unicode definiton. +The generated regular expression will compile so that the boundary uses +all of Unicode. No other portion of the regular expression is affected. + =item Using !~ with %s doesn't make sense (F) Using the C<!~> operator with C<s///r>, C<tr///r> or C<y///r> is diff --git a/pod/perlre.pod b/pod/perlre.pod index 4231e99591..90858b1157 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -388,6 +388,10 @@ the pattern uses a Unicode property (C<\p{...}> or C<\P{...}>); or =item 6 +the pattern uses a Unicode break (C<\b{...}> or C<\B{...}>); or + +=item 7 + the pattern uses L</C<(?[ ])>> =back @@ -770,6 +774,8 @@ X<regexp, zero-width assertion> X<regular expression, zero-width assertion> X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G> + \b{} Match at Unicode boundary of specified type + \B{} Match where corresponding \b{} doesn't match \b Match a word boundary \B Match except at a word boundary \A Match only at beginning of string @@ -778,6 +784,12 @@ X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G> \G Match only at pos() (e.g. at the end-of-match position of prior m//g) +A Unicode boundary (C<\b{}>), available starting in v5.22, is a spot +between two characters, or before the first character in the string, or +after the final character in the string where certain criteria defined +by Unicode are met. See L<perlrebackslash/\b{}, \b, \B{}, \B> for +details. + A word boundary (C<\b>) is a spot between two characters that has a C<\w> on one side of it and a C<\W> on the other side of it (in either order), counting the imaginary characters off the diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod index 230e76dea8..ea460cb82c 100644 --- a/pod/perlrebackslash.pod +++ b/pod/perlrebackslash.pod @@ -66,8 +66,8 @@ as C<Not in [].> \1 Absolute backreference. Not in []. \a Alarm or bell. \A Beginning of string. Not in []. - \b Word/non-word boundary. (Backspace in []). - \B Not a word/non-word boundary. Not in []. + \b{}, \b Boundary. (\b is a backspace in []). + \B{}, \B Not a boundary. \cX Control-X. \C Single octet, even under UTF-8. Not in []. (Deprecated) @@ -134,7 +134,8 @@ description. (For EBCDIC platforms, see L<perlebcdic/OPERATOR DIFFERENCES>.) =item [1] C<\b> is the backspace character only inside a character class. Outside a -character class, C<\b> is a word/non-word boundary. +character class, C<\b> alone is a word-character/non-word-character +boundary, and C<\b{}> is some other type of boundary. =item [2] @@ -525,10 +526,21 @@ or the beginning of that string if there was no previous match. Mnemonic: I<G>lobal. -=item \b, \B +=item \b{}, \b, \B{}, \B -C<\b> matches at any place between a word and a non-word character; C<\B> -matches at any place between characters where C<\b> doesn't match. C<\b> +C<\b{...}>, available starting in v5.22, matches a boundary (between two +characters, or before the first character of the string, or after the +final character of the string) based on the Unicode rules for the +boundary type specified inside the braces. The currently known boundary +types are given a few paragraphs below. C<\B{...}> matches at any place +between characters where C<\b{...}> of the same type doesn't match. + +C<\b> when not immediately followed by a C<"{"> matches at any place +between a word (something matched by C<\w>) and a non-word character +(C<\W>); C<\B> when not immediately followed by a C<"{"> matches at any +place between characters where C<\b> doesn't match. + +C<\b> and C<\B> assume there's a non-word character before the beginning and after the end of the source string; so C<\b> will match at the beginning (or end) of the source string if the source string begins (or ends) with a word @@ -537,13 +549,22 @@ character. Otherwise, C<\B> will match. Do not use something like C<\b=head\d\b> and expect it to match the beginning of a line. It can't, because for there to be a boundary before the non-word "=", there must be a word character immediately previous. -All boundary determinations look for word characters alone, not for -non-words characters nor for string ends. It may help to understand how +All plain C<\b> and C<\B> boundary determinations look for word +characters alone, not for +non-word characters nor for string ends. It may help to understand how <\b> and <\B> work by equating them as follows: \b really means (?:(?<=\w)(?!\w)|(?<!\w)(?=\w)) \B really means (?:(?<=\w)(?=\w)|(?<!\w)(?!\w)) +In contrast, C<\b{...}> always matches at the beginning and end of the +line (and C<\B{...}> never does). The only boundary type currently +"Grapheme Cluster Boundary". (Actually Perl always uses the improved +"extended" grapheme cluster"). These are explained below under C<\X>. +In fact, C<\X> is another way to get the same functionality. It is +equivalent to C</.+?\b{gcb}/>. Use whichever is most convenient for +your situation. + Mnemonic: I<b>oundary. =back @@ -650,6 +671,8 @@ were a single character. The match is greedy and non-backtracking, so that the cluster is never broken up into smaller components. +See also L<C<\b{gcb}>|/\b{}, \b, \B{}, \B>. + Mnemonic: eI<X>tended Unicode character. =back diff --git a/pod/perlreref.pod b/pod/perlreref.pod index 7ae8f6cfdf..bc4bef76f4 100644 --- a/pod/perlreref.pod +++ b/pod/perlreref.pod @@ -201,6 +201,8 @@ All are zero-width assertions. ^ Match string start (or line, if /m is used) $ Match string end (or line, if /m is used) or before newline + \b{} Match boundary of type specified within the braces + \B{} Match wherever \b{} doesn't match \b Match word boundary (between \w and \W) \B Match except at word boundary (between \w and \w or \W and \W) \A Match string start (regardless of /m) diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 0482d92596..ee99198e2d 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -1100,7 +1100,8 @@ Level 2 - Extended Unicode Support [10] see UAX#15 "Unicode Normalization Forms" [11] have Unicode::Normalize but not integrated to regexes - [12] have \X but we don't have a "Grapheme Cluster Mode" + [12] have \X and \b{gcb} but we don't have a "Grapheme Cluster + Mode" [14] see UAX#29, Word Boundaries [15] This is covered in Chapter 3.13 (in Unicode 6.0) @@ -1575,8 +1576,9 @@ regular expressions outside the scope. =item * -Matching any of several properties in regular expressions, namely C<\b>, -C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes +Matching any of several properties in regular expressions, namely +C<\b> (without braces), C<\B> (without braces), C<\s>, C<\S>, C<\w>, +C<\W>, and all the Posix character classes I<except> C<[[:ascii:]]>. Starting in Perl 5.14.0, regular expressions compiled within the scope of C<unicode_strings> use character semantics @@ -7432,6 +7432,9 @@ STATIC bool S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) #define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \ assert(character) +STATIC bool S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after) + __attribute__warn_unused_result__; + STATIC I32 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) __attribute__warn_unused_result__ __attribute__nonnull__(1) @@ -87,7 +87,6 @@ EXTERN_C const struct regexp_engine my_reg_engine; #endif #include "dquote_static.c" -#include "charclass_invlists.h" #include "inline_invlist.c" #include "unicode_constants.h" @@ -11772,27 +11771,90 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) invert = 1; /* FALLTHROUGH */ case 'b': + { + regex_charset charset = get_regex_charset(RExC_flags); + RExC_seen_zerolen++; RExC_seen |= REG_LOOKBEHIND_SEEN; - op = BOUND + get_regex_charset(RExC_flags); - if (op > BOUNDA) { /* /aa is same as /a */ - op = BOUNDA; - } - else if (op == BOUNDL) { - RExC_contains_locale = 1; - } + op = BOUND + charset; - if (invert) { - op += NBOUND - BOUND; + if (op == BOUNDL) { + RExC_contains_locale = 1; } ret = reg_node(pRExC_state, op); *flagp |= SIMPLE; - if ((U8) *(RExC_parse + 1) == '{') { - /* diag_listed_as: Use "%s" instead of "%s" */ - vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse); + if (*(RExC_parse + 1) != '{') { + FLAGS(ret) = TRADITIONAL_BOUND; + if (PASS2 && op > BOUNDA) { /* /aa is same as /a */ + OP(ret) = BOUNDA; + } + } + else { + STRLEN length; + char name = *RExC_parse; + char * endbrace; + RExC_parse += 2; + endbrace = strchr(RExC_parse, '}'); + + if (! endbrace) { + vFAIL2("Missing right brace on \\%c{}", name); + } + /* XXX Need to decide whether to take spaces or not. Should be + * consistent with \p{}, but that currently is SPACE, which + * means vertical too, which seems wrong + * while (isBLANK(*RExC_parse)) { + RExC_parse++; + }*/ + if (endbrace == RExC_parse) { + RExC_parse++; /* After the '}' */ + vFAIL2("Empty \\%c{}", name); + } + length = endbrace - RExC_parse; + /*while (isBLANK(*(RExC_parse + length - 1))) { + length--; + }*/ + switch (*RExC_parse) { + case 'g': + if (length != 1 + && (length != 3 || strnNE(RExC_parse + 1, "cb", 2))) + { + goto bad_bound_type; + } + FLAGS(ret) = GCB_BOUND; + break; + default: + bad_bound_type: + RExC_parse = endbrace; + vFAIL2utf8f( + "'%"UTF8f"' is an unknown bound type", + UTF8fARG(UTF, length, endbrace - length)); + NOT_REACHED; /*NOTREACHED*/ + } + RExC_parse = endbrace; + RExC_uni_semantics = 1; + + if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */ + OP(ret) = BOUNDU; + length += 4; + + /* Don't have to worry about UTF-8, in this message because + * to get here the contents of the \b must be ASCII */ + ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */ + "Using /u for '%.*s' instead of /%s", + (unsigned) length, + endbrace - length + 1, + (charset == REGEX_ASCII_RESTRICTED_CHARSET) + ? ASCII_RESTRICT_PAT_MODS + : ASCII_MORE_RESTRICT_PAT_MODS); + } } + + if (PASS2 && invert) { + OP(ret) += NBOUND - BOUND; + } goto finish_meta_pat; + } case 'D': invert = 1; @@ -16735,6 +16797,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } + else if (k == BOUND || k == NBOUND) { + /* Must be synced with order of 'bound_type' in regcomp.h */ + const char * const bounds[] = { + "", /* Traditional */ + "{gcb}" + }; + sv_catpv(sv, bounds[FLAGS(o)]); + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); else if (OP(o) == SBOL) @@ -993,6 +993,11 @@ re.pm, especially to the documentation. #endif /* DEBUG RELATED DEFINES */ +typedef enum { + TRADITIONAL_BOUND = _CC_WORDCHAR, + GCB_BOUND +} bound_type; + /* * Local variables: * c-indentation-style: bsd diff --git a/regcomp.sym b/regcomp.sym index c20c5aaad1..7daa241dba 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -43,15 +43,15 @@ GPOS GPOS, no ; Matches where last m//g left off. # in regcomp.c uses the enum value of the modifier as an offset from the /d # version. The complements must come after the non-complements. # BOUND, POSIX and their complements are affected, as well as EXACTF. -BOUND BOUND, no ; Match "" at any word boundary using native charset rules for non-utf8 -BOUNDL BOUND, no ; Match "" at any locale word boundary -BOUNDU BOUND, no ; Match "" at any word boundary using Unicode rules -BOUNDA BOUND, no ; Match "" at any word boundary using ASCII rules +BOUND BOUND, no ; Match "" at any word boundary using native charset rules for non-utf8, otherwise Unicode rules +BOUNDL BOUND, no ; Match "" at any boundary of a given type using locale rules +BOUNDU BOUND, no ; Match "" at any boundary of a given type using Unicode rules +BOUNDA BOUND, no ; Match "" at any boundary of a given type using ASCII rules # All NBOUND nodes are required by code in regexec.c to be greater than all BOUND ones -NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset rules for non-utf8 -NBOUNDL NBOUND, no ; Match "" at any locale word non-boundary -NBOUNDU NBOUND, no ; Match "" at any word non-boundary using Unicode rules -NBOUNDA NBOUND, no ; Match "" at any word non-boundary using ASCII rules +NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset rules for non-utf8, otherwise Unicode rules +NBOUNDL NBOUND, no ; Match "" at any boundary of a given type using locale rules +NBOUNDU NBOUND, no ; Match "" at any boundary of a given type using using Unicode rules +NBOUNDA NBOUND, no ; Match "" at any boundary of a given type using using ASCII rules #* [Special] alternatives: REG_ANY REG_ANY, no 0 S ; Match any one character (except newline). @@ -37,6 +37,9 @@ #include "re_top.h" #endif +#define B_ON_NON_UTF8_LOCALE_IS_WRONG \ + "Use of \\b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale" + /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -191,18 +194,6 @@ static const char* const non_utf8_target_but_utf8_required PL_XPosix_ptrs[_CC_WORDCHAR], \ LATIN_CAPITAL_LETTER_SHARP_S_UTF8); -#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ - STMT_START { \ - LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ - "_X_regular_begin", \ - NULL, \ - LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ - LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ - "_X_extend", \ - NULL, \ - COMBINING_GRAVE_ACCENT_UTF8); \ - } STMT_END - #define PLACEHOLDER /* Something for the preprocessor to grab onto */ /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ @@ -262,16 +253,6 @@ static const char* const non_utf8_target_but_utf8_required } \ } STMT_END -/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode. - * These are for the pre-composed Hangul syllables, which are all in a - * contiguous block and arranged there in such a way so as to facilitate - * alorithmic determination of their characteristics. As such, they don't need - * a swash, but can be determined by simple arithmetic. Almost all are - * GCB=LVT, but every 28th one is a GCB=LV */ -#define SBASE 0xAC00 /* Start of block */ -#define SCount 11172 /* Length of block */ -#define TCount 28 - #define SLAB_FIRST(s) (&(s)->states[0]) #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) @@ -1728,6 +1709,33 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) +/* Takes a pointer to an inversion list, a pointer to its corresponding + * inversion map, and a code point, and returns the code point's value + * according to the two arrays. It assumes that all code points have a value. + * This is used as the base macro for macros for particular properties */ +#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \ + invmap[_invlist_search(invlist, cp)] + +/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead + * of a code point, returning the value for the first code point in the string. + * And it takes the particular macro name that finds the desired value given a + * code point. Merely convert the UTF-8 to code point and call the cp macro */ +#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \ + (__ASSERT_(pos < strend) \ + /* Note assumes is valid UTF-8 */ \ + (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL)))) + +/* Returns the GCB value for the input code point */ +#define getGCB_VAL_CP(cp) \ + _generic_GET_BREAK_VAL_CP( \ + PL_GCB_invlist, \ + Grapheme_Cluster_Break_invmap, \ + (cp)) + +/* Returns the GCB value for the first code point in the UTF-8 encoded string + * bounded by pos and strend */ +#define getGCB_VAL_UTF8(pos, strend) \ + _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend) /* We know what class REx starts with. Try to find this position... */ /* if reginfo->intuit, its a dryrun */ @@ -1937,30 +1945,120 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case BOUNDL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (FLAGS(c) != TRADITIONAL_BOUND) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), + B_ON_NON_UTF8_LOCALE_IS_WRONG); + goto do_boundu; + } + FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); break; + case NBOUNDL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (FLAGS(c) != TRADITIONAL_BOUND) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), + B_ON_NON_UTF8_LOCALE_IS_WRONG); + goto do_nboundu; + } + FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); break; - case BOUND: + + case BOUND: /* regcomp.c makes sure that this only has the traditional \b + meaning */ + assert(FLAGS(c) == TRADITIONAL_BOUND); + FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); break; - case BOUNDA: + + case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b + meaning */ + assert(FLAGS(c) == TRADITIONAL_BOUND); + FBC_BOUND_A(isWORDCHAR_A); break; - case NBOUND: + + case NBOUND: /* regcomp.c makes sure that this only has the traditional \b + meaning */ + assert(FLAGS(c) == TRADITIONAL_BOUND); + FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); break; - case NBOUNDA: + + case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b + meaning */ + assert(FLAGS(c) == TRADITIONAL_BOUND); + FBC_NBOUND_A(isWORDCHAR_A); break; - case BOUNDU: - FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); - break; + case NBOUNDU: - FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { + FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + } + + do_nboundu: + + to_complement = 1; + /* FALLTHROUGH */ + + case BOUNDU: + do_boundu: + switch((bound_type) FLAGS(c)) { + case TRADITIONAL_BOUND: + FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case GCB_BOUND: + if (s == reginfo->strbeg) { /* GCB always matches at begin and + end */ + if (to_complement ^ cBOOL(reginfo->intuit + || regtry(reginfo, &s))) + { + goto got_it; + } + s += (utf8_target) ? UTF8SKIP(s) : 1; + } + + if (utf8_target) { + PL_GCB_enum before = getGCB_VAL_UTF8( + reghop3((U8*)s, -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend); + while (s < strend) { + PL_GCB_enum after = getGCB_VAL_UTF8((U8*) s, + (U8*) reginfo->strend); + if (to_complement ^ isGCB(before, after)) { + if (reginfo->intuit || regtry(reginfo, &s)) { + goto got_it; + } + before = after; + } + s += UTF8SKIP(s); + } + } + else { /* Not utf8. Everything is a GCB except between CR and + LF */ + while (s < strend) { + if (to_complement ^ (UCHARAT(s - 1) != '\r' + || UCHARAT(s) != '\n')) + { + if (reginfo->intuit || regtry(reginfo, &s)) { + goto got_it; + } + s++; + } + } + } + + if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) { + goto got_it; + } + break; + } break; + case LNBREAK: REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), is_LNBREAK_latin1_safe(s, strend) @@ -3892,6 +3990,105 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, return TRUE; } +/* This creates a single number by combining two, with 'before' being like the + * 10's digit, but this isn't necessarily base 10; it is base however many + * elements of the enum there are */ +#define GCBcase(before, after) ((PL_GCB_ENUM_COUNT * before) + after) + +STATIC bool +S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after) +{ + /* returns a boolean indicating if there is a Grapheme Cluster Boundary + * between the inputs. See http://www.unicode.org/reports/tr29/ */ + + switch (GCBcase(before, after)) { + + /* Break at the start and end of text. + GB1. sot ÷ + GB2. ÷ eot + + Break before and after controls except between CR and LF + GB4. ( Control | CR | LF ) ÷ + GB5. ÷ ( Control | CR | LF ) + + Otherwise, break everywhere. + GB10. Any ÷ Any */ + default: + return TRUE; + + /* Do not break between a CR and LF. + GB3. CR × LF */ + case GCBcase(PL_GCB_CR, PL_GCB_LF): + return FALSE; + + /* Do not break Hangul syllable sequences. + GB6. L × ( L | V | LV | LVT ) */ + case GCBcase(PL_GCB_L, PL_GCB_L): + case GCBcase(PL_GCB_L, PL_GCB_V): + case GCBcase(PL_GCB_L, PL_GCB_LV): + case GCBcase(PL_GCB_L, PL_GCB_LVT): + return FALSE; + + /* GB7. ( LV | V ) × ( V | T ) */ + case GCBcase(PL_GCB_LV, PL_GCB_V): + case GCBcase(PL_GCB_LV, PL_GCB_T): + case GCBcase(PL_GCB_V, PL_GCB_V): + case GCBcase(PL_GCB_V, PL_GCB_T): + return FALSE; + + /* GB8. ( LVT | T) × T */ + case GCBcase(PL_GCB_LVT, PL_GCB_T): + case GCBcase(PL_GCB_T, PL_GCB_T): + return FALSE; + + /* Do not break between regional indicator symbols. + GB8a. Regional_Indicator × Regional_Indicator */ + case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Regional_Indicator): + return FALSE; + + /* Do not break before extending characters. + GB9. × Extend */ + case GCBcase(PL_GCB_Other, PL_GCB_Extend): + case GCBcase(PL_GCB_Extend, PL_GCB_Extend): + case GCBcase(PL_GCB_L, PL_GCB_Extend): + case GCBcase(PL_GCB_LV, PL_GCB_Extend): + case GCBcase(PL_GCB_LVT, PL_GCB_Extend): + case GCBcase(PL_GCB_Prepend, PL_GCB_Extend): + case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Extend): + case GCBcase(PL_GCB_SpacingMark, PL_GCB_Extend): + case GCBcase(PL_GCB_T, PL_GCB_Extend): + case GCBcase(PL_GCB_V, PL_GCB_Extend): + return FALSE; + + /* Do not break before SpacingMarks, or after Prepend characters. + GB9a. × SpacingMark */ + case GCBcase(PL_GCB_Other, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_Extend, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_L, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_LV, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_LVT, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_Prepend, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_SpacingMark, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_T, PL_GCB_SpacingMark): + case GCBcase(PL_GCB_V, PL_GCB_SpacingMark): + return FALSE; + + /* GB9b. Prepend × */ + case GCBcase(PL_GCB_Prepend, PL_GCB_Other): + case GCBcase(PL_GCB_Prepend, PL_GCB_L): + case GCBcase(PL_GCB_Prepend, PL_GCB_LV): + case GCBcase(PL_GCB_Prepend, PL_GCB_LVT): + case GCBcase(PL_GCB_Prepend, PL_GCB_Prepend): + case GCBcase(PL_GCB_Prepend, PL_GCB_Regional_Indicator): + case GCBcase(PL_GCB_Prepend, PL_GCB_T): + case GCBcase(PL_GCB_Prepend, PL_GCB_V): + return FALSE; + } + + NOT_REACHED; +} + /* returns -1 on failure, $+[0] on success */ STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) @@ -3964,6 +4161,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) int to_complement; /* Invert the result? */ _char_class_number classnum; bool is_utf8_pat = reginfo->is_utf8_pat; + bool match = FALSE; + #ifdef DEBUGGING GET_RE_DEBUG_FLAGS_DECL; @@ -4623,13 +4822,21 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; } - /* XXX At that point regcomp.c would no longer * have to set the FLAGS fields of these */ case NBOUNDL: /* /\B/l */ to_complement = 1; /* FALLTHROUGH */ case BOUNDL: /* /\b/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + if (FLAGS(scan) != TRADITIONAL_BOUND) { + if (! IN_UTF8_CTYPE_LOCALE) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), + B_ON_NON_UTF8_LOCALE_IS_WRONG); + } + goto boundu; + } + if (utf8_target) { if (locinput == reginfo->strbeg) ln = isWORDCHAR_LC('\n'); @@ -4696,9 +4903,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* FALLTHROUGH */ case BOUNDU: /* /\b/u */ + + boundu: if (utf8_target) { - bound_utf8: + bound_utf8: + switch((bound_type) FLAGS(scan)) { + case TRADITIONAL_BOUND: ln = (locinput == reginfo->strbeg) ? isWORDCHAR_L1('\n') : isWORDCHAR_utf8(reghop3((U8*)locinput, -1, @@ -4706,18 +4917,55 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) n = (NEXTCHR_IS_EOS) ? isWORDCHAR_L1('\n') : isWORDCHAR_utf8((U8*)locinput); + + match = ln != n; + break; + case GCB_BOUND: + if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { + match = TRUE; /* GCB always matches at begin and + end */ + } + else { + /* Find the gcb values of previous and current + * chars, then see if is a break point */ + match = isGCB(getGCB_VAL_UTF8( + reghop3((U8*)locinput, + -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend), + getGCB_VAL_UTF8((U8*) locinput, + (U8*) reginfo->strend)); + } + break; + } } - else { + else { /* Not utf8 target */ + switch((bound_type) FLAGS(scan)) { + case TRADITIONAL_BOUND: ln = (locinput == reginfo->strbeg) ? isWORDCHAR_L1('\n') : isWORDCHAR_L1(UCHARAT(locinput - 1)); n = (NEXTCHR_IS_EOS) ? isWORDCHAR_L1('\n') : isWORDCHAR_L1(nextchr); + match = ln != n; + break; + case GCB_BOUND: + if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { + match = TRUE; /* GCB always matches at begin and + end */ + } + else { /* Only CR-LF combo isn't a GCB in 0-255 + range */ + match = UCHARAT(locinput - 1) != '\r' + || UCHARAT(locinput) != '\n'; + } + break; + } } - if (to_complement ^ (ln == n)) { + if (to_complement ^ ! match) { sayNO; } break; @@ -4921,38 +5169,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ - /* From http://www.unicode.org/reports/tr29 (5.2 version). An - extended Grapheme Cluster is: - - CR LF - | Prepend* Begin Extend* - | . - - Begin is: ( Special_Begin | ! Control ) - Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) - Extend is: ( Grapheme_Extend | Spacing_Mark ) - Control is: [ GCB_Control | CR | LF ] - Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) - - If we create a 'Regular_Begin' = Begin - Special_Begin, then - we can rewrite - - Begin is ( Regular_Begin + Special Begin ) - - It turns out that 98.4% of all Unicode code points match - Regular_Begin. Doing it this way eliminates a table match in - the previous implementation for almost all Unicode code points. - - There is a subtlety with Prepend* which showed up in testing. - Note that the Begin, and only the Begin is required in: - | Prepend* Begin Extend* - Also, Begin contains '! Control'. A Prepend must be a - '! Control', which means it must also be a Begin. What it - comes down to is that if we match Prepend* and then find no - suitable Begin afterwards, that if we backtrack the last - Prepend, that one will be a suitable Begin. - */ - if (NEXTCHR_IS_EOS) sayNO; if (! utf8_target) { @@ -4970,147 +5186,27 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else { - /* Utf8: See if is ( CR LF ); already know that locinput < - * reginfo->strend, so locinput+1 is in bounds */ - if ( nextchr == '\r' && locinput+1 < reginfo->strend - && UCHARAT(locinput + 1) == '\n') - { - locinput += 2; - } - else { - STRLEN len; - - /* In case have to backtrack to beginning, then match '.' */ - char *starting = locinput; - - /* In case have to backtrack the last prepend */ - char *previous_prepend = NULL; + /* Get the gcb type for the current character */ + PL_GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput, + (U8*) reginfo->strend); - LOAD_UTF8_CHARCLASS_GCB(); - - /* Match (prepend)* */ - while (locinput < reginfo->strend - && (len = is_GCB_Prepend_utf8(locinput))) - { - previous_prepend = locinput; - locinput += len; - } - - /* As noted above, if we matched a prepend character, but - * the next thing won't match, back off the last prepend we - * matched, as it is guaranteed to match the begin */ - if (previous_prepend - && (locinput >= reginfo->strend - || (! swash_fetch(PL_utf8_X_regular_begin, - (U8*)locinput, utf8_target) - && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput))) - ) - { - locinput = previous_prepend; - } - - /* Note that here we know reginfo->strend > locinput, as we - * tested that upon input to this switch case, and if we - * moved locinput forward, we tested the result just above - * and it either passed, or we backed off so that it will - * now pass */ - if (swash_fetch(PL_utf8_X_regular_begin, - (U8*)locinput, utf8_target)) { - locinput += UTF8SKIP(locinput); + /* Then scan through the input until we get to the first + * character whose type is supposed to be a gcb with the + * current character. (There is always a break at the + * end-of-input) */ + locinput += UTF8SKIP(locinput); + while (locinput < reginfo->strend) { + PL_GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput, + (U8*) reginfo->strend); + if (isGCB(prev_gcb, cur_gcb)) { + break; } - else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) { - - /* Here did not match the required 'Begin' in the - * second term. So just match the very first - * character, the '.' of the final term of the regex */ - locinput = starting + UTF8SKIP(starting); - goto exit_utf8; - } else { - - /* Here is a special begin. It can be composed of - * several individual characters. One possibility is - * RI+ */ - if ((len = is_GCB_RI_utf8(locinput))) { - locinput += len; - while (locinput < reginfo->strend - && (len = is_GCB_RI_utf8(locinput))) - { - locinput += len; - } - } else if ((len = is_GCB_T_utf8(locinput))) { - /* Another possibility is T+ */ - locinput += len; - while (locinput < reginfo->strend - && (len = is_GCB_T_utf8(locinput))) - { - locinput += len; - } - } else { - /* Here, neither RI+ nor T+; must be some other - * Hangul. That means it is one of the others: L, - * LV, LVT or V, and matches: - * L* (L | LVT T* | V * V* T* | LV V* T*) */ - - /* Match L* */ - while (locinput < reginfo->strend - && (len = is_GCB_L_utf8(locinput))) - { - locinput += len; - } - - /* Here, have exhausted L*. If the next character - * is not an LV, LVT nor V, it means we had to have - * at least one L, so matches L+ in the original - * equation, we have a complete hangul syllable. - * Are done. */ + prev_gcb = cur_gcb; + locinput += UTF8SKIP(locinput); + } - if (locinput < reginfo->strend - && is_GCB_LV_LVT_V_utf8(locinput)) - { - /* Otherwise keep going. Must be LV, LVT or V. - * See if LVT, by first ruling out V, then LV */ - if (! is_GCB_V_utf8(locinput) - /* All but every TCount one is LV */ - && (valid_utf8_to_uvchr((U8 *) locinput, - NULL) - - SBASE) - % TCount != 0) - { - locinput += UTF8SKIP(locinput); - } else { - - /* Must be V or LV. Take it, then match - * V* */ - locinput += UTF8SKIP(locinput); - while (locinput < reginfo->strend - && (len = is_GCB_V_utf8(locinput))) - { - locinput += len; - } - } - /* And any of LV, LVT, or V can be followed - * by T* */ - while (locinput < reginfo->strend - && (len = is_GCB_T_utf8(locinput))) - { - locinput += len; - } - } - } - } - - /* Match any extender */ - while (locinput < reginfo->strend - && swash_fetch(PL_utf8_X_extend, - (U8*)locinput, utf8_target)) - { - locinput += UTF8SKIP(locinput); - } - } - exit_utf8: - if (locinput > reginfo->strend) sayNO; } break; diff --git a/regnodes.h b/regnodes.h index 439fa8d7b1..144d6f63b5 100644 --- a/regnodes.h +++ b/regnodes.h @@ -19,14 +19,14 @@ #define MEOL 5 /* 0x05 Same, assuming multiline: /$/m */ #define EOS 6 /* 0x06 Match "" at end of string: /\z/ */ #define GPOS 7 /* 0x07 Matches where last m//g left off. */ -#define BOUND 8 /* 0x08 Match "" at any word boundary using native charset rules for non-utf8 */ -#define BOUNDL 9 /* 0x09 Match "" at any locale word boundary */ -#define BOUNDU 10 /* 0x0a Match "" at any word boundary using Unicode rules */ -#define BOUNDA 11 /* 0x0b Match "" at any word boundary using ASCII rules */ -#define NBOUND 12 /* 0x0c Match "" at any word non-boundary using native charset rules for non-utf8 */ -#define NBOUNDL 13 /* 0x0d Match "" at any locale word non-boundary */ -#define NBOUNDU 14 /* 0x0e Match "" at any word non-boundary using Unicode rules */ -#define NBOUNDA 15 /* 0x0f Match "" at any word non-boundary using ASCII rules */ +#define BOUND 8 /* 0x08 Match "" at any word boundary using native charset rules for non-utf8, otherwise Unicode rules */ +#define BOUNDL 9 /* 0x09 Match "" at any boundary of a given type using locale rules */ +#define BOUNDU 10 /* 0x0a Match "" at any boundary of a given type using Unicode rules */ +#define BOUNDA 11 /* 0x0b Match "" at any boundary of a given type using ASCII rules */ +#define NBOUND 12 /* 0x0c Match "" at any word non-boundary using native charset rules for non-utf8, otherwise Unicode rules */ +#define NBOUNDL 13 /* 0x0d Match "" at any boundary of a given type using locale rules */ +#define NBOUNDU 14 /* 0x0e Match "" at any boundary of a given type using using Unicode rules */ +#define NBOUNDA 15 /* 0x0f Match "" at any boundary of a given type using using ASCII rules */ #define REG_ANY 16 /* 0x10 Match any one character (except newline). */ #define SANY 17 /* 0x11 Match any one character. */ #define CANY 18 /* 0x12 Match any one byte. */ @@ -14944,6 +14944,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, for (i = 0; i < POSIX_CC_COUNT; i++) { PL_XPosix_ptrs[i] = sv_dup_inc(proto_perl->IXPosix_ptrs[i], param); } + PL_GCB_invlist = sv_dup_inc(proto_perl->IGCB_invlist, param); PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param); PL_utf8_X_regular_begin = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param); PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param); diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec index 0c6a16a5ba..3f15db0e19 100644 --- a/t/lib/warnings/regexec +++ b/t/lib/warnings/regexec @@ -143,3 +143,18 @@ Wide character (U+100) in pattern match (m//) at - line 10. Wide character (U+100) in pattern match (m//) at - line 11. Wide character (U+100) in pattern match (m//) at - line 12. Wide character (U+100) in pattern match (m//) at - line 12. +######## +# NAME \b{} in non-UTF-8 locale +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, "C"); +"a" =~ /\b{gcb}/l; +no warnings 'locale'; +"a" =~ /\b{gcb}/l; +EXPECT +Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8. +Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8. diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index c985c8e1b0..452d982d17 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -142,8 +142,6 @@ my @death = '/(?lil:foo)/' => 'Regexp modifier "l" may not appear twice {#} m/(?lil{#}:foo)/', '/(?aaia:foo)/' => 'Regexp modifier "a" may appear a maximum of twice {#} m/(?aaia{#}:foo)/', '/(?i-l:foo)/' => 'Regexp modifier "l" may not appear after the "-" {#} m/(?i-l{#}:foo)/', -'/a\b{cde/' => 'Use "\b\{" instead of "\b{" {#} m/a\{#}b{cde/', -'/a\B{cde/' => 'Use "\B\{" instead of "\B{" {#} m/a\{#}B{cde/', '/((x)/' => 'Unmatched ( {#} m/({#}(x)/', @@ -188,8 +186,17 @@ my @death = '/[z-a]/' => 'Invalid [] range "z-a" {#} m/[z-a{#}]/', '/\p/' => 'Empty \p{} {#} m/\p{#}/', - '/\P{}/' => 'Empty \P{} {#} m/\P{{#}}/', + +'/a\b{cde/' => 'Missing right brace on \b{} {#} m/a\b{{#}cde/', +'/a\B{cde/' => 'Missing right brace on \B{} {#} m/a\B{{#}cde/', + + '/\b{}/' => 'Empty \b{} {#} m/\b{}{#}/', + '/\B{}/' => 'Empty \B{} {#} m/\B{}{#}/', + + '/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/", + '/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/", + '/(?[[[:word]]])/' => "Unmatched ':' in POSIX class {#} m/(?[[[:word{#}]]])/", '/(?[[:word]])/' => "Unmatched ':' in POSIX class {#} m/(?[[:word{#}]])/", '/(?[[[:digit: ])/' => "Unmatched '[' in POSIX class {#} m/(?[[[:digit:{#} ])/", @@ -417,6 +424,8 @@ my @death_utf8 = mark_as_utf8( '/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/', 'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>, '/\cネ/' => "Character following \"\\c\" must be printable ASCII", + '/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/", + '/\B{ネ}/' => "'ネ' is an unknown bound type {#} m/\\B{ネ{#}}/", ); push @death, @death_utf8; @@ -450,6 +459,8 @@ my @death_utf8_only_under_strict = ( my @warning = ( 'm/\b*\x{100}/' => '\b* matches null string many times {#} m/\b*{#}\x{100}/', + '/\b{g}/a' => "Using /u for '\\b{g}' instead of /a {#} m/\\b{g}{#}/", + '/\B{gcb}/a' => "Using /u for '\\B{gcb}' instead of /a {#} m/\\B{gcb}{#}/", 'm/[:blank:]\x{100}/' => 'POSIX syntax [: :] belongs inside character classes {#} m/[:blank:]{#}\x{100}/', 'm/[[:cntrl:]][:^ascii:]\x{100}/' => 'POSIX syntax [: :] belongs inside character classes {#} m/[[:cntrl:]][:^ascii:]{#}\x{100}/', "m'\\y\\x{100}'" => 'Unrecognized escape \y passed through {#} m/\y{#}\x{100}/', @@ -32,7 +32,6 @@ #define PERL_IN_UTF8_C #include "perl.h" #include "inline_invlist.c" -#include "charclass_invlists.h" static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; |