summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2006-09-19 03:37:19 +0200
committerH.Merijn Brand <h.m.brand@xs4all.nl>2006-09-19 06:56:36 +0000
commitfc8cd66c26827f6c2ee1aa00ab2d3b3c320a4a28 (patch)
treeb426e51c41b332c31c05ec65e7570a4cc620f20c
parenta7ae1e4a956bbd5ffa44d286e0591bf4c0e7c341 (diff)
downloadperl-fc8cd66c26827f6c2ee1aa00ab2d3b3c320a4a28.tar.gz
Re: \N{...} in regular expression [PATCH]
Message-ID: <9b18b3110609181637m796d6c16o1b2741edc5f09eb2@mail.gmail.com> p4raw-id: //depot/perl@28868
-rw-r--r--MANIFEST1
-rw-r--r--embed.fnc1
-rw-r--r--embed.h2
-rw-r--r--pod/perldiag.pod15
-rw-r--r--proto.h3
-rw-r--r--regcomp.c301
-rw-r--r--regexec.c2
-rw-r--r--t/lib/Cname.pm22
-rwxr-xr-xt/op/pat.t139
-rw-r--r--toke.c10
10 files changed, 448 insertions, 48 deletions
diff --git a/MANIFEST b/MANIFEST
index d2a854ee56..7463370a3e 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3177,6 +3177,7 @@ t/io/through.t See if pipe passes data intact
t/io/utf8.t See if file seeking works
t/japh/abigail.t Obscure tests
t/lib/1_compile.t See if the various libraries and extensions compile
+t/lib/Cname.pm Test charnames in regexes (op/pat.t)
t/lib/common.pl Helper for lib/{warnings,feature}.t
t/lib/commonsense.t See if configuration meets basic needs
t/lib/compmod.pl Helper for 1_compile.t
diff --git a/embed.fnc b/embed.fnc
index 082a7c5288..63e9e8f0a2 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1308,6 +1308,7 @@ Es |regnode*|regclass |NN struct RExC_state_t *state|U32 depth
ERsn |I32 |regcurly |NN const char *
Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op
Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth
+Es |regnode*|reg_namedseq |NN struct RExC_state_t *state|NULLOK UV *valuep
Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd
Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth
Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth
diff --git a/embed.h b/embed.h
index dbb6ca34cc..fa43f4b12e 100644
--- a/embed.h
+++ b/embed.h
@@ -1312,6 +1312,7 @@
#define regcurly S_regcurly
#define reg_node S_reg_node
#define regpiece S_regpiece
+#define reg_namedseq S_reg_namedseq
#define reginsert S_reginsert
#define regtail S_regtail
#define join_exact S_join_exact
@@ -3500,6 +3501,7 @@
#define regcurly S_regcurly
#define reg_node(a,b) S_reg_node(aTHX_ a,b)
#define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c)
+#define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b)
#define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c)
#define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d)
#define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index cda8945232..f3a5eed382 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1334,6 +1334,14 @@ specified in the C<\N{...}> escape. Perhaps you forgot to load the
corresponding C<overload> or C<charnames> pragma? See L<charnames> and
L<overload>.
+=item Constant(%s)%s: %s in regex; marked by <-- HERE in m/%s/
+
+(F) The parser found inconsistencies while attempting to find
+the character name specified in the C<\N{...}> escape. Perhaps you
+forgot to load the corresponding C<charnames> pragma?
+See L<charnames>.
+
+
=item Constant is not %s reference
(F) A constant value (perhaps declared using the C<use constant> pragma)
@@ -1841,6 +1849,13 @@ about 250 characters for simple names, and somewhat more for compound
names (like C<$A::B>). You've exceeded Perl's limits. Future versions
of Perl are likely to eliminate these arbitrary limitations.
+=item Ignoring %s in character class in regex; marked by <-- HERE in m/%s/
+
+(W) Named unicode character escapes (\N{...}) may return multi-char
+or zero length sequences. When such an escape is used in a character class
+its behaviour is not well defined. Check that the correct escape has
+been used, and the correct charname handler is in scope.
+
=item Illegal binary digit %s
(F) You used a digit other than 0 or 1 in a binary number.
diff --git a/proto.h b/proto.h
index 48a66bb920..a957f32e70 100644
--- a/proto.h
+++ b/proto.h
@@ -3565,6 +3565,9 @@ STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp, U32 dep
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);
+STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *state, UV *valuep)
+ __attribute__nonnull__(pTHX_1);
+
STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_3);
diff --git a/regcomp.c b/regcomp.c
index db73dfb953..4cf477599b 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -118,6 +118,7 @@ typedef struct RExC_state_t {
I32 seen_zerolen;
I32 seen_evals;
I32 utf8;
+ HV *charnames; /* cache of named sequences */
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
@@ -149,6 +150,7 @@ typedef struct RExC_state_t {
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
+#define RExC_charnames (pRExC_state->charnames)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
@@ -3734,6 +3736,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
RExC_size = 0L;
RExC_emit = &PL_regdummy;
RExC_whilem_seen = 0;
+ RExC_charnames = NULL;
+
#if 0 /* REGC() is (currently) a NOP at the first pass.
* Clever compilers notice this and complain. --jhi */
REGC((U8)REG_MAGIC, (char*)RExC_emit);
@@ -3833,6 +3837,7 @@ reStudy:
copyRExC_state=RExC_state;
}
#endif
+
/* Dig out information for optimizations. */
r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
pm->op_pmflags = RExC_flags;
@@ -4204,6 +4209,8 @@ reStudy:
r->reganch |= ROPT_CANY_SEEN;
Newxz(r->startp, RExC_npar, I32);
Newxz(r->endp, RExC_npar, I32);
+ if (RExC_charnames)
+ SvREFCNT_dec((SV*)(RExC_charnames));
DEBUG_r( RX_DEBUG_on(r) );
DEBUG_DUMP_r({
@@ -4948,6 +4955,274 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
return(ret);
}
+
+/* reg_namedseq(pRExC_state,UVp)
+
+ This is expected to be called by a parser routine that has
+ recognized'\N' and needs to handle the rest. RExC_parse is
+ expected to point at the first char following the N at the time
+ of the call.
+
+ If valuep is non-null then it is assumed that we are parsing inside
+ of a charclass definition and the first codepoint in the resolved
+ string is returned via *valuep and the routine will return NULL.
+ In this mode if a multichar string is returned from the charnames
+ handler a warning will be issued, and only the first char in the
+ sequence will be examined. If the string returned is zero length
+ then the value of *valuep is undefined and NON-NULL will
+ be returned to indicate failure. (This will NOT be a valid pointer
+ to a regnode.)
+
+ If value is null then it is assumed that we are parsing normal text
+ and inserts a new EXACT node into the program containing the resolved
+ string and returns a pointer to the new node. If the string is
+ zerolength a NOTHING node is emitted.
+
+ On success RExC_parse is set to the char following the endbrace.
+ Parsing failures will generate a fatal errorvia vFAIL(...)
+
+ NOTE: We cache all results from the charnames handler locally in
+ the RExC_charnames hash (created on first use) to prevent a charnames
+ handler from playing silly-buggers and returning a short string and
+ then a long string for a given pattern. Since the regexp program
+ size is calculated during an initial parse this would result
+ in a buffer overrun so we cache to prevent the charname result from
+ changing during the course of the parse.
+
+ */
+STATIC regnode *
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
+{
+ char * name; /* start of the content of the name */
+ char * endbrace; /* endbrace following the name */
+ SV *sv_str = NULL;
+ SV *sv_name = NULL;
+ STRLEN len; /* this has various purposes throughout the code */
+ bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
+ regnode *ret = NULL;
+
+ if (*RExC_parse != '{') {
+ vFAIL("Missing braces on \\N{}");
+ }
+ name = RExC_parse+1;
+ endbrace = strchr(RExC_parse, '}');
+ if ( ! endbrace ) {
+ RExC_parse++;
+ vFAIL("Missing right brace on \\N{}");
+ }
+ RExC_parse = endbrace + 1;
+
+
+ /* RExC_parse points at the beginning brace,
+ endbrace points at the last */
+ if ( name[0]=='U' && name[1]=='+' ) {
+ /* its a "unicode hex" notation {U+89AB} */
+ I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX
+ | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+ UV cp;
+ len = endbrace - name - 2;
+ cp = grok_hex(name + 2, &len, &fl, NULL);
+ if ( len != endbrace - name - 2 ) {
+ cp = 0xFFFD;
+ }
+ if (cp > 0xff)
+ RExC_utf8 = 1;
+ if ( valuep ) {
+ *valuep = cp;
+ return NULL;
+ }
+ sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
+ } else {
+ /* fetch the charnames handler for this scope */
+ HV * const table = GvHV(PL_hintgv);
+ SV **cvp= table ?
+ hv_fetchs(table, "charnames", FALSE) :
+ NULL;
+ SV *cv= cvp ? *cvp : NULL;
+ HE *he_str;
+ int count;
+ /* create an SV with the name as argument */
+ sv_name = newSVpvn(name, endbrace - name);
+
+ if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
+ vFAIL2("Constant(\\N{%s}) unknown: "
+ "(possibly a missing \"use charnames ...\")",
+ SvPVX(sv_name));
+ }
+ if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
+ vFAIL2("Constant(\\N{%s}): "
+ "$^H{charnames} is not defined",SvPVX(sv_name));
+ }
+
+
+
+ if (!RExC_charnames) {
+ /* make sure our cache is allocated */
+ RExC_charnames = newHV();
+ }
+ /* see if we have looked this one up before */
+ he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
+ if ( he_str ) {
+ sv_str = HeVAL(he_str);
+ cached = 1;
+ } else {
+ dSP ;
+
+ ENTER ;
+ SAVETMPS ;
+ PUSHMARK(SP) ;
+
+ XPUSHs(sv_name);
+
+ PUTBACK ;
+
+ count= call_sv(cv, G_SCALAR);
+
+ if (count == 1) { /* XXXX is this right? dmq */
+ sv_str = POPs;
+ SvREFCNT_inc_simple_void(sv_str);
+ }
+
+ SPAGAIN ;
+ PUTBACK ;
+ FREETMPS ;
+ LEAVE ;
+
+ if ( !sv_str || !SvOK(sv_str) ) {
+ vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
+ "did not return a defined value",SvPVX(sv_name));
+ }
+ if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
+ cached = 1;
+ }
+ }
+ if (valuep) {
+ char *p = SvPV(sv_str, len);
+ if (len) {
+ STRLEN numlen = 1;
+ if ( SvUTF8(sv_str) ) {
+ *valuep = utf8_to_uvchr(p, &numlen);
+ if (*valuep > 0x7F)
+ RExC_utf8 = 1;
+ /* XXXX
+ We have to turn on utf8 for high bit chars otherwise
+ we get failures with
+
+ "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
+ "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
+
+ This is different from what \x{} would do with the same
+ codepoint, where the condition is > 0xFF.
+ - dmq
+ */
+
+
+ } else {
+ *valuep = (UV)*p;
+ /* warn if we havent used the whole string? */
+ }
+ if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+ vWARN2(RExC_parse,
+ "Ignoring excess chars from \\N{%s} in character class",
+ SvPVX(sv_name)
+ );
+ }
+ } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+ vWARN2(RExC_parse,
+ "Ignoring zero length \\N{%s} in character class",
+ SvPVX(sv_name)
+ );
+ }
+ if (sv_name)
+ SvREFCNT_dec(sv_name);
+ if (!cached)
+ SvREFCNT_dec(sv_str);
+ return len ? NULL : (regnode *)&len;
+ } else if(SvCUR(sv_str)) {
+
+ char *s;
+ char *p, *pend;
+ STRLEN charlen = 1;
+ char * parse_start = name-3; /* needed for the offsets */
+ GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
+
+ ret = reg_node(pRExC_state,
+ (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ s= STRING(ret);
+
+ if ( RExC_utf8 && !SvUTF8(sv_str) ) {
+ sv_utf8_upgrade(sv_str);
+ } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
+ RExC_utf8= 1;
+ }
+
+ p = SvPV(sv_str, len);
+ pend = p + len;
+ /* len is the length written, charlen is the size the char read */
+ for ( len = 0; p < pend; p += charlen ) {
+ if (UTF) {
+ UV uvc = utf8_to_uvchr(p, &charlen);
+ if (FOLD) {
+ STRLEN foldlen,numlen;
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
+ uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
+ /* Emit all the Unicode characters. */
+
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen)
+ {
+ uvc = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ const STRLEN unilen = reguni(pRExC_state, uvc, s);
+ s += unilen;
+ len += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break; /* "Can't happen." */
+ }
+ } else {
+ const STRLEN unilen = reguni(pRExC_state, uvc, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ }
+ } else {
+ len++;
+ REGC(*p, s++);
+ }
+ }
+ if (SIZE_ONLY) {
+ RExC_size += STR_SZ(len);
+ } else {
+ STR_LEN(ret) = len;
+ RExC_emit += STR_SZ(len);
+ }
+ Set_Node_Cur_Length(ret); /* MJD */
+ RExC_parse--;
+ nextchar(pRExC_state);
+ } else {
+ ret = reg_node(pRExC_state,NOTHING);
+ }
+ if (!cached) {
+ SvREFCNT_dec(sv_str);
+ }
+ if (sv_name) {
+ SvREFCNT_dec(sv_name);
+ }
+ return ret;
+
+}
+
+
+
/*
- regatom - the lowest level
*
@@ -5184,6 +5459,14 @@ tryagain:
*flagp |= HASWIDTH|SIMPLE;
}
break;
+ case 'N':
+ /* Handle \N{NAME} here and not below because it can be
+ multicharacter. join_exact() will join them up later on.
+ Also this makes sure that things like /\N{BLAH}+/ and
+ \N{BLAH} being multi char Just Happen. dmq*/
+ ++RExC_parse;
+ ret= reg_namedseq(pRExC_state, NULL);
+ break;
case 'n':
case 'r':
case 't':
@@ -5295,6 +5578,7 @@ tryagain:
case 'D':
case 'p':
case 'P':
+ case 'N':
--p;
goto loopdone;
case 'n':
@@ -5493,7 +5777,7 @@ tryagain:
/* If the encoding pragma is in effect recode the text of
* any EXACT-kind nodes. */
- if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
+ if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) {
const STRLEN oldlen = STR_LEN(ret);
SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
@@ -5766,6 +6050,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
if (UCHARAT(RExC_parse) == ']')
goto charclassloop;
+parseit:
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
charclassloop:
@@ -5807,6 +6092,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
case 'D': namedclass = ANYOF_NDIGIT; break;
+ case 'N': /* Handle \N{NAME} in class */
+ {
+ /* We only pay attention to the first char of
+ multichar strings being returned. I kinda wonder
+ if this makes sense as it does change the behaviour
+ from earlier versions, OTOH that behaviour was broken
+ as well. */
+ UV v; /* value is register so we cant & it /grrr */
+ if (reg_namedseq(pRExC_state, &v)) {
+ goto parseit;
+ }
+ value= v;
+ }
+ break;
case 'p':
case 'P':
{
diff --git a/regexec.c b/regexec.c
index 1a86e4997a..679e31fc36 100644
--- a/regexec.c
+++ b/regexec.c
@@ -527,7 +527,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
DEBUG_OPTIMISE_r({
PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
(int)(end_point - start_point),
- (int)(end_point - start_point),
+ (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
start_point);
});
diff --git a/t/lib/Cname.pm b/t/lib/Cname.pm
new file mode 100644
index 0000000000..d4b8a9ea4d
--- /dev/null
+++ b/t/lib/Cname.pm
@@ -0,0 +1,22 @@
+package Cname;
+our $Evil='A';
+
+sub translator {
+ my $str = shift;
+ if ( $str eq 'EVIL' ) {
+ (my $c=substr("A".$Evil,-1))++;
+ my $r=$Evil;
+ $Evil.=$c;
+ return $r;
+ }
+ if ( $str eq 'EMPTY-STR') {
+ return "";
+ }
+ return $str;
+}
+
+sub import {
+ shift;
+ $^H{charnames} = \&translator;
+}
+1;
diff --git a/t/op/pat.t b/t/op/pat.t
index 4ff133b619..97bad61881 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,8 +6,7 @@
$| = 1;
-# please update note at bottom of file when you change this
-print "1..1232\n";
+# Test counter output is generated by a BEGIN block at bottom of file
BEGIN {
chdir 't' if -d 't';
@@ -1286,7 +1285,7 @@ print "ok 247\n";
{
# bug id 20001008.001
- my $test = 248;
+ $test = 248;
my @x = ("stra\337e 138","stra\337e 138");
for (@x) {
s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
@@ -1376,7 +1375,7 @@ print "ok 247\n";
}
SKIP: {
- my $test = 264; # till 575
+ $test = 264; # till 575
use charnames ":full";
@@ -2032,13 +2031,13 @@ print "ok 683\n" if @a == 9 && "@a" eq "f o o \n $a $b b a r";
}
-my $test = 687;
+$test = 687;
# Force scalar context on the patern match
-sub ok ($$) {
+sub ok ($;$) {
my($ok, $name) = @_;
- printf "%sok %d - %s\n", ($ok ? "" : "not "), $test, $name;
+ printf "%sok %d - %s\n", ($ok ? "" : "not "), $test, $name||'unnamed';
printf "# Failed test at line %d\n", (caller)[2] unless $ok;
@@ -2604,35 +2603,21 @@ print "# some Unicode properties\n";
use charnames ':full';
- print "\N{LATIN SMALL LETTER SHARP S}" =~
- /\N{LATIN SMALL LETTER SHARP S}/ ? "ok 835\n" : "not ok 835\n";
+ $test= 835;
- print "\N{LATIN SMALL LETTER SHARP S}" =~
- /\N{LATIN SMALL LETTER SHARP S}/i ? "ok 836\n" : "not ok 836\n";
+ ok("\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER SHARP S}/);
+ ok("\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER SHARP S}/i);
- print "\N{LATIN SMALL LETTER SHARP S}" =~
- /[\N{LATIN SMALL LETTER SHARP S}]/ ? "ok 837\n" : "not ok 837\n";
+ ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}]/);
+ ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i);
- print "\N{LATIN SMALL LETTER SHARP S}" =~
- /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 838\n" : "not ok 838\n";
+ ok("ss" =~ /\N{LATIN SMALL LETTER SHARP S}/i);
+ ok("SS" =~ /\N{LATIN SMALL LETTER SHARP S}/i);
+ ok("ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i);
+ ok("SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i);
- print "ss" =~
- /\N{LATIN SMALL LETTER SHARP S}/i ? "ok 839\n" : "not ok 839\n";
-
- print "SS" =~
- /\N{LATIN SMALL LETTER SHARP S}/i ? "ok 840\n" : "not ok 840\n";
-
- print "ss" =~
- /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n";
-
- print "SS" =~
- /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n";
-
- print "\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i ?
- "ok 843\n" : "not ok 843\n";
-
- print "\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i ?
- "ok 844\n" : "not ok 844\n";
+ ok("\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i);
+ ok("\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i);
}
{
@@ -2751,7 +2736,7 @@ print "# some Unicode properties\n";
# check utf8/non-utf8 mixtures
# try to force all float/anchored check combinations
my $c = "\x{100}";
- my $test = 865;
+ $test = 865;
my $subst;
for my $re (
"xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", "xx.*(?=$c)", "(?=$c).*xx",
@@ -2790,7 +2775,7 @@ print "# some Unicode properties\n";
{
print "# qr/.../x\n";
- my $test = 893;
+ $test = 893;
my $R = qr/ A B C # D E/x;
@@ -2806,7 +2791,7 @@ print "# some Unicode properties\n";
{
print "# illegal Unicode properties\n";
- my $test = 896;
+ $test = 896;
print eval { "a" =~ /\pq / } ? "not ok $test\n" : "ok $test\n";
$test++;
@@ -2818,7 +2803,7 @@ print "# some Unicode properties\n";
{
print "# [ID 20020412.005] wrong pmop flags checked when empty pattern\n";
# requires reuse of last successful pattern
- my $test = 898;
+ $test = 898;
$test =~ /\d/;
for (0 .. 1) {
my $match = ?? + 0;
@@ -3039,7 +3024,7 @@ ok("A" =~ /\p{AsciiHexAndDash}/, "'A' is AsciiHexAndDash");
my $ok = $s =~ /(\x{100}{4})/;
my($ord, $len) = (ord $1, length $1);
print +($ok && $ord == 0x100 && $len == 4)
- ? "ok $test\n" : "not ok $test\t# $ok/$ord/$len\n";
+ ? "ok $test\n" : "not ok $test\t# [#18179] $ok/$ord/$len\n";
++$test;
}
@@ -3404,10 +3389,12 @@ ok(("foba ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i)
-{
+if (!$ENV{PERL_SKIP_PSYCHO_TEST}){
my @normal=qw(these are some normal words);
my $psycho=join "|",@normal,map chr $_,255..20000;
ok(('these'=~/($psycho)/) && $1 eq 'these','Pyscho');
+} else {
+ ok(1,'Skipped Psycho');
}
# [perl #36207] mixed utf8 / latin-1 and case folding
@@ -3533,22 +3520,22 @@ if ($ordA == 193) {
my @chars = ("A".."Z");
my $delim = ",";
my $size = 32771 - 4;
- my $test = '';
+ my $str = '';
# create some random junk. Inefficient, but it works.
for ($i = 0 ; $i < $size ; $i++) {
- $test .= $chars[int(rand(@chars))];
+ $str .= $chars[int(rand(@chars))];
}
- $test .= ($delim x 4);
+ $str .= ($delim x 4);
my $res;
my $matched;
- if ($test =~ s/^(.*?)${delim}{4}//s) {
+ if ($str =~ s/^(.*?)${delim}{4}//s) {
$res = $1;
$matched=1;
}
ok($matched,'pattern matches');
- ok(length($test)==0,"Empty string");
+ ok(length($str)==0,"Empty string");
ok(defined($res) && length($res)==$size,"\$1 is correct size");
}
@@ -3578,9 +3565,73 @@ if ($ordA == 193) {
ok("A@-B" =~ /A@{-}B/x, 'interpolation of @- in /@{-}/x');
}
+{
+ use lib 'lib';
+ use Cname;
+
+ ok('fooB'=~/\N{foo}[\N{B}\N{b}]/,"Passthrough charname");
+ $test=1233; my $handle=make_must_warn('Ignoring excess chars from');
+ $handle->('q(xxWxx) =~ /[\N{WARN}]/');
+ {
+ my $code;
+ my $w="";
+ local $SIG{__WARN__} = sub { $w.=shift };
+ eval($code=<<'EOFTEST') or die "$@\n$code\n";
+ {
+ use warnings;
+
+ #1234
+ ok("\0" !~ /[\N{EMPTY-STR}XY]/,
+ "Zerolength charname in charclass doesnt match \0");
+ 1;
+ }
+EOFTEST
+ ok($w=~/Ignoring zero length/,
+ "Got expected zero length warning");
+ warn $code;
+
+ }
+ $handle= make_must_warn('Ignoring zero length');
+ $handle->('qq(\\0) =~ /[\N{EMPTY-STR}XY]/');
+ ok('AB'=~/(\N{EVIL})/ && $1 eq 'A',"Charname caching $1");
+ ok('ABC'=~/(\N{EVIL})/,"Charname caching $1");
+ ok('xy'=~/x\N{EMPTY-STR}y/, 'Empty string charname produces NOTHING node');
+ ok(''=~/\N{EMPTY-STR}/, 'Empty string charname produces NOTHING node 2');
+
+}
+{
+ print "# MORE LATIN SMALL LETTER SHARP S\n";
+
+ use charnames ':full';
+
+ #see also test #835
+ ok("ss" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i,
+ "unoptimized named sequence in class 1");
+ ok("SS" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i,
+ "unoptimized named sequence in class 2");
+ ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/,
+ "unoptimized named sequence in class 3");
+ ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i,
+ "unoptimized named sequence in class 4");
+
+ ok('aabc' !~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against aabc');
+ ok('a+bc' =~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against a+bc');
+ ok('a+bc' =~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against a+bc');
+
+ ok(' A B'=~/\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/,
+ 'Intermixed named and unicode escapes 1');
+ ok("\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}"=~
+ /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/,
+ 'Intermixed named and unicode escapes 2');
+ ok("\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042} 3"=~
+ /[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/,
+ 'Intermixed named and unicode escapes');
+}
# Keep the following test last -- it may crash perl
ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
or print "# Unexpected outcome: should pass or crash perl\n";
-# last test 1231
+# Don't forget to update this!
+BEGIN{print "1..1251\n"};
+
diff --git a/toke.c b/toke.c
index 3fec508a52..b097e39581 100644
--- a/toke.c
+++ b/toke.c
@@ -1793,7 +1793,7 @@ S_scan_const(pTHX_ char *start)
const char * const leaveit = /* set of acceptably-backslashed characters */
(const char *)
(PL_lex_inpat
- ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
+ ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-Nnrtfeaxcz0123456789[{]} \t\n\r\f\v#"
: "");
if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
@@ -2179,6 +2179,7 @@ S_scan_const(pTHX_ char *start)
SV *res;
STRLEN len;
const char *str;
+ SV *type;
if (!e) {
yyerror("Missing right brace on \\N{}");
@@ -2192,12 +2193,17 @@ S_scan_const(pTHX_ char *start)
s += 3;
len = e - s;
uv = grok_hex(s, &len, &flags, NULL);
+ if ( len != e - s ) {
+ uv=0xFFFD;
+ }
s = e + 1;
goto NUM_ESCAPE_INSERT;
}
res = newSVpvn(s + 1, e - s - 1);
+ type = newSVpvn(s - 2,e - s + 3);
res = new_constant( NULL, 0, "charnames",
- res, NULL, "\\N{...}" );
+ res, NULL, SvPVX(type) );
+ SvREFCNT_dec(type);
if (has_utf8)
sv_utf8_upgrade(res);
str = SvPV_const(res,len);