Change meaning of \v, \V, and add \h, \H to match Perl6, add \R to match PCRE and unicode tr18

Message-ID: <9b18b3110704221434g43457742p28cab00289f83639@mail.gmail.com> p4raw-id: //depot/perl@31026
author: Yves Orton <demerphq@gmail.com> 2007-04-23 01:34:55 +0200
committer: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2007-04-23 09:04:31 +0000
commit: e1d1eefb8c88e0dcaf2bb9e6c04d7f6192be966f (patch)
tree: 6930eda70a9e751e8789ca568d015204ca292552 /regexec.c
parent: 8638e433f90c5046b38f1d145c125bad5dbfdb7e (diff)
download: perl-e1d1eefb8c88e0dcaf2bb9e6c04d7f6192be966f.tar.gz
1 files changed, 135 insertions, 1 deletions
diff --git a/regexec.c b/regexec.c
index 1eb7ff2859..fa853a475f 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1110,6 +1110,15 @@ REXEC_FBC_SCAN(                                       \
 if ((!reginfo || regtry(reginfo, &s))) \
     goto got_it
 
+#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
+    if (do_utf8) {                                             \
+	REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
+    }                                                          \
+    else {                                                     \
+	REXEC_FBC_CLASS_SCAN(CoNd);                            \
+    }                                                          \
+    break
+    
 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
     if (do_utf8) {                                             \
 	UtFpReLoAd;                                            \
@@ -1425,6 +1434,31 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		!isDIGIT_LC_utf8((U8*)s),
 		!isDIGIT_LC(*s)
 	    );
+	case LNBREAK:
+	    REXEC_FBC_CSCAN(
+		is_LNBREAK_utf8(s),
+		is_LNBREAK_latin1(s)
+	    );
+	case VERTWS:
+	    REXEC_FBC_CSCAN(
+		is_VERTWS_utf8(s),
+		is_VERTWS_latin1(s)
+	    );
+	case NVERTWS:
+	    REXEC_FBC_CSCAN(
+		!is_VERTWS_utf8(s),
+		!is_VERTWS_latin1(s)
+	    );
+	case HORIZWS:
+	    REXEC_FBC_CSCAN(
+		is_HORIZWS_utf8(s),
+		is_HORIZWS_latin1(s)
+	    );
+	case NHORIZWS:
+	    REXEC_FBC_CSCAN(
+		!is_HORIZWS_utf8(s),
+		!is_HORIZWS_latin1(s)
+	    );	    
 	case AHOCORASICKC:
 	case AHOCORASICK: 
 	    {
@@ -3207,8 +3241,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		      * pack("U0U*", 0xDF) =~ /ss/i,
 		      * the 0xC3 0x9F are the UTF-8
 		      * byte sequence for the U+00DF. */
+
 		     if (!(do_utf8 &&
-			   toLOWER(s[0]) == 's' &&
+		           toLOWER(s[0]) == 's' &&
 			   ln >= 2 &&
 			   toLOWER(s[1]) == 's' &&
 			   (U8)l[0] == 0xC3 &&
@@ -4972,6 +5007,35 @@ NULL
             /* NOTREACHED */
 #undef ST
 
+        case LNBREAK:
+            if ((n=is_LNBREAK(locinput,do_utf8))) {
+                locinput += n;
+                nextchr = UCHARAT(locinput);
+            } else
+                sayNO;
+            break;
+
+#define CASE_CLASS(nAmE)                              \
+        case nAmE:                                    \
+            if ((n=is_##nAmE(locinput,do_utf8))) {    \
+                locinput += n;                        \
+                nextchr = UCHARAT(locinput);          \
+            } else                                    \
+                sayNO;                                \
+            break;                                    \
+        case N##nAmE:                                 \
+            if ((n=is_##nAmE(locinput,do_utf8))) {    \
+                sayNO;                                \
+            } else {                                  \
+                locinput += UTF8SKIP(locinput);       \
+                nextchr = UCHARAT(locinput);          \
+            }                                         \
+            break
+
+        CASE_CLASS(VERTWS);
+        CASE_CLASS(HORIZWS);
+#undef CASE_CLASS
+
 	default:
 	    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
 			  PTR2UV(scan), OP(scan));
@@ -5382,7 +5446,77 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    while (scan < loceol && !isDIGIT(*scan))
 		scan++;
 	}
+    case LNBREAK:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+		scan += c;
+		hardcount++;
+	    }
+	} else {
+	    /*
+	      LNBREAK can match two latin chars, which is ok,
+	      because we have a null terminated string, but we
+	      have to use hardcount in this situation
+	    */
+	    while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
+		scan+=c;
+		hardcount++;
+	    }
+	}	
+	break;
+    case HORIZWS:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+		scan += c;
+		hardcount++;
+	    }
+	} else {
+	    while (scan < loceol && is_HORIZWS_latin1(scan)) 
+		scan++;		
+	}	
 	break;
+    case NHORIZWS:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+		scan += UTF8SKIP(scan);
+		hardcount++;
+	    }
+	} else {
+	    while (scan < loceol && !is_HORIZWS_latin1(scan))
+		scan++;
+
+	}	
+	break;
+    case VERTWS:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+		scan += c;
+		hardcount++;
+	    }
+	} else {
+	    while (scan < loceol && is_VERTWS_latin1(scan)) 
+		scan++;
+
+	}	
+	break;
+    case NVERTWS:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+		scan += UTF8SKIP(scan);
+		hardcount++;
+	    }
+	} else {
+	    while (scan < loceol && !is_VERTWS_latin1(scan)) 
+		scan++;
+          
+	}	
+	break;
+
     default:		/* Called on something of 0 width. */
 	break;		/* So match right here or not at all. */
     }
author	Yves Orton <demerphq@gmail.com>	2007-04-23 01:34:55 +0200
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2007-04-23 09:04:31 +0000
commit	e1d1eefb8c88e0dcaf2bb9e6c04d7f6192be966f (patch)
tree	6930eda70a9e751e8789ca568d015204ca292552 /regexec.c
parent	8638e433f90c5046b38f1d145c125bad5dbfdb7e (diff)
download	perl-e1d1eefb8c88e0dcaf2bb9e6c04d7f6192be966f.tar.gz