Add [[:blank:]] as suggested in

Subject: [ID 20000716.024] [=cc=] / [:blank:] Message-Id: <200007170055.RAA23528@fummy.dsl.yahoo.com> (the [=cc=] has already been taken care of by #6439 so the whole bug report can be closed) and make [[:space:]] to be equivalent to isspace(3) (as opposed to \s, which is isSPACE()). The difference is that now [[:space:]] matches the mythical vertical tab, while \s doesn't. p4raw-id: //depot/perl@6703
author: Jeffrey Friedl <jfriedl@regex.info> 2000-07-16 10:55:29 -0700
committer: Jarkko Hietaniemi <jhi@iki.fi> 2000-08-18 21:07:06 +0000
commit: aaa51d5e11b8b0db616a7f939c784733b4cfef87 (patch)
tree: 5cc57e412e80d1006256d7edc7526dd927cbe2bd
parent: 2a4ebaa641b7ba24b2dcfc940bb2b5da27d05b4e (diff)
download: perl-aaa51d5e11b8b0db616a7f939c784733b4cfef87.tar.gz
7 files changed, 133 insertions, 24 deletions
diff --git a/handy.h b/handy.h
index 9e6f223cb5..d82b1c609b 100644
--- a/handy.h
+++ b/handy.h
@@ -296,6 +296,8 @@ Converts the specified character to lowercase.
 #define isALPHA(c)	(isUPPER(c) || isLOWER(c))
 #define isSPACE(c) \
 	((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) =='\r' || (c) == '\f')
+#define isPSXSPC(c)	(isSPACE(c) || (c) == '\v')
+#define isBLANK(c)	((c) == ' ' || (c) == '\t')
 #define isDIGIT(c)	((c) >= '0' && (c) <= '9')
 #ifdef EBCDIC
     /* In EBCDIC we do not do locales: therefore() isupper() is fine. */
@@ -382,6 +384,9 @@ Converts the specified character to lowercase.
 #  endif
 #endif /* USE_NEXT_CTYPE */
 
+#define isPSXSPC_LC(c)		(isSPACE_LC(c) || (c) == '\v')
+#define isBLANK_LC(c)		isBLANK(c) /* could be wrong */
+
 #define isALNUM_uni(c)		is_uni_alnum(c)
 #define isIDFIRST_uni(c)	is_uni_idfirst(c)
 #define isALPHA_uni(c)		is_uni_alpha(c)
@@ -400,6 +405,9 @@ Converts the specified character to lowercase.
 #define toTITLE_uni(c)		to_uni_title(c)
 #define toLOWER_uni(c)		to_uni_lower(c)
 
+#define isPSXSPC_uni(c)		(isSPACE_uni(c) ||(c) == '\f')
+#define isBLANK_uni(c)		isBLANK(c) /* could be wrong */
+
 #define isALNUM_LC_uni(c)	(c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c))
 #define isIDFIRST_LC_uni(c)	(c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c))
 #define isALPHA_LC_uni(c)	(c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c))
@@ -416,6 +424,9 @@ Converts the specified character to lowercase.
 #define toTITLE_LC_uni(c)	(c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c))
 #define toLOWER_LC_uni(c)	(c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c))
 
+#define isPSXSPC_LC_uni(c)	(isSPACE_LC_uni(c) ||(c) == '\f')
+#define isBLANK_LC_uni(c)	isBLANK(c) /* could be wrong */
+
 #define isALNUM_utf8(p)		is_utf8_alnum(p)
 #define isIDFIRST_utf8(p)	is_utf8_idfirst(p)
 #define isALPHA_utf8(p)		is_utf8_alpha(p)
@@ -434,6 +445,9 @@ Converts the specified character to lowercase.
 #define toTITLE_utf8(p)		to_utf8_title(p)
 #define toLOWER_utf8(p)		to_utf8_lower(p)
 
+#define isPSXSPC_utf8(c)	(isSPACE_utf8(c) ||(c) == '\f')
+#define isBLANK_utf8(c)		isBLANK(c) /* could be wrong */
+
 #define isALNUM_LC_utf8(p)	isALNUM_LC_uni(utf8_to_uv(p, 0))
 #define isIDFIRST_LC_utf8(p)	isIDFIRST_LC_uni(utf8_to_uv(p, 0))
 #define isALPHA_LC_utf8(p)	isALPHA_LC_uni(utf8_to_uv(p, 0))
@@ -450,6 +464,9 @@ Converts the specified character to lowercase.
 #define toTITLE_LC_utf8(p)	toTITLE_LC_uni(utf8_to_uv(p, 0))
 #define toLOWER_LC_utf8(p)	toLOWER_LC_uni(utf8_to_uv(p, 0))
 
+#define isPSXSPC_LC_utf8(c)	(isSPACE_LC_utf8(c) ||(c) == '\f')
+#define isBLANK_LC_utf8(c)	isBLANK(c) /* could be wrong */
+
 #ifdef EBCDIC
 EXT int ebcdic_control (int);
 #  define toCTRL(c)	ebcdic_control(c)
diff --git a/pod/perlre.pod b/pod/perlre.pod
index c964be8b8f..fa4aad25fd 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -199,20 +199,26 @@ equivalents (if available) are as follows:
     alpha
     alnum
     ascii
+    blank		[1]
     cntrl
     digit       \d
     graph
     lower
     print
     punct
-    space       \s
+    space       \s	[2]
     upper
-    word        \w
+    word        \w	[3]
     xdigit
 
+  [1] A GNU extension equivalent to C<[ \t]>, `all horizontal whitespace'.
+  [2] Not I<exactly equivalent> to C<\s> since the C<[[:space:]]> includes
+      also the (very rare) `vertical tabulator', "\ck", chr(11).
+  [3] A Perl extension. 
+
 For example use C<[:upper:]> to match all the uppercase characters.
-Note that the C<[]> are part of the C<[::]> construct, not part of the whole
-character class.  For example:
+Note that the C<[]> are part of the C<[::]> construct, not part of the
+whole character class.  For example:
 
     [01[:alpha:]%]
 
@@ -224,6 +230,7 @@ If the C<utf8> pragma is used, the following equivalences to Unicode
     alpha       IsAlpha
     alnum       IsAlnum
     ascii       IsASCII
+    blank	IsSpace
     cntrl       IsCntrl
     digit       IsDigit
     graph       IsGraph
@@ -238,8 +245,8 @@ If the C<utf8> pragma is used, the following equivalences to Unicode
 For example C<[:lower:]> and C<\p{IsLower}> are equivalent.
 
 If the C<utf8> pragma is not used but the C<locale> pragma is, the
-classes correlate with the isalpha(3) interface (except for `word',
-which is a Perl extension, mirroring C<\w>).
+classes correlate with the usual isalpha(3) interface (except for
+`word' and `blank').
 
 The assumedly non-obviously named classes are:
 
diff --git a/pod/perlretut.pod b/pod/perlretut.pod
index 66f8179ab6..87669e50ab 100644
--- a/pod/perlretut.pod
+++ b/pod/perlretut.pod
@@ -1672,15 +1672,17 @@ i.e., a non-mark followed by one or more marks.
 
 As if all those classes weren't enough, Perl also defines POSIX style
 character classes.  These have the form C<[:name:]>, with C<name> the
-name of the POSIX class.  The POSIX classes are alpha, alnum, ascii,
-cntrl, digit, graph, lower, print, punct, space, upper, word, and
-xdigit.  If C<utf8> is being used, then these classes are defined the
-same as their corresponding perl Unicode classes: C<[:upper:]> is the
-same as C<\p{IsUpper}>, etc.  The POSIX character classes, however,
-don't require using C<utf8>.  The C<[:digit:]>, C<[:word:]>, and
+name of the POSIX class.  The POSIX classes are C<alpha>, C<alnum>,
+C<ascii>, C<cntrl>, C<digit>, C<graph>, C<lower>, C<print>, C<punct>,
+C<space>, C<upper>, and C<xdigit>, and two extensions, C<word> (a Perl
+extension to match C<\w>), and C<blank> (a GNU extension).  If C<utf8>
+is being used, then these classes are defined the same as their
+corresponding perl Unicode classes: C<[:upper:]> is the same as
+C<\p{IsUpper}>, etc.  The POSIX character classes, however, don't
+require using C<utf8>.  The C<[:digit:]>, C<[:word:]>, and
 C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s>
-character classes.  To negate a POSIX class, put a C<^> in front of the
-name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under
+character classes.  To negate a POSIX class, put a C<^> in front of
+the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under
 C<utf8>, C<\P{IsDigit}>.  The Unicode and POSIX character classes can
 be used just like C<\d>, both inside and outside of character classes:
 
diff --git a/regcomp.c b/regcomp.c
index f0b7c5cd18..d2195b0ca1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -443,7 +443,7 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
 {
     int value;
 
-    for (value = 0; value < ANYOF_MAX; value += 2)
+    for (value = 0; value <= ANYOF_MAX; value += 2)
 	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 	    return 1;
     for (value = 0; value < 256; ++value)
@@ -3004,6 +3004,11 @@ S_regpposixcc(pTHX_ I32 value)
 			    namedclass =
 				complement ? ANYOF_NASCII : ANYOF_ASCII;
 			break;
+		    case 'b':
+			if (strnEQ(posixcc, "blank", 5))
+			    namedclass =
+				complement ? ANYOF_NBLANK : ANYOF_BLANK;
+			break;
 		    case 'c':
 			if (strnEQ(posixcc, "cntrl", 5))
 			    namedclass =
@@ -3035,7 +3040,7 @@ S_regpposixcc(pTHX_ I32 value)
 		    case 's':
 			if (strnEQ(posixcc, "space", 5))
 			    namedclass =
-				complement ? ANYOF_NSPACE : ANYOF_SPACE;
+				complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
 			break;
 		    case 'u':
 			if (strnEQ(posixcc, "upper", 5))
@@ -3160,7 +3165,7 @@ S_regclass(pTHX)
 	else if (value == '\\') {
 	    value = UCHARAT(PL_regcomp_parse++);
 	    /* Some compilers cannot handle switching on 64-bit integer
-	     * values, therefore value cannot be an UV. --jhi */
+	     * values, therefore the 'value' cannot be an UV. --jhi */
 	    switch (value) {
 	    case 'w':	namedclass = ANYOF_ALNUM;	break;
 	    case 'W':	namedclass = ANYOF_NALNUM;	break;
@@ -3339,6 +3344,24 @@ S_regclass(pTHX)
 #endif /* EBCDIC */
 		    }
 		    break;
+		case ANYOF_BLANK:
+		    if (LOC)
+			ANYOF_CLASS_SET(ret, ANYOF_BLANK);
+		    else {
+			for (value = 0; value < 256; value++)
+			    if (isBLANK(value))
+				ANYOF_BITMAP_SET(ret, value);
+		    }
+		    break;
+		case ANYOF_NBLANK:
+		    if (LOC)
+			ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
+		    else {
+			for (value = 0; value < 256; value++)
+			    if (!isBLANK(value))
+				ANYOF_BITMAP_SET(ret, value);
+		    }
+		    break;
 		case ANYOF_CNTRL:
 		    if (LOC)
 			ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
@@ -3412,6 +3435,24 @@ S_regclass(pTHX)
 				ANYOF_BITMAP_SET(ret, value);
 		    }
 		    break;
+		case ANYOF_PSXSPC:
+		    if (LOC)
+			ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
+		    else {
+			for (value = 0; value < 256; value++)
+			    if (isPSXSPC(value))
+				ANYOF_BITMAP_SET(ret, value);
+		    }
+		    break;
+		case ANYOF_NPSXSPC:
+		    if (LOC)
+			ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
+		    else {
+			for (value = 0; value < 256; value++)
+			    if (!isPSXSPC(value))
+				ANYOF_BITMAP_SET(ret, value);
+		    }
+		    break;
 		case ANYOF_PUNCT:
 		    if (LOC)
 			ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
@@ -3739,8 +3780,12 @@ S_regclassutf8(pTHX)
 		case ANYOF_NPUNCT:
 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");	break;
 		case ANYOF_SPACE:
+		case ANYOF_PSXSPC:
+		case ANYOF_BLANK:
 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");	break;
 		case ANYOF_NSPACE:
+		case ANYOF_NPSXSPC:
+		case ANYOF_NBLANK:
 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");	break;
 		case ANYOF_UPPER:
 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");	break;
@@ -4193,7 +4238,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
     else if (k == ANYOF) {
 	int i, rangestart = -1;
 	const char * const out[] = {	/* Should be syncronized with
-					   a table in regcomp.h */
+					   ANYOF_ #xdefines in regcomp.h */
 	    "\\w",
 	    "\\W",
 	    "\\s",
@@ -4217,9 +4262,13 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 	    "[:punct:]",
 	    "[:^punct:]",
 	    "[:upper:]",
-	    "[:!upper:]",
+	    "[:^upper:]",
 	    "[:xdigit:]",
-	    "[:^xdigit:]"
+	    "[:^xdigit:]",
+	    "[:space:]",
+	    "[:^space:]",
+	    "[:blank:]",
+	    "[:^blank:]"
 	};
 
 	if (o->flags & ANYOF_LOCALE)
diff --git a/regcomp.h b/regcomp.h
index 34c5c251be..e30e67f30f 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -194,7 +194,7 @@ struct regnode_charclass_class {
 
 #define ANYOF_ALNUM	 0	/* \w, utf8::IsWord, isALNUM() */
 #define ANYOF_NALNUM	 1
-#define ANYOF_SPACE	 2
+#define ANYOF_SPACE	 2	/* \s */
 #define ANYOF_NSPACE	 3
 #define ANYOF_DIGIT	 4
 #define ANYOF_NDIGIT	 5
@@ -218,8 +218,12 @@ struct regnode_charclass_class {
 #define ANYOF_NUPPER	23
 #define ANYOF_XDIGIT	24
 #define ANYOF_NXDIGIT	25
+#define ANYOF_PSXSPC	26	/* POSIX space: \s plus the vertical tab */
+#define ANYOF_NPSXSPC	27
+#define ANYOF_BLANK	28
+#define ANYOF_NBLANK	29	/* GNU extension: space and tab */
 
-#define ANYOF_MAX	31
+#define ANYOF_MAX	32
 
 /* Backward source code compatibility. */
 
diff --git a/regexec.c b/regexec.c
index cbc8c199b5..2004cc4cdf 100644
--- a/regexec.c
+++ b/regexec.c
@@ -3625,7 +3625,11 @@ S_reginclass(pTHX_ register regnode *p, register I32 c)
 	    (ANYOF_CLASS_TEST(p, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
 	    (ANYOF_CLASS_TEST(p, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
 	    (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
-	    (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
+	    (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
+	    (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
+	    (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
+	    (ANYOF_CLASS_TEST(p, ANYOF_BLANK)   &&  isBLANK(c))    ||
+	    (ANYOF_CLASS_TEST(p, ANYOF_NBLANK)  && !isBLANK(c))
 	    ) /* How's that for a conditional? */
 	{
 	    match = TRUE;
diff --git a/t/op/pat.t b/t/op/pat.t
index 91c4b7d58f..2ba6d934d2 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4,7 +4,7 @@
 # the format supported by op/regexp.t.  If you want to add a test
 # that does fit that format, add it to op/re_tests, not here.
 
-print "1..220\n";
+print "1..223\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -1058,3 +1058,29 @@ $w = 0;
 }
 print $w ? "not " : "", "ok $test\n";
 $test++;
+
+my %space = ( spc   => " ",
+	      tab   => "\t",
+	      cr    => "\r",
+	      lf    => "\n",
+	      ff    => "\f",
+# The vertical tabulator seems miraculously be 12 both in ASCII and EBCDIC.
+	      vt    => chr(11),
+	      false => "space" );
+
+my @space0 = sort grep { $space{$_} =~ /\s/ }          keys %space;
+my @space1 = sort grep { $space{$_} =~ /[[:space:]]/ } keys %space;
+my @space2 = sort grep { $space{$_} =~ /[[:blank:]]/ } keys %space;
+
+print "not " unless "@space0" eq "cr ff lf spc tab";
+print "ok $test\n";
+$test++;
+
+print "not " unless "@space1" eq "cr ff lf spc tab vt";
+print "ok $test\n";
+$test++;
+
+print "not " unless "@space2" eq "spc tab";
+print "ok $test\n";
+$test++;
+
author	Jeffrey Friedl <jfriedl@regex.info>	2000-07-16 10:55:29 -0700
committer	Jarkko Hietaniemi <jhi@iki.fi>	2000-08-18 21:07:06 +0000
commit	aaa51d5e11b8b0db616a7f939c784733b4cfef87 (patch)
tree	5cc57e412e80d1006256d7edc7526dd927cbe2bd
parent	2a4ebaa641b7ba24b2dcfc940bb2b5da27d05b4e (diff)
download	perl-aaa51d5e11b8b0db616a7f939c784733b4cfef87.tar.gz