summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeffrey Friedl <jfriedl@regex.info>2000-07-16 10:55:29 -0700
committerJarkko Hietaniemi <jhi@iki.fi>2000-08-18 21:07:06 +0000
commitaaa51d5e11b8b0db616a7f939c784733b4cfef87 (patch)
tree5cc57e412e80d1006256d7edc7526dd927cbe2bd
parent2a4ebaa641b7ba24b2dcfc940bb2b5da27d05b4e (diff)
downloadperl-aaa51d5e11b8b0db616a7f939c784733b4cfef87.tar.gz
Add [[:blank:]] as suggested in
Subject: [ID 20000716.024] [=cc=] / [:blank:] Message-Id: <200007170055.RAA23528@fummy.dsl.yahoo.com> (the [=cc=] has already been taken care of by #6439 so the whole bug report can be closed) and make [[:space:]] to be equivalent to isspace(3) (as opposed to \s, which is isSPACE()). The difference is that now [[:space:]] matches the mythical vertical tab, while \s doesn't. p4raw-id: //depot/perl@6703
-rw-r--r--handy.h17
-rw-r--r--pod/perlre.pod19
-rw-r--r--pod/perlretut.pod18
-rw-r--r--regcomp.c61
-rw-r--r--regcomp.h8
-rw-r--r--regexec.c6
-rwxr-xr-xt/op/pat.t28
7 files changed, 133 insertions, 24 deletions
diff --git a/handy.h b/handy.h
index 9e6f223cb5..d82b1c609b 100644
--- a/handy.h
+++ b/handy.h
@@ -296,6 +296,8 @@ Converts the specified character to lowercase.
#define isALPHA(c) (isUPPER(c) || isLOWER(c))
#define isSPACE(c) \
((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) =='\r' || (c) == '\f')
+#define isPSXSPC(c) (isSPACE(c) || (c) == '\v')
+#define isBLANK(c) ((c) == ' ' || (c) == '\t')
#define isDIGIT(c) ((c) >= '0' && (c) <= '9')
#ifdef EBCDIC
/* In EBCDIC we do not do locales: therefore() isupper() is fine. */
@@ -382,6 +384,9 @@ Converts the specified character to lowercase.
# endif
#endif /* USE_NEXT_CTYPE */
+#define isPSXSPC_LC(c) (isSPACE_LC(c) || (c) == '\v')
+#define isBLANK_LC(c) isBLANK(c) /* could be wrong */
+
#define isALNUM_uni(c) is_uni_alnum(c)
#define isIDFIRST_uni(c) is_uni_idfirst(c)
#define isALPHA_uni(c) is_uni_alpha(c)
@@ -400,6 +405,9 @@ Converts the specified character to lowercase.
#define toTITLE_uni(c) to_uni_title(c)
#define toLOWER_uni(c) to_uni_lower(c)
+#define isPSXSPC_uni(c) (isSPACE_uni(c) ||(c) == '\f')
+#define isBLANK_uni(c) isBLANK(c) /* could be wrong */
+
#define isALNUM_LC_uni(c) (c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c))
#define isIDFIRST_LC_uni(c) (c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c))
#define isALPHA_LC_uni(c) (c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c))
@@ -416,6 +424,9 @@ Converts the specified character to lowercase.
#define toTITLE_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c))
#define toLOWER_LC_uni(c) (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c))
+#define isPSXSPC_LC_uni(c) (isSPACE_LC_uni(c) ||(c) == '\f')
+#define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */
+
#define isALNUM_utf8(p) is_utf8_alnum(p)
#define isIDFIRST_utf8(p) is_utf8_idfirst(p)
#define isALPHA_utf8(p) is_utf8_alpha(p)
@@ -434,6 +445,9 @@ Converts the specified character to lowercase.
#define toTITLE_utf8(p) to_utf8_title(p)
#define toLOWER_utf8(p) to_utf8_lower(p)
+#define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f')
+#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */
+
#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv(p, 0))
#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv(p, 0))
#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv(p, 0))
@@ -450,6 +464,9 @@ Converts the specified character to lowercase.
#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, 0))
#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, 0))
+#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f')
+#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */
+
#ifdef EBCDIC
EXT int ebcdic_control (int);
# define toCTRL(c) ebcdic_control(c)
diff --git a/pod/perlre.pod b/pod/perlre.pod
index c964be8b8f..fa4aad25fd 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -199,20 +199,26 @@ equivalents (if available) are as follows:
alpha
alnum
ascii
+ blank [1]
cntrl
digit \d
graph
lower
print
punct
- space \s
+ space \s [2]
upper
- word \w
+ word \w [3]
xdigit
+ [1] A GNU extension equivalent to C<[ \t]>, `all horizontal whitespace'.
+ [2] Not I<exactly equivalent> to C<\s> since the C<[[:space:]]> includes
+ also the (very rare) `vertical tabulator', "\ck", chr(11).
+ [3] A Perl extension.
+
For example use C<[:upper:]> to match all the uppercase characters.
-Note that the C<[]> are part of the C<[::]> construct, not part of the whole
-character class. For example:
+Note that the C<[]> are part of the C<[::]> construct, not part of the
+whole character class. For example:
[01[:alpha:]%]
@@ -224,6 +230,7 @@ If the C<utf8> pragma is used, the following equivalences to Unicode
alpha IsAlpha
alnum IsAlnum
ascii IsASCII
+ blank IsSpace
cntrl IsCntrl
digit IsDigit
graph IsGraph
@@ -238,8 +245,8 @@ If the C<utf8> pragma is used, the following equivalences to Unicode
For example C<[:lower:]> and C<\p{IsLower}> are equivalent.
If the C<utf8> pragma is not used but the C<locale> pragma is, the
-classes correlate with the isalpha(3) interface (except for `word',
-which is a Perl extension, mirroring C<\w>).
+classes correlate with the usual isalpha(3) interface (except for
+`word' and `blank').
The assumedly non-obviously named classes are:
diff --git a/pod/perlretut.pod b/pod/perlretut.pod
index 66f8179ab6..87669e50ab 100644
--- a/pod/perlretut.pod
+++ b/pod/perlretut.pod
@@ -1672,15 +1672,17 @@ i.e., a non-mark followed by one or more marks.
As if all those classes weren't enough, Perl also defines POSIX style
character classes. These have the form C<[:name:]>, with C<name> the
-name of the POSIX class. The POSIX classes are alpha, alnum, ascii,
-cntrl, digit, graph, lower, print, punct, space, upper, word, and
-xdigit. If C<utf8> is being used, then these classes are defined the
-same as their corresponding perl Unicode classes: C<[:upper:]> is the
-same as C<\p{IsUpper}>, etc. The POSIX character classes, however,
-don't require using C<utf8>. The C<[:digit:]>, C<[:word:]>, and
+name of the POSIX class. The POSIX classes are C<alpha>, C<alnum>,
+C<ascii>, C<cntrl>, C<digit>, C<graph>, C<lower>, C<print>, C<punct>,
+C<space>, C<upper>, and C<xdigit>, and two extensions, C<word> (a Perl
+extension to match C<\w>), and C<blank> (a GNU extension). If C<utf8>
+is being used, then these classes are defined the same as their
+corresponding perl Unicode classes: C<[:upper:]> is the same as
+C<\p{IsUpper}>, etc. The POSIX character classes, however, don't
+require using C<utf8>. The C<[:digit:]>, C<[:word:]>, and
C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s>
-character classes. To negate a POSIX class, put a C<^> in front of the
-name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under
+character classes. To negate a POSIX class, put a C<^> in front of
+the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under
C<utf8>, C<\P{IsDigit}>. The Unicode and POSIX character classes can
be used just like C<\d>, both inside and outside of character classes:
diff --git a/regcomp.c b/regcomp.c
index f0b7c5cd18..d2195b0ca1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -443,7 +443,7 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
{
int value;
- for (value = 0; value < ANYOF_MAX; value += 2)
+ for (value = 0; value <= ANYOF_MAX; value += 2)
if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
return 1;
for (value = 0; value < 256; ++value)
@@ -3004,6 +3004,11 @@ S_regpposixcc(pTHX_ I32 value)
namedclass =
complement ? ANYOF_NASCII : ANYOF_ASCII;
break;
+ case 'b':
+ if (strnEQ(posixcc, "blank", 5))
+ namedclass =
+ complement ? ANYOF_NBLANK : ANYOF_BLANK;
+ break;
case 'c':
if (strnEQ(posixcc, "cntrl", 5))
namedclass =
@@ -3035,7 +3040,7 @@ S_regpposixcc(pTHX_ I32 value)
case 's':
if (strnEQ(posixcc, "space", 5))
namedclass =
- complement ? ANYOF_NSPACE : ANYOF_SPACE;
+ complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
break;
case 'u':
if (strnEQ(posixcc, "upper", 5))
@@ -3160,7 +3165,7 @@ S_regclass(pTHX)
else if (value == '\\') {
value = UCHARAT(PL_regcomp_parse++);
/* Some compilers cannot handle switching on 64-bit integer
- * values, therefore value cannot be an UV. --jhi */
+ * values, therefore the 'value' cannot be an UV. --jhi */
switch (value) {
case 'w': namedclass = ANYOF_ALNUM; break;
case 'W': namedclass = ANYOF_NALNUM; break;
@@ -3339,6 +3344,24 @@ S_regclass(pTHX)
#endif /* EBCDIC */
}
break;
+ case ANYOF_BLANK:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_BLANK);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isBLANK(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ break;
+ case ANYOF_NBLANK:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isBLANK(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ break;
case ANYOF_CNTRL:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
@@ -3412,6 +3435,24 @@ S_regclass(pTHX)
ANYOF_BITMAP_SET(ret, value);
}
break;
+ case ANYOF_PSXSPC:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isPSXSPC(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ break;
+ case ANYOF_NPSXSPC:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isPSXSPC(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ break;
case ANYOF_PUNCT:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
@@ -3739,8 +3780,12 @@ S_regclassutf8(pTHX)
case ANYOF_NPUNCT:
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break;
case ANYOF_SPACE:
+ case ANYOF_PSXSPC:
+ case ANYOF_BLANK:
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break;
case ANYOF_NSPACE:
+ case ANYOF_NPSXSPC:
+ case ANYOF_NBLANK:
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break;
case ANYOF_UPPER:
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break;
@@ -4193,7 +4238,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
else if (k == ANYOF) {
int i, rangestart = -1;
const char * const out[] = { /* Should be syncronized with
- a table in regcomp.h */
+ ANYOF_ #xdefines in regcomp.h */
"\\w",
"\\W",
"\\s",
@@ -4217,9 +4262,13 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
"[:punct:]",
"[:^punct:]",
"[:upper:]",
- "[:!upper:]",
+ "[:^upper:]",
"[:xdigit:]",
- "[:^xdigit:]"
+ "[:^xdigit:]",
+ "[:space:]",
+ "[:^space:]",
+ "[:blank:]",
+ "[:^blank:]"
};
if (o->flags & ANYOF_LOCALE)
diff --git a/regcomp.h b/regcomp.h
index 34c5c251be..e30e67f30f 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -194,7 +194,7 @@ struct regnode_charclass_class {
#define ANYOF_ALNUM 0 /* \w, utf8::IsWord, isALNUM() */
#define ANYOF_NALNUM 1
-#define ANYOF_SPACE 2
+#define ANYOF_SPACE 2 /* \s */
#define ANYOF_NSPACE 3
#define ANYOF_DIGIT 4
#define ANYOF_NDIGIT 5
@@ -218,8 +218,12 @@ struct regnode_charclass_class {
#define ANYOF_NUPPER 23
#define ANYOF_XDIGIT 24
#define ANYOF_NXDIGIT 25
+#define ANYOF_PSXSPC 26 /* POSIX space: \s plus the vertical tab */
+#define ANYOF_NPSXSPC 27
+#define ANYOF_BLANK 28
+#define ANYOF_NBLANK 29 /* GNU extension: space and tab */
-#define ANYOF_MAX 31
+#define ANYOF_MAX 32
/* Backward source code compatibility. */
diff --git a/regexec.c b/regexec.c
index cbc8c199b5..2004cc4cdf 100644
--- a/regexec.c
+++ b/regexec.c
@@ -3625,7 +3625,11 @@ S_reginclass(pTHX_ register regnode *p, register I32 c)
(ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
(ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
(ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
+ (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC) && isPSXSPC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_BLANK) && isBLANK(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NBLANK) && !isBLANK(c))
) /* How's that for a conditional? */
{
match = TRUE;
diff --git a/t/op/pat.t b/t/op/pat.t
index 91c4b7d58f..2ba6d934d2 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4,7 +4,7 @@
# the format supported by op/regexp.t. If you want to add a test
# that does fit that format, add it to op/re_tests, not here.
-print "1..220\n";
+print "1..223\n";
BEGIN {
chdir 't' if -d 't';
@@ -1058,3 +1058,29 @@ $w = 0;
}
print $w ? "not " : "", "ok $test\n";
$test++;
+
+my %space = ( spc => " ",
+ tab => "\t",
+ cr => "\r",
+ lf => "\n",
+ ff => "\f",
+# The vertical tabulator seems miraculously be 12 both in ASCII and EBCDIC.
+ vt => chr(11),
+ false => "space" );
+
+my @space0 = sort grep { $space{$_} =~ /\s/ } keys %space;
+my @space1 = sort grep { $space{$_} =~ /[[:space:]]/ } keys %space;
+my @space2 = sort grep { $space{$_} =~ /[[:blank:]]/ } keys %space;
+
+print "not " unless "@space0" eq "cr ff lf spc tab";
+print "ok $test\n";
+$test++;
+
+print "not " unless "@space1" eq "cr ff lf spc tab vt";
+print "ok $test\n";
+$test++;
+
+print "not " unless "@space2" eq "spc tab";
+print "ok $test\n";
+$test++;
+