summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/utf8_heavy.pl49
-rw-r--r--pod/perlre.pod4
-rw-r--r--pod/perlunicode.pod80
-rwxr-xr-xt/op/pat.t51
4 files changed, 167 insertions, 17 deletions
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index 70bd018f81..29d4ac2c7d 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -81,6 +81,25 @@ sub SWASHNEW {
last GETFILE;
}
+ ##
+ ## It could be a user-defined property.
+ ##
+
+ if ($type =~ /^I[ns](\w+)$/) {
+ my @caller = caller(1);
+
+ if (defined $caller[0]) {
+ my $prop = $caller[0] . "::" . $type;
+
+ if (exists &{$prop}) {
+ no strict 'refs';
+
+ $list = &{$prop};
+ last GETFILE;
+ }
+ }
+ }
+
##
## Last attempt -- see if it's a "To" name (e.g. "ToLower")
##
@@ -99,21 +118,25 @@ sub SWASHNEW {
return $type;
}
- print "found it (file='$file')\n" if DEBUG;
+ if (defined $file) {
+ print "found it (file='$file')\n" if DEBUG;
+
+ ##
+ ## If we reach here, it was due to a 'last GETFILE' above
+ ## (exception: user-defined properties), so we
+ ## have a filename, so now we load it if we haven't already.
+ ## If we have, return the cached results. The cache key is the
+ ## file to load.
+ ##
+ if ($Cache{$file} and ref($Cache{$file}) eq $class)
+ {
+ print "Returning cached '$file' for \\p{$type}\n" if DEBUG;
+ return $Cache{$class, $file};
+ }
- ##
- ## If we reach here, it was due to a 'last GETFILE' above, so we
- ## have a filename, so now we load it if we haven't already.
- ## If we have, return the cached results. The cache key is the
- ## file to load.
- ##
- if ($Cache{$file} and ref($Cache{$file}) eq $class)
- {
- print "Returning cached '$file' for \\p{$type}\n" if DEBUG;
- return $Cache{$class, $file};
- }
+ $list = do $file;
+ }
- $list = do $file;
$ListSorted = 1; ## we know that these lists are sorted
}
diff --git a/pod/perlre.pod b/pod/perlre.pod
index f2ce3ffff1..c0d4e8955b 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -198,8 +198,8 @@ C<\d>, and C<\D> within character classes, but if you try to use them
as endpoints of a range, that's not a range, the "-" is understood
literally. If Unicode is in effect, C<\s> matches also "\x{85}",
"\x{2028}, and "\x{2029}", see L<perlunicode> for more details about
-C<\pP>, C<\PP>, and C<\X>, and L<perluniintro> about Unicode in
-general.
+C<\pP>, C<\PP>, and C<\X>, and L<perluniintro> about Unicode in general.
+You can define your own C<\p> and C<\P> propreties, see L<perlunicode>.
The POSIX character class syntax
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index af79344402..46080430a7 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -615,6 +615,86 @@ And finally, C<scalar reverse()> reverses by character rather than by byte.
=back
+=head2 Defining your own character properties
+
+You can define your own character properties by defining subroutines
+that have names beginning with "In" or "Is". The subroutines must be
+visible in the package that uses the properties. The user-defined
+properties can be used in the regular expression C<\p> and C<\P>
+constructs.
+
+The subroutines must return a specially formatted string: one or more
+newline-separated lines. Each line must be one of the following:
+
+=over 4
+
+=item *
+
+Two hexadecimal numbers separated by a tabulator denoting a range
+of Unicode codepoints.
+
+=item *
+
+An existing character property prefixed by "+utf8::" to include
+all the characters in that property.
+
+=item *
+
+An existing character property prefixed by "-utf8::" to exclude
+all the characters in that property.
+
+=item *
+
+An existing character property prefixed by "!utf8::" to include
+all except the characters in that property.
+
+=back
+
+For example, to define a property that covers both the Japanese
+syllabaries (hiragana and katakana), you can define
+
+ sub InKana {
+ return <<'END';
+ 3040 309F
+ 30A0 30FF
+ END
+ }
+
+Imagine that the here-doc end marker is at the beginning of the line,
+and that the hexadecimal numbers are separated by a tabulator.
+Now you can use C<\p{InKana}> and C<\P{IsKana}>.
+
+You could also have used the existing block property names:
+
+ sub InKana {
+ return <<'END';
+ +utf8::InHiragana
+ +utf8::InKatakana
+ END
+ }
+
+Suppose you wanted to match only the allocated characters,
+not the by raw block ranges: in other words, you want to remove
+the non-characters:
+
+ sub InKana {
+ return <<'END';
+ +utf8::InHiragana
+ +utf8::InKatakana
+ -utf8::IsCn
+ END
+ }
+
+The negation is useful for defining (surprise!) negated classes.
+
+ sub InNotKana {
+ return <<'END';
+ !utf8::InHiragana
+ -utf8::InKatakana
+ +utf8::IsCn
+ END
+ }
+
=head2 Character encodings for input and output
See L<Encode>.
diff --git a/t/op/pat.t b/t/op/pat.t
index 853c59ccd2..905204bed1 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
$| = 1;
-print "1..900\n";
+print "1..908\n";
BEGIN {
chdir 't' if -d 't';
@@ -2800,7 +2800,7 @@ print "# some Unicode properties\n";
}
{
- # [ID 20020412.005] wrong pmop flags checked when empty pattern
+ print "# [ID 20020412.005] wrong pmop flags checked when empty pattern\n";
# requires reuse of last successful pattern
my $test = 898;
$test =~ /\d/;
@@ -2823,3 +2823,50 @@ print "# some Unicode properties\n";
}
++$test;
}
+
+print "# user-defined character properties\n";
+
+sub InKana1 {
+ return <<'END';
+3040 309F
+30A0 30FF
+END
+}
+
+sub InKana2 {
+ return <<'END';
++utf8::InHiragana
++utf8::InKatakana
+END
+}
+
+sub InKana3 {
+ return <<'END';
++utf8::InHiragana
++utf8::InKatakana
+-utf8::IsCn
+END
+}
+
+sub InNotKana {
+ return <<'END';
+!utf8::InHiragana
+-utf8::InKatakana
++utf8::IsCn
+END
+}
+
+$test = 901;
+
+print "\x{3040}" =~ /\p{InKana1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+print "\x{303F}" =~ /\P{InKana1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+
+print "\x{3040}" =~ /\p{InKana2}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+print "\x{303F}" =~ /\P{InKana2}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+
+print "\x{3041}" =~ /\p{InKana3}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+print "\x{3040}" =~ /\P{InKana3}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+
+print "\x{3040}" =~ /\p{InNotKana}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+print "\x{3041}" =~ /\P{InNotKana}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+