diff options
-rw-r--r-- | lib/utf8_heavy.pl | 49 | ||||
-rw-r--r-- | pod/perlre.pod | 4 | ||||
-rw-r--r-- | pod/perlunicode.pod | 80 | ||||
-rwxr-xr-x | t/op/pat.t | 51 |
4 files changed, 167 insertions, 17 deletions
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index 70bd018f81..29d4ac2c7d 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -81,6 +81,25 @@ sub SWASHNEW { last GETFILE; } + ## + ## It could be a user-defined property. + ## + + if ($type =~ /^I[ns](\w+)$/) { + my @caller = caller(1); + + if (defined $caller[0]) { + my $prop = $caller[0] . "::" . $type; + + if (exists &{$prop}) { + no strict 'refs'; + + $list = &{$prop}; + last GETFILE; + } + } + } + ## ## Last attempt -- see if it's a "To" name (e.g. "ToLower") ## @@ -99,21 +118,25 @@ sub SWASHNEW { return $type; } - print "found it (file='$file')\n" if DEBUG; + if (defined $file) { + print "found it (file='$file')\n" if DEBUG; + + ## + ## If we reach here, it was due to a 'last GETFILE' above + ## (exception: user-defined properties), so we + ## have a filename, so now we load it if we haven't already. + ## If we have, return the cached results. The cache key is the + ## file to load. + ## + if ($Cache{$file} and ref($Cache{$file}) eq $class) + { + print "Returning cached '$file' for \\p{$type}\n" if DEBUG; + return $Cache{$class, $file}; + } - ## - ## If we reach here, it was due to a 'last GETFILE' above, so we - ## have a filename, so now we load it if we haven't already. - ## If we have, return the cached results. The cache key is the - ## file to load. - ## - if ($Cache{$file} and ref($Cache{$file}) eq $class) - { - print "Returning cached '$file' for \\p{$type}\n" if DEBUG; - return $Cache{$class, $file}; - } + $list = do $file; + } - $list = do $file; $ListSorted = 1; ## we know that these lists are sorted } diff --git a/pod/perlre.pod b/pod/perlre.pod index f2ce3ffff1..c0d4e8955b 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -198,8 +198,8 @@ C<\d>, and C<\D> within character classes, but if you try to use them as endpoints of a range, that's not a range, the "-" is understood literally. If Unicode is in effect, C<\s> matches also "\x{85}", "\x{2028}, and "\x{2029}", see L<perlunicode> for more details about -C<\pP>, C<\PP>, and C<\X>, and L<perluniintro> about Unicode in -general. +C<\pP>, C<\PP>, and C<\X>, and L<perluniintro> about Unicode in general. +You can define your own C<\p> and C<\P> propreties, see L<perlunicode>. The POSIX character class syntax diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index af79344402..46080430a7 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -615,6 +615,86 @@ And finally, C<scalar reverse()> reverses by character rather than by byte. =back +=head2 Defining your own character properties + +You can define your own character properties by defining subroutines +that have names beginning with "In" or "Is". The subroutines must be +visible in the package that uses the properties. The user-defined +properties can be used in the regular expression C<\p> and C<\P> +constructs. + +The subroutines must return a specially formatted string: one or more +newline-separated lines. Each line must be one of the following: + +=over 4 + +=item * + +Two hexadecimal numbers separated by a tabulator denoting a range +of Unicode codepoints. + +=item * + +An existing character property prefixed by "+utf8::" to include +all the characters in that property. + +=item * + +An existing character property prefixed by "-utf8::" to exclude +all the characters in that property. + +=item * + +An existing character property prefixed by "!utf8::" to include +all except the characters in that property. + +=back + +For example, to define a property that covers both the Japanese +syllabaries (hiragana and katakana), you can define + + sub InKana { + return <<'END'; + 3040 309F + 30A0 30FF + END + } + +Imagine that the here-doc end marker is at the beginning of the line, +and that the hexadecimal numbers are separated by a tabulator. +Now you can use C<\p{InKana}> and C<\P{IsKana}>. + +You could also have used the existing block property names: + + sub InKana { + return <<'END'; + +utf8::InHiragana + +utf8::InKatakana + END + } + +Suppose you wanted to match only the allocated characters, +not the by raw block ranges: in other words, you want to remove +the non-characters: + + sub InKana { + return <<'END'; + +utf8::InHiragana + +utf8::InKatakana + -utf8::IsCn + END + } + +The negation is useful for defining (surprise!) negated classes. + + sub InNotKana { + return <<'END'; + !utf8::InHiragana + -utf8::InKatakana + +utf8::IsCn + END + } + =head2 Character encodings for input and output See L<Encode>. diff --git a/t/op/pat.t b/t/op/pat.t index 853c59ccd2..905204bed1 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..900\n"; +print "1..908\n"; BEGIN { chdir 't' if -d 't'; @@ -2800,7 +2800,7 @@ print "# some Unicode properties\n"; } { - # [ID 20020412.005] wrong pmop flags checked when empty pattern + print "# [ID 20020412.005] wrong pmop flags checked when empty pattern\n"; # requires reuse of last successful pattern my $test = 898; $test =~ /\d/; @@ -2823,3 +2823,50 @@ print "# some Unicode properties\n"; } ++$test; } + +print "# user-defined character properties\n"; + +sub InKana1 { + return <<'END'; +3040 309F +30A0 30FF +END +} + +sub InKana2 { + return <<'END'; ++utf8::InHiragana ++utf8::InKatakana +END +} + +sub InKana3 { + return <<'END'; ++utf8::InHiragana ++utf8::InKatakana +-utf8::IsCn +END +} + +sub InNotKana { + return <<'END'; +!utf8::InHiragana +-utf8::InKatakana ++utf8::IsCn +END +} + +$test = 901; + +print "\x{3040}" =~ /\p{InKana1}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "\x{303F}" =~ /\P{InKana1}/ ? "ok $test\n" : "not ok $test\n"; $test++; + +print "\x{3040}" =~ /\p{InKana2}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "\x{303F}" =~ /\P{InKana2}/ ? "ok $test\n" : "not ok $test\n"; $test++; + +print "\x{3041}" =~ /\p{InKana3}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "\x{3040}" =~ /\P{InKana3}/ ? "ok $test\n" : "not ok $test\n"; $test++; + +print "\x{3040}" =~ /\p{InNotKana}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "\x{3041}" =~ /\P{InNotKana}/ ? "ok $test\n" : "not ok $test\n"; $test++; + |