diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2004-03-03 11:37:21 +0200 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2004-03-07 22:30:45 +0000 |
commit | 08952adc863bc4a67f8bd3d4c7da553683565739 (patch) | |
tree | bc8b5f1850b509c5e3f5bf280d76ca105aea9b20 | |
parent | 8bd77110f5da47faabf6ba3a6e7ab9fd075701a8 (diff) | |
download | perl-08952adc863bc4a67f8bd3d4c7da553683565739.tar.gz |
Integrate:
[ 22427]
Speed up the unicode case mappings (for /i, lc, uc, etc).
Subject: [PATCH] [perl #24826]
Message-Id: <9B5CBF96-6CE5-11D8-83B0-00039362CB92@iki.fi>
[ 22430]
Clarify the difference between utf8::downgrade/upgrade
and utf8::encode/decode (patch by Jarkko).
[ 22439]
ensure utf8::encode() normalises its arg
[ 22444]
A small perluniintro clarification by Jarkko.
[ 22463]
Add a readonly check to Perl_sv_utf8_upgrade_flags, a regresion test
in utf8.t, and fix 3 bugs it exposed in utfhash.t
p4raw-link: @22463 on //depot/perl: 5fec3b1d36062f79cb996123dc191025c139d617
p4raw-link: @22444 on //depot/perl: c0c50798904ed219d069da8d28f789f4cc6e4fb2
p4raw-link: @22439 on //depot/perl: 42b1db35bd6485f66426fa704408dc673fb999f1
p4raw-link: @22430 on //depot/perl: 2c9359a248d51da75ec39822c411d2e97fe5c631
p4raw-link: @22427 on //depot/perl: b08cf34e02b6d68e83a46f7566341e6914ff1a2e
p4raw-id: //depot/maint-5.8/perl@22464
p4raw-integrated: from //depot/perl@22461 'copy in' t/uni/case.pl
(@15684..) lib/unicore/To/Fold.pl lib/unicore/To/Lower.pl
lib/unicore/To/Title.pl lib/unicore/To/Upper.pl (@19391..)
lib/unicore/mktables (@19686..) lib/utf8.t (@19778..)
t/op/utfhash.t (@20288..) lib/utf8.pm (@20878..) utf8.c
(@21297..)
p4raw-integrated: from //depot/perl@22439 'edit in' sv.c (@22353..)
p4raw-integrated: from //depot/perl@22430 'ignore' pod/perluniintro.pod
(@21198..)
-rw-r--r-- | lib/unicore/To/Fold.pl | 207 | ||||
-rw-r--r-- | lib/unicore/To/Lower.pl | 57 | ||||
-rw-r--r-- | lib/unicore/To/Title.pl | 151 | ||||
-rw-r--r-- | lib/unicore/To/Upper.pl | 205 | ||||
-rw-r--r-- | lib/unicore/mktables | 24 | ||||
-rw-r--r-- | lib/utf8.pm | 23 | ||||
-rw-r--r-- | lib/utf8.t | 8 | ||||
-rw-r--r-- | pod/perluniintro.pod | 6 | ||||
-rw-r--r-- | sv.c | 4 | ||||
-rw-r--r-- | t/op/utfhash.t | 9 | ||||
-rw-r--r-- | t/uni/case.pl | 16 | ||||
-rw-r--r-- | utf8.c | 22 |
12 files changed, 380 insertions, 352 deletions
diff --git a/lib/unicore/To/Fold.pl b/lib/unicore/To/Fold.pl index ee608336f8..33c5d466be 100644 --- a/lib/unicore/To/Fold.pl +++ b/lib/unicore/To/Fold.pl @@ -3,111 +3,112 @@ # Any changes made here will be lost! +# The ke UTF-8 _bytes_, the value UTF-8 (speed hack) %utf8::ToSpecFold = ( -'00DF' => "\x{0073}\x{0073}", -'0130' => "\x{0069}\x{0307}", -'0149' => "\x{02BC}\x{006E}", -'01F0' => "\x{006A}\x{030C}", -'0390' => "\x{03B9}\x{0308}\x{0301}", -'03B0' => "\x{03C5}\x{0308}\x{0301}", -'0587' => "\x{0565}\x{0582}", -'1E96' => "\x{0068}\x{0331}", -'1E97' => "\x{0074}\x{0308}", -'1E98' => "\x{0077}\x{030A}", -'1E99' => "\x{0079}\x{030A}", -'1E9A' => "\x{0061}\x{02BE}", -'1F50' => "\x{03C5}\x{0313}", -'1F52' => "\x{03C5}\x{0313}\x{0300}", -'1F54' => "\x{03C5}\x{0313}\x{0301}", -'1F56' => "\x{03C5}\x{0313}\x{0342}", -'1F80' => "\x{1F00}\x{03B9}", -'1F81' => "\x{1F01}\x{03B9}", -'1F82' => "\x{1F02}\x{03B9}", -'1F83' => "\x{1F03}\x{03B9}", -'1F84' => "\x{1F04}\x{03B9}", -'1F85' => "\x{1F05}\x{03B9}", -'1F86' => "\x{1F06}\x{03B9}", -'1F87' => "\x{1F07}\x{03B9}", -'1F88' => "\x{1F00}\x{03B9}", -'1F89' => "\x{1F01}\x{03B9}", -'1F8A' => "\x{1F02}\x{03B9}", -'1F8B' => "\x{1F03}\x{03B9}", -'1F8C' => "\x{1F04}\x{03B9}", -'1F8D' => "\x{1F05}\x{03B9}", -'1F8E' => "\x{1F06}\x{03B9}", -'1F8F' => "\x{1F07}\x{03B9}", -'1F90' => "\x{1F20}\x{03B9}", -'1F91' => "\x{1F21}\x{03B9}", -'1F92' => "\x{1F22}\x{03B9}", -'1F93' => "\x{1F23}\x{03B9}", -'1F94' => "\x{1F24}\x{03B9}", -'1F95' => "\x{1F25}\x{03B9}", -'1F96' => "\x{1F26}\x{03B9}", -'1F97' => "\x{1F27}\x{03B9}", -'1F98' => "\x{1F20}\x{03B9}", -'1F99' => "\x{1F21}\x{03B9}", -'1F9A' => "\x{1F22}\x{03B9}", -'1F9B' => "\x{1F23}\x{03B9}", -'1F9C' => "\x{1F24}\x{03B9}", -'1F9D' => "\x{1F25}\x{03B9}", -'1F9E' => "\x{1F26}\x{03B9}", -'1F9F' => "\x{1F27}\x{03B9}", -'1FA0' => "\x{1F60}\x{03B9}", -'1FA1' => "\x{1F61}\x{03B9}", -'1FA2' => "\x{1F62}\x{03B9}", -'1FA3' => "\x{1F63}\x{03B9}", -'1FA4' => "\x{1F64}\x{03B9}", -'1FA5' => "\x{1F65}\x{03B9}", -'1FA6' => "\x{1F66}\x{03B9}", -'1FA7' => "\x{1F67}\x{03B9}", -'1FA8' => "\x{1F60}\x{03B9}", -'1FA9' => "\x{1F61}\x{03B9}", -'1FAA' => "\x{1F62}\x{03B9}", -'1FAB' => "\x{1F63}\x{03B9}", -'1FAC' => "\x{1F64}\x{03B9}", -'1FAD' => "\x{1F65}\x{03B9}", -'1FAE' => "\x{1F66}\x{03B9}", -'1FAF' => "\x{1F67}\x{03B9}", -'1FB2' => "\x{1F70}\x{03B9}", -'1FB3' => "\x{03B1}\x{03B9}", -'1FB4' => "\x{03AC}\x{03B9}", -'1FB6' => "\x{03B1}\x{0342}", -'1FB7' => "\x{03B1}\x{0342}\x{03B9}", -'1FBC' => "\x{03B1}\x{03B9}", -'1FC2' => "\x{1F74}\x{03B9}", -'1FC3' => "\x{03B7}\x{03B9}", -'1FC4' => "\x{03AE}\x{03B9}", -'1FC6' => "\x{03B7}\x{0342}", -'1FC7' => "\x{03B7}\x{0342}\x{03B9}", -'1FCC' => "\x{03B7}\x{03B9}", -'1FD2' => "\x{03B9}\x{0308}\x{0300}", -'1FD3' => "\x{03B9}\x{0308}\x{0301}", -'1FD6' => "\x{03B9}\x{0342}", -'1FD7' => "\x{03B9}\x{0308}\x{0342}", -'1FE2' => "\x{03C5}\x{0308}\x{0300}", -'1FE3' => "\x{03C5}\x{0308}\x{0301}", -'1FE4' => "\x{03C1}\x{0313}", -'1FE6' => "\x{03C5}\x{0342}", -'1FE7' => "\x{03C5}\x{0308}\x{0342}", -'1FF2' => "\x{1F7C}\x{03B9}", -'1FF3' => "\x{03C9}\x{03B9}", -'1FF4' => "\x{03CE}\x{03B9}", -'1FF6' => "\x{03C9}\x{0342}", -'1FF7' => "\x{03C9}\x{0342}\x{03B9}", -'1FFC' => "\x{03C9}\x{03B9}", -'FB00' => "\x{0066}\x{0066}", -'FB01' => "\x{0066}\x{0069}", -'FB02' => "\x{0066}\x{006C}", -'FB03' => "\x{0066}\x{0066}\x{0069}", -'FB04' => "\x{0066}\x{0066}\x{006C}", -'FB05' => "\x{0073}\x{0074}", -'FB06' => "\x{0073}\x{0074}", -'FB13' => "\x{0574}\x{0576}", -'FB14' => "\x{0574}\x{0565}", -'FB15' => "\x{0574}\x{056B}", -'FB16' => "\x{057E}\x{0576}", -'FB17' => "\x{0574}\x{056D}", +"\xC3\x9F" => "\x{0073}\x{0073}", +"\xC4\xB0" => "\x{0069}\x{0307}", +"\xC5\x89" => "\x{02BC}\x{006E}", +"\xC7\xB0" => "\x{006A}\x{030C}", +"\xCE\x90" => "\x{03B9}\x{0308}\x{0301}", +"\xCE\xB0" => "\x{03C5}\x{0308}\x{0301}", +"\xD6\x87" => "\x{0565}\x{0582}", +"\xE1\xBA\x96" => "\x{0068}\x{0331}", +"\xE1\xBA\x97" => "\x{0074}\x{0308}", +"\xE1\xBA\x98" => "\x{0077}\x{030A}", +"\xE1\xBA\x99" => "\x{0079}\x{030A}", +"\xE1\xBA\x9A" => "\x{0061}\x{02BE}", +"\xE1\xBD\x90" => "\x{03C5}\x{0313}", +"\xE1\xBD\x92" => "\x{03C5}\x{0313}\x{0300}", +"\xE1\xBD\x94" => "\x{03C5}\x{0313}\x{0301}", +"\xE1\xBD\x96" => "\x{03C5}\x{0313}\x{0342}", +"\xE1\xBE\x80" => "\x{1F00}\x{03B9}", +"\xE1\xBE\x81" => "\x{1F01}\x{03B9}", +"\xE1\xBE\x82" => "\x{1F02}\x{03B9}", +"\xE1\xBE\x83" => "\x{1F03}\x{03B9}", +"\xE1\xBE\x84" => "\x{1F04}\x{03B9}", +"\xE1\xBE\x85" => "\x{1F05}\x{03B9}", +"\xE1\xBE\x86" => "\x{1F06}\x{03B9}", +"\xE1\xBE\x87" => "\x{1F07}\x{03B9}", +"\xE1\xBE\x88" => "\x{1F00}\x{03B9}", +"\xE1\xBE\x89" => "\x{1F01}\x{03B9}", +"\xE1\xBE\x8A" => "\x{1F02}\x{03B9}", +"\xE1\xBE\x8B" => "\x{1F03}\x{03B9}", +"\xE1\xBE\x8C" => "\x{1F04}\x{03B9}", +"\xE1\xBE\x8D" => "\x{1F05}\x{03B9}", +"\xE1\xBE\x8E" => "\x{1F06}\x{03B9}", +"\xE1\xBE\x8F" => "\x{1F07}\x{03B9}", +"\xE1\xBE\x90" => "\x{1F20}\x{03B9}", +"\xE1\xBE\x91" => "\x{1F21}\x{03B9}", +"\xE1\xBE\x92" => "\x{1F22}\x{03B9}", +"\xE1\xBE\x93" => "\x{1F23}\x{03B9}", +"\xE1\xBE\x94" => "\x{1F24}\x{03B9}", +"\xE1\xBE\x95" => "\x{1F25}\x{03B9}", +"\xE1\xBE\x96" => "\x{1F26}\x{03B9}", +"\xE1\xBE\x97" => "\x{1F27}\x{03B9}", +"\xE1\xBE\x98" => "\x{1F20}\x{03B9}", +"\xE1\xBE\x99" => "\x{1F21}\x{03B9}", +"\xE1\xBE\x9A" => "\x{1F22}\x{03B9}", +"\xE1\xBE\x9B" => "\x{1F23}\x{03B9}", +"\xE1\xBE\x9C" => "\x{1F24}\x{03B9}", +"\xE1\xBE\x9D" => "\x{1F25}\x{03B9}", +"\xE1\xBE\x9E" => "\x{1F26}\x{03B9}", +"\xE1\xBE\x9F" => "\x{1F27}\x{03B9}", +"\xE1\xBE\xA0" => "\x{1F60}\x{03B9}", +"\xE1\xBE\xA1" => "\x{1F61}\x{03B9}", +"\xE1\xBE\xA2" => "\x{1F62}\x{03B9}", +"\xE1\xBE\xA3" => "\x{1F63}\x{03B9}", +"\xE1\xBE\xA4" => "\x{1F64}\x{03B9}", +"\xE1\xBE\xA5" => "\x{1F65}\x{03B9}", +"\xE1\xBE\xA6" => "\x{1F66}\x{03B9}", +"\xE1\xBE\xA7" => "\x{1F67}\x{03B9}", +"\xE1\xBE\xA8" => "\x{1F60}\x{03B9}", +"\xE1\xBE\xA9" => "\x{1F61}\x{03B9}", +"\xE1\xBE\xAA" => "\x{1F62}\x{03B9}", +"\xE1\xBE\xAB" => "\x{1F63}\x{03B9}", +"\xE1\xBE\xAC" => "\x{1F64}\x{03B9}", +"\xE1\xBE\xAD" => "\x{1F65}\x{03B9}", +"\xE1\xBE\xAE" => "\x{1F66}\x{03B9}", +"\xE1\xBE\xAF" => "\x{1F67}\x{03B9}", +"\xE1\xBE\xB2" => "\x{1F70}\x{03B9}", +"\xE1\xBE\xB3" => "\x{03B1}\x{03B9}", +"\xE1\xBE\xB4" => "\x{03AC}\x{03B9}", +"\xE1\xBE\xB6" => "\x{03B1}\x{0342}", +"\xE1\xBE\xB7" => "\x{03B1}\x{0342}\x{03B9}", +"\xE1\xBE\xBC" => "\x{03B1}\x{03B9}", +"\xE1\xBF\x82" => "\x{1F74}\x{03B9}", +"\xE1\xBF\x83" => "\x{03B7}\x{03B9}", +"\xE1\xBF\x84" => "\x{03AE}\x{03B9}", +"\xE1\xBF\x86" => "\x{03B7}\x{0342}", +"\xE1\xBF\x87" => "\x{03B7}\x{0342}\x{03B9}", +"\xE1\xBF\x8C" => "\x{03B7}\x{03B9}", +"\xE1\xBF\x92" => "\x{03B9}\x{0308}\x{0300}", +"\xE1\xBF\x93" => "\x{03B9}\x{0308}\x{0301}", +"\xE1\xBF\x96" => "\x{03B9}\x{0342}", +"\xE1\xBF\x97" => "\x{03B9}\x{0308}\x{0342}", +"\xE1\xBF\xA2" => "\x{03C5}\x{0308}\x{0300}", +"\xE1\xBF\xA3" => "\x{03C5}\x{0308}\x{0301}", +"\xE1\xBF\xA4" => "\x{03C1}\x{0313}", +"\xE1\xBF\xA6" => "\x{03C5}\x{0342}", +"\xE1\xBF\xA7" => "\x{03C5}\x{0308}\x{0342}", +"\xE1\xBF\xB2" => "\x{1F7C}\x{03B9}", +"\xE1\xBF\xB3" => "\x{03C9}\x{03B9}", +"\xE1\xBF\xB4" => "\x{03CE}\x{03B9}", +"\xE1\xBF\xB6" => "\x{03C9}\x{0342}", +"\xE1\xBF\xB7" => "\x{03C9}\x{0342}\x{03B9}", +"\xE1\xBF\xBC" => "\x{03C9}\x{03B9}", +"\xEF\xAC\x80" => "\x{0066}\x{0066}", +"\xEF\xAC\x81" => "\x{0066}\x{0069}", +"\xEF\xAC\x82" => "\x{0066}\x{006C}", +"\xEF\xAC\x83" => "\x{0066}\x{0066}\x{0069}", +"\xEF\xAC\x84" => "\x{0066}\x{0066}\x{006C}", +"\xEF\xAC\x85" => "\x{0073}\x{0074}", +"\xEF\xAC\x86" => "\x{0073}\x{0074}", +"\xEF\xAC\x93" => "\x{0574}\x{0576}", +"\xEF\xAC\x94" => "\x{0574}\x{0565}", +"\xEF\xAC\x95" => "\x{0574}\x{056B}", +"\xEF\xAC\x96" => "\x{057E}\x{0576}", +"\xEF\xAC\x97" => "\x{0574}\x{056D}", ); return <<'END'; diff --git a/lib/unicore/To/Lower.pl b/lib/unicore/To/Lower.pl index 8bf064ad80..7e5adfeede 100644 --- a/lib/unicore/To/Lower.pl +++ b/lib/unicore/To/Lower.pl @@ -3,36 +3,37 @@ # Any changes made here will be lost! +# The key UTF-8 _bytes_, the value UTF-8 (speed hack) %utf8::ToSpecLower = ( -'0130' => "\x{0069}\x{0307}", -'1F88' => "\x{1F80}", -'1F89' => "\x{1F81}", -'1F8A' => "\x{1F82}", -'1F8B' => "\x{1F83}", -'1F8C' => "\x{1F84}", -'1F8D' => "\x{1F85}", -'1F8E' => "\x{1F86}", -'1F8F' => "\x{1F87}", -'1F98' => "\x{1F90}", -'1F99' => "\x{1F91}", -'1F9A' => "\x{1F92}", -'1F9B' => "\x{1F93}", -'1F9C' => "\x{1F94}", -'1F9D' => "\x{1F95}", -'1F9E' => "\x{1F96}", -'1F9F' => "\x{1F97}", -'1FA8' => "\x{1FA0}", -'1FA9' => "\x{1FA1}", -'1FAA' => "\x{1FA2}", -'1FAB' => "\x{1FA3}", -'1FAC' => "\x{1FA4}", -'1FAD' => "\x{1FA5}", -'1FAE' => "\x{1FA6}", -'1FAF' => "\x{1FA7}", -'1FBC' => "\x{1FB3}", -'1FCC' => "\x{1FC3}", -'1FFC' => "\x{1FF3}", +"\xC4\xB0" => "\x{0069}\x{0307}", +"\xE1\xBE\x88" => "\x{1F80}", +"\xE1\xBE\x89" => "\x{1F81}", +"\xE1\xBE\x8A" => "\x{1F82}", +"\xE1\xBE\x8B" => "\x{1F83}", +"\xE1\xBE\x8C" => "\x{1F84}", +"\xE1\xBE\x8D" => "\x{1F85}", +"\xE1\xBE\x8E" => "\x{1F86}", +"\xE1\xBE\x8F" => "\x{1F87}", +"\xE1\xBE\x98" => "\x{1F90}", +"\xE1\xBE\x99" => "\x{1F91}", +"\xE1\xBE\x9A" => "\x{1F92}", +"\xE1\xBE\x9B" => "\x{1F93}", +"\xE1\xBE\x9C" => "\x{1F94}", +"\xE1\xBE\x9D" => "\x{1F95}", +"\xE1\xBE\x9E" => "\x{1F96}", +"\xE1\xBE\x9F" => "\x{1F97}", +"\xE1\xBE\xA8" => "\x{1FA0}", +"\xE1\xBE\xA9" => "\x{1FA1}", +"\xE1\xBE\xAA" => "\x{1FA2}", +"\xE1\xBE\xAB" => "\x{1FA3}", +"\xE1\xBE\xAC" => "\x{1FA4}", +"\xE1\xBE\xAD" => "\x{1FA5}", +"\xE1\xBE\xAE" => "\x{1FA6}", +"\xE1\xBE\xAF" => "\x{1FA7}", +"\xE1\xBE\xBC" => "\x{1FB3}", +"\xE1\xBF\x8C" => "\x{1FC3}", +"\xE1\xBF\xBC" => "\x{1FF3}", ); return <<'END'; diff --git a/lib/unicore/To/Title.pl b/lib/unicore/To/Title.pl index cfeccabcdc..2223f7bafc 100644 --- a/lib/unicore/To/Title.pl +++ b/lib/unicore/To/Title.pl @@ -3,83 +3,84 @@ # Any changes made here will be lost! +# The key UTF-8 _bytes_, the value UTF-8 (speed hack) %utf8::ToSpecTitle = ( -'00DF' => "\x{0053}\x{0073}", -'0149' => "\x{02BC}\x{004E}", -'01F0' => "\x{004A}\x{030C}", -'0390' => "\x{0399}\x{0308}\x{0301}", -'03B0' => "\x{03A5}\x{0308}\x{0301}", -'0587' => "\x{0535}\x{0582}", -'1E96' => "\x{0048}\x{0331}", -'1E97' => "\x{0054}\x{0308}", -'1E98' => "\x{0057}\x{030A}", -'1E99' => "\x{0059}\x{030A}", -'1E9A' => "\x{0041}\x{02BE}", -'1F50' => "\x{03A5}\x{0313}", -'1F52' => "\x{03A5}\x{0313}\x{0300}", -'1F54' => "\x{03A5}\x{0313}\x{0301}", -'1F56' => "\x{03A5}\x{0313}\x{0342}", -'1F80' => "\x{1F88}", -'1F81' => "\x{1F89}", -'1F82' => "\x{1F8A}", -'1F83' => "\x{1F8B}", -'1F84' => "\x{1F8C}", -'1F85' => "\x{1F8D}", -'1F86' => "\x{1F8E}", -'1F87' => "\x{1F8F}", -'1F90' => "\x{1F98}", -'1F91' => "\x{1F99}", -'1F92' => "\x{1F9A}", -'1F93' => "\x{1F9B}", -'1F94' => "\x{1F9C}", -'1F95' => "\x{1F9D}", -'1F96' => "\x{1F9E}", -'1F97' => "\x{1F9F}", -'1FA0' => "\x{1FA8}", -'1FA1' => "\x{1FA9}", -'1FA2' => "\x{1FAA}", -'1FA3' => "\x{1FAB}", -'1FA4' => "\x{1FAC}", -'1FA5' => "\x{1FAD}", -'1FA6' => "\x{1FAE}", -'1FA7' => "\x{1FAF}", -'1FB2' => "\x{1FBA}\x{0345}", -'1FB3' => "\x{1FBC}", -'1FB4' => "\x{0386}\x{0345}", -'1FB6' => "\x{0391}\x{0342}", -'1FB7' => "\x{0391}\x{0342}\x{0345}", -'1FC2' => "\x{1FCA}\x{0345}", -'1FC3' => "\x{1FCC}", -'1FC4' => "\x{0389}\x{0345}", -'1FC6' => "\x{0397}\x{0342}", -'1FC7' => "\x{0397}\x{0342}\x{0345}", -'1FD2' => "\x{0399}\x{0308}\x{0300}", -'1FD3' => "\x{0399}\x{0308}\x{0301}", -'1FD6' => "\x{0399}\x{0342}", -'1FD7' => "\x{0399}\x{0308}\x{0342}", -'1FE2' => "\x{03A5}\x{0308}\x{0300}", -'1FE3' => "\x{03A5}\x{0308}\x{0301}", -'1FE4' => "\x{03A1}\x{0313}", -'1FE6' => "\x{03A5}\x{0342}", -'1FE7' => "\x{03A5}\x{0308}\x{0342}", -'1FF2' => "\x{1FFA}\x{0345}", -'1FF3' => "\x{1FFC}", -'1FF4' => "\x{038F}\x{0345}", -'1FF6' => "\x{03A9}\x{0342}", -'1FF7' => "\x{03A9}\x{0342}\x{0345}", -'FB00' => "\x{0046}\x{0066}", -'FB01' => "\x{0046}\x{0069}", -'FB02' => "\x{0046}\x{006C}", -'FB03' => "\x{0046}\x{0066}\x{0069}", -'FB04' => "\x{0046}\x{0066}\x{006C}", -'FB05' => "\x{0053}\x{0074}", -'FB06' => "\x{0053}\x{0074}", -'FB13' => "\x{0544}\x{0576}", -'FB14' => "\x{0544}\x{0565}", -'FB15' => "\x{0544}\x{056B}", -'FB16' => "\x{054E}\x{0576}", -'FB17' => "\x{0544}\x{056D}", +"\xC3\x9F" => "\x{0053}\x{0073}", +"\xC5\x89" => "\x{02BC}\x{004E}", +"\xC7\xB0" => "\x{004A}\x{030C}", +"\xCE\x90" => "\x{0399}\x{0308}\x{0301}", +"\xCE\xB0" => "\x{03A5}\x{0308}\x{0301}", +"\xD6\x87" => "\x{0535}\x{0582}", +"\xE1\xBA\x96" => "\x{0048}\x{0331}", +"\xE1\xBA\x97" => "\x{0054}\x{0308}", +"\xE1\xBA\x98" => "\x{0057}\x{030A}", +"\xE1\xBA\x99" => "\x{0059}\x{030A}", +"\xE1\xBA\x9A" => "\x{0041}\x{02BE}", +"\xE1\xBD\x90" => "\x{03A5}\x{0313}", +"\xE1\xBD\x92" => "\x{03A5}\x{0313}\x{0300}", +"\xE1\xBD\x94" => "\x{03A5}\x{0313}\x{0301}", +"\xE1\xBD\x96" => "\x{03A5}\x{0313}\x{0342}", +"\xE1\xBE\x80" => "\x{1F88}", +"\xE1\xBE\x81" => "\x{1F89}", +"\xE1\xBE\x82" => "\x{1F8A}", +"\xE1\xBE\x83" => "\x{1F8B}", +"\xE1\xBE\x84" => "\x{1F8C}", +"\xE1\xBE\x85" => "\x{1F8D}", +"\xE1\xBE\x86" => "\x{1F8E}", +"\xE1\xBE\x87" => "\x{1F8F}", +"\xE1\xBE\x90" => "\x{1F98}", +"\xE1\xBE\x91" => "\x{1F99}", +"\xE1\xBE\x92" => "\x{1F9A}", +"\xE1\xBE\x93" => "\x{1F9B}", +"\xE1\xBE\x94" => "\x{1F9C}", +"\xE1\xBE\x95" => "\x{1F9D}", +"\xE1\xBE\x96" => "\x{1F9E}", +"\xE1\xBE\x97" => "\x{1F9F}", +"\xE1\xBE\xA0" => "\x{1FA8}", +"\xE1\xBE\xA1" => "\x{1FA9}", +"\xE1\xBE\xA2" => "\x{1FAA}", +"\xE1\xBE\xA3" => "\x{1FAB}", +"\xE1\xBE\xA4" => "\x{1FAC}", +"\xE1\xBE\xA5" => "\x{1FAD}", +"\xE1\xBE\xA6" => "\x{1FAE}", +"\xE1\xBE\xA7" => "\x{1FAF}", +"\xE1\xBE\xB2" => "\x{1FBA}\x{0345}", +"\xE1\xBE\xB3" => "\x{1FBC}", +"\xE1\xBE\xB4" => "\x{0386}\x{0345}", +"\xE1\xBE\xB6" => "\x{0391}\x{0342}", +"\xE1\xBE\xB7" => "\x{0391}\x{0342}\x{0345}", +"\xE1\xBF\x82" => "\x{1FCA}\x{0345}", +"\xE1\xBF\x83" => "\x{1FCC}", +"\xE1\xBF\x84" => "\x{0389}\x{0345}", +"\xE1\xBF\x86" => "\x{0397}\x{0342}", +"\xE1\xBF\x87" => "\x{0397}\x{0342}\x{0345}", +"\xE1\xBF\x92" => "\x{0399}\x{0308}\x{0300}", +"\xE1\xBF\x93" => "\x{0399}\x{0308}\x{0301}", +"\xE1\xBF\x96" => "\x{0399}\x{0342}", +"\xE1\xBF\x97" => "\x{0399}\x{0308}\x{0342}", +"\xE1\xBF\xA2" => "\x{03A5}\x{0308}\x{0300}", +"\xE1\xBF\xA3" => "\x{03A5}\x{0308}\x{0301}", +"\xE1\xBF\xA4" => "\x{03A1}\x{0313}", +"\xE1\xBF\xA6" => "\x{03A5}\x{0342}", +"\xE1\xBF\xA7" => "\x{03A5}\x{0308}\x{0342}", +"\xE1\xBF\xB2" => "\x{1FFA}\x{0345}", +"\xE1\xBF\xB3" => "\x{1FFC}", +"\xE1\xBF\xB4" => "\x{038F}\x{0345}", +"\xE1\xBF\xB6" => "\x{03A9}\x{0342}", +"\xE1\xBF\xB7" => "\x{03A9}\x{0342}\x{0345}", +"\xEF\xAC\x80" => "\x{0046}\x{0066}", +"\xEF\xAC\x81" => "\x{0046}\x{0069}", +"\xEF\xAC\x82" => "\x{0046}\x{006C}", +"\xEF\xAC\x83" => "\x{0046}\x{0066}\x{0069}", +"\xEF\xAC\x84" => "\x{0046}\x{0066}\x{006C}", +"\xEF\xAC\x85" => "\x{0053}\x{0074}", +"\xEF\xAC\x86" => "\x{0053}\x{0074}", +"\xEF\xAC\x93" => "\x{0544}\x{0576}", +"\xEF\xAC\x94" => "\x{0544}\x{0565}", +"\xEF\xAC\x95" => "\x{0544}\x{056B}", +"\xEF\xAC\x96" => "\x{054E}\x{0576}", +"\xEF\xAC\x97" => "\x{0544}\x{056D}", ); return <<'END'; diff --git a/lib/unicore/To/Upper.pl b/lib/unicore/To/Upper.pl index 1a42789016..6c8fd30577 100644 --- a/lib/unicore/To/Upper.pl +++ b/lib/unicore/To/Upper.pl @@ -3,110 +3,111 @@ # Any changes made here will be lost! +# The key UTF-8 _bytes_, the value UTF-8 (speed hack) %utf8::ToSpecUpper = ( -'00DF' => "\x{0053}\x{0053}", -'0149' => "\x{02BC}\x{004E}", -'01F0' => "\x{004A}\x{030C}", -'0390' => "\x{0399}\x{0308}\x{0301}", -'03B0' => "\x{03A5}\x{0308}\x{0301}", -'0587' => "\x{0535}\x{0552}", -'1E96' => "\x{0048}\x{0331}", -'1E97' => "\x{0054}\x{0308}", -'1E98' => "\x{0057}\x{030A}", -'1E99' => "\x{0059}\x{030A}", -'1E9A' => "\x{0041}\x{02BE}", -'1F50' => "\x{03A5}\x{0313}", -'1F52' => "\x{03A5}\x{0313}\x{0300}", -'1F54' => "\x{03A5}\x{0313}\x{0301}", -'1F56' => "\x{03A5}\x{0313}\x{0342}", -'1F80' => "\x{1F08}\x{0399}", -'1F81' => "\x{1F09}\x{0399}", -'1F82' => "\x{1F0A}\x{0399}", -'1F83' => "\x{1F0B}\x{0399}", -'1F84' => "\x{1F0C}\x{0399}", -'1F85' => "\x{1F0D}\x{0399}", -'1F86' => "\x{1F0E}\x{0399}", -'1F87' => "\x{1F0F}\x{0399}", -'1F88' => "\x{1F08}\x{0399}", -'1F89' => "\x{1F09}\x{0399}", -'1F8A' => "\x{1F0A}\x{0399}", -'1F8B' => "\x{1F0B}\x{0399}", -'1F8C' => "\x{1F0C}\x{0399}", -'1F8D' => "\x{1F0D}\x{0399}", -'1F8E' => "\x{1F0E}\x{0399}", -'1F8F' => "\x{1F0F}\x{0399}", -'1F90' => "\x{1F28}\x{0399}", -'1F91' => "\x{1F29}\x{0399}", -'1F92' => "\x{1F2A}\x{0399}", -'1F93' => "\x{1F2B}\x{0399}", -'1F94' => "\x{1F2C}\x{0399}", -'1F95' => "\x{1F2D}\x{0399}", -'1F96' => "\x{1F2E}\x{0399}", -'1F97' => "\x{1F2F}\x{0399}", -'1F98' => "\x{1F28}\x{0399}", -'1F99' => "\x{1F29}\x{0399}", -'1F9A' => "\x{1F2A}\x{0399}", -'1F9B' => "\x{1F2B}\x{0399}", -'1F9C' => "\x{1F2C}\x{0399}", -'1F9D' => "\x{1F2D}\x{0399}", -'1F9E' => "\x{1F2E}\x{0399}", -'1F9F' => "\x{1F2F}\x{0399}", -'1FA0' => "\x{1F68}\x{0399}", -'1FA1' => "\x{1F69}\x{0399}", -'1FA2' => "\x{1F6A}\x{0399}", -'1FA3' => "\x{1F6B}\x{0399}", -'1FA4' => "\x{1F6C}\x{0399}", -'1FA5' => "\x{1F6D}\x{0399}", -'1FA6' => "\x{1F6E}\x{0399}", -'1FA7' => "\x{1F6F}\x{0399}", -'1FA8' => "\x{1F68}\x{0399}", -'1FA9' => "\x{1F69}\x{0399}", -'1FAA' => "\x{1F6A}\x{0399}", -'1FAB' => "\x{1F6B}\x{0399}", -'1FAC' => "\x{1F6C}\x{0399}", -'1FAD' => "\x{1F6D}\x{0399}", -'1FAE' => "\x{1F6E}\x{0399}", -'1FAF' => "\x{1F6F}\x{0399}", -'1FB2' => "\x{1FBA}\x{0399}", -'1FB3' => "\x{0391}\x{0399}", -'1FB4' => "\x{0386}\x{0399}", -'1FB6' => "\x{0391}\x{0342}", -'1FB7' => "\x{0391}\x{0342}\x{0399}", -'1FBC' => "\x{0391}\x{0399}", -'1FC2' => "\x{1FCA}\x{0399}", -'1FC3' => "\x{0397}\x{0399}", -'1FC4' => "\x{0389}\x{0399}", -'1FC6' => "\x{0397}\x{0342}", -'1FC7' => "\x{0397}\x{0342}\x{0399}", -'1FCC' => "\x{0397}\x{0399}", -'1FD2' => "\x{0399}\x{0308}\x{0300}", -'1FD3' => "\x{0399}\x{0308}\x{0301}", -'1FD6' => "\x{0399}\x{0342}", -'1FD7' => "\x{0399}\x{0308}\x{0342}", -'1FE2' => "\x{03A5}\x{0308}\x{0300}", -'1FE3' => "\x{03A5}\x{0308}\x{0301}", -'1FE4' => "\x{03A1}\x{0313}", -'1FE6' => "\x{03A5}\x{0342}", -'1FE7' => "\x{03A5}\x{0308}\x{0342}", -'1FF2' => "\x{1FFA}\x{0399}", -'1FF3' => "\x{03A9}\x{0399}", -'1FF4' => "\x{038F}\x{0399}", -'1FF6' => "\x{03A9}\x{0342}", -'1FF7' => "\x{03A9}\x{0342}\x{0399}", -'1FFC' => "\x{03A9}\x{0399}", -'FB00' => "\x{0046}\x{0046}", -'FB01' => "\x{0046}\x{0049}", -'FB02' => "\x{0046}\x{004C}", -'FB03' => "\x{0046}\x{0046}\x{0049}", -'FB04' => "\x{0046}\x{0046}\x{004C}", -'FB05' => "\x{0053}\x{0054}", -'FB06' => "\x{0053}\x{0054}", -'FB13' => "\x{0544}\x{0546}", -'FB14' => "\x{0544}\x{0535}", -'FB15' => "\x{0544}\x{053B}", -'FB16' => "\x{054E}\x{0546}", -'FB17' => "\x{0544}\x{053D}", +"\xC3\x9F" => "\x{0053}\x{0053}", +"\xC5\x89" => "\x{02BC}\x{004E}", +"\xC7\xB0" => "\x{004A}\x{030C}", +"\xCE\x90" => "\x{0399}\x{0308}\x{0301}", +"\xCE\xB0" => "\x{03A5}\x{0308}\x{0301}", +"\xD6\x87" => "\x{0535}\x{0552}", +"\xE1\xBA\x96" => "\x{0048}\x{0331}", +"\xE1\xBA\x97" => "\x{0054}\x{0308}", +"\xE1\xBA\x98" => "\x{0057}\x{030A}", +"\xE1\xBA\x99" => "\x{0059}\x{030A}", +"\xE1\xBA\x9A" => "\x{0041}\x{02BE}", +"\xE1\xBD\x90" => "\x{03A5}\x{0313}", +"\xE1\xBD\x92" => "\x{03A5}\x{0313}\x{0300}", +"\xE1\xBD\x94" => "\x{03A5}\x{0313}\x{0301}", +"\xE1\xBD\x96" => "\x{03A5}\x{0313}\x{0342}", +"\xE1\xBE\x80" => "\x{1F08}\x{0399}", +"\xE1\xBE\x81" => "\x{1F09}\x{0399}", +"\xE1\xBE\x82" => "\x{1F0A}\x{0399}", +"\xE1\xBE\x83" => "\x{1F0B}\x{0399}", +"\xE1\xBE\x84" => "\x{1F0C}\x{0399}", +"\xE1\xBE\x85" => "\x{1F0D}\x{0399}", +"\xE1\xBE\x86" => "\x{1F0E}\x{0399}", +"\xE1\xBE\x87" => "\x{1F0F}\x{0399}", +"\xE1\xBE\x88" => "\x{1F08}\x{0399}", +"\xE1\xBE\x89" => "\x{1F09}\x{0399}", +"\xE1\xBE\x8A" => "\x{1F0A}\x{0399}", +"\xE1\xBE\x8B" => "\x{1F0B}\x{0399}", +"\xE1\xBE\x8C" => "\x{1F0C}\x{0399}", +"\xE1\xBE\x8D" => "\x{1F0D}\x{0399}", +"\xE1\xBE\x8E" => "\x{1F0E}\x{0399}", +"\xE1\xBE\x8F" => "\x{1F0F}\x{0399}", +"\xE1\xBE\x90" => "\x{1F28}\x{0399}", +"\xE1\xBE\x91" => "\x{1F29}\x{0399}", +"\xE1\xBE\x92" => "\x{1F2A}\x{0399}", +"\xE1\xBE\x93" => "\x{1F2B}\x{0399}", +"\xE1\xBE\x94" => "\x{1F2C}\x{0399}", +"\xE1\xBE\x95" => "\x{1F2D}\x{0399}", +"\xE1\xBE\x96" => "\x{1F2E}\x{0399}", +"\xE1\xBE\x97" => "\x{1F2F}\x{0399}", +"\xE1\xBE\x98" => "\x{1F28}\x{0399}", +"\xE1\xBE\x99" => "\x{1F29}\x{0399}", +"\xE1\xBE\x9A" => "\x{1F2A}\x{0399}", +"\xE1\xBE\x9B" => "\x{1F2B}\x{0399}", +"\xE1\xBE\x9C" => "\x{1F2C}\x{0399}", +"\xE1\xBE\x9D" => "\x{1F2D}\x{0399}", +"\xE1\xBE\x9E" => "\x{1F2E}\x{0399}", +"\xE1\xBE\x9F" => "\x{1F2F}\x{0399}", +"\xE1\xBE\xA0" => "\x{1F68}\x{0399}", +"\xE1\xBE\xA1" => "\x{1F69}\x{0399}", +"\xE1\xBE\xA2" => "\x{1F6A}\x{0399}", +"\xE1\xBE\xA3" => "\x{1F6B}\x{0399}", +"\xE1\xBE\xA4" => "\x{1F6C}\x{0399}", +"\xE1\xBE\xA5" => "\x{1F6D}\x{0399}", +"\xE1\xBE\xA6" => "\x{1F6E}\x{0399}", +"\xE1\xBE\xA7" => "\x{1F6F}\x{0399}", +"\xE1\xBE\xA8" => "\x{1F68}\x{0399}", +"\xE1\xBE\xA9" => "\x{1F69}\x{0399}", +"\xE1\xBE\xAA" => "\x{1F6A}\x{0399}", +"\xE1\xBE\xAB" => "\x{1F6B}\x{0399}", +"\xE1\xBE\xAC" => "\x{1F6C}\x{0399}", +"\xE1\xBE\xAD" => "\x{1F6D}\x{0399}", +"\xE1\xBE\xAE" => "\x{1F6E}\x{0399}", +"\xE1\xBE\xAF" => "\x{1F6F}\x{0399}", +"\xE1\xBE\xB2" => "\x{1FBA}\x{0399}", +"\xE1\xBE\xB3" => "\x{0391}\x{0399}", +"\xE1\xBE\xB4" => "\x{0386}\x{0399}", +"\xE1\xBE\xB6" => "\x{0391}\x{0342}", +"\xE1\xBE\xB7" => "\x{0391}\x{0342}\x{0399}", +"\xE1\xBE\xBC" => "\x{0391}\x{0399}", +"\xE1\xBF\x82" => "\x{1FCA}\x{0399}", +"\xE1\xBF\x83" => "\x{0397}\x{0399}", +"\xE1\xBF\x84" => "\x{0389}\x{0399}", +"\xE1\xBF\x86" => "\x{0397}\x{0342}", +"\xE1\xBF\x87" => "\x{0397}\x{0342}\x{0399}", +"\xE1\xBF\x8C" => "\x{0397}\x{0399}", +"\xE1\xBF\x92" => "\x{0399}\x{0308}\x{0300}", +"\xE1\xBF\x93" => "\x{0399}\x{0308}\x{0301}", +"\xE1\xBF\x96" => "\x{0399}\x{0342}", +"\xE1\xBF\x97" => "\x{0399}\x{0308}\x{0342}", +"\xE1\xBF\xA2" => "\x{03A5}\x{0308}\x{0300}", +"\xE1\xBF\xA3" => "\x{03A5}\x{0308}\x{0301}", +"\xE1\xBF\xA4" => "\x{03A1}\x{0313}", +"\xE1\xBF\xA6" => "\x{03A5}\x{0342}", +"\xE1\xBF\xA7" => "\x{03A5}\x{0308}\x{0342}", +"\xE1\xBF\xB2" => "\x{1FFA}\x{0399}", +"\xE1\xBF\xB3" => "\x{03A9}\x{0399}", +"\xE1\xBF\xB4" => "\x{038F}\x{0399}", +"\xE1\xBF\xB6" => "\x{03A9}\x{0342}", +"\xE1\xBF\xB7" => "\x{03A9}\x{0342}\x{0399}", +"\xE1\xBF\xBC" => "\x{03A9}\x{0399}", +"\xEF\xAC\x80" => "\x{0046}\x{0046}", +"\xEF\xAC\x81" => "\x{0046}\x{0049}", +"\xEF\xAC\x82" => "\x{0046}\x{004C}", +"\xEF\xAC\x83" => "\x{0046}\x{0046}\x{0049}", +"\xEF\xAC\x84" => "\x{0046}\x{0046}\x{004C}", +"\xEF\xAC\x85" => "\x{0053}\x{0054}", +"\xEF\xAC\x86" => "\x{0053}\x{0054}", +"\xEF\xAC\x93" => "\x{0544}\x{0546}", +"\xEF\xAC\x94" => "\x{0544}\x{0535}", +"\xEF\xAC\x95" => "\x{0544}\x{053B}", +"\xEF\xAC\x96" => "\x{054E}\x{0546}", +"\xEF\xAC\x97" => "\x{0544}\x{053D}", ); return <<'END'; diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 5fdac52dc6..09b8175cd7 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -1658,16 +1658,18 @@ sub SpecialCasing_txt() { my $NormalCase = do "To/$case.pl" || die "$0: $@\n"; - my @OUT = ( - $HEADER, "\n", - "%utf8::ToSpec$case =\n(\n", - ); + my @OUT = + ( + $HEADER, "\n", + "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n", + "%utf8::ToSpec$case =\n(\n", + ); for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) { my ($ix, $code, $to) = @$prop; my $tostr = join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; - push @OUT, sprintf qq['%04X' => "$tostr",\n], $ix; + push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix))); # Remove any single-character mappings for # the same character since we are going for # the special casing rules. @@ -1719,14 +1721,16 @@ sub CaseFolding_txt() # my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n"; - my @OUT = ( - $HEADER, "\n", - "%utf8::ToSpecFold =\n(\n", - ); + my @OUT = + ( + $HEADER, "\n", + "# The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n", + "%utf8::ToSpecFold =\n(\n", + ); for my $code (sort { $a <=> $b } keys %Fold) { my $foldstr = join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code}; - push @OUT, sprintf qq['%04X' => "$foldstr",\n], $code; + push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code))); } push @OUT, ( ");\n\n", diff --git a/lib/utf8.pm b/lib/utf8.pm index f5eebe7fba..ea99dd9f86 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -31,9 +31,11 @@ utf8 - Perl pragma to enable/disable UTF-8 (or UTF-EBCDIC) in source code use utf8; no utf8; + # Convert a Perl scalar to/from UTF-8. $num_octets = utf8::upgrade($string); $success = utf8::downgrade($string[, FAIL_OK]); + # Change the native bytes of a Perl scalar to/from UTF-8 bytes. utf8::encode($string); utf8::decode($string); @@ -133,18 +135,23 @@ pragma. =item * utf8::encode($string) -Converts (in-place) I<$string> from logical characters to octet -sequence representing it in Perl's I<UTF-X> encoding. Returns -nothing. Same as Encode::encode_utf8(). Note that this should not be -used to convert a legacy byte encoding to Unicode: use Encode for -that. +Converts in-place the octets of the I<$string> to the octet sequence +in Perl's I<UTF-X> encoding. Returns nothing. B<Note that this does +not change the "type" of I<$string> to UTF-8>, and that this handles +only ISO 8859-1 (or EBCDIC) as the source character set. Therefore +this should not be used to convert a legacy 8-bit encoding to Unicode: +use Encode::decode() for that. In the very limited case of wanting to +handle just ISO 8859-1 (or EBCDIC), you could use utf8::upgrade(). =item * utf8::decode($string) Attempts to convert I<$string> in-place from Perl's I<UTF-X> encoding -into logical characters. Returns nothing. Same as Encode::decode_utf8(). -Note that this should not be used to convert Unicode back to a legacy -byte encoding: use Encode for that. +into octets. Returns nothing. B<Note that this does not change the +"type" of <$string> from UTF-8>, and that this handles only ISO 8859-1 +(or EBCDIC) as the destination character set. Therefore this should +not be used to convert Unicode back to a legacy 8-bit encoding: +use Encode::encode() for that. In the very limited case of wanting +to handle just ISO 8859-1 (or EBCDIC), you could use utf8::downgrade(). =item * $flag = utf8::is_utf8(STRING) diff --git a/lib/utf8.t b/lib/utf8.t index 33cd5966af..90035e56b3 100644 --- a/lib/utf8.t +++ b/lib/utf8.t @@ -37,7 +37,7 @@ no utf8; # Ironic, no? # # -plan tests => 143; +plan tests => 144; { # bug id 20001009.001 @@ -409,3 +409,9 @@ SKIP: { ok( utf8::is_utf8($b), " utf8::is_utf8 beyond"); # $b stays in UTF-8. ok( utf8::is_utf8($c), " utf8::is_utf8 unicode"); } + +{ + eval {utf8::encode("£")}; + like($@, qr/^Modification of a read-only value attempted/, + "utf8::encode should refuse to touch read-only values"); +} diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index 71d0e57cca..803df80759 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -265,7 +265,7 @@ C<substr()> will work on the Unicode characters; regular expressions will work on the Unicode characters (see L<perlunicode> and L<perlretut>). Note that Perl considers combining character sequences to be -characters, so for example +separate characters, so for example use charnames ':full'; print length("\N{LATIN CAPITAL LETTER A}\N{COMBINING ACUTE ACCENT}"), "\n"; @@ -299,8 +299,8 @@ If that variable isn't set, the encoding pragma will fail. The C<Encode> module knows about many encodings and has interfaces for doing conversions between those encodings: - use Encode 'from_to'; - from_to($data, "iso-8859-3", "utf-8"); # from legacy to utf-8 + use Encode 'decode'; + $data = decode("iso-8859-3", $data); # convert from legacy to utf-8 =head2 Unicode I/O @@ -3454,6 +3454,10 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) sv_force_normal(sv); } + if (SvREADONLY(sv)) { + Perl_croak(aTHX_ PL_no_modify); + } + if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING)) sv_recode_to_utf8(sv, PL_encoding); else { /* Assume Latin-1/EBCDIC */ diff --git a/t/op/utfhash.t b/t/op/utfhash.t index 9e0196b6b8..33909c0cbc 100644 --- a/t/op/utfhash.t +++ b/t/op/utfhash.t @@ -32,8 +32,9 @@ is($hashu{"\xff"},0xFF); is($hashu{"\x7f"},0x7F); # Now try same thing with variables forced into various forms. -foreach my $a ("\x7f","\xff") +foreach ("\x7f","\xff") { + my $a = $_; # Force a copy utf8::upgrade($a); is($hash8{$a},ord($a)); is($hashu{$a},ord($a)); @@ -56,8 +57,9 @@ $hash8{chr(0x1ff)} = 0x1ff; # Check we have not got an spurious extra keys is(join('',sort { ord $a <=> ord $b } keys %hash8),"\x7f\xff\x{1ff}"); -foreach my $a ("\x7f","\xff","\x{1ff}") +foreach ("\x7f","\xff","\x{1ff}") { + my $a = $_; utf8::upgrade($a); is($hash8{$a},ord($a)); my $b = $a.chr(100); @@ -69,8 +71,9 @@ foreach my $a ("\x7f","\xff","\x{1ff}") is(delete $hashu{chr(0x1ff)},0x1ff); is(join('',sort keys %hashu),"\x7f\xff"); -foreach my $a ("\x7f","\xff") +foreach ("\x7f","\xff") { + my $a = $_; utf8::upgrade($a); is($hashu{$a},ord($a)); utf8::downgrade($a); diff --git a/t/uni/case.pl b/t/uni/case.pl index b6df5a8089..0402be402f 100644 --- a/t/uni/case.pl +++ b/t/uni/case.pl @@ -20,15 +20,15 @@ sub casetest { my %seen; for my $i (sort keys %simple) { - $seen{hex $i}++; + $seen{$i}++; } print "# ", scalar keys %simple, " simple mappings\n"; my $both; for my $i (sort keys %$spec) { - if (++$seen{hex $i} == 2) { - warn "$base: $i seen twice\n"; + if (++$seen{$i} == 2) { + warn sprintf "$base: $i seen twice\n"; $both++; } } @@ -52,7 +52,7 @@ sub casetest { my $test = 1; - for my $i (sort { hex $a <=> hex $b } keys %simple) { + for my $i (sort keys %simple) { my $w = $simple{$i}; my $c = pack "U0U", hex $i; my $d = $func->($c); @@ -62,9 +62,11 @@ sub casetest { $test++; } - for my $i (sort { hex $a <=> hex $b } keys %$spec) { + for my $i (sort keys %$spec) { my $w = unidump($spec->{$i}); - my $c = pack "U0U", hex $i; + my $u = unpack "U0U", $i; + my $h = sprintf "%04X", $u; + my $c = chr($u); $c .= chr(0x100); chop $c; my $d = $func->($c); my $e = unidump($d); if (ord "A" == 193) { # EBCDIC @@ -116,7 +118,7 @@ sub casetest { # just undo our remapping. } print $w eq $e ? - "ok $test # $i -> $w\n" : "not ok $test # $i -> $e ($w)\n"; + "ok $test # $i -> $w\n" : "not ok $test # $h -> $e ($w)\n"; $test++; } @@ -1401,21 +1401,19 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma if (!*swashp) /* load on-demand */ *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); - if (special) { + /* The 0xDF is the only special casing Unicode code point below 0x100. */ + if (special && (uv1 == 0xDF || uv1 > 0xFF)) { /* It might be "special" (sometimes, but not always, * a multicharacter mapping) */ HV *hv; - SV *keysv; - HE *he; - SV *val; - - if ((hv = get_hv(special, FALSE)) && - (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) && - (he = hv_fetch_ent(hv, keysv, FALSE, 0)) && - (val = HeVAL(he))) { - char *s; + SV **svp; + + if ((hv = get_hv(special, FALSE)) && + (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) && + (*svp)) { + char *s; - s = SvPV(val, len); + s = SvPV(*svp, len); if (len == 1) len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp; else { @@ -1426,7 +1424,7 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma U8 *t = (U8*)s, *tend = t + len, *d; d = tmpbuf; - if (SvUTF8(val)) { + if (SvUTF8(*svp)) { STRLEN tlen = 0; while (t < tend) { |