summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2004-03-03 11:37:21 +0200
committerNicholas Clark <nick@ccl4.org>2004-03-07 22:30:45 +0000
commit08952adc863bc4a67f8bd3d4c7da553683565739 (patch)
treebc8b5f1850b509c5e3f5bf280d76ca105aea9b20
parent8bd77110f5da47faabf6ba3a6e7ab9fd075701a8 (diff)
downloadperl-08952adc863bc4a67f8bd3d4c7da553683565739.tar.gz
Integrate:
[ 22427] Speed up the unicode case mappings (for /i, lc, uc, etc). Subject: [PATCH] [perl #24826] Message-Id: <9B5CBF96-6CE5-11D8-83B0-00039362CB92@iki.fi> [ 22430] Clarify the difference between utf8::downgrade/upgrade and utf8::encode/decode (patch by Jarkko). [ 22439] ensure utf8::encode() normalises its arg [ 22444] A small perluniintro clarification by Jarkko. [ 22463] Add a readonly check to Perl_sv_utf8_upgrade_flags, a regresion test in utf8.t, and fix 3 bugs it exposed in utfhash.t p4raw-link: @22463 on //depot/perl: 5fec3b1d36062f79cb996123dc191025c139d617 p4raw-link: @22444 on //depot/perl: c0c50798904ed219d069da8d28f789f4cc6e4fb2 p4raw-link: @22439 on //depot/perl: 42b1db35bd6485f66426fa704408dc673fb999f1 p4raw-link: @22430 on //depot/perl: 2c9359a248d51da75ec39822c411d2e97fe5c631 p4raw-link: @22427 on //depot/perl: b08cf34e02b6d68e83a46f7566341e6914ff1a2e p4raw-id: //depot/maint-5.8/perl@22464 p4raw-integrated: from //depot/perl@22461 'copy in' t/uni/case.pl (@15684..) lib/unicore/To/Fold.pl lib/unicore/To/Lower.pl lib/unicore/To/Title.pl lib/unicore/To/Upper.pl (@19391..) lib/unicore/mktables (@19686..) lib/utf8.t (@19778..) t/op/utfhash.t (@20288..) lib/utf8.pm (@20878..) utf8.c (@21297..) p4raw-integrated: from //depot/perl@22439 'edit in' sv.c (@22353..) p4raw-integrated: from //depot/perl@22430 'ignore' pod/perluniintro.pod (@21198..)
-rw-r--r--lib/unicore/To/Fold.pl207
-rw-r--r--lib/unicore/To/Lower.pl57
-rw-r--r--lib/unicore/To/Title.pl151
-rw-r--r--lib/unicore/To/Upper.pl205
-rw-r--r--lib/unicore/mktables24
-rw-r--r--lib/utf8.pm23
-rw-r--r--lib/utf8.t8
-rw-r--r--pod/perluniintro.pod6
-rw-r--r--sv.c4
-rw-r--r--t/op/utfhash.t9
-rw-r--r--t/uni/case.pl16
-rw-r--r--utf8.c22
12 files changed, 380 insertions, 352 deletions
diff --git a/lib/unicore/To/Fold.pl b/lib/unicore/To/Fold.pl
index ee608336f8..33c5d466be 100644
--- a/lib/unicore/To/Fold.pl
+++ b/lib/unicore/To/Fold.pl
@@ -3,111 +3,112 @@
# Any changes made here will be lost!
+# The ke UTF-8 _bytes_, the value UTF-8 (speed hack)
%utf8::ToSpecFold =
(
-'00DF' => "\x{0073}\x{0073}",
-'0130' => "\x{0069}\x{0307}",
-'0149' => "\x{02BC}\x{006E}",
-'01F0' => "\x{006A}\x{030C}",
-'0390' => "\x{03B9}\x{0308}\x{0301}",
-'03B0' => "\x{03C5}\x{0308}\x{0301}",
-'0587' => "\x{0565}\x{0582}",
-'1E96' => "\x{0068}\x{0331}",
-'1E97' => "\x{0074}\x{0308}",
-'1E98' => "\x{0077}\x{030A}",
-'1E99' => "\x{0079}\x{030A}",
-'1E9A' => "\x{0061}\x{02BE}",
-'1F50' => "\x{03C5}\x{0313}",
-'1F52' => "\x{03C5}\x{0313}\x{0300}",
-'1F54' => "\x{03C5}\x{0313}\x{0301}",
-'1F56' => "\x{03C5}\x{0313}\x{0342}",
-'1F80' => "\x{1F00}\x{03B9}",
-'1F81' => "\x{1F01}\x{03B9}",
-'1F82' => "\x{1F02}\x{03B9}",
-'1F83' => "\x{1F03}\x{03B9}",
-'1F84' => "\x{1F04}\x{03B9}",
-'1F85' => "\x{1F05}\x{03B9}",
-'1F86' => "\x{1F06}\x{03B9}",
-'1F87' => "\x{1F07}\x{03B9}",
-'1F88' => "\x{1F00}\x{03B9}",
-'1F89' => "\x{1F01}\x{03B9}",
-'1F8A' => "\x{1F02}\x{03B9}",
-'1F8B' => "\x{1F03}\x{03B9}",
-'1F8C' => "\x{1F04}\x{03B9}",
-'1F8D' => "\x{1F05}\x{03B9}",
-'1F8E' => "\x{1F06}\x{03B9}",
-'1F8F' => "\x{1F07}\x{03B9}",
-'1F90' => "\x{1F20}\x{03B9}",
-'1F91' => "\x{1F21}\x{03B9}",
-'1F92' => "\x{1F22}\x{03B9}",
-'1F93' => "\x{1F23}\x{03B9}",
-'1F94' => "\x{1F24}\x{03B9}",
-'1F95' => "\x{1F25}\x{03B9}",
-'1F96' => "\x{1F26}\x{03B9}",
-'1F97' => "\x{1F27}\x{03B9}",
-'1F98' => "\x{1F20}\x{03B9}",
-'1F99' => "\x{1F21}\x{03B9}",
-'1F9A' => "\x{1F22}\x{03B9}",
-'1F9B' => "\x{1F23}\x{03B9}",
-'1F9C' => "\x{1F24}\x{03B9}",
-'1F9D' => "\x{1F25}\x{03B9}",
-'1F9E' => "\x{1F26}\x{03B9}",
-'1F9F' => "\x{1F27}\x{03B9}",
-'1FA0' => "\x{1F60}\x{03B9}",
-'1FA1' => "\x{1F61}\x{03B9}",
-'1FA2' => "\x{1F62}\x{03B9}",
-'1FA3' => "\x{1F63}\x{03B9}",
-'1FA4' => "\x{1F64}\x{03B9}",
-'1FA5' => "\x{1F65}\x{03B9}",
-'1FA6' => "\x{1F66}\x{03B9}",
-'1FA7' => "\x{1F67}\x{03B9}",
-'1FA8' => "\x{1F60}\x{03B9}",
-'1FA9' => "\x{1F61}\x{03B9}",
-'1FAA' => "\x{1F62}\x{03B9}",
-'1FAB' => "\x{1F63}\x{03B9}",
-'1FAC' => "\x{1F64}\x{03B9}",
-'1FAD' => "\x{1F65}\x{03B9}",
-'1FAE' => "\x{1F66}\x{03B9}",
-'1FAF' => "\x{1F67}\x{03B9}",
-'1FB2' => "\x{1F70}\x{03B9}",
-'1FB3' => "\x{03B1}\x{03B9}",
-'1FB4' => "\x{03AC}\x{03B9}",
-'1FB6' => "\x{03B1}\x{0342}",
-'1FB7' => "\x{03B1}\x{0342}\x{03B9}",
-'1FBC' => "\x{03B1}\x{03B9}",
-'1FC2' => "\x{1F74}\x{03B9}",
-'1FC3' => "\x{03B7}\x{03B9}",
-'1FC4' => "\x{03AE}\x{03B9}",
-'1FC6' => "\x{03B7}\x{0342}",
-'1FC7' => "\x{03B7}\x{0342}\x{03B9}",
-'1FCC' => "\x{03B7}\x{03B9}",
-'1FD2' => "\x{03B9}\x{0308}\x{0300}",
-'1FD3' => "\x{03B9}\x{0308}\x{0301}",
-'1FD6' => "\x{03B9}\x{0342}",
-'1FD7' => "\x{03B9}\x{0308}\x{0342}",
-'1FE2' => "\x{03C5}\x{0308}\x{0300}",
-'1FE3' => "\x{03C5}\x{0308}\x{0301}",
-'1FE4' => "\x{03C1}\x{0313}",
-'1FE6' => "\x{03C5}\x{0342}",
-'1FE7' => "\x{03C5}\x{0308}\x{0342}",
-'1FF2' => "\x{1F7C}\x{03B9}",
-'1FF3' => "\x{03C9}\x{03B9}",
-'1FF4' => "\x{03CE}\x{03B9}",
-'1FF6' => "\x{03C9}\x{0342}",
-'1FF7' => "\x{03C9}\x{0342}\x{03B9}",
-'1FFC' => "\x{03C9}\x{03B9}",
-'FB00' => "\x{0066}\x{0066}",
-'FB01' => "\x{0066}\x{0069}",
-'FB02' => "\x{0066}\x{006C}",
-'FB03' => "\x{0066}\x{0066}\x{0069}",
-'FB04' => "\x{0066}\x{0066}\x{006C}",
-'FB05' => "\x{0073}\x{0074}",
-'FB06' => "\x{0073}\x{0074}",
-'FB13' => "\x{0574}\x{0576}",
-'FB14' => "\x{0574}\x{0565}",
-'FB15' => "\x{0574}\x{056B}",
-'FB16' => "\x{057E}\x{0576}",
-'FB17' => "\x{0574}\x{056D}",
+"\xC3\x9F" => "\x{0073}\x{0073}",
+"\xC4\xB0" => "\x{0069}\x{0307}",
+"\xC5\x89" => "\x{02BC}\x{006E}",
+"\xC7\xB0" => "\x{006A}\x{030C}",
+"\xCE\x90" => "\x{03B9}\x{0308}\x{0301}",
+"\xCE\xB0" => "\x{03C5}\x{0308}\x{0301}",
+"\xD6\x87" => "\x{0565}\x{0582}",
+"\xE1\xBA\x96" => "\x{0068}\x{0331}",
+"\xE1\xBA\x97" => "\x{0074}\x{0308}",
+"\xE1\xBA\x98" => "\x{0077}\x{030A}",
+"\xE1\xBA\x99" => "\x{0079}\x{030A}",
+"\xE1\xBA\x9A" => "\x{0061}\x{02BE}",
+"\xE1\xBD\x90" => "\x{03C5}\x{0313}",
+"\xE1\xBD\x92" => "\x{03C5}\x{0313}\x{0300}",
+"\xE1\xBD\x94" => "\x{03C5}\x{0313}\x{0301}",
+"\xE1\xBD\x96" => "\x{03C5}\x{0313}\x{0342}",
+"\xE1\xBE\x80" => "\x{1F00}\x{03B9}",
+"\xE1\xBE\x81" => "\x{1F01}\x{03B9}",
+"\xE1\xBE\x82" => "\x{1F02}\x{03B9}",
+"\xE1\xBE\x83" => "\x{1F03}\x{03B9}",
+"\xE1\xBE\x84" => "\x{1F04}\x{03B9}",
+"\xE1\xBE\x85" => "\x{1F05}\x{03B9}",
+"\xE1\xBE\x86" => "\x{1F06}\x{03B9}",
+"\xE1\xBE\x87" => "\x{1F07}\x{03B9}",
+"\xE1\xBE\x88" => "\x{1F00}\x{03B9}",
+"\xE1\xBE\x89" => "\x{1F01}\x{03B9}",
+"\xE1\xBE\x8A" => "\x{1F02}\x{03B9}",
+"\xE1\xBE\x8B" => "\x{1F03}\x{03B9}",
+"\xE1\xBE\x8C" => "\x{1F04}\x{03B9}",
+"\xE1\xBE\x8D" => "\x{1F05}\x{03B9}",
+"\xE1\xBE\x8E" => "\x{1F06}\x{03B9}",
+"\xE1\xBE\x8F" => "\x{1F07}\x{03B9}",
+"\xE1\xBE\x90" => "\x{1F20}\x{03B9}",
+"\xE1\xBE\x91" => "\x{1F21}\x{03B9}",
+"\xE1\xBE\x92" => "\x{1F22}\x{03B9}",
+"\xE1\xBE\x93" => "\x{1F23}\x{03B9}",
+"\xE1\xBE\x94" => "\x{1F24}\x{03B9}",
+"\xE1\xBE\x95" => "\x{1F25}\x{03B9}",
+"\xE1\xBE\x96" => "\x{1F26}\x{03B9}",
+"\xE1\xBE\x97" => "\x{1F27}\x{03B9}",
+"\xE1\xBE\x98" => "\x{1F20}\x{03B9}",
+"\xE1\xBE\x99" => "\x{1F21}\x{03B9}",
+"\xE1\xBE\x9A" => "\x{1F22}\x{03B9}",
+"\xE1\xBE\x9B" => "\x{1F23}\x{03B9}",
+"\xE1\xBE\x9C" => "\x{1F24}\x{03B9}",
+"\xE1\xBE\x9D" => "\x{1F25}\x{03B9}",
+"\xE1\xBE\x9E" => "\x{1F26}\x{03B9}",
+"\xE1\xBE\x9F" => "\x{1F27}\x{03B9}",
+"\xE1\xBE\xA0" => "\x{1F60}\x{03B9}",
+"\xE1\xBE\xA1" => "\x{1F61}\x{03B9}",
+"\xE1\xBE\xA2" => "\x{1F62}\x{03B9}",
+"\xE1\xBE\xA3" => "\x{1F63}\x{03B9}",
+"\xE1\xBE\xA4" => "\x{1F64}\x{03B9}",
+"\xE1\xBE\xA5" => "\x{1F65}\x{03B9}",
+"\xE1\xBE\xA6" => "\x{1F66}\x{03B9}",
+"\xE1\xBE\xA7" => "\x{1F67}\x{03B9}",
+"\xE1\xBE\xA8" => "\x{1F60}\x{03B9}",
+"\xE1\xBE\xA9" => "\x{1F61}\x{03B9}",
+"\xE1\xBE\xAA" => "\x{1F62}\x{03B9}",
+"\xE1\xBE\xAB" => "\x{1F63}\x{03B9}",
+"\xE1\xBE\xAC" => "\x{1F64}\x{03B9}",
+"\xE1\xBE\xAD" => "\x{1F65}\x{03B9}",
+"\xE1\xBE\xAE" => "\x{1F66}\x{03B9}",
+"\xE1\xBE\xAF" => "\x{1F67}\x{03B9}",
+"\xE1\xBE\xB2" => "\x{1F70}\x{03B9}",
+"\xE1\xBE\xB3" => "\x{03B1}\x{03B9}",
+"\xE1\xBE\xB4" => "\x{03AC}\x{03B9}",
+"\xE1\xBE\xB6" => "\x{03B1}\x{0342}",
+"\xE1\xBE\xB7" => "\x{03B1}\x{0342}\x{03B9}",
+"\xE1\xBE\xBC" => "\x{03B1}\x{03B9}",
+"\xE1\xBF\x82" => "\x{1F74}\x{03B9}",
+"\xE1\xBF\x83" => "\x{03B7}\x{03B9}",
+"\xE1\xBF\x84" => "\x{03AE}\x{03B9}",
+"\xE1\xBF\x86" => "\x{03B7}\x{0342}",
+"\xE1\xBF\x87" => "\x{03B7}\x{0342}\x{03B9}",
+"\xE1\xBF\x8C" => "\x{03B7}\x{03B9}",
+"\xE1\xBF\x92" => "\x{03B9}\x{0308}\x{0300}",
+"\xE1\xBF\x93" => "\x{03B9}\x{0308}\x{0301}",
+"\xE1\xBF\x96" => "\x{03B9}\x{0342}",
+"\xE1\xBF\x97" => "\x{03B9}\x{0308}\x{0342}",
+"\xE1\xBF\xA2" => "\x{03C5}\x{0308}\x{0300}",
+"\xE1\xBF\xA3" => "\x{03C5}\x{0308}\x{0301}",
+"\xE1\xBF\xA4" => "\x{03C1}\x{0313}",
+"\xE1\xBF\xA6" => "\x{03C5}\x{0342}",
+"\xE1\xBF\xA7" => "\x{03C5}\x{0308}\x{0342}",
+"\xE1\xBF\xB2" => "\x{1F7C}\x{03B9}",
+"\xE1\xBF\xB3" => "\x{03C9}\x{03B9}",
+"\xE1\xBF\xB4" => "\x{03CE}\x{03B9}",
+"\xE1\xBF\xB6" => "\x{03C9}\x{0342}",
+"\xE1\xBF\xB7" => "\x{03C9}\x{0342}\x{03B9}",
+"\xE1\xBF\xBC" => "\x{03C9}\x{03B9}",
+"\xEF\xAC\x80" => "\x{0066}\x{0066}",
+"\xEF\xAC\x81" => "\x{0066}\x{0069}",
+"\xEF\xAC\x82" => "\x{0066}\x{006C}",
+"\xEF\xAC\x83" => "\x{0066}\x{0066}\x{0069}",
+"\xEF\xAC\x84" => "\x{0066}\x{0066}\x{006C}",
+"\xEF\xAC\x85" => "\x{0073}\x{0074}",
+"\xEF\xAC\x86" => "\x{0073}\x{0074}",
+"\xEF\xAC\x93" => "\x{0574}\x{0576}",
+"\xEF\xAC\x94" => "\x{0574}\x{0565}",
+"\xEF\xAC\x95" => "\x{0574}\x{056B}",
+"\xEF\xAC\x96" => "\x{057E}\x{0576}",
+"\xEF\xAC\x97" => "\x{0574}\x{056D}",
);
return <<'END';
diff --git a/lib/unicore/To/Lower.pl b/lib/unicore/To/Lower.pl
index 8bf064ad80..7e5adfeede 100644
--- a/lib/unicore/To/Lower.pl
+++ b/lib/unicore/To/Lower.pl
@@ -3,36 +3,37 @@
# Any changes made here will be lost!
+# The key UTF-8 _bytes_, the value UTF-8 (speed hack)
%utf8::ToSpecLower =
(
-'0130' => "\x{0069}\x{0307}",
-'1F88' => "\x{1F80}",
-'1F89' => "\x{1F81}",
-'1F8A' => "\x{1F82}",
-'1F8B' => "\x{1F83}",
-'1F8C' => "\x{1F84}",
-'1F8D' => "\x{1F85}",
-'1F8E' => "\x{1F86}",
-'1F8F' => "\x{1F87}",
-'1F98' => "\x{1F90}",
-'1F99' => "\x{1F91}",
-'1F9A' => "\x{1F92}",
-'1F9B' => "\x{1F93}",
-'1F9C' => "\x{1F94}",
-'1F9D' => "\x{1F95}",
-'1F9E' => "\x{1F96}",
-'1F9F' => "\x{1F97}",
-'1FA8' => "\x{1FA0}",
-'1FA9' => "\x{1FA1}",
-'1FAA' => "\x{1FA2}",
-'1FAB' => "\x{1FA3}",
-'1FAC' => "\x{1FA4}",
-'1FAD' => "\x{1FA5}",
-'1FAE' => "\x{1FA6}",
-'1FAF' => "\x{1FA7}",
-'1FBC' => "\x{1FB3}",
-'1FCC' => "\x{1FC3}",
-'1FFC' => "\x{1FF3}",
+"\xC4\xB0" => "\x{0069}\x{0307}",
+"\xE1\xBE\x88" => "\x{1F80}",
+"\xE1\xBE\x89" => "\x{1F81}",
+"\xE1\xBE\x8A" => "\x{1F82}",
+"\xE1\xBE\x8B" => "\x{1F83}",
+"\xE1\xBE\x8C" => "\x{1F84}",
+"\xE1\xBE\x8D" => "\x{1F85}",
+"\xE1\xBE\x8E" => "\x{1F86}",
+"\xE1\xBE\x8F" => "\x{1F87}",
+"\xE1\xBE\x98" => "\x{1F90}",
+"\xE1\xBE\x99" => "\x{1F91}",
+"\xE1\xBE\x9A" => "\x{1F92}",
+"\xE1\xBE\x9B" => "\x{1F93}",
+"\xE1\xBE\x9C" => "\x{1F94}",
+"\xE1\xBE\x9D" => "\x{1F95}",
+"\xE1\xBE\x9E" => "\x{1F96}",
+"\xE1\xBE\x9F" => "\x{1F97}",
+"\xE1\xBE\xA8" => "\x{1FA0}",
+"\xE1\xBE\xA9" => "\x{1FA1}",
+"\xE1\xBE\xAA" => "\x{1FA2}",
+"\xE1\xBE\xAB" => "\x{1FA3}",
+"\xE1\xBE\xAC" => "\x{1FA4}",
+"\xE1\xBE\xAD" => "\x{1FA5}",
+"\xE1\xBE\xAE" => "\x{1FA6}",
+"\xE1\xBE\xAF" => "\x{1FA7}",
+"\xE1\xBE\xBC" => "\x{1FB3}",
+"\xE1\xBF\x8C" => "\x{1FC3}",
+"\xE1\xBF\xBC" => "\x{1FF3}",
);
return <<'END';
diff --git a/lib/unicore/To/Title.pl b/lib/unicore/To/Title.pl
index cfeccabcdc..2223f7bafc 100644
--- a/lib/unicore/To/Title.pl
+++ b/lib/unicore/To/Title.pl
@@ -3,83 +3,84 @@
# Any changes made here will be lost!
+# The key UTF-8 _bytes_, the value UTF-8 (speed hack)
%utf8::ToSpecTitle =
(
-'00DF' => "\x{0053}\x{0073}",
-'0149' => "\x{02BC}\x{004E}",
-'01F0' => "\x{004A}\x{030C}",
-'0390' => "\x{0399}\x{0308}\x{0301}",
-'03B0' => "\x{03A5}\x{0308}\x{0301}",
-'0587' => "\x{0535}\x{0582}",
-'1E96' => "\x{0048}\x{0331}",
-'1E97' => "\x{0054}\x{0308}",
-'1E98' => "\x{0057}\x{030A}",
-'1E99' => "\x{0059}\x{030A}",
-'1E9A' => "\x{0041}\x{02BE}",
-'1F50' => "\x{03A5}\x{0313}",
-'1F52' => "\x{03A5}\x{0313}\x{0300}",
-'1F54' => "\x{03A5}\x{0313}\x{0301}",
-'1F56' => "\x{03A5}\x{0313}\x{0342}",
-'1F80' => "\x{1F88}",
-'1F81' => "\x{1F89}",
-'1F82' => "\x{1F8A}",
-'1F83' => "\x{1F8B}",
-'1F84' => "\x{1F8C}",
-'1F85' => "\x{1F8D}",
-'1F86' => "\x{1F8E}",
-'1F87' => "\x{1F8F}",
-'1F90' => "\x{1F98}",
-'1F91' => "\x{1F99}",
-'1F92' => "\x{1F9A}",
-'1F93' => "\x{1F9B}",
-'1F94' => "\x{1F9C}",
-'1F95' => "\x{1F9D}",
-'1F96' => "\x{1F9E}",
-'1F97' => "\x{1F9F}",
-'1FA0' => "\x{1FA8}",
-'1FA1' => "\x{1FA9}",
-'1FA2' => "\x{1FAA}",
-'1FA3' => "\x{1FAB}",
-'1FA4' => "\x{1FAC}",
-'1FA5' => "\x{1FAD}",
-'1FA6' => "\x{1FAE}",
-'1FA7' => "\x{1FAF}",
-'1FB2' => "\x{1FBA}\x{0345}",
-'1FB3' => "\x{1FBC}",
-'1FB4' => "\x{0386}\x{0345}",
-'1FB6' => "\x{0391}\x{0342}",
-'1FB7' => "\x{0391}\x{0342}\x{0345}",
-'1FC2' => "\x{1FCA}\x{0345}",
-'1FC3' => "\x{1FCC}",
-'1FC4' => "\x{0389}\x{0345}",
-'1FC6' => "\x{0397}\x{0342}",
-'1FC7' => "\x{0397}\x{0342}\x{0345}",
-'1FD2' => "\x{0399}\x{0308}\x{0300}",
-'1FD3' => "\x{0399}\x{0308}\x{0301}",
-'1FD6' => "\x{0399}\x{0342}",
-'1FD7' => "\x{0399}\x{0308}\x{0342}",
-'1FE2' => "\x{03A5}\x{0308}\x{0300}",
-'1FE3' => "\x{03A5}\x{0308}\x{0301}",
-'1FE4' => "\x{03A1}\x{0313}",
-'1FE6' => "\x{03A5}\x{0342}",
-'1FE7' => "\x{03A5}\x{0308}\x{0342}",
-'1FF2' => "\x{1FFA}\x{0345}",
-'1FF3' => "\x{1FFC}",
-'1FF4' => "\x{038F}\x{0345}",
-'1FF6' => "\x{03A9}\x{0342}",
-'1FF7' => "\x{03A9}\x{0342}\x{0345}",
-'FB00' => "\x{0046}\x{0066}",
-'FB01' => "\x{0046}\x{0069}",
-'FB02' => "\x{0046}\x{006C}",
-'FB03' => "\x{0046}\x{0066}\x{0069}",
-'FB04' => "\x{0046}\x{0066}\x{006C}",
-'FB05' => "\x{0053}\x{0074}",
-'FB06' => "\x{0053}\x{0074}",
-'FB13' => "\x{0544}\x{0576}",
-'FB14' => "\x{0544}\x{0565}",
-'FB15' => "\x{0544}\x{056B}",
-'FB16' => "\x{054E}\x{0576}",
-'FB17' => "\x{0544}\x{056D}",
+"\xC3\x9F" => "\x{0053}\x{0073}",
+"\xC5\x89" => "\x{02BC}\x{004E}",
+"\xC7\xB0" => "\x{004A}\x{030C}",
+"\xCE\x90" => "\x{0399}\x{0308}\x{0301}",
+"\xCE\xB0" => "\x{03A5}\x{0308}\x{0301}",
+"\xD6\x87" => "\x{0535}\x{0582}",
+"\xE1\xBA\x96" => "\x{0048}\x{0331}",
+"\xE1\xBA\x97" => "\x{0054}\x{0308}",
+"\xE1\xBA\x98" => "\x{0057}\x{030A}",
+"\xE1\xBA\x99" => "\x{0059}\x{030A}",
+"\xE1\xBA\x9A" => "\x{0041}\x{02BE}",
+"\xE1\xBD\x90" => "\x{03A5}\x{0313}",
+"\xE1\xBD\x92" => "\x{03A5}\x{0313}\x{0300}",
+"\xE1\xBD\x94" => "\x{03A5}\x{0313}\x{0301}",
+"\xE1\xBD\x96" => "\x{03A5}\x{0313}\x{0342}",
+"\xE1\xBE\x80" => "\x{1F88}",
+"\xE1\xBE\x81" => "\x{1F89}",
+"\xE1\xBE\x82" => "\x{1F8A}",
+"\xE1\xBE\x83" => "\x{1F8B}",
+"\xE1\xBE\x84" => "\x{1F8C}",
+"\xE1\xBE\x85" => "\x{1F8D}",
+"\xE1\xBE\x86" => "\x{1F8E}",
+"\xE1\xBE\x87" => "\x{1F8F}",
+"\xE1\xBE\x90" => "\x{1F98}",
+"\xE1\xBE\x91" => "\x{1F99}",
+"\xE1\xBE\x92" => "\x{1F9A}",
+"\xE1\xBE\x93" => "\x{1F9B}",
+"\xE1\xBE\x94" => "\x{1F9C}",
+"\xE1\xBE\x95" => "\x{1F9D}",
+"\xE1\xBE\x96" => "\x{1F9E}",
+"\xE1\xBE\x97" => "\x{1F9F}",
+"\xE1\xBE\xA0" => "\x{1FA8}",
+"\xE1\xBE\xA1" => "\x{1FA9}",
+"\xE1\xBE\xA2" => "\x{1FAA}",
+"\xE1\xBE\xA3" => "\x{1FAB}",
+"\xE1\xBE\xA4" => "\x{1FAC}",
+"\xE1\xBE\xA5" => "\x{1FAD}",
+"\xE1\xBE\xA6" => "\x{1FAE}",
+"\xE1\xBE\xA7" => "\x{1FAF}",
+"\xE1\xBE\xB2" => "\x{1FBA}\x{0345}",
+"\xE1\xBE\xB3" => "\x{1FBC}",
+"\xE1\xBE\xB4" => "\x{0386}\x{0345}",
+"\xE1\xBE\xB6" => "\x{0391}\x{0342}",
+"\xE1\xBE\xB7" => "\x{0391}\x{0342}\x{0345}",
+"\xE1\xBF\x82" => "\x{1FCA}\x{0345}",
+"\xE1\xBF\x83" => "\x{1FCC}",
+"\xE1\xBF\x84" => "\x{0389}\x{0345}",
+"\xE1\xBF\x86" => "\x{0397}\x{0342}",
+"\xE1\xBF\x87" => "\x{0397}\x{0342}\x{0345}",
+"\xE1\xBF\x92" => "\x{0399}\x{0308}\x{0300}",
+"\xE1\xBF\x93" => "\x{0399}\x{0308}\x{0301}",
+"\xE1\xBF\x96" => "\x{0399}\x{0342}",
+"\xE1\xBF\x97" => "\x{0399}\x{0308}\x{0342}",
+"\xE1\xBF\xA2" => "\x{03A5}\x{0308}\x{0300}",
+"\xE1\xBF\xA3" => "\x{03A5}\x{0308}\x{0301}",
+"\xE1\xBF\xA4" => "\x{03A1}\x{0313}",
+"\xE1\xBF\xA6" => "\x{03A5}\x{0342}",
+"\xE1\xBF\xA7" => "\x{03A5}\x{0308}\x{0342}",
+"\xE1\xBF\xB2" => "\x{1FFA}\x{0345}",
+"\xE1\xBF\xB3" => "\x{1FFC}",
+"\xE1\xBF\xB4" => "\x{038F}\x{0345}",
+"\xE1\xBF\xB6" => "\x{03A9}\x{0342}",
+"\xE1\xBF\xB7" => "\x{03A9}\x{0342}\x{0345}",
+"\xEF\xAC\x80" => "\x{0046}\x{0066}",
+"\xEF\xAC\x81" => "\x{0046}\x{0069}",
+"\xEF\xAC\x82" => "\x{0046}\x{006C}",
+"\xEF\xAC\x83" => "\x{0046}\x{0066}\x{0069}",
+"\xEF\xAC\x84" => "\x{0046}\x{0066}\x{006C}",
+"\xEF\xAC\x85" => "\x{0053}\x{0074}",
+"\xEF\xAC\x86" => "\x{0053}\x{0074}",
+"\xEF\xAC\x93" => "\x{0544}\x{0576}",
+"\xEF\xAC\x94" => "\x{0544}\x{0565}",
+"\xEF\xAC\x95" => "\x{0544}\x{056B}",
+"\xEF\xAC\x96" => "\x{054E}\x{0576}",
+"\xEF\xAC\x97" => "\x{0544}\x{056D}",
);
return <<'END';
diff --git a/lib/unicore/To/Upper.pl b/lib/unicore/To/Upper.pl
index 1a42789016..6c8fd30577 100644
--- a/lib/unicore/To/Upper.pl
+++ b/lib/unicore/To/Upper.pl
@@ -3,110 +3,111 @@
# Any changes made here will be lost!
+# The key UTF-8 _bytes_, the value UTF-8 (speed hack)
%utf8::ToSpecUpper =
(
-'00DF' => "\x{0053}\x{0053}",
-'0149' => "\x{02BC}\x{004E}",
-'01F0' => "\x{004A}\x{030C}",
-'0390' => "\x{0399}\x{0308}\x{0301}",
-'03B0' => "\x{03A5}\x{0308}\x{0301}",
-'0587' => "\x{0535}\x{0552}",
-'1E96' => "\x{0048}\x{0331}",
-'1E97' => "\x{0054}\x{0308}",
-'1E98' => "\x{0057}\x{030A}",
-'1E99' => "\x{0059}\x{030A}",
-'1E9A' => "\x{0041}\x{02BE}",
-'1F50' => "\x{03A5}\x{0313}",
-'1F52' => "\x{03A5}\x{0313}\x{0300}",
-'1F54' => "\x{03A5}\x{0313}\x{0301}",
-'1F56' => "\x{03A5}\x{0313}\x{0342}",
-'1F80' => "\x{1F08}\x{0399}",
-'1F81' => "\x{1F09}\x{0399}",
-'1F82' => "\x{1F0A}\x{0399}",
-'1F83' => "\x{1F0B}\x{0399}",
-'1F84' => "\x{1F0C}\x{0399}",
-'1F85' => "\x{1F0D}\x{0399}",
-'1F86' => "\x{1F0E}\x{0399}",
-'1F87' => "\x{1F0F}\x{0399}",
-'1F88' => "\x{1F08}\x{0399}",
-'1F89' => "\x{1F09}\x{0399}",
-'1F8A' => "\x{1F0A}\x{0399}",
-'1F8B' => "\x{1F0B}\x{0399}",
-'1F8C' => "\x{1F0C}\x{0399}",
-'1F8D' => "\x{1F0D}\x{0399}",
-'1F8E' => "\x{1F0E}\x{0399}",
-'1F8F' => "\x{1F0F}\x{0399}",
-'1F90' => "\x{1F28}\x{0399}",
-'1F91' => "\x{1F29}\x{0399}",
-'1F92' => "\x{1F2A}\x{0399}",
-'1F93' => "\x{1F2B}\x{0399}",
-'1F94' => "\x{1F2C}\x{0399}",
-'1F95' => "\x{1F2D}\x{0399}",
-'1F96' => "\x{1F2E}\x{0399}",
-'1F97' => "\x{1F2F}\x{0399}",
-'1F98' => "\x{1F28}\x{0399}",
-'1F99' => "\x{1F29}\x{0399}",
-'1F9A' => "\x{1F2A}\x{0399}",
-'1F9B' => "\x{1F2B}\x{0399}",
-'1F9C' => "\x{1F2C}\x{0399}",
-'1F9D' => "\x{1F2D}\x{0399}",
-'1F9E' => "\x{1F2E}\x{0399}",
-'1F9F' => "\x{1F2F}\x{0399}",
-'1FA0' => "\x{1F68}\x{0399}",
-'1FA1' => "\x{1F69}\x{0399}",
-'1FA2' => "\x{1F6A}\x{0399}",
-'1FA3' => "\x{1F6B}\x{0399}",
-'1FA4' => "\x{1F6C}\x{0399}",
-'1FA5' => "\x{1F6D}\x{0399}",
-'1FA6' => "\x{1F6E}\x{0399}",
-'1FA7' => "\x{1F6F}\x{0399}",
-'1FA8' => "\x{1F68}\x{0399}",
-'1FA9' => "\x{1F69}\x{0399}",
-'1FAA' => "\x{1F6A}\x{0399}",
-'1FAB' => "\x{1F6B}\x{0399}",
-'1FAC' => "\x{1F6C}\x{0399}",
-'1FAD' => "\x{1F6D}\x{0399}",
-'1FAE' => "\x{1F6E}\x{0399}",
-'1FAF' => "\x{1F6F}\x{0399}",
-'1FB2' => "\x{1FBA}\x{0399}",
-'1FB3' => "\x{0391}\x{0399}",
-'1FB4' => "\x{0386}\x{0399}",
-'1FB6' => "\x{0391}\x{0342}",
-'1FB7' => "\x{0391}\x{0342}\x{0399}",
-'1FBC' => "\x{0391}\x{0399}",
-'1FC2' => "\x{1FCA}\x{0399}",
-'1FC3' => "\x{0397}\x{0399}",
-'1FC4' => "\x{0389}\x{0399}",
-'1FC6' => "\x{0397}\x{0342}",
-'1FC7' => "\x{0397}\x{0342}\x{0399}",
-'1FCC' => "\x{0397}\x{0399}",
-'1FD2' => "\x{0399}\x{0308}\x{0300}",
-'1FD3' => "\x{0399}\x{0308}\x{0301}",
-'1FD6' => "\x{0399}\x{0342}",
-'1FD7' => "\x{0399}\x{0308}\x{0342}",
-'1FE2' => "\x{03A5}\x{0308}\x{0300}",
-'1FE3' => "\x{03A5}\x{0308}\x{0301}",
-'1FE4' => "\x{03A1}\x{0313}",
-'1FE6' => "\x{03A5}\x{0342}",
-'1FE7' => "\x{03A5}\x{0308}\x{0342}",
-'1FF2' => "\x{1FFA}\x{0399}",
-'1FF3' => "\x{03A9}\x{0399}",
-'1FF4' => "\x{038F}\x{0399}",
-'1FF6' => "\x{03A9}\x{0342}",
-'1FF7' => "\x{03A9}\x{0342}\x{0399}",
-'1FFC' => "\x{03A9}\x{0399}",
-'FB00' => "\x{0046}\x{0046}",
-'FB01' => "\x{0046}\x{0049}",
-'FB02' => "\x{0046}\x{004C}",
-'FB03' => "\x{0046}\x{0046}\x{0049}",
-'FB04' => "\x{0046}\x{0046}\x{004C}",
-'FB05' => "\x{0053}\x{0054}",
-'FB06' => "\x{0053}\x{0054}",
-'FB13' => "\x{0544}\x{0546}",
-'FB14' => "\x{0544}\x{0535}",
-'FB15' => "\x{0544}\x{053B}",
-'FB16' => "\x{054E}\x{0546}",
-'FB17' => "\x{0544}\x{053D}",
+"\xC3\x9F" => "\x{0053}\x{0053}",
+"\xC5\x89" => "\x{02BC}\x{004E}",
+"\xC7\xB0" => "\x{004A}\x{030C}",
+"\xCE\x90" => "\x{0399}\x{0308}\x{0301}",
+"\xCE\xB0" => "\x{03A5}\x{0308}\x{0301}",
+"\xD6\x87" => "\x{0535}\x{0552}",
+"\xE1\xBA\x96" => "\x{0048}\x{0331}",
+"\xE1\xBA\x97" => "\x{0054}\x{0308}",
+"\xE1\xBA\x98" => "\x{0057}\x{030A}",
+"\xE1\xBA\x99" => "\x{0059}\x{030A}",
+"\xE1\xBA\x9A" => "\x{0041}\x{02BE}",
+"\xE1\xBD\x90" => "\x{03A5}\x{0313}",
+"\xE1\xBD\x92" => "\x{03A5}\x{0313}\x{0300}",
+"\xE1\xBD\x94" => "\x{03A5}\x{0313}\x{0301}",
+"\xE1\xBD\x96" => "\x{03A5}\x{0313}\x{0342}",
+"\xE1\xBE\x80" => "\x{1F08}\x{0399}",
+"\xE1\xBE\x81" => "\x{1F09}\x{0399}",
+"\xE1\xBE\x82" => "\x{1F0A}\x{0399}",
+"\xE1\xBE\x83" => "\x{1F0B}\x{0399}",
+"\xE1\xBE\x84" => "\x{1F0C}\x{0399}",
+"\xE1\xBE\x85" => "\x{1F0D}\x{0399}",
+"\xE1\xBE\x86" => "\x{1F0E}\x{0399}",
+"\xE1\xBE\x87" => "\x{1F0F}\x{0399}",
+"\xE1\xBE\x88" => "\x{1F08}\x{0399}",
+"\xE1\xBE\x89" => "\x{1F09}\x{0399}",
+"\xE1\xBE\x8A" => "\x{1F0A}\x{0399}",
+"\xE1\xBE\x8B" => "\x{1F0B}\x{0399}",
+"\xE1\xBE\x8C" => "\x{1F0C}\x{0399}",
+"\xE1\xBE\x8D" => "\x{1F0D}\x{0399}",
+"\xE1\xBE\x8E" => "\x{1F0E}\x{0399}",
+"\xE1\xBE\x8F" => "\x{1F0F}\x{0399}",
+"\xE1\xBE\x90" => "\x{1F28}\x{0399}",
+"\xE1\xBE\x91" => "\x{1F29}\x{0399}",
+"\xE1\xBE\x92" => "\x{1F2A}\x{0399}",
+"\xE1\xBE\x93" => "\x{1F2B}\x{0399}",
+"\xE1\xBE\x94" => "\x{1F2C}\x{0399}",
+"\xE1\xBE\x95" => "\x{1F2D}\x{0399}",
+"\xE1\xBE\x96" => "\x{1F2E}\x{0399}",
+"\xE1\xBE\x97" => "\x{1F2F}\x{0399}",
+"\xE1\xBE\x98" => "\x{1F28}\x{0399}",
+"\xE1\xBE\x99" => "\x{1F29}\x{0399}",
+"\xE1\xBE\x9A" => "\x{1F2A}\x{0399}",
+"\xE1\xBE\x9B" => "\x{1F2B}\x{0399}",
+"\xE1\xBE\x9C" => "\x{1F2C}\x{0399}",
+"\xE1\xBE\x9D" => "\x{1F2D}\x{0399}",
+"\xE1\xBE\x9E" => "\x{1F2E}\x{0399}",
+"\xE1\xBE\x9F" => "\x{1F2F}\x{0399}",
+"\xE1\xBE\xA0" => "\x{1F68}\x{0399}",
+"\xE1\xBE\xA1" => "\x{1F69}\x{0399}",
+"\xE1\xBE\xA2" => "\x{1F6A}\x{0399}",
+"\xE1\xBE\xA3" => "\x{1F6B}\x{0399}",
+"\xE1\xBE\xA4" => "\x{1F6C}\x{0399}",
+"\xE1\xBE\xA5" => "\x{1F6D}\x{0399}",
+"\xE1\xBE\xA6" => "\x{1F6E}\x{0399}",
+"\xE1\xBE\xA7" => "\x{1F6F}\x{0399}",
+"\xE1\xBE\xA8" => "\x{1F68}\x{0399}",
+"\xE1\xBE\xA9" => "\x{1F69}\x{0399}",
+"\xE1\xBE\xAA" => "\x{1F6A}\x{0399}",
+"\xE1\xBE\xAB" => "\x{1F6B}\x{0399}",
+"\xE1\xBE\xAC" => "\x{1F6C}\x{0399}",
+"\xE1\xBE\xAD" => "\x{1F6D}\x{0399}",
+"\xE1\xBE\xAE" => "\x{1F6E}\x{0399}",
+"\xE1\xBE\xAF" => "\x{1F6F}\x{0399}",
+"\xE1\xBE\xB2" => "\x{1FBA}\x{0399}",
+"\xE1\xBE\xB3" => "\x{0391}\x{0399}",
+"\xE1\xBE\xB4" => "\x{0386}\x{0399}",
+"\xE1\xBE\xB6" => "\x{0391}\x{0342}",
+"\xE1\xBE\xB7" => "\x{0391}\x{0342}\x{0399}",
+"\xE1\xBE\xBC" => "\x{0391}\x{0399}",
+"\xE1\xBF\x82" => "\x{1FCA}\x{0399}",
+"\xE1\xBF\x83" => "\x{0397}\x{0399}",
+"\xE1\xBF\x84" => "\x{0389}\x{0399}",
+"\xE1\xBF\x86" => "\x{0397}\x{0342}",
+"\xE1\xBF\x87" => "\x{0397}\x{0342}\x{0399}",
+"\xE1\xBF\x8C" => "\x{0397}\x{0399}",
+"\xE1\xBF\x92" => "\x{0399}\x{0308}\x{0300}",
+"\xE1\xBF\x93" => "\x{0399}\x{0308}\x{0301}",
+"\xE1\xBF\x96" => "\x{0399}\x{0342}",
+"\xE1\xBF\x97" => "\x{0399}\x{0308}\x{0342}",
+"\xE1\xBF\xA2" => "\x{03A5}\x{0308}\x{0300}",
+"\xE1\xBF\xA3" => "\x{03A5}\x{0308}\x{0301}",
+"\xE1\xBF\xA4" => "\x{03A1}\x{0313}",
+"\xE1\xBF\xA6" => "\x{03A5}\x{0342}",
+"\xE1\xBF\xA7" => "\x{03A5}\x{0308}\x{0342}",
+"\xE1\xBF\xB2" => "\x{1FFA}\x{0399}",
+"\xE1\xBF\xB3" => "\x{03A9}\x{0399}",
+"\xE1\xBF\xB4" => "\x{038F}\x{0399}",
+"\xE1\xBF\xB6" => "\x{03A9}\x{0342}",
+"\xE1\xBF\xB7" => "\x{03A9}\x{0342}\x{0399}",
+"\xE1\xBF\xBC" => "\x{03A9}\x{0399}",
+"\xEF\xAC\x80" => "\x{0046}\x{0046}",
+"\xEF\xAC\x81" => "\x{0046}\x{0049}",
+"\xEF\xAC\x82" => "\x{0046}\x{004C}",
+"\xEF\xAC\x83" => "\x{0046}\x{0046}\x{0049}",
+"\xEF\xAC\x84" => "\x{0046}\x{0046}\x{004C}",
+"\xEF\xAC\x85" => "\x{0053}\x{0054}",
+"\xEF\xAC\x86" => "\x{0053}\x{0054}",
+"\xEF\xAC\x93" => "\x{0544}\x{0546}",
+"\xEF\xAC\x94" => "\x{0544}\x{0535}",
+"\xEF\xAC\x95" => "\x{0544}\x{053B}",
+"\xEF\xAC\x96" => "\x{054E}\x{0546}",
+"\xEF\xAC\x97" => "\x{0544}\x{053D}",
);
return <<'END';
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 5fdac52dc6..09b8175cd7 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -1658,16 +1658,18 @@ sub SpecialCasing_txt()
{
my $NormalCase = do "To/$case.pl" || die "$0: $@\n";
- my @OUT = (
- $HEADER, "\n",
- "%utf8::ToSpec$case =\n(\n",
- );
+ my @OUT =
+ (
+ $HEADER, "\n",
+ "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
+ "%utf8::ToSpec$case =\n(\n",
+ );
for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) {
my ($ix, $code, $to) = @$prop;
my $tostr =
join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
- push @OUT, sprintf qq['%04X' => "$tostr",\n], $ix;
+ push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix)));
# Remove any single-character mappings for
# the same character since we are going for
# the special casing rules.
@@ -1719,14 +1721,16 @@ sub CaseFolding_txt()
#
my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
- my @OUT = (
- $HEADER, "\n",
- "%utf8::ToSpecFold =\n(\n",
- );
+ my @OUT =
+ (
+ $HEADER, "\n",
+ "# The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
+ "%utf8::ToSpecFold =\n(\n",
+ );
for my $code (sort { $a <=> $b } keys %Fold) {
my $foldstr =
join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
- push @OUT, sprintf qq['%04X' => "$foldstr",\n], $code;
+ push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code)));
}
push @OUT, (
");\n\n",
diff --git a/lib/utf8.pm b/lib/utf8.pm
index f5eebe7fba..ea99dd9f86 100644
--- a/lib/utf8.pm
+++ b/lib/utf8.pm
@@ -31,9 +31,11 @@ utf8 - Perl pragma to enable/disable UTF-8 (or UTF-EBCDIC) in source code
use utf8;
no utf8;
+ # Convert a Perl scalar to/from UTF-8.
$num_octets = utf8::upgrade($string);
$success = utf8::downgrade($string[, FAIL_OK]);
+ # Change the native bytes of a Perl scalar to/from UTF-8 bytes.
utf8::encode($string);
utf8::decode($string);
@@ -133,18 +135,23 @@ pragma.
=item * utf8::encode($string)
-Converts (in-place) I<$string> from logical characters to octet
-sequence representing it in Perl's I<UTF-X> encoding. Returns
-nothing. Same as Encode::encode_utf8(). Note that this should not be
-used to convert a legacy byte encoding to Unicode: use Encode for
-that.
+Converts in-place the octets of the I<$string> to the octet sequence
+in Perl's I<UTF-X> encoding. Returns nothing. B<Note that this does
+not change the "type" of I<$string> to UTF-8>, and that this handles
+only ISO 8859-1 (or EBCDIC) as the source character set. Therefore
+this should not be used to convert a legacy 8-bit encoding to Unicode:
+use Encode::decode() for that. In the very limited case of wanting to
+handle just ISO 8859-1 (or EBCDIC), you could use utf8::upgrade().
=item * utf8::decode($string)
Attempts to convert I<$string> in-place from Perl's I<UTF-X> encoding
-into logical characters. Returns nothing. Same as Encode::decode_utf8().
-Note that this should not be used to convert Unicode back to a legacy
-byte encoding: use Encode for that.
+into octets. Returns nothing. B<Note that this does not change the
+"type" of <$string> from UTF-8>, and that this handles only ISO 8859-1
+(or EBCDIC) as the destination character set. Therefore this should
+not be used to convert Unicode back to a legacy 8-bit encoding:
+use Encode::encode() for that. In the very limited case of wanting
+to handle just ISO 8859-1 (or EBCDIC), you could use utf8::downgrade().
=item * $flag = utf8::is_utf8(STRING)
diff --git a/lib/utf8.t b/lib/utf8.t
index 33cd5966af..90035e56b3 100644
--- a/lib/utf8.t
+++ b/lib/utf8.t
@@ -37,7 +37,7 @@ no utf8; # Ironic, no?
#
#
-plan tests => 143;
+plan tests => 144;
{
# bug id 20001009.001
@@ -409,3 +409,9 @@ SKIP: {
ok( utf8::is_utf8($b), " utf8::is_utf8 beyond"); # $b stays in UTF-8.
ok( utf8::is_utf8($c), " utf8::is_utf8 unicode");
}
+
+{
+ eval {utf8::encode("£")};
+ like($@, qr/^Modification of a read-only value attempted/,
+ "utf8::encode should refuse to touch read-only values");
+}
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
index 71d0e57cca..803df80759 100644
--- a/pod/perluniintro.pod
+++ b/pod/perluniintro.pod
@@ -265,7 +265,7 @@ C<substr()> will work on the Unicode characters; regular expressions
will work on the Unicode characters (see L<perlunicode> and L<perlretut>).
Note that Perl considers combining character sequences to be
-characters, so for example
+separate characters, so for example
use charnames ':full';
print length("\N{LATIN CAPITAL LETTER A}\N{COMBINING ACUTE ACCENT}"), "\n";
@@ -299,8 +299,8 @@ If that variable isn't set, the encoding pragma will fail.
The C<Encode> module knows about many encodings and has interfaces
for doing conversions between those encodings:
- use Encode 'from_to';
- from_to($data, "iso-8859-3", "utf-8"); # from legacy to utf-8
+ use Encode 'decode';
+ $data = decode("iso-8859-3", $data); # convert from legacy to utf-8
=head2 Unicode I/O
diff --git a/sv.c b/sv.c
index 36a2339f1b..ab20b3984c 100644
--- a/sv.c
+++ b/sv.c
@@ -3454,6 +3454,10 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags)
sv_force_normal(sv);
}
+ if (SvREADONLY(sv)) {
+ Perl_croak(aTHX_ PL_no_modify);
+ }
+
if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING))
sv_recode_to_utf8(sv, PL_encoding);
else { /* Assume Latin-1/EBCDIC */
diff --git a/t/op/utfhash.t b/t/op/utfhash.t
index 9e0196b6b8..33909c0cbc 100644
--- a/t/op/utfhash.t
+++ b/t/op/utfhash.t
@@ -32,8 +32,9 @@ is($hashu{"\xff"},0xFF);
is($hashu{"\x7f"},0x7F);
# Now try same thing with variables forced into various forms.
-foreach my $a ("\x7f","\xff")
+foreach ("\x7f","\xff")
{
+ my $a = $_; # Force a copy
utf8::upgrade($a);
is($hash8{$a},ord($a));
is($hashu{$a},ord($a));
@@ -56,8 +57,9 @@ $hash8{chr(0x1ff)} = 0x1ff;
# Check we have not got an spurious extra keys
is(join('',sort { ord $a <=> ord $b } keys %hash8),"\x7f\xff\x{1ff}");
-foreach my $a ("\x7f","\xff","\x{1ff}")
+foreach ("\x7f","\xff","\x{1ff}")
{
+ my $a = $_;
utf8::upgrade($a);
is($hash8{$a},ord($a));
my $b = $a.chr(100);
@@ -69,8 +71,9 @@ foreach my $a ("\x7f","\xff","\x{1ff}")
is(delete $hashu{chr(0x1ff)},0x1ff);
is(join('',sort keys %hashu),"\x7f\xff");
-foreach my $a ("\x7f","\xff")
+foreach ("\x7f","\xff")
{
+ my $a = $_;
utf8::upgrade($a);
is($hashu{$a},ord($a));
utf8::downgrade($a);
diff --git a/t/uni/case.pl b/t/uni/case.pl
index b6df5a8089..0402be402f 100644
--- a/t/uni/case.pl
+++ b/t/uni/case.pl
@@ -20,15 +20,15 @@ sub casetest {
my %seen;
for my $i (sort keys %simple) {
- $seen{hex $i}++;
+ $seen{$i}++;
}
print "# ", scalar keys %simple, " simple mappings\n";
my $both;
for my $i (sort keys %$spec) {
- if (++$seen{hex $i} == 2) {
- warn "$base: $i seen twice\n";
+ if (++$seen{$i} == 2) {
+ warn sprintf "$base: $i seen twice\n";
$both++;
}
}
@@ -52,7 +52,7 @@ sub casetest {
my $test = 1;
- for my $i (sort { hex $a <=> hex $b } keys %simple) {
+ for my $i (sort keys %simple) {
my $w = $simple{$i};
my $c = pack "U0U", hex $i;
my $d = $func->($c);
@@ -62,9 +62,11 @@ sub casetest {
$test++;
}
- for my $i (sort { hex $a <=> hex $b } keys %$spec) {
+ for my $i (sort keys %$spec) {
my $w = unidump($spec->{$i});
- my $c = pack "U0U", hex $i;
+ my $u = unpack "U0U", $i;
+ my $h = sprintf "%04X", $u;
+ my $c = chr($u); $c .= chr(0x100); chop $c;
my $d = $func->($c);
my $e = unidump($d);
if (ord "A" == 193) { # EBCDIC
@@ -116,7 +118,7 @@ sub casetest {
# just undo our remapping.
}
print $w eq $e ?
- "ok $test # $i -> $w\n" : "not ok $test # $i -> $e ($w)\n";
+ "ok $test # $i -> $w\n" : "not ok $test # $h -> $e ($w)\n";
$test++;
}
diff --git a/utf8.c b/utf8.c
index 6a665eb110..6b2527cb03 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1401,21 +1401,19 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma
if (!*swashp) /* load on-demand */
*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
- if (special) {
+ /* The 0xDF is the only special casing Unicode code point below 0x100. */
+ if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
/* It might be "special" (sometimes, but not always,
* a multicharacter mapping) */
HV *hv;
- SV *keysv;
- HE *he;
- SV *val;
-
- if ((hv = get_hv(special, FALSE)) &&
- (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
- (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
- (val = HeVAL(he))) {
- char *s;
+ SV **svp;
+
+ if ((hv = get_hv(special, FALSE)) &&
+ (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
+ (*svp)) {
+ char *s;
- s = SvPV(val, len);
+ s = SvPV(*svp, len);
if (len == 1)
len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
else {
@@ -1426,7 +1424,7 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma
U8 *t = (U8*)s, *tend = t + len, *d;
d = tmpbuf;
- if (SvUTF8(val)) {
+ if (SvUTF8(*svp)) {
STRLEN tlen = 0;
while (t < tend) {