summaryrefslogtreecommitdiff
path: root/ext/Unicode
diff options
context:
space:
mode:
authorSADAHIRO Tomoyuki <BQW10602@nifty.com>2002-01-10 10:08:56 +0900
committerJarkko Hietaniemi <jhi@iki.fi>2002-01-09 19:46:27 +0000
commitd85850a777b87feb3a01d2328aa281372642877e (patch)
treee75119d5803e7b5e3272492b3fe832c24bc37da4 /ext/Unicode
parent56ece252362762758f1efe6f62541c52eb1e969b (diff)
downloadperl-d85850a777b87feb3a01d2328aa281372642877e.tar.gz
[Patch @14129] fixes Unicode::Normalize
Message-Id: <20020110010110.690B.BQW10602@nifty.com> p4raw-id: //depot/perl@14156
Diffstat (limited to 'ext/Unicode')
-rw-r--r--ext/Unicode/Normalize/Makefile.PL2
-rw-r--r--ext/Unicode/Normalize/Normalize.pm28
-rw-r--r--ext/Unicode/Normalize/Normalize.pod89
-rw-r--r--ext/Unicode/Normalize/Normalize.xs24
-rw-r--r--ext/Unicode/Normalize/mkheader32
5 files changed, 33 insertions, 142 deletions
diff --git a/ext/Unicode/Normalize/Makefile.PL b/ext/Unicode/Normalize/Makefile.PL
index 88ab9b7b63..2b834d7dae 100644
--- a/ext/Unicode/Normalize/Makefile.PL
+++ b/ext/Unicode/Normalize/Makefile.PL
@@ -9,7 +9,7 @@ WriteMakefile(
'NAME' => 'Unicode::Normalize',
'VERSION_FROM' => 'Normalize.pm', # finds $VERSION
($] >= 5.005 ? ## Add these new keywords supported since 5.005
- (ABSTRACT_FROM => 'Normalize.pod', # retrieve abstract from module
+ (ABSTRACT_FROM => 'Normalize.pm', # retrieve abstract from module
AUTHOR => 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>') : ()),
clean => {FILES=> 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h'},
);
diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm
index 40d326ff04..819fbc4bf9 100644
--- a/ext/Unicode/Normalize/Normalize.pm
+++ b/ext/Unicode/Normalize/Normalize.pm
@@ -5,7 +5,7 @@ use strict;
use warnings;
use Carp;
-our $VERSION = '0.12';
+our $VERSION = '0.13';
our $PACKAGE = __PACKAGE__;
require Exporter;
@@ -22,24 +22,24 @@ our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
bootstrap Unicode::Normalize $VERSION;
-use constant CANON => 0;
use constant COMPAT => 1;
-sub NFD ($) { reorder(decompose($_[0], CANON )) }
+sub NFD ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
-sub NFC ($) { compose(reorder(decompose($_[0], CANON ))) }
+sub NFC ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
sub normalize($$)
{
- my $form = shift;
- $form =~ s/^NF//;
- $form eq 'D' ? NFD ($_[0]) :
- $form eq 'C' ? NFC ($_[0]) :
- $form eq 'KD' ? NFKD($_[0]) :
- $form eq 'KC' ? NFKC($_[0]) :
- croak $PACKAGE."::normalize: invalid form name: $form";
+ my $form = shift;
+ $form =~ s/^NF//;
+ return
+ $form eq 'D' ? NFD ($_[0]) :
+ $form eq 'C' ? NFC ($_[0]) :
+ $form eq 'KD' ? NFKD($_[0]) :
+ $form eq 'KC' ? NFKC($_[0]) :
+ croak $PACKAGE."::normalize: invalid form name: $form";
}
1;
@@ -69,7 +69,7 @@ Unicode::Normalize - normalized forms of Unicode text
=head1 DESCRIPTION
-=head2 Normalization
+=head2 Normalization Forms
=over 4
@@ -107,7 +107,7 @@ As C<$form_name>, one of the following names must be given.
These functions are interface of character data used internally.
If you want only to get unicode normalization forms,
-you need not to call them by yourself.
+you doesn't need call them by yourself.
=over 4
@@ -123,7 +123,7 @@ If it is not decomposable, returns undef.
=item C<$uv_composite = getComposite($uv_here, $uv_next)>
-If the couple of two characters here and next (as codepoints) is composable
+If two characters here and next (as codepoints) are composable
(including Hangul Jamo/Syllables and Exclusions),
returns the codepoint of the composite.
diff --git a/ext/Unicode/Normalize/Normalize.pod b/ext/Unicode/Normalize/Normalize.pod
deleted file mode 100644
index 4ac8966a83..0000000000
--- a/ext/Unicode/Normalize/Normalize.pod
+++ /dev/null
@@ -1,89 +0,0 @@
-
-=head1 NAME
-
-Unicode::Normalize - normalized forms of Unicode text
-
-=head1 SYNOPSIS
-
- use Unicode::Normalize;
-
- $string_NFD = NFD($raw_string); # Normalization Form D
- $string_NFC = NFC($raw_string); # Normalization Form C
- $string_NFKD = NFKD($raw_string); # Normalization Form KD
- $string_NFKC = NFKC($raw_string); # Normalization Form KC
-
- or
-
- use Unicode::Normalize 'normalize';
-
- $string_NFD = normalize('D', $raw_string); # Normalization Form D
- $string_NFC = normalize('C', $raw_string); # Normalization Form C
- $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
- $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
-
-=head1 DESCRIPTION
-
-=over 4
-
-=item C<$string_NFD = NFD($raw_string)>
-
-returns the Normalization Form D (formed by canonical decomposition).
-
-
-=item C<$string_NFC = NFC($raw_string)>
-
-returns the Normalization Form C (formed by canonical decomposition
-followed by canonical composition).
-
-=item C<$string_NFKD = NFKD($raw_string)>
-
-returns the Normalization Form KD (formed by compatibility decomposition).
-
-=item C<$string_NFKC = NFKC($raw_string)>
-
-returns the Normalization Form KC (formed by compatibility decomposition
-followed by B<canonical> composition).
-
-=item C<$normalized_string = normalize($form_name, $raw_string)>
-
-As C<$form_name>, one of the following names must be given.
-
- 'C' or 'NFC' for Normalization Form C
- 'D' or 'NFD' for Normalization Form D
- 'KC' or 'NFKC' for Normalization Form KC
- 'KD' or 'NFKD' for Normalization Form KD
-
-=back
-
-=head2 EXPORT
-
-C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
-
-C<normalize>: on request.
-
-=head1 AUTHOR
-
-SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
-
- http://homepage1.nifty.com/nomenclator/perl/
-
- Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
-
- This program is free software; you can redistribute it and/or
- modify it under the same terms as Perl itself.
-
-=head1 SEE ALSO
-
-=over 4
-
-=item L<Lingua::KO::Hangul::Util>
-
-utility functions for Hangul Syllables
-
-=item http://www.unicode.org/unicode/reports/tr15/
-
-Unicode Normalization Forms - UAX #15
-
-=back
-
-=cut
diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs
index 7adad7efd9..0b0809e50d 100644
--- a/ext/Unicode/Normalize/Normalize.xs
+++ b/ext/Unicode/Normalize/Normalize.xs
@@ -146,10 +146,10 @@ void sv_cat_decompHangul (SV* sv, UV uv)
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
SV*
-decompose(arg, compat)
+decompose(arg, compat = &PL_sv_no)
SV * arg
SV * compat
- PROTOTYPE: $
+ PROTOTYPE: $;$
PREINIT:
UV uv;
SV *src, *dst;
@@ -275,7 +275,7 @@ compose(arg)
s = (U8*)SvPV(src, srclen);
e = s + srclen;
- dstlen = srclen + 1; /* equal or shorter, XXX */
+ dstlen = srclen + 1;
dst = newSV(dstlen);
(void)SvPOK_only(dst);
SvUTF8_on(dst);
@@ -317,6 +317,16 @@ compose(arg)
/* S + C + S => S-S + C would be also blocked. */
if( uvComp && ! isExclusion(uvComp) && preCC <= curCC)
{
+ STRLEN leftcur, rightcur, dstcur;
+ leftcur = UNISKIP(uvComp);
+ rightcur = UNISKIP(uvS) + UNISKIP(uv);
+
+ if (leftcur > rightcur) {
+ dstcur = d - (U8*)SvPVX(dst);
+ dstlen += leftcur - rightcur;
+ d = (U8*)SvGROW(dst,dstlen) + dstcur;
+ }
+
/* preCC not changed to curCC */
uvS = uvComp;
} else if (! curCC && p < e) { /* blocked */
@@ -328,15 +338,15 @@ compose(arg)
}
}
d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
- if((tmplen = t - tmp_start)) { /* uncomposed combining char */
+ tmplen = t - tmp_start;
+ if (tmplen) { /* uncomposed combining char */
t = (U8*)SvPVX(tmp);
while(tmplen--) *d++ = *t++;
}
uvS = uv;
} /* for */
- e = d; /* end of dst */
- d = (U8*)SvPVX(dst);
- SvCUR_set(dst, e - d);
+
+ SvCUR_set(dst, d - (U8*)SvPVX(dst));
RETVAL = dst;
OUTPUT:
RETVAL
diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader
index 5793e4a757..aa6a153bf1 100644
--- a/ext/Unicode/Normalize/mkheader
+++ b/ext/Unicode/Normalize/mkheader
@@ -111,7 +111,7 @@ sub _getHexArray {
sub _U_stringify {
sprintf '"%s"', join '',
- map sprintf("\\x%2x", $_), unpack 'C*', pack 'U*', @_;
+ map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
}
foreach my $hash (\%Canon, \%Compat) {
@@ -120,48 +120,18 @@ foreach my $hash (\%Canon, \%Compat) {
}
}
-sub utf8len {
- my $uv = shift;
- return $uv < 0x80 ? 1 :
- $uv < 0x800 ? 2 :
- $uv < 0x10000 ? 3 :
- $uv < 0x110000 ? 4 :
- croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
-}
-
-sub utfebcdiclen {
- my $uv = shift;
- return $uv < 0xA0 ? 1 :
- $uv < 0x400 ? 2 :
- $uv < 0x4000 ? 3 :
- $uv < 0x40000 ? 4 :
- $uv < 0x110000 ? 5 :
- croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
-}
-
my $prefix = "UNF_";
my $structname = "${prefix}complist";
our (%Comp1st, %CompList);
-my $errExpand = "$PACKAGE: A composable pair in %s "
- . "is longer than the composite in bytes!\n"
- . "%d + %d => %d\nQuit. Please inform the author...";
-
foreach(sort keys %Compos) {
my @a = unpack('U*', $_);
my $val = $Compos{$_};
my $name = sprintf "${structname}_%06x", $a[0];
$Comp1st{ $a[0] } = $name;
$CompList{ $name }{ $a[1] } = $val;
-
- if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) {
- croak sprintf($errExpand, "utf-8", $a[0], $a[1], $val);
- }
- if( utfebcdiclen($a[0]) + utfebcdiclen($a[1]) < utfebcdiclen($val)) {
- croak sprintf($errExpand, "utf-ebcdic", $a[0], $a[1], $val);
- }
}
my $compinit =