[Patch @14129] fixes Unicode::Normalize

Message-Id: <20020110010110.690B.BQW10602@nifty.com> p4raw-id: //depot/perl@14156
author: SADAHIRO Tomoyuki <BQW10602@nifty.com> 2002-01-10 10:08:56 +0900
committer: Jarkko Hietaniemi <jhi@iki.fi> 2002-01-09 19:46:27 +0000
commit: d85850a777b87feb3a01d2328aa281372642877e (patch)
tree: e75119d5803e7b5e3272492b3fe832c24bc37da4 /ext/Unicode
parent: 56ece252362762758f1efe6f62541c52eb1e969b (diff)
download: perl-d85850a777b87feb3a01d2328aa281372642877e.tar.gz
5 files changed, 33 insertions, 142 deletions
diff --git a/ext/Unicode/Normalize/Makefile.PL b/ext/Unicode/Normalize/Makefile.PL
index 88ab9b7b63..2b834d7dae 100644
--- a/ext/Unicode/Normalize/Makefile.PL
+++ b/ext/Unicode/Normalize/Makefile.PL
@@ -9,7 +9,7 @@ WriteMakefile(
     'NAME'		=> 'Unicode::Normalize',
     'VERSION_FROM'	=> 'Normalize.pm', # finds $VERSION
     ($] >= 5.005 ?    ## Add these new keywords supported since 5.005
-      (ABSTRACT_FROM => 'Normalize.pod', # retrieve abstract from module
+      (ABSTRACT_FROM => 'Normalize.pm', # retrieve abstract from module
        AUTHOR     => 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>') : ()),
     clean => {FILES=> 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h'},
 );
diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm
index 40d326ff04..819fbc4bf9 100644
--- a/ext/Unicode/Normalize/Normalize.pm
+++ b/ext/Unicode/Normalize/Normalize.pm
@@ -5,7 +5,7 @@ use strict;
 use warnings;
 use Carp;
 
-our $VERSION = '0.12';
+our $VERSION = '0.13';
 our $PACKAGE = __PACKAGE__;
 
 require Exporter;
@@ -22,24 +22,24 @@ our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
 
 bootstrap Unicode::Normalize $VERSION;
 
-use constant CANON  => 0;
 use constant COMPAT => 1;
 
-sub NFD  ($) { reorder(decompose($_[0], CANON )) }
+sub NFD  ($) { reorder(decompose($_[0])) }
 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
 
-sub NFC  ($) { compose(reorder(decompose($_[0], CANON ))) }
+sub NFC  ($) { compose(reorder(decompose($_[0]))) }
 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
 
 sub normalize($$)
 {
-  my $form = shift;
-  $form =~ s/^NF//;
-  $form eq 'D'  ? NFD ($_[0]) :
-  $form eq 'C'  ? NFC ($_[0]) :
-  $form eq 'KD' ? NFKD($_[0]) :
-  $form eq 'KC' ? NFKC($_[0]) :
-    croak $PACKAGE."::normalize: invalid form name: $form";
+    my $form = shift;
+    $form =~ s/^NF//;
+    return
+	$form eq 'D'  ? NFD ($_[0]) :
+	$form eq 'C'  ? NFC ($_[0]) :
+	$form eq 'KD' ? NFKD($_[0]) :
+	$form eq 'KC' ? NFKC($_[0]) :
+      croak $PACKAGE."::normalize: invalid form name: $form";
 }
 
 1;
@@ -69,7 +69,7 @@ Unicode::Normalize - normalized forms of Unicode text
 
 =head1 DESCRIPTION
 
-=head2 Normalization
+=head2 Normalization Forms
 
 =over 4
 
@@ -107,7 +107,7 @@ As C<$form_name>, one of the following names must be given.
 
 These functions are interface of character data used internally.
 If you want only to get unicode normalization forms, 
-you need not to call them by yourself.
+you doesn't need call them by yourself.
 
 =over 4
 
@@ -123,7 +123,7 @@ If it is not decomposable, returns undef.
 
 =item C<$uv_composite = getComposite($uv_here, $uv_next)>
 
-If the couple of two characters here and next (as codepoints) is composable
+If two characters here and next (as codepoints) are composable
 (including Hangul Jamo/Syllables and Exclusions),
 returns the codepoint of the composite.
 
diff --git a/ext/Unicode/Normalize/Normalize.pod b/ext/Unicode/Normalize/Normalize.pod
deleted file mode 100644
index 4ac8966a83..0000000000
--- a/ext/Unicode/Normalize/Normalize.pod
+++ /dev/null
@@ -1,89 +0,0 @@
-
-=head1 NAME
-
-Unicode::Normalize - normalized forms of Unicode text
-
-=head1 SYNOPSIS
-
-  use Unicode::Normalize;
-
-  $string_NFD  = NFD($raw_string);  # Normalization Form D
-  $string_NFC  = NFC($raw_string);  # Normalization Form C
-  $string_NFKD = NFKD($raw_string); # Normalization Form KD
-  $string_NFKC = NFKC($raw_string); # Normalization Form KC
-
-   or
-
-  use Unicode::Normalize 'normalize';
-
-  $string_NFD  = normalize('D',  $raw_string);  # Normalization Form D
-  $string_NFC  = normalize('C',  $raw_string);  # Normalization Form C
-  $string_NFKD = normalize('KD', $raw_string);  # Normalization Form KD
-  $string_NFKC = normalize('KC', $raw_string);  # Normalization Form KC
-
-=head1 DESCRIPTION
-
-=over 4
-
-=item C<$string_NFD = NFD($raw_string)>
-
-returns the Normalization Form D (formed by canonical decomposition).
-
-
-=item C<$string_NFC = NFC($raw_string)>
-
-returns the Normalization Form C (formed by canonical decomposition
-followed by canonical composition).
-
-=item C<$string_NFKD = NFKD($raw_string)>
-
-returns the Normalization Form KD (formed by compatibility decomposition).
-
-=item C<$string_NFKC = NFKC($raw_string)>
-
-returns the Normalization Form KC (formed by compatibility decomposition
-followed by B<canonical> composition).
-
-=item C<$normalized_string = normalize($form_name, $raw_string)>
-
-As C<$form_name>, one of the following names must be given.
-
-  'C'  or 'NFC'  for Normalization Form C
-  'D'  or 'NFD'  for Normalization Form D
-  'KC' or 'NFKC' for Normalization Form KC
-  'KD' or 'NFKD' for Normalization Form KD
-
-=back
-
-=head2 EXPORT
-
-C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
-
-C<normalize>: on request.
-
-=head1 AUTHOR
-
-SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
-
-  http://homepage1.nifty.com/nomenclator/perl/
-
-  Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
-
-  This program is free software; you can redistribute it and/or 
-  modify it under the same terms as Perl itself.
-
-=head1 SEE ALSO
-
-=over 4
-
-=item L<Lingua::KO::Hangul::Util>
-
-utility functions for Hangul Syllables
-
-=item http://www.unicode.org/unicode/reports/tr15/
-
-Unicode Normalization Forms - UAX #15
-
-=back
-
-=cut
diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs
index 7adad7efd9..0b0809e50d 100644
--- a/ext/Unicode/Normalize/Normalize.xs
+++ b/ext/Unicode/Normalize/Normalize.xs
@@ -146,10 +146,10 @@ void sv_cat_decompHangul (SV* sv, UV uv)
 MODULE = Unicode::Normalize	PACKAGE = Unicode::Normalize
 
 SV*
-decompose(arg, compat)
+decompose(arg, compat = &PL_sv_no)
     SV * arg
     SV * compat
-  PROTOTYPE: $
+  PROTOTYPE: $;$
   PREINIT:
     UV uv;
     SV *src, *dst;
@@ -275,7 +275,7 @@ compose(arg)
 
     s = (U8*)SvPV(src, srclen);
     e = s + srclen;
-    dstlen = srclen + 1; /* equal or shorter, XXX */
+    dstlen = srclen + 1;
     dst = newSV(dstlen);
     (void)SvPOK_only(dst);
     SvUTF8_on(dst);
@@ -317,6 +317,16 @@ compose(arg)
 	/* S + C + S => S-S + C would be also blocked. */
 		if( uvComp && ! isExclusion(uvComp) && preCC <= curCC)
 		{
+		    STRLEN leftcur, rightcur, dstcur;
+		    leftcur  = UNISKIP(uvComp);
+		    rightcur = UNISKIP(uvS) + UNISKIP(uv);
+
+		    if (leftcur > rightcur) {
+			dstcur = d - (U8*)SvPVX(dst);
+			dstlen += leftcur - rightcur;
+			d = (U8*)SvGROW(dst,dstlen) + dstcur;
+		    }
+
 		    /* preCC not changed to curCC */
 		    uvS = uvComp;
 	        } else if (! curCC && p < e) { /* blocked */
@@ -328,15 +338,15 @@ compose(arg)
 	    }
 	}
 	d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
-	if((tmplen = t - tmp_start)) { /* uncomposed combining char */
+	tmplen = t - tmp_start;
+	if (tmplen) { /* uncomposed combining char */
 	    t = (U8*)SvPVX(tmp);
 	    while(tmplen--) *d++ = *t++;
 	}
 	uvS = uv;
     } /* for */
-    e = d; /* end of dst */
-    d = (U8*)SvPVX(dst);
-    SvCUR_set(dst, e - d);
+
+    SvCUR_set(dst, d - (U8*)SvPVX(dst));
     RETVAL = dst;
   OUTPUT:
     RETVAL
diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader
index 5793e4a757..aa6a153bf1 100644
--- a/ext/Unicode/Normalize/mkheader
+++ b/ext/Unicode/Normalize/mkheader
@@ -111,7 +111,7 @@ sub _getHexArray {
 
 sub _U_stringify {
   sprintf '"%s"', join '',
-    map sprintf("\\x%2x", $_), unpack 'C*', pack 'U*', @_;
+    map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
 }
 
 foreach my $hash (\%Canon, \%Compat) {
@@ -120,48 +120,18 @@ foreach my $hash (\%Canon, \%Compat) {
   }
 }
 
-sub utf8len {
-  my $uv = shift;
-  return $uv < 0x80 ? 1 :
-        $uv < 0x800 ? 2 :
-      $uv < 0x10000 ? 3 :
-     $uv < 0x110000 ? 4 :
-  croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
-}
-
-sub utfebcdiclen {
-  my $uv = shift;
-  return $uv < 0xA0 ? 1 :
-        $uv < 0x400 ? 2 :
-       $uv < 0x4000 ? 3 :
-      $uv < 0x40000 ? 4 :
-     $uv < 0x110000 ? 5 :
-  croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
-}
-
 my $prefix = "UNF_";
 
 my $structname = "${prefix}complist";
 
 our (%Comp1st, %CompList);
 
-my $errExpand = "$PACKAGE: A composable pair in %s "
-	. "is longer than the composite in bytes!\n"
-	. "%d + %d => %d\nQuit. Please inform the author...";
-
 foreach(sort keys %Compos) {
   my @a = unpack('U*', $_);
   my $val = $Compos{$_};
   my $name = sprintf "${structname}_%06x", $a[0];
   $Comp1st{ $a[0] } = $name;
   $CompList{ $name }{ $a[1] } = $val;
-
-  if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) {
-    croak sprintf($errExpand, "utf-8", $a[0], $a[1], $val);
-  }
-  if( utfebcdiclen($a[0]) + utfebcdiclen($a[1]) < utfebcdiclen($val)) {
-    croak sprintf($errExpand, "utf-ebcdic", $a[0], $a[1], $val);
-  }
 }
 
 my $compinit =
author	SADAHIRO Tomoyuki <BQW10602@nifty.com>	2002-01-10 10:08:56 +0900
committer	Jarkko Hietaniemi <jhi@iki.fi>	2002-01-09 19:46:27 +0000
commit	d85850a777b87feb3a01d2328aa281372642877e (patch)
tree	e75119d5803e7b5e3272492b3fe832c24bc37da4 /ext/Unicode
parent	56ece252362762758f1efe6f62541c52eb1e969b (diff)
download	perl-d85850a777b87feb3a01d2328aa281372642877e.tar.gz