Update to Unicode::Collate 0.08.

p4raw-id: //depot/perl@11819
author: Jarkko Hietaniemi <jhi@iki.fi> 2001-09-01 23:40:36 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2001-09-01 23:40:36 +0000
commit: d16e9e3d91e0575ab967c4d13e69d9d9569220a3 (patch)
tree: 93e8ef0e1d253acbbfbcd20bd425f563bfff5dfa /lib/Unicode
parent: 75685a94f35c086cc598b03baf224ef3dc31936b (diff)
download: perl-d16e9e3d91e0575ab967c4d13e69d9d9569220a3.tar.gz
4 files changed, 192 insertions, 37 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index 91a957455e..113613e18f 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -7,7 +7,7 @@ use Carp;
 use Lingua::KO::Hangul::Util;
 require Exporter;
 
-our $VERSION = '0.07';
+our $VERSION = '0.08';
 our $PACKAGE = __PACKAGE__;
 
 our @ISA = qw(Exporter);
@@ -39,7 +39,7 @@ sub new
      ! defined $self->{alternate} ? '' : $self->{alternate};
 
   # collation level
-  $self->{level} ||= $self->{alternate} =~ /shift/ ? 4 : 3;
+  $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);
 
   # normalization form
   $self->{normalization} = 'D' if ! exists $self->{normalization};
@@ -126,14 +126,13 @@ sub parseEntry
      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
   )
   {
-     $self->{ignored}{$ele} = 1;
-     $self->{entries}{$ele} = 1; # true
+     $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
   }
   else
   {
     foreach my $arr ($k =~ /\[(\S+)\]/g) {
       my $var = $arr =~ /\*/;
-      push @key, $self->getCE( $var, _getHexArray($arr) );
+      push @key, $self->altCE( $var, _getHexArray($arr) );
     }
     $self->{entries}{$ele} = \@key;
   }
@@ -142,17 +141,18 @@ sub parseEntry
 
 
 ##
-## list to collation element
+## arrayref CE = altCE(bool variable?, list[num] weights)
 ##
-sub getCE
+sub altCE
 {
   my $self = shift;
   my $var  = shift;
   my @c    = @_;
 
   $self->{alternate} eq 'blanked' ?
-     $var ? [0,0,0] : [ @c[0..2] ] :
-  $self->{alternate} eq 'non-ignorable' ? [ @c[0..2] ] :
+    $var ? [0,0,0] : [ @c[0..2] ] :
+  $self->{alternate} eq 'non-ignorable' ?
+    [ @c[0..2] ] :
   $self->{alternate} eq 'shifted' ?
     $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
   $self->{alternate} eq 'shift-trimmed' ?
@@ -161,7 +161,7 @@ sub getCE
 }
 
 ##
-## to debug
+## string hex_sortkey = splitCE(string arg)
 ##
 sub viewSortKey
 {
@@ -172,20 +172,17 @@ sub viewSortKey
   "[$view]";
 }
 
+
 ##
-## sort key
+## list[strings] elements = splitCE(string arg)
 ##
-sub getSortKey
+sub splitCE
 {
   my $self = shift;
   my $code = $self->{preprocess};
   my $norm = $self->{normalize};
   my $ent  = $self->{entries};
-  my $ign  = $self->{ignored};
   my $max  = $self->{maxlength};
-  my $lev  = $self->{level};
-  my $cjk  = $self->{overrideCJK};
-  my $hang = $self->{overrideHangul};
   my $rear = $self->{rearrangeHash};
 
   my $str = ref $code ? &$code(shift) : shift;
@@ -235,20 +232,111 @@ sub getSortKey
         last;
       }
     }
+    push @buf, $ch;
+  }
+  wantarray ? @buf : \@buf;
+}
 
-    next if !defined $ch || $ign->{$ch};   # ignored
-
-    push @buf,
-      $ent->{$ch}
-        ? @{ $ent->{$ch} }
-        : _isHangul($u)
-          ? $hang
-            ? &$hang($u)
-            : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
-          : _isCJK($u)
-            ? $cjk ? &$cjk($u) : map($self->getCE(0,@$_), _CJK($u))
-            : map($self->getCE(0,@$_), _derivCE($u));
+
+##
+## list[arrayrefs] weight = getWt(string element)
+##
+sub getWt
+{
+  my $self = shift;
+  my $ch   = shift;
+  my $ent  = $self->{entries};
+  my $ign  = $self->{ignored};
+  my $cjk  = $self->{overrideCJK};
+  my $hang = $self->{overrideHangul};
+  return if !defined $ch || $ign->{$ch};   # ignored
+  return @{ $ent->{$ch} } if $ent->{$ch};
+  my $u = unpack('U', $ch);
+  return
+    _isHangul($u)
+      ? $hang
+        ? &$hang($u)
+        : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
+      : _isCJK($u)
+        ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
+        : map($self->altCE(0,@$_), _derivCE($u));
+}
+
+##
+## int = index(string, substring)
+##
+sub index
+{
+  my $self = shift;
+  my $lev  = $self->{level};
+  my $str  = $self->splitCE(shift);
+  my $sub  = $self->splitCE(shift);
+
+  return wantarray ? (0,0) : 0 if ! @$sub;
+  return wantarray ?  ()  : -1 if ! @$str;
+
+  my @subWt = grep _ignorableAtLevel($_,$lev),
+              map $self->getWt($_), @$sub;
+
+  my(@strWt,@strPt);
+  my $count = 0;
+  for my $e (@$str){
+    my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
+    push @strWt, @tmp;
+    push @strPt, ($count) x @tmp; 
+    $count += length $e;
+    while(@strWt >= @subWt){
+      if(_eqArray(\@strWt, \@subWt, $lev)){
+        my $pos = $strPt[0];
+        return wantarray ? ($pos, $count-$pos) : $pos;
+      }
+      shift @strWt;
+      shift @strPt;
+    }
+  }
+  return wantarray ? () : -1;
+}
+
+##
+## bool _eqArray(arrayref, arrayref, level)
+##
+sub _eqArray($$$)
+{
+  my $a   = shift; # length $a >= length $b;
+  my $b   = shift;
+  my $lev = shift;
+  for my $v (0..$lev-1){
+    for my $c (0..@$b-1){
+      return if $a->[$c][$v] != $b->[$c][$v];
+    }
   }
+  return 1;
+}
+
+
+##
+## bool _ignorableAtLevel(CE, level)
+##
+sub _ignorableAtLevel($$)
+{
+  my $ce = shift;
+  return if ! defined $ce;
+  my $lv = shift;
+  ! grep { ! $ce->[$_] } 0..$lv-1;
+}
+
+
+##
+## string sortkey = getSortKey(string arg)
+##
+sub getSortKey
+{
+  my $self = shift;
+  my $lev  = $self->{level};
+  my $rCE  = $self->splitCE(shift); # get an arrayref
+
+  # weight arrays
+  my @buf = grep defined(), map $self->getWt($_), @$rCE;
 
   # make sort key
   my @ret = ([],[],[],[]);
@@ -282,7 +370,7 @@ sub getSortKey
 
 
 ##
-## cmp
+## int compare = cmp(string a, string b)
 ##
 sub cmp
 {
@@ -293,7 +381,7 @@ sub cmp
 }
 
 ##
-## sort
+## list[strings] sorted = sort(list[strings] arg)
 ##
 sub sort
 {
@@ -305,7 +393,7 @@ sub sort
 }
 
 ##
-## Derived CE
+## list[arrayrefs] CE = _derivCE(int codepoint)
 ##
 sub _derivCE
 {
@@ -327,7 +415,7 @@ sub _getHexArray
 }
 
 ##
-##  CJK Unified Ideographs
+## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
 ##
 sub _isCJK
 {
@@ -338,7 +426,7 @@ sub _isCJK
 }
 
 ##
-##  CJK Unified Ideographs
+## list[arrayref] CE = _CJK(int codepoint_of_CJK)
 ##
 sub _CJK
 {
@@ -347,7 +435,7 @@ sub _CJK
 }
 
 ##
-## Hangul Syllables
+## bool is_a_Hangul_Syllable = _isHangul(int codepoint) 
 ##
 sub _isHangul
 {
@@ -379,6 +467,8 @@ Unicode::Collate - use UCA (Unicode Collation Algorithm)
 
 =head2 Constructor and Tailoring
 
+The C<new> method returns a collator object.
+
    $UCA = Unicode::Collate->new(
       alternate => $alternate,
       backwards => $levelNumber, # or \@levelNumbers
@@ -599,6 +689,44 @@ and get the result of the comparison of the strings using UCA.
 
    $UCA->cmp($a, $b)
 
+=item C<$position = $UCA-E<gt>index($string, $substring)>
+
+=item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
+
+-- see 6.8 Searching, UTR #10.
+
+If C<$substring> matches a part of C<$string>, returns
+the position of the first occurrence of the matching part in scalar context;
+in list context, returns a two-element list of
+the position and the length of the matching part.
+
+B<Notice> that the length of the matching part may differ from
+the length of C<$substring>.
+
+B<Note> that the position and the length are counted on the string
+after the process of preprocess, normalization, and rearrangement.
+Therefore, in case the specified string is not binary equal to
+the preprocessed/normalized/rearranged string, the position and the length
+may differ form those on the specified string. But it is guaranteed 
+that, if matched, it returns a non-negative value as C<$position>.
+
+If C<$substring> does not match any part of C<$string>,
+returns C<-1> in scalar context and
+an empty list in list context.
+
+e.g. you say
+
+  my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
+  my $str = "Ich mu\x{00DF} studieren.";
+  my $sub = "m\x{00FC}ss";
+  my $match;
+  if(my @tmp = $UCA->index($str, $sub)){
+    $match = substr($str, $tmp[0], $tmp[1]);
+  }
+
+and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
+is primary equal to C<"m>E<252>C<ss">. 
+
 =back
 
 =head2 EXPORT
diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes
index 57e56b24e8..9b4ed1b626 100644
--- a/lib/Unicode/Collate/Changes
+++ b/lib/Unicode/Collate/Changes
@@ -1,5 +1,8 @@
 Revision history for Perl extension Unicode::Collate.
 
+0.08  Mon Aug 20 22:40:18 2001
+    - add the index method.
+
 0.07  Thu Aug 16 23:42:02 2001
     - rename the module name to Unicode::Collate.
 
diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README
index bf0efff22f..0cf2bc3c55 100644
--- a/lib/Unicode/Collate/README
+++ b/lib/Unicode/Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.07
+Unicode/Collate version 0.08
 ===============================
 
 use UCA (Unicode Collation Algorithm)
diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t
index be4cc4a3db..352c44fe6a 100644
--- a/lib/Unicode/Collate/t/test.t
+++ b/lib/Unicode/Collate/t/test.t
@@ -4,7 +4,7 @@
 #########################
 
 use Test;
-BEGIN { plan tests => 20 };
+BEGIN { plan tests => 22 };
 use Unicode::Collate;
 ok(1); # If we made it this far, we're ok.
 
@@ -43,7 +43,7 @@ if(!$@){
   ok($NFD->cmp("A$acute", $A_acute), 0);
 }
 else{
-  ok(1,1);
+  ok(1);
 }
 
 my $tr = Unicode::Collate->new(
@@ -113,3 +113,27 @@ my $ign = Unicode::Collate->new(
 
 ok( $ign->cmp("element","lament"), 0);
 
+$UCA->{level} = 2;
+
+my $orig = "This is a Perl book.";
+my $str;
+my $sub = "PERL";
+my $rep = "camel";
+my $ret = "This is a camel book.";
+
+$str = $orig;
+if(my @tmp = $UCA->index($str, $sub)){
+  substr($str, $tmp[0], $tmp[1], $rep);
+}
+
+ok($str, $ret);
+
+$UCA->{level} = $old_level;
+
+$str = $orig;
+if(my @tmp = $UCA->index($str, $sub)){
+  substr($str, $tmp[0], $tmp[1], $rep);
+}
+
+ok($str, $orig);
+
author	Jarkko Hietaniemi <jhi@iki.fi>	2001-09-01 23:40:36 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2001-09-01 23:40:36 +0000
commit	d16e9e3d91e0575ab967c4d13e69d9d9569220a3 (patch)
tree	93e8ef0e1d253acbbfbcd20bd425f563bfff5dfa /lib/Unicode
parent	75685a94f35c086cc598b03baf224ef3dc31936b (diff)
download	perl-d16e9e3d91e0575ab967c4d13e69d9d9569220a3.tar.gz