summaryrefslogtreecommitdiff
path: root/lib/Lingua/KO/Hangul/Util.pm
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-11-09 00:23:40 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-11-09 00:23:40 +0000
commitac5ea53171baa7dab1a92df1eacf8d2fe19cbdbb (patch)
tree5ce2221e6a7335594861f38233d4d665313a7c82 /lib/Lingua/KO/Hangul/Util.pm
parent61a515a61510e728f2014674d12cb94cb5a90834 (diff)
downloadperl-ac5ea53171baa7dab1a92df1eacf8d2fe19cbdbb.tar.gz
Upgrade to Unicode::Normalize 0.10, now in XS.
The CPAN distribution has both pm and XS implementations, and for performance reasons we choose the XS. Another reason to choose the XS is that it doesn't require Lingua::KO::Hangul::Util, which means that we can delete that-- which in turn means that Unicode::UCD cannot expect that: support it, but don't expect. Ditto Unicode::Collate. Note that Unicode::Normalize Makefile.PL and Normalize.xs have been modified from the CPAN 0.10 versions: the first one to be simpler (no pm) and clean up the generated unf*.h files, the second one to quench compiler grumblings. Must notify Sadahiro about these changes. p4raw-id: //depot/perl@12909
Diffstat (limited to 'lib/Lingua/KO/Hangul/Util.pm')
-rw-r--r--lib/Lingua/KO/Hangul/Util.pm278
1 files changed, 0 insertions, 278 deletions
diff --git a/lib/Lingua/KO/Hangul/Util.pm b/lib/Lingua/KO/Hangul/Util.pm
deleted file mode 100644
index 3848592903..0000000000
--- a/lib/Lingua/KO/Hangul/Util.pm
+++ /dev/null
@@ -1,278 +0,0 @@
-package Lingua::KO::Hangul::Util;
-
-use 5.006;
-use strict;
-use warnings;
-
-require Exporter;
-
-our @ISA = qw(Exporter);
-our %EXPORT_TAGS = ();
-our @EXPORT_OK = ();
-our @EXPORT = qw(
- decomposeHangul
- composeHangul
- getHangulName
- parseHangulName
-);
-our $VERSION = '0.02';
-
-our @JamoL = ( # Initial (HANGUL CHOSEONG)
- "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
- "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
- );
-
-our @JamoV = ( # Medial (HANGUL JUNGSEONG)
- "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
- "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
- "YU", "EU", "YI", "I",
- );
-
-our @JamoT = ( # Final (HANGUL JONGSEONG)
- "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
- "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
- "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
- );
-
-our $BlockName = "HANGUL SYLLABLE ";
-
-use constant SBase => 0xAC00;
-use constant LBase => 0x1100;
-use constant VBase => 0x1161;
-use constant TBase => 0x11A7;
-use constant LCount => 19; # scalar @JamoL
-use constant VCount => 21; # scalar @JamoV
-use constant TCount => 28; # scalar @JamoT
-use constant NCount => 588; # VCount * TCount
-use constant SCount => 11172; # LCount * NCount
-use constant SFinal => 0xD7A3; # SBase -1 + SCount
-
-our(%CodeL, %CodeV, %CodeT);
-@CodeL{@JamoL} = 0 .. LCount-1;
-@CodeV{@JamoV} = 0 .. VCount-1;
-@CodeT{@JamoT} = 0 .. TCount-1;
-
-sub getHangulName {
- my $code = shift;
- return undef unless SBase <= $code && $code <= SFinal;
- my $SIndex = $code - SBase;
- my $LIndex = int( $SIndex / NCount);
- my $VIndex = int(($SIndex % NCount) / TCount);
- my $TIndex = $SIndex % TCount;
- "$BlockName$JamoL[$LIndex]$JamoV[$VIndex]$JamoT[$TIndex]";
-}
-
-sub parseHangulName {
- my $arg = shift;
- return undef unless $arg =~ s/$BlockName//o;
- return undef unless $arg =~ /^([^AEIOUWY]*)([AEIOUWY]+)([^AEIOUWY]*)$/;
- return undef unless exists $CodeL{$1}
- && exists $CodeV{$2}
- && exists $CodeT{$3};
- SBase + $CodeL{$1} * NCount + $CodeV{$2} * TCount + $CodeT{$3};
-}
-
-sub decomposeHangul {
- my $code = shift;
- return unless SBase <= $code && $code <= SFinal;
- my $SIndex = $code - SBase;
- my $LIndex = int( $SIndex / NCount);
- my $VIndex = int(($SIndex % NCount) / TCount);
- my $TIndex = $SIndex % TCount;
- my @ret = (
- LBase + $LIndex,
- VBase + $VIndex,
- $TIndex ? (TBase + $TIndex) : (),
- );
- wantarray ? @ret : pack('U*', @ret);
-}
-
-#
-# To Do:
-# s/(\p{JamoL}\p{JamoV})/toHangLV($1)/ge;
-# s/(\p{HangLV}\p{JamoT})/toHangLVT($1)/ge;
-#
-sub composeHangul {
- my $str = shift;
- return $str unless length $str;
- my(@ret);
-
- foreach my $ch (unpack('U*', $str)) # Makes list! The string be short!
- {
- push(@ret, $ch) and next unless @ret;
-
- # 1. check to see if $ret[-1] is L and $ch is V.
- my $LIndex = $ret[-1] - LBase;
- if(0 <= $LIndex && $LIndex < LCount)
- {
- my $VIndex = $ch - VBase;
- if(0 <= $VIndex && $VIndex < VCount)
- {
- $ret[-1] = SBase + ($LIndex * VCount + $VIndex) * TCount;
- next; # discard $ch
- }
- }
-
- # 2. check to see if $ret[-1] is LV and $ch is T.
- my $SIndex = $ret[-1] - SBase;
- if(0 <= $SIndex && $SIndex < SCount && $SIndex % TCount == 0)
- {
- my $TIndex = $ch - TBase;
- if(0 <= $TIndex && $TIndex < TCount)
- {
- $ret[-1] += $TIndex;
- next; # discard $ch
- }
- }
-
- # 3. just append $ch
- push(@ret, $ch);
- }
- wantarray ? @ret : pack('U*', @ret);
-}
-
-1;
-__END__
-
-=head1 NAME
-
-Lingua::KO::Hangul::Util - utility functions for Hangul Syllables
-
-=head1 SYNOPSIS
-
- use Lingua::KO::Hangul::Util;
-
- decomposeHangul(0xAC00);
- # (0x1100,0x1161) or "\x{1100}\x{1161}"
-
- composeHangul("\x{1100}\x{1161}");
- # "\x{AC00}"
-
- getHangulName(0xAC00);
- # "HANGUL SYLLABLE GA"
-
- parseHangulName("HANGUL SYLLABLE GA");
- # 0xAC00
-
-=head1 DESCRIPTION
-
-A Hangul syllable consists of Hangul Jamo.
-
-Hangul Jamo are classified into three classes:
-
- CHOSEONG (the initial sound) as a leading consonant (L),
- JUNGSEONG (the medial sound) as a vowel (V),
- JONGSEONG (the final sound) as a trailing consonant (T).
-
-Any Hangul syllable is a composition of
-
- i) CHOSEONG + JUNGSEONG (L + V)
-
- or
-
- ii) CHOSEONG + JUNGSEONG + JONGSEONG (L + V + T).
-
-Names of Hangul Syllables have a format of C<"HANGUL SYLLABLE %s">.
-
-=head2 Composition and Decomposition
-
-=over 4
-
-=item C<$string_decomposed = decomposeHangul($codepoint)>
-
-=item C<@codepoints = decomposeHangul($codepoint)>
-
-Accepts unicode codepoint integer.
-
-If the specified codepoint is of a Hangul syllable,
-returns a list of codepoints (in a list context)
-or a UTF-8 string (in a scalar context)
-of its decomposition.
-
- decomposeHangul(0xAC00) # U+AC00 is HANGUL SYLLABLE GA.
- returns "\x{1100}\x{1161}" or (0x1100, 0x1161);
-
- decomposeHangul(0xAE00) # U+AE00 is HANGUL SYLLABLE GEUL.
- returns "\x{1100}\x{1173}\x{11AF}" or (0x1100, 0x1173, 0x11AF);
-
-Otherwise, returns false (empty string or empty list).
-
- decomposeHangul(0x0041) # outside Hangul Syllables
- returns empty string or empty list.
-
-=item C<$string_composed = composeHangul($src_string)>
-
-=item C<@codepoints_composed = composeHangul($src_string)>
-
-Any sequence of an initial Jamo C<L> and a medial Jamo C<V>
-is composed into a syllable C<LV>;
-then any sequence of a syllable C<LV> and a final Jamo C<T>
-is composed into a syllable C<LVT>.
-
-Any characters other than Hangul Jamo and Hangul Syllables
-are unaffected.
-
- composeHangul("Hangul \x{1100}\x{1161}\x{1100}\x{1173}\x{11AF}.")
- returns "Hangul \x{AC00}\x{AE00}." or
- (0x48,0x61,0x6E,0x67,0x75,0x6C,0x20,0xAC00,0xAE00,0x2E);
-
-=back
-
-=head2 Hangul Syllable Name
-
-=over 4
-
-=item C<$name = getHangulName($codepoint)>
-
-If the specified codepoint is of a Hangul syllable,
-returns its name; otherwise returns undef.
-
- getHangulName(0xAC00) returns "HANGUL SYLLABLE GA";
- getHangulName(0x0041) returns undef.
-
-=item C<$codepoint = parseHangulName($name)>
-
-If the specified name is of a Hangul syllable,
-returns its codepoint; otherwise returns undef.
-
- parseHangulName("HANGUL SYLLABLE GEUL") returns 0xAE00;
-
- parseHangulName("LATIN SMALL LETTER A") returns undef;
-
- parseHangulName("HANGUL SYLLABLE PERL") returns undef;
- # Regrettably, HANGUL SYLLABLE PERL does not exist :-)
-
-=back
-
-=head2 EXPORT
-
-By default,
-
- decomposeHangul
- composeHangul
- getHangulName
- parseHangulName
-
-=head1 AUTHOR
-
-SADAHIRO Tomoyuki
-
- bqw10602@nifty.com
- http://homepage1.nifty.com/nomenclator/perl/
-
- Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
-
- This program is free software; you can redistribute it and/or
- modify it under the same terms as Perl itself.
-
-=head1 SEE ALSO
-
-=over 4
-
-=item http://www.unicode.org/unicode/reports/tr15
-
-Annex 10: Hangul, in Unicode Normalization Forms (UAX #15).
-
-=back
-
-=cut