summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorNick Ing-Simmons <nik@tiuk.ti.com>2000-12-31 18:11:54 +0000
committerNick Ing-Simmons <nik@tiuk.ti.com>2000-12-31 18:11:54 +0000
commit9b37254de3a0e643859aebea34267588f789f15f (patch)
treea1ef764d2070f5965a05ec1aefbf3b5b2b255d06 /ext
parenteb73becae469f5299394486e1d7e01e96646e994 (diff)
downloadperl-9b37254de3a0e643859aebea34267588f789f15f.tar.gz
Start of support of ICU-style .ucm files:
- teach compile how to read a .ucm file - first guess at how to represent fallbacks in "tries". - use fallbacks if check == 0 - new return code to indicate we used one. p4raw-id: //depot/perlio@8285
Diffstat (limited to 'ext')
-rw-r--r--ext/Encode/Encode.pm9
-rw-r--r--ext/Encode/Encode.xs6
-rwxr-xr-xext/Encode/compile108
-rw-r--r--ext/Encode/encengine.c12
-rw-r--r--ext/Encode/encode.h9
5 files changed, 122 insertions, 22 deletions
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index db471cb059..dedb8e9a89 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -342,7 +342,7 @@ sub from_to
# The global hash is declared in XS code
$encoding{Unicode} = bless({},'Encode::Unicode');
-$encoding{iso10646-1} = bless({},'Encode::iso10646_1');
+$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
sub encodings
{
@@ -408,7 +408,8 @@ sub getEncoding
package Encode::Unicode;
-# Dummy package that provides the encode interface
+# Dummy package that provides the encode interface but leaves data
+# as UTF-8 encoded. It is here so that from_to()
sub name { 'Unicode' }
@@ -533,7 +534,9 @@ sub fromUnicode
return $str;
}
-package Encode::iso10646_1;#
+package Encode::iso10646_1;
+# Encoding is 16-bit network order Unicode
+# Used for X font encodings
sub name { 'iso10646-1' }
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index cca1ddcd7b..b61d89bd96 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -340,10 +340,14 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
STRLEN dlen = SvLEN(dst);
int code;
- while ((code = do_encode(dir,s,&slen,d,dlen,&dlen)))
+ while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
{
SvCUR_set(dst,dlen);
SvPOK_on(dst);
+
+ if (code == ENCODE_FALLBACK)
+ break;
+
switch(code)
{
case ENCODE_NOSPACE:
diff --git a/ext/Encode/compile b/ext/Encode/compile
index fbb08cd2e9..21478f84d9 100755
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -4,7 +4,7 @@ use strict;
sub encode_U
{
- # UTF-8 encocde long hand - only covers part of perl's range
+ # UTF-8 encode long hand - only covers part of perl's range
my $uv = shift;
if ($uv < 0x80)
{
@@ -96,10 +96,17 @@ sub cmp_name
foreach my $enc (sort cmp_name @ARGV)
{
- my ($name) = $enc =~ /^.*?([\w-]+)(\.enc)$/;
+ my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
if (open(E,$enc))
{
- compile_enc(\*E,lc($name),\*C);
+ if ($sfx eq 'enc')
+ {
+ compile_enc(\*E,lc($name),\*C);
+ }
+ else
+ {
+ compile_ucm(\*E,lc($name),\*C);
+ }
}
else
{
@@ -135,6 +142,86 @@ close(C);
close(D);
close(H);
+
+sub compile_ucm
+{
+ my ($fh,$name,$ch) = @_;
+ my $e2u = {};
+ my $u2e = {};
+ my $cs;
+ my %attr;
+ while (<$fh>)
+ {
+ s/#.*$//;
+ last if /^\s*CHARMAP\s*$/i;
+ if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i)
+ {
+ $attr{$1} = $2;
+ }
+ }
+ if (!defined($cs = $attr{'code_set_name'}))
+ {
+ warn "No <code_set_name> in $name\n";
+ }
+ else
+ {
+ $name = lc($cs);
+ }
+ my $erep;
+ my $urep;
+ if (exists $attr{'subchar'})
+ {
+ my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/;
+ $erep = join('',map(hex($_),@byte));
+ }
+ warn "Scanning $cs\n";
+ my $nfb = 0;
+ my $hfb = 0;
+ while (<$fh>)
+ {
+ s/#.*$//;
+ last if /^\s*END\s+CHARMAP\s*$/i;
+ next if /^\s*$/;
+ my ($u,@byte) = /^<U([0-9a-f]+)>\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i;
+ my $fb = pop(@byte);
+ if (defined($u))
+ {
+ my $uch = encode_U(hex($u));
+ my $ech = join('',map(hex($_),@byte));
+ if (length($fb))
+ {
+ $fb = substr($fb,1);
+ $hfb++;
+ }
+ else
+ {
+ $nfb++;
+ $fb = '0';
+ }
+ # $fb is fallback flag
+ # 0 - round trip safe
+ # 1 - fallback for unicode -> enc
+ # 2 - skip sub-char mapping
+ # 3 - fallback enc -> unicode
+ enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
+ enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
+ }
+ else
+ {
+ warn $_;
+ }
+
+ }
+ if ($nfb && $hfb)
+ {
+ die "$nfb entries without fallback, $hfb entries with\n";
+ }
+ output($ch,$name.'_utf8',$e2u);
+ output($ch,'utf8_'.$name,$u2e);
+ $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
+ outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)];
+}
+
sub compile_enc
{
my ($fh,$name,$ch) = @_;
@@ -173,8 +260,8 @@ sub compile_enc
if ($val || (!$ch && !$page))
{
my $uch = encode_U($val);
- enter($e2u,$ech,$uch,$e2u);
- enter($u2e,$uch,$ech,$u2e);
+ enter($e2u,$ech,$uch,$e2u,0);
+ enter($u2e,$uch,$ech,$u2e,0);
}
else
{
@@ -193,18 +280,18 @@ sub compile_enc
sub enter
{
- my ($a,$s,$d,$t) = @_;
+ my ($a,$s,$d,$t,$fb) = @_;
$t = $a if @_ < 4;
my $b = substr($s,0,1);
my $e = $a->{$b};
unless ($e)
{ # 0 1 2 3 4 5
- $e = [$b,$b,'',{},length($s),0];
+ $e = [$b,$b,'',{},length($s),0,$fb];
$a->{$b} = $e;
}
if (length($s) > 1)
{
- enter($e->[3],substr($s,1),$d,$t);
+ enter($e->[3],substr($s,1),$d,$t,$fb);
}
else
{
@@ -260,7 +347,8 @@ sub process
ord($b) == ord($a->{$l}[1])+1 &&
$a->{$l}[3] == $a->{$b}[3] &&
$a->{$l}[4] == $a->{$b}[4] &&
- $a->{$l}[5] == $a->{$b}[5]
+ $a->{$l}[5] == $a->{$b}[5] &&
+ $a->{$l}[6] == $a->{$b}[6]
# && length($a->{$l}[2]) < 16
)
{
@@ -316,7 +404,7 @@ sub outtable
print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
foreach my $b (@{$a->{'Entries'}})
{
- my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}};
+ my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
my $sc = ord($s);
my $ec = ord($e);
print $fh "{";
diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c
index f31725094d..4c68dd96f6 100644
--- a/ext/Encode/encengine.c
+++ b/ext/Encode/encengine.c
@@ -92,7 +92,7 @@ we add a flag to re-add the removed byte to the source we could handle
#include "encode.h"
int
-do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout)
+do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx)
{
const U8 *s = src;
const U8 *send = s+*slen;
@@ -106,9 +106,9 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR
U8 byte = *s;
while (byte > e->max)
e++;
- if (byte >= e->min && e->slen)
+ if (byte >= e->min && e->slen && (approx || !e->slen & 0x80))
{
- const U8 *cend = s + e->slen;
+ const U8 *cend = s + (e->slen & 0x7f);
if (cend <= send)
{
STRLEN n;
@@ -136,7 +136,11 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR
enc = e->next;
s++;
if (s == cend)
- last = s;
+ {
+ if (approx && (e->slen & 0x80))
+ code = ENCODE_FALLBACK;
+ last = s;
+ }
}
else
{
diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h
index 604b97f99d..853ad041b4 100644
--- a/ext/Encode/encode.h
+++ b/ext/Encode/encode.h
@@ -28,12 +28,13 @@ struct encode_s
#ifdef U8
extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen,
- U8 *dst, STRLEN dlen, STRLEN *dout);
+ U8 *dst, STRLEN dlen, STRLEN *dout, int approx);
extern void Encode_DefineEncoding(encode_t *enc);
#endif
-#define ENCODE_NOSPACE 1
-#define ENCODE_PARTIAL 2
-#define ENCODE_NOREP 3
+#define ENCODE_NOSPACE 1
+#define ENCODE_PARTIAL 2
+#define ENCODE_NOREP 3
+#define ENCODE_FALLBACK 4
#endif