diff options
author | Nick Ing-Simmons <nik@tiuk.ti.com> | 2000-12-13 23:16:13 +0000 |
---|---|---|
committer | Nick Ing-Simmons <nik@tiuk.ti.com> | 2000-12-13 23:16:13 +0000 |
commit | 017e2addf6da99b3f648d9518de5a848be394ab8 (patch) | |
tree | a616df4fe9e14299b549f5c1ee0dd5d027258431 /ext/Encode/encengine.c | |
parent | 511c2ff04fc070a9b9389f53ec595d85ce870c80 (diff) | |
download | perl-017e2addf6da99b3f648d9518de5a848be394ab8.tar.gz |
Beginings of compiled encodings - checked in as a snapshot of thoughts
so far and so it does not get lost.
p4raw-id: //depot/perlio@8103
Diffstat (limited to 'ext/Encode/encengine.c')
-rw-r--r-- | ext/Encode/encengine.c | 134 |
1 files changed, 134 insertions, 0 deletions
diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c new file mode 100644 index 0000000000..a73be737e2 --- /dev/null +++ b/ext/Encode/encengine.c @@ -0,0 +1,134 @@ +/* +Data structures for encoding transformations. + +Perl works internally in either a native 'byte' encoding or +in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t" +representation. When we do we can use utf8_to_uv(). + +Most character encodings are either simple byte mappings or +variable length multi-byte encodings. UTF-8 can be viewed as a +rather extreme case of the latter. + +So to solve an important part of perl's encode needs we need to solve the +"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate +case. (Where one of multi-bytes will usually be UTF-8.) + +The other type of encoding is a shift encoding where a prefix sequence +determines what subsequent bytes mean. Such encodings have state. + +We also need to handle case where a character in one encoding has to be +represented as multiple characters in the other. e.g. letter+diacritic. + +The process can be considered as pseudo perl: + +my $dst = ''; +while (length($src)) + { + my $size = $count($src); + my $in_seq = substr($src,0,$size,''); + my $out_seq = $s2d_hash{$in_seq}; + if (defined $out_seq) + { + $dst .= $out_seq; + } + else + { + # an error condition + } + } +return $dst; + +That has the following components: + &src_count - a "rule" for how many bytes make up the next character in the + source. + %s2d_hash - a mapping from input sequences to output sequences + +The problem with that scheme is that it does not allow the output +character repertoire to affect the characters considered from the +input. + +So we use a "trie" representation which can also be considered +a state machine: + +my $dst = ''; +my $seq = \@s2d_seq; +my $next = \@s2d_next; +while (length($src)) + { + my $byte = $substr($src,0,1,''); + my $out_seq = $seq->[$byte]; + if (defined $out_seq) + { + $dst .= $out_seq; + } + else + { + # an error condition + } + ($next,$seq) = @$next->[$byte] if $next; + } +return $dst; + +There is now a pair of data structures to represent everything. +It is valid for output sequence at a particular point to +be defined but zero length, that just means "don't know yet". +For the single byte case there is no 'next' so new tables will be the same as +the original tables. For a multi-byte case a prefix byte will flip to the tables +for the next page (adding nothing to the output), then the tables for the page +will provide the actual output and set tables back to original base page. + +This scheme can also handle shift encodings. + +A slight enhancement to the scheme also allows for look-ahead - if +we add a flag to re-add the removed byte to the source we could handle + a" -> ä + ab -> a (and take b back please) + +*/ + +#include <EXTERN.h> +#include <perl.h> +#define U8 U8 +#include "encode.h" + +STRLEN +translate(encpage_t *enc, const U8 *src, STRLEN slen, U8 *dst, STRLEN dlen) +{ + const U8 *send = src+slen; + U8 *dend = dst+dlen; + U8 *dptr = dst; + while (src < send) + { + encpage_t *e = enc; + U8 byte = *src++; + while (byte > e->max) + e++; + if (byte >= e->min) + { + STRLEN n = e->dlen; + if (n) + { + const U8 *out = e->seq+n*(byte - e->min); + STRLEN n = *out++; + if (dptr+n <= dend) + { + if (dst) + Copy(out,dptr,n,U8); + dptr += n; + } + else + { + /* No room */ + } + } + enc = e->next; + } + else + { + /* Cannot represent */ + } + } + return dptr-dst; +} + + |