summaryrefslogtreecommitdiff
path: root/ext/Encode/encengine.c
diff options
context:
space:
mode:
authorNick Ing-Simmons <nik@tiuk.ti.com>2000-12-13 23:16:13 +0000
committerNick Ing-Simmons <nik@tiuk.ti.com>2000-12-13 23:16:13 +0000
commit017e2addf6da99b3f648d9518de5a848be394ab8 (patch)
treea616df4fe9e14299b549f5c1ee0dd5d027258431 /ext/Encode/encengine.c
parent511c2ff04fc070a9b9389f53ec595d85ce870c80 (diff)
downloadperl-017e2addf6da99b3f648d9518de5a848be394ab8.tar.gz
Beginings of compiled encodings - checked in as a snapshot of thoughts
so far and so it does not get lost. p4raw-id: //depot/perlio@8103
Diffstat (limited to 'ext/Encode/encengine.c')
-rw-r--r--ext/Encode/encengine.c134
1 files changed, 134 insertions, 0 deletions
diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c
new file mode 100644
index 0000000000..a73be737e2
--- /dev/null
+++ b/ext/Encode/encengine.c
@@ -0,0 +1,134 @@
+/*
+Data structures for encoding transformations.
+
+Perl works internally in either a native 'byte' encoding or
+in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
+representation. When we do we can use utf8_to_uv().
+
+Most character encodings are either simple byte mappings or
+variable length multi-byte encodings. UTF-8 can be viewed as a
+rather extreme case of the latter.
+
+So to solve an important part of perl's encode needs we need to solve the
+"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
+case. (Where one of multi-bytes will usually be UTF-8.)
+
+The other type of encoding is a shift encoding where a prefix sequence
+determines what subsequent bytes mean. Such encodings have state.
+
+We also need to handle case where a character in one encoding has to be
+represented as multiple characters in the other. e.g. letter+diacritic.
+
+The process can be considered as pseudo perl:
+
+my $dst = '';
+while (length($src))
+ {
+ my $size = $count($src);
+ my $in_seq = substr($src,0,$size,'');
+ my $out_seq = $s2d_hash{$in_seq};
+ if (defined $out_seq)
+ {
+ $dst .= $out_seq;
+ }
+ else
+ {
+ # an error condition
+ }
+ }
+return $dst;
+
+That has the following components:
+ &src_count - a "rule" for how many bytes make up the next character in the
+ source.
+ %s2d_hash - a mapping from input sequences to output sequences
+
+The problem with that scheme is that it does not allow the output
+character repertoire to affect the characters considered from the
+input.
+
+So we use a "trie" representation which can also be considered
+a state machine:
+
+my $dst = '';
+my $seq = \@s2d_seq;
+my $next = \@s2d_next;
+while (length($src))
+ {
+ my $byte = $substr($src,0,1,'');
+ my $out_seq = $seq->[$byte];
+ if (defined $out_seq)
+ {
+ $dst .= $out_seq;
+ }
+ else
+ {
+ # an error condition
+ }
+ ($next,$seq) = @$next->[$byte] if $next;
+ }
+return $dst;
+
+There is now a pair of data structures to represent everything.
+It is valid for output sequence at a particular point to
+be defined but zero length, that just means "don't know yet".
+For the single byte case there is no 'next' so new tables will be the same as
+the original tables. For a multi-byte case a prefix byte will flip to the tables
+for the next page (adding nothing to the output), then the tables for the page
+will provide the actual output and set tables back to original base page.
+
+This scheme can also handle shift encodings.
+
+A slight enhancement to the scheme also allows for look-ahead - if
+we add a flag to re-add the removed byte to the source we could handle
+ a" -> ä
+ ab -> a (and take b back please)
+
+*/
+
+#include <EXTERN.h>
+#include <perl.h>
+#define U8 U8
+#include "encode.h"
+
+STRLEN
+translate(encpage_t *enc, const U8 *src, STRLEN slen, U8 *dst, STRLEN dlen)
+{
+ const U8 *send = src+slen;
+ U8 *dend = dst+dlen;
+ U8 *dptr = dst;
+ while (src < send)
+ {
+ encpage_t *e = enc;
+ U8 byte = *src++;
+ while (byte > e->max)
+ e++;
+ if (byte >= e->min)
+ {
+ STRLEN n = e->dlen;
+ if (n)
+ {
+ const U8 *out = e->seq+n*(byte - e->min);
+ STRLEN n = *out++;
+ if (dptr+n <= dend)
+ {
+ if (dst)
+ Copy(out,dptr,n,U8);
+ dptr += n;
+ }
+ else
+ {
+ /* No room */
+ }
+ }
+ enc = e->next;
+ }
+ else
+ {
+ /* Cannot represent */
+ }
+ }
+ return dptr-dst;
+}
+
+