Beginings of compiled encodings - checked in as a snapshot of thoughts

so far and so it does not get lost. p4raw-id: //depot/perlio@8103
author: Nick Ing-Simmons <nik@tiuk.ti.com> 2000-12-13 23:16:13 +0000
committer: Nick Ing-Simmons <nik@tiuk.ti.com> 2000-12-13 23:16:13 +0000
commit: 017e2addf6da99b3f648d9518de5a848be394ab8 (patch)
tree: a616df4fe9e14299b549f5c1ee0dd5d027258431 /ext/Encode/encengine.c
parent: 511c2ff04fc070a9b9389f53ec595d85ce870c80 (diff)
download: perl-017e2addf6da99b3f648d9518de5a848be394ab8.tar.gz
1 files changed, 134 insertions, 0 deletions
diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c
new file mode 100644
index 0000000000..a73be737e2
--- /dev/null
+++ b/ext/Encode/encengine.c
@@ -0,0 +1,134 @@
+/*
+Data structures for encoding transformations.
+
+Perl works internally in either a native 'byte' encoding or
+in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
+representation. When we do we can use utf8_to_uv().
+
+Most character encodings are either simple byte mappings or
+variable length multi-byte encodings. UTF-8 can be viewed as a
+rather extreme case of the latter.
+
+So to solve an important part of perl's encode needs we need to solve the
+"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
+case. (Where one of multi-bytes will usually be UTF-8.)
+
+The other type of encoding is a shift encoding where a prefix sequence
+determines what subsequent bytes mean. Such encodings have state.
+
+We also need to handle case where a character in one encoding has to be
+represented as multiple characters in the other. e.g. letter+diacritic.
+
+The process can be considered as pseudo perl:
+
+my $dst = '';
+while (length($src))
+ {
+  my $size    = $count($src);
+  my $in_seq  = substr($src,0,$size,'');
+  my $out_seq = $s2d_hash{$in_seq};
+  if (defined $out_seq)
+   {
+    $dst .= $out_seq;
+   }
+  else
+   {
+    # an error condition
+   }
+ }
+return $dst;
+
+That has the following components:
+ &src_count - a "rule" for how many bytes make up the next character in the
+              source.
+ %s2d_hash  - a mapping from input sequences to output sequences
+
+The problem with that scheme is that it does not allow the output
+character repertoire to affect the characters considered from the
+input.
+
+So we use a "trie" representation which can also be considered
+a state machine:
+
+my $dst   = '';
+my $seq   = \@s2d_seq;
+my $next  = \@s2d_next;
+while (length($src))
+ {
+  my $byte    = $substr($src,0,1,'');
+  my $out_seq = $seq->[$byte];
+  if (defined $out_seq)
+   {
+    $dst .= $out_seq;
+   }
+  else
+   {
+    # an error condition
+   }
+  ($next,$seq) = @$next->[$byte] if $next;
+ }
+return $dst;
+
+There is now a pair of data structures to represent everything.
+It is valid for output sequence at a particular point to
+be defined but zero length, that just means "don't know yet".
+For the single byte case there is no 'next' so new tables will be the same as
+the original tables. For a multi-byte case a prefix byte will flip to the tables
+for  the next page (adding nothing to the output), then the tables for the page
+will provide the actual output and set tables back to original base page.
+
+This scheme can also handle shift encodings.
+
+A slight enhancement to the scheme also allows for look-ahead - if
+we add a flag to re-add the removed byte to the source we could handle
+  a" -> �
+  ab -> a (and take b back please)
+
+*/
+
+#include <EXTERN.h>
+#include <perl.h>
+#define U8 U8
+#include "encode.h"
+
+STRLEN
+translate(encpage_t *enc, const U8 *src, STRLEN slen, U8 *dst, STRLEN dlen)
+{
+ const U8 *send = src+slen;
+ U8 *dend = dst+dlen;
+ U8 *dptr = dst;
+ while (src < send)
+  {
+   encpage_t *e = enc;
+   U8 byte = *src++;
+   while (byte > e->max)
+    e++;
+   if (byte >= e->min)
+    {
+     STRLEN n = e->dlen;
+     if (n)
+      {
+       const U8 *out = e->seq+n*(byte - e->min);
+       STRLEN n = *out++;
+       if (dptr+n <= dend)
+        {
+         if (dst)
+          Copy(out,dptr,n,U8);
+         dptr += n;
+        }
+       else
+        {
+         /* No room */
+        }
+      }
+     enc = e->next;
+    }
+   else
+    {
+     /* Cannot represent */
+    }
+  }
+ return dptr-dst;
+}
+
+
author	Nick Ing-Simmons <nik@tiuk.ti.com>	2000-12-13 23:16:13 +0000
committer	Nick Ing-Simmons <nik@tiuk.ti.com>	2000-12-13 23:16:13 +0000
commit	017e2addf6da99b3f648d9518de5a848be394ab8 (patch)
tree	a616df4fe9e14299b549f5c1ee0dd5d027258431 /ext/Encode/encengine.c
parent	511c2ff04fc070a9b9389f53ec595d85ce870c80 (diff)
download	perl-017e2addf6da99b3f648d9518de5a848be394ab8.tar.gz