1 files changed, 109 insertions, 36 deletions
diff --git a/src/encoding.c b/src/encoding.c
index 6c1567a..c8b3d1a 100644
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -1,4 +1,12 @@
-/* Copyright (c) 1993-2003
+/* Copyright (c) 2010
+ *      Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de)
+ *      Sadrul Habib Chowdhury (sadrul@users.sourceforge.net)
+ * Copyright (c) 2008, 2009
+ *      Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de)
+ *      Michael Schroeder (mlschroe@immd4.informatik.uni-erlangen.de)
+ *      Micah Cowan (micah@cowan.name)
+ *      Sadrul Habib Chowdhury (sadrul@users.sourceforge.net)
+ * Copyright (c) 1993-2003
  *      Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de)
  *      Michael Schroeder (mlschroe@immd4.informatik.uni-erlangen.de)
  * Copyright (c) 1987 Oliver Laumann 
@@ -542,20 +550,20 @@ int from, to;
 
   if (from == to || (from != UTF8 && to != UTF8) || w == 0)
     return ml;
-  if (ml->font == null && encodings[from].deffont == 0)
+  if (ml->font == (unsigned int *)null && encodings[from].deffont == 0)
     return ml;
   if (w > maxlen)
     {
       for (i = 0; i < 2; i++)
 	{
 	  if (rml[i].image == 0)
-	    rml[i].image = malloc(w);
+	    rml[i].image = malloc(w * sizeof(int));
 	  else
-	    rml[i].image = realloc(rml[i].image, w);
+	    rml[i].image = realloc(rml[i].image, w * sizeof(int));
 	  if (rml[i].font == 0)
-	    rml[i].font = malloc(w);
+	    rml[i].font = malloc(w * sizeof(int));
 	  else
-	    rml[i].font = realloc(rml[i].font, w);
+	    rml[i].font = realloc(rml[i].font, w * sizeof(int));
 	  if (rml[i].image == 0 || rml[i].font == 0)
 	    {
 	      maxlen = 0;
@@ -639,29 +647,93 @@ struct combchar {
 };
 struct combchar **combchars;
 
+/** Thank you Ken! http://www.cl.cam.ac.uk/~mgk25/ucs/utf-8-history.txt */
+typedef struct
+{
+  int	cmask;
+  int	cval;
+  int	shift;
+  long	lmask;
+  long	lval;
+} Tab;
+
+static Tab	tab[] =
+{
+  0x80,	0x00,	0*6,	0x7F,		0,		/* 1 byte sequence */
+  0xE0,	0xC0,	1*6,	0x7FF,		0x80,		/* 2 byte sequence */
+  0xF0,	0xE0,	2*6,	0xFFFF,		0x800,		/* 3 byte sequence */
+  0xF8,	0xF0,	3*6,	0x1FFFFF,	0x10000,	/* 4 byte sequence */
+  0xFC,	0xF8,	4*6,	0x3FFFFFF,	0x200000,	/* 5 byte sequence */
+  0xFE,	0xFC,	5*6,	0x7FFFFFFF,	0x4000000,	/* 6 byte sequence */
+  0,							/* end of table */
+};
+
+#define FOR_EACH_BYTE_IN_UTF8(c, bytefn, combfn) do	\
+  {	\
+    int byte = 0;	\
+    if (c >= 0xe000)	\
+      {	\
+	int _tm = 0;	\
+	Tab *_t;	\
+	for (_t = tab; _t->cmask; _t++)	\
+	  {	\
+	    if (c <= _t->lmask)	\
+	      {	\
+		_tm = _t->shift;	\
+		byte = _t->cval | (c>>_tm);	\
+		bytefn	\
+		while (_tm > 0)	\
+		  {	\
+		    _tm -= 6;	\
+		    byte = 0x80 | ((c>>_tm) & 0x3F);	\
+		    bytefn	\
+		  }	\
+		break;	\
+	      }	\
+	  }	\
+	break;	\
+      }	\
+	\
+    if (c >= 0xd800 && c < 0xe000 && combchars && combchars[c - 0xd800])	\
+      {	\
+	combfn	\
+      }	\
+    if (c >= 0x800)	\
+      {	\
+	byte = (c & 0xf000) >> 12 | 0xe0;	\
+	bytefn	\
+	c = (c & 0x0fff) | 0x1000;	\
+      }	\
+    if (c >= 0x80)	\
+      {	\
+	byte = (c & 0x1fc0) >> 6 ^ 0xc0;	\
+	bytefn	\
+	c = (c & 0x3f) | 0x80;	\
+      }	\
+    byte = c;	\
+    bytefn	\
+  } while (0)
+
 void
 AddUtf8(c)
 int c;
 {
   ASSERT(D_encoding == UTF8);
-  if (c >= 0xd800 && c < 0xe000 && combchars && combchars[c - 0xd800])
+
+  FOR_EACH_BYTE_IN_UTF8(c,
+    {
+      AddChar(byte);
+    },
     {
       AddUtf8(combchars[c - 0xd800]->c1);
       c = combchars[c - 0xd800]->c2;
     }
-  if (c >= 0x800)
-    {
-      AddChar((c & 0xf000) >> 12 | 0xe0);
-      c = (c & 0x0fff) | 0x1000; 
-    }
-  if (c >= 0x80)
-    {
-      AddChar((c & 0x1fc0) >> 6 ^ 0xc0);
-      c = (c & 0x3f) | 0x80; 
-    }
-  AddChar(c);
+  );
 }
 
+#if 0
+/* It feels like a good idea to simply use one ToUtf8, instead of having both
+ * ToUtf8_comb and ToUtf8. */
 int
 ToUtf8_comb(p, c)
 char *p;
@@ -676,29 +748,27 @@ int c;
     }
   return ToUtf8(p, c);
 }
+#endif
 
 int
 ToUtf8(p, c)
 char *p;
 int c;
 {
-  int l = 1;
-  if (c >= 0x800)
+  int l = 0;
+  FOR_EACH_BYTE_IN_UTF8(c,
     {
       if (p)
-	*p++ = (c & 0xf000) >> 12 | 0xe0; 
+	*p++ = byte;
       l++;
-      c = (c & 0x0fff) | 0x1000; 
-    }
-  if (c >= 0x80)
+    },
     {
+      l += ToUtf8(p, combchars[c - 0xd800]->c1);
+      c = combchars[c - 0xd800]->c2;
       if (p)
-	*p++ = (c & 0x1fc0) >> 6 ^ 0xc0; 
-      l++;
-      c = (c & 0x3f) | 0x80; 
+	p += l;
     }
-  if (p)
-    *p++ = c;
+  );
   return l;
 }
 
@@ -758,8 +828,6 @@ int c, *utf8charp;
   *utf8charp = utf8char = (c & 0x80000000) ? c : 0;
   if (utf8char)
     return -1;
-  if (c & 0xffff0000)
-    c = UCS_REPL;	/* sorry, only know 16bit Unicode */
   if (c >= 0xd800 && (c <= 0xdfff || c == 0xfffe || c == 0xffff))
     c = UCS_REPL;	/* illegal code */
   return c;
@@ -803,7 +871,7 @@ int encoding;
 #else
       ml = &p->w_mlines[j];
 #endif
-      if (ml->font == null && encodings[p->w_encoding].deffont == 0)
+      if (ml->font == (unsigned int *)null && encodings[p->w_encoding].deffont == 0)
 	continue;
       for (i = 0; i < p->w_width; i++)
 	{
@@ -812,11 +880,11 @@ int encoding;
 	    c |= encodings[p->w_encoding].deffont << 8;
 	  if (c < 256)
 	    continue;
-	  if (ml->font == null)
+	  if (ml->font == (unsigned int *)null)
 	    {
-	      if ((ml->font = (unsigned char *)calloc(p->w_width + 1, 1)) == 0)
+	      if ((ml->font = (unsigned int *)calloc(p->w_width + 1, sizeof(int))) == 0)
 		{
-		  ml->font = null;
+		  ml->font = (unsigned int *)null;
 		  break;
 		}
 	    }
@@ -1350,6 +1418,9 @@ int *fontp;
 #ifdef UTF8
   if (encoding == UTF8)
     {
+#if 0
+      /* We didn't use to handle 16+bit unicode correctly. But since now we do (in ToUtf8),
+       * do we need this? */
       if (f)
 	{
 # ifdef DW_CHARS
@@ -1366,6 +1437,7 @@ int *fontp;
 	      c = recode_char_to_encoding(c, encoding);
 	    }
         }
+#endif
       return ToUtf8(bp, c);
     }
   if ((c & 0xff00) && f == 0)	/* is_utf8? */
@@ -1612,7 +1684,8 @@ struct mline *ml;
 int xs, xe;
 int encoding;
 {
-  unsigned char *f, *i;
+  unsigned int *f;
+  unsigned int *i;
   int c, x, dx;
 
   if (encoding == UTF8 || encodings[encoding].deffont == 0)