summaryrefslogtreecommitdiff
path: root/src/encoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/encoding.c')
-rw-r--r--src/encoding.c145
1 files changed, 109 insertions, 36 deletions
diff --git a/src/encoding.c b/src/encoding.c
index 6c1567a..c8b3d1a 100644
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -1,4 +1,12 @@
-/* Copyright (c) 1993-2003
+/* Copyright (c) 2010
+ * Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de)
+ * Sadrul Habib Chowdhury (sadrul@users.sourceforge.net)
+ * Copyright (c) 2008, 2009
+ * Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de)
+ * Michael Schroeder (mlschroe@immd4.informatik.uni-erlangen.de)
+ * Micah Cowan (micah@cowan.name)
+ * Sadrul Habib Chowdhury (sadrul@users.sourceforge.net)
+ * Copyright (c) 1993-2003
* Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de)
* Michael Schroeder (mlschroe@immd4.informatik.uni-erlangen.de)
* Copyright (c) 1987 Oliver Laumann
@@ -542,20 +550,20 @@ int from, to;
if (from == to || (from != UTF8 && to != UTF8) || w == 0)
return ml;
- if (ml->font == null && encodings[from].deffont == 0)
+ if (ml->font == (unsigned int *)null && encodings[from].deffont == 0)
return ml;
if (w > maxlen)
{
for (i = 0; i < 2; i++)
{
if (rml[i].image == 0)
- rml[i].image = malloc(w);
+ rml[i].image = malloc(w * sizeof(int));
else
- rml[i].image = realloc(rml[i].image, w);
+ rml[i].image = realloc(rml[i].image, w * sizeof(int));
if (rml[i].font == 0)
- rml[i].font = malloc(w);
+ rml[i].font = malloc(w * sizeof(int));
else
- rml[i].font = realloc(rml[i].font, w);
+ rml[i].font = realloc(rml[i].font, w * sizeof(int));
if (rml[i].image == 0 || rml[i].font == 0)
{
maxlen = 0;
@@ -639,29 +647,93 @@ struct combchar {
};
struct combchar **combchars;
+/** Thank you Ken! http://www.cl.cam.ac.uk/~mgk25/ucs/utf-8-history.txt */
+typedef struct
+{
+ int cmask;
+ int cval;
+ int shift;
+ long lmask;
+ long lval;
+} Tab;
+
+static Tab tab[] =
+{
+ 0x80, 0x00, 0*6, 0x7F, 0, /* 1 byte sequence */
+ 0xE0, 0xC0, 1*6, 0x7FF, 0x80, /* 2 byte sequence */
+ 0xF0, 0xE0, 2*6, 0xFFFF, 0x800, /* 3 byte sequence */
+ 0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */
+ 0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */
+ 0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */
+ 0, /* end of table */
+};
+
+#define FOR_EACH_BYTE_IN_UTF8(c, bytefn, combfn) do \
+ { \
+ int byte = 0; \
+ if (c >= 0xe000) \
+ { \
+ int _tm = 0; \
+ Tab *_t; \
+ for (_t = tab; _t->cmask; _t++) \
+ { \
+ if (c <= _t->lmask) \
+ { \
+ _tm = _t->shift; \
+ byte = _t->cval | (c>>_tm); \
+ bytefn \
+ while (_tm > 0) \
+ { \
+ _tm -= 6; \
+ byte = 0x80 | ((c>>_tm) & 0x3F); \
+ bytefn \
+ } \
+ break; \
+ } \
+ } \
+ break; \
+ } \
+ \
+ if (c >= 0xd800 && c < 0xe000 && combchars && combchars[c - 0xd800]) \
+ { \
+ combfn \
+ } \
+ if (c >= 0x800) \
+ { \
+ byte = (c & 0xf000) >> 12 | 0xe0; \
+ bytefn \
+ c = (c & 0x0fff) | 0x1000; \
+ } \
+ if (c >= 0x80) \
+ { \
+ byte = (c & 0x1fc0) >> 6 ^ 0xc0; \
+ bytefn \
+ c = (c & 0x3f) | 0x80; \
+ } \
+ byte = c; \
+ bytefn \
+ } while (0)
+
void
AddUtf8(c)
int c;
{
ASSERT(D_encoding == UTF8);
- if (c >= 0xd800 && c < 0xe000 && combchars && combchars[c - 0xd800])
+
+ FOR_EACH_BYTE_IN_UTF8(c,
+ {
+ AddChar(byte);
+ },
{
AddUtf8(combchars[c - 0xd800]->c1);
c = combchars[c - 0xd800]->c2;
}
- if (c >= 0x800)
- {
- AddChar((c & 0xf000) >> 12 | 0xe0);
- c = (c & 0x0fff) | 0x1000;
- }
- if (c >= 0x80)
- {
- AddChar((c & 0x1fc0) >> 6 ^ 0xc0);
- c = (c & 0x3f) | 0x80;
- }
- AddChar(c);
+ );
}
+#if 0
+/* It feels like a good idea to simply use one ToUtf8, instead of having both
+ * ToUtf8_comb and ToUtf8. */
int
ToUtf8_comb(p, c)
char *p;
@@ -676,29 +748,27 @@ int c;
}
return ToUtf8(p, c);
}
+#endif
int
ToUtf8(p, c)
char *p;
int c;
{
- int l = 1;
- if (c >= 0x800)
+ int l = 0;
+ FOR_EACH_BYTE_IN_UTF8(c,
{
if (p)
- *p++ = (c & 0xf000) >> 12 | 0xe0;
+ *p++ = byte;
l++;
- c = (c & 0x0fff) | 0x1000;
- }
- if (c >= 0x80)
+ },
{
+ l += ToUtf8(p, combchars[c - 0xd800]->c1);
+ c = combchars[c - 0xd800]->c2;
if (p)
- *p++ = (c & 0x1fc0) >> 6 ^ 0xc0;
- l++;
- c = (c & 0x3f) | 0x80;
+ p += l;
}
- if (p)
- *p++ = c;
+ );
return l;
}
@@ -758,8 +828,6 @@ int c, *utf8charp;
*utf8charp = utf8char = (c & 0x80000000) ? c : 0;
if (utf8char)
return -1;
- if (c & 0xffff0000)
- c = UCS_REPL; /* sorry, only know 16bit Unicode */
if (c >= 0xd800 && (c <= 0xdfff || c == 0xfffe || c == 0xffff))
c = UCS_REPL; /* illegal code */
return c;
@@ -803,7 +871,7 @@ int encoding;
#else
ml = &p->w_mlines[j];
#endif
- if (ml->font == null && encodings[p->w_encoding].deffont == 0)
+ if (ml->font == (unsigned int *)null && encodings[p->w_encoding].deffont == 0)
continue;
for (i = 0; i < p->w_width; i++)
{
@@ -812,11 +880,11 @@ int encoding;
c |= encodings[p->w_encoding].deffont << 8;
if (c < 256)
continue;
- if (ml->font == null)
+ if (ml->font == (unsigned int *)null)
{
- if ((ml->font = (unsigned char *)calloc(p->w_width + 1, 1)) == 0)
+ if ((ml->font = (unsigned int *)calloc(p->w_width + 1, sizeof(int))) == 0)
{
- ml->font = null;
+ ml->font = (unsigned int *)null;
break;
}
}
@@ -1350,6 +1418,9 @@ int *fontp;
#ifdef UTF8
if (encoding == UTF8)
{
+#if 0
+ /* We didn't use to handle 16+bit unicode correctly. But since now we do (in ToUtf8),
+ * do we need this? */
if (f)
{
# ifdef DW_CHARS
@@ -1366,6 +1437,7 @@ int *fontp;
c = recode_char_to_encoding(c, encoding);
}
}
+#endif
return ToUtf8(bp, c);
}
if ((c & 0xff00) && f == 0) /* is_utf8? */
@@ -1612,7 +1684,8 @@ struct mline *ml;
int xs, xe;
int encoding;
{
- unsigned char *f, *i;
+ unsigned int *f;
+ unsigned int *i;
int c, x, dx;
if (encoding == UTF8 || encodings[encoding].deffont == 0)