diff options
Diffstat (limited to 'src/encoding.c')
-rw-r--r-- | src/encoding.c | 145 |
1 files changed, 109 insertions, 36 deletions
diff --git a/src/encoding.c b/src/encoding.c index 6c1567a..c8b3d1a 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -1,4 +1,12 @@ -/* Copyright (c) 1993-2003 +/* Copyright (c) 2010 + * Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de) + * Sadrul Habib Chowdhury (sadrul@users.sourceforge.net) + * Copyright (c) 2008, 2009 + * Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de) + * Michael Schroeder (mlschroe@immd4.informatik.uni-erlangen.de) + * Micah Cowan (micah@cowan.name) + * Sadrul Habib Chowdhury (sadrul@users.sourceforge.net) + * Copyright (c) 1993-2003 * Juergen Weigert (jnweiger@immd4.informatik.uni-erlangen.de) * Michael Schroeder (mlschroe@immd4.informatik.uni-erlangen.de) * Copyright (c) 1987 Oliver Laumann @@ -542,20 +550,20 @@ int from, to; if (from == to || (from != UTF8 && to != UTF8) || w == 0) return ml; - if (ml->font == null && encodings[from].deffont == 0) + if (ml->font == (unsigned int *)null && encodings[from].deffont == 0) return ml; if (w > maxlen) { for (i = 0; i < 2; i++) { if (rml[i].image == 0) - rml[i].image = malloc(w); + rml[i].image = malloc(w * sizeof(int)); else - rml[i].image = realloc(rml[i].image, w); + rml[i].image = realloc(rml[i].image, w * sizeof(int)); if (rml[i].font == 0) - rml[i].font = malloc(w); + rml[i].font = malloc(w * sizeof(int)); else - rml[i].font = realloc(rml[i].font, w); + rml[i].font = realloc(rml[i].font, w * sizeof(int)); if (rml[i].image == 0 || rml[i].font == 0) { maxlen = 0; @@ -639,29 +647,93 @@ struct combchar { }; struct combchar **combchars; +/** Thank you Ken! http://www.cl.cam.ac.uk/~mgk25/ucs/utf-8-history.txt */ +typedef struct +{ + int cmask; + int cval; + int shift; + long lmask; + long lval; +} Tab; + +static Tab tab[] = +{ + 0x80, 0x00, 0*6, 0x7F, 0, /* 1 byte sequence */ + 0xE0, 0xC0, 1*6, 0x7FF, 0x80, /* 2 byte sequence */ + 0xF0, 0xE0, 2*6, 0xFFFF, 0x800, /* 3 byte sequence */ + 0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */ + 0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */ + 0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */ + 0, /* end of table */ +}; + +#define FOR_EACH_BYTE_IN_UTF8(c, bytefn, combfn) do \ + { \ + int byte = 0; \ + if (c >= 0xe000) \ + { \ + int _tm = 0; \ + Tab *_t; \ + for (_t = tab; _t->cmask; _t++) \ + { \ + if (c <= _t->lmask) \ + { \ + _tm = _t->shift; \ + byte = _t->cval | (c>>_tm); \ + bytefn \ + while (_tm > 0) \ + { \ + _tm -= 6; \ + byte = 0x80 | ((c>>_tm) & 0x3F); \ + bytefn \ + } \ + break; \ + } \ + } \ + break; \ + } \ + \ + if (c >= 0xd800 && c < 0xe000 && combchars && combchars[c - 0xd800]) \ + { \ + combfn \ + } \ + if (c >= 0x800) \ + { \ + byte = (c & 0xf000) >> 12 | 0xe0; \ + bytefn \ + c = (c & 0x0fff) | 0x1000; \ + } \ + if (c >= 0x80) \ + { \ + byte = (c & 0x1fc0) >> 6 ^ 0xc0; \ + bytefn \ + c = (c & 0x3f) | 0x80; \ + } \ + byte = c; \ + bytefn \ + } while (0) + void AddUtf8(c) int c; { ASSERT(D_encoding == UTF8); - if (c >= 0xd800 && c < 0xe000 && combchars && combchars[c - 0xd800]) + + FOR_EACH_BYTE_IN_UTF8(c, + { + AddChar(byte); + }, { AddUtf8(combchars[c - 0xd800]->c1); c = combchars[c - 0xd800]->c2; } - if (c >= 0x800) - { - AddChar((c & 0xf000) >> 12 | 0xe0); - c = (c & 0x0fff) | 0x1000; - } - if (c >= 0x80) - { - AddChar((c & 0x1fc0) >> 6 ^ 0xc0); - c = (c & 0x3f) | 0x80; - } - AddChar(c); + ); } +#if 0 +/* It feels like a good idea to simply use one ToUtf8, instead of having both + * ToUtf8_comb and ToUtf8. */ int ToUtf8_comb(p, c) char *p; @@ -676,29 +748,27 @@ int c; } return ToUtf8(p, c); } +#endif int ToUtf8(p, c) char *p; int c; { - int l = 1; - if (c >= 0x800) + int l = 0; + FOR_EACH_BYTE_IN_UTF8(c, { if (p) - *p++ = (c & 0xf000) >> 12 | 0xe0; + *p++ = byte; l++; - c = (c & 0x0fff) | 0x1000; - } - if (c >= 0x80) + }, { + l += ToUtf8(p, combchars[c - 0xd800]->c1); + c = combchars[c - 0xd800]->c2; if (p) - *p++ = (c & 0x1fc0) >> 6 ^ 0xc0; - l++; - c = (c & 0x3f) | 0x80; + p += l; } - if (p) - *p++ = c; + ); return l; } @@ -758,8 +828,6 @@ int c, *utf8charp; *utf8charp = utf8char = (c & 0x80000000) ? c : 0; if (utf8char) return -1; - if (c & 0xffff0000) - c = UCS_REPL; /* sorry, only know 16bit Unicode */ if (c >= 0xd800 && (c <= 0xdfff || c == 0xfffe || c == 0xffff)) c = UCS_REPL; /* illegal code */ return c; @@ -803,7 +871,7 @@ int encoding; #else ml = &p->w_mlines[j]; #endif - if (ml->font == null && encodings[p->w_encoding].deffont == 0) + if (ml->font == (unsigned int *)null && encodings[p->w_encoding].deffont == 0) continue; for (i = 0; i < p->w_width; i++) { @@ -812,11 +880,11 @@ int encoding; c |= encodings[p->w_encoding].deffont << 8; if (c < 256) continue; - if (ml->font == null) + if (ml->font == (unsigned int *)null) { - if ((ml->font = (unsigned char *)calloc(p->w_width + 1, 1)) == 0) + if ((ml->font = (unsigned int *)calloc(p->w_width + 1, sizeof(int))) == 0) { - ml->font = null; + ml->font = (unsigned int *)null; break; } } @@ -1350,6 +1418,9 @@ int *fontp; #ifdef UTF8 if (encoding == UTF8) { +#if 0 + /* We didn't use to handle 16+bit unicode correctly. But since now we do (in ToUtf8), + * do we need this? */ if (f) { # ifdef DW_CHARS @@ -1366,6 +1437,7 @@ int *fontp; c = recode_char_to_encoding(c, encoding); } } +#endif return ToUtf8(bp, c); } if ((c & 0xff00) && f == 0) /* is_utf8? */ @@ -1612,7 +1684,8 @@ struct mline *ml; int xs, xe; int encoding; { - unsigned char *f, *i; + unsigned int *f; + unsigned int *i; int c, x, dx; if (encoding == UTF8 || encodings[encoding].deffont == 0) |