From c01dbb8d33281a5cdea16525904ff28ca61620fd Mon Sep 17 00:00:00 2001 From: Owen Taylor Date: Fri, 10 Mar 2000 15:58:38 +0000 Subject: Added arabic shaper from Karl Koehler * modules/Makefile.am modules/arabic/*: Added arabic shaper from Karl Koehler --- modules/arabic/Makefile.am | 14 ++ modules/arabic/arabic-x.c | 335 +++++++++++++++++++++++++++++++++++++++++++++ modules/arabic/arabic.c | 335 +++++++++++++++++++++++++++++++++++++++++++++ modules/arabic/arconv.c | 136 ++++++++++++++++++ modules/arabic/arconv.h | 8 ++ 5 files changed, 828 insertions(+) create mode 100644 modules/arabic/Makefile.am create mode 100644 modules/arabic/arabic-x.c create mode 100644 modules/arabic/arabic.c create mode 100644 modules/arabic/arconv.c create mode 100644 modules/arabic/arconv.h (limited to 'modules') diff --git a/modules/arabic/Makefile.am b/modules/arabic/Makefile.am new file mode 100644 index 00000000..550668cb --- /dev/null +++ b/modules/arabic/Makefile.am @@ -0,0 +1,14 @@ +## Process this file with automake to create Makefile.in. + +noinst_LTLIBRARIES = pango-arabic.la + +INCLUDES = -I$(top_srcdir)/libpango/ + +pango_arabic_la_SOURCES = \ + arabic.c \ + arconv.c \ + arconv.h + +pango_arabic_la_LDFLAGS = -rpath $(libdir) -export-dynamic -avoid-version -module + +EXTRA_DIST= diff --git a/modules/arabic/arabic-x.c b/modules/arabic/arabic-x.c new file mode 100644 index 00000000..58875d58 --- /dev/null +++ b/modules/arabic/arabic-x.c @@ -0,0 +1,335 @@ +/* Pango - Arabic module + * arabic module + * + * (C) 2000 K. Koehler + * + */ + +#include +#include +#include "pango.h" +#include "pangox.h" +#include "utils.h" +#include + +#include "arconv.h" + +/* +** I hope thing's work easily ... +*/ + +static PangoEngineRange arabic_range[] = { + { 0x060B, 0x067F, "*" }, +}; + +static PangoEngineInfo script_engines[] = { + { + "ArabicScriptEngineLang", + PANGO_ENGINE_TYPE_LANG, + PANGO_RENDER_TYPE_NONE, + arabic_range, G_N_ELEMENTS(arabic_range) + }, + { + "ArabicScriptEngineX", + PANGO_ENGINE_TYPE_SHAPE, + PANGO_RENDER_TYPE_X, + arabic_range, G_N_ELEMENTS(arabic_range) + } +}; + +static gint n_script_engines = G_N_ELEMENTS (script_engines); + + + + +/* + * Language script engine + */ + +static void +arabic_engine_break (const char *text, + int len, + PangoAnalysis *analysis, + PangoLogAttr *attrs) +{ + /* Most of the code comes from tamil_engine_break + * only difference is char stop based on modifiers + */ + + const char *cur = text; + const char *next; + gint i = 0; + GUChar4 wc; + + while (*cur) + { + if (!_pango_utf8_iterate (cur, &next, &wc)) + return; + if (cur == next) + break; + if ((next - text) > len) + break; + cur = next; + + attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0; + attrs[i].is_break = (i > 0 && attrs[i-1].is_white) || attrs[i].is_white; + attrs[i].is_char_stop = 1; + attrs[i].is_word_stop = (i == 0) || attrs[i-1].is_white; + + i++; + } +} + +static PangoEngine * +arabic_engine_lang_new () +{ + PangoEngineLang *result; + + result = g_new (PangoEngineLang, 1); + + result->engine.id = "ArabicScriptEngine"; + result->engine.type = PANGO_ENGINE_TYPE_LANG; + result->engine.length = sizeof (result); + result->script_break = arabic_engine_break; + + return (PangoEngine *)result; +} + +/* + * X window system script engine portion + */ + +static PangoXSubfont +find_unic_font (PangoFont *font,char* charsets[]) +{ + PangoXSubfont *subfonts; + int *subfont_charsets; + int n_subfonts; + PangoXSubfont result = 0; + + n_subfonts = pango_x_list_subfonts (font, charsets, 1, &subfonts, &subfont_charsets); + + if (n_subfonts > 0) + result = subfonts[0]; + + g_free (subfonts); + g_free (subfont_charsets); + + return result; +} + +static char *default_charset[] = { + "iso10646-1" +}; + + + +static void +set_glyph (PangoGlyphString *glyphs, + PangoFont *font, PangoXSubfont subfont, + int i, int cluster_start, int glyph) +{ + PangoRectangle logical_rect; + + glyphs->glyphs[i].glyph = PANGO_X_MAKE_GLYPH (subfont, glyph); + + glyphs->glyphs[i].geometry.x_offset = 0; + glyphs->glyphs[i].geometry.y_offset = 0; + + glyphs->log_clusters[i] = cluster_start; + + pango_font_get_glyph_extents (font, glyphs->glyphs[i].glyph, NULL, &logical_rect); + glyphs->glyphs[i].geometry.width = logical_rect.width; +} + + +/* The following thing is actually critical ... */ + +static void +arabic_engine_shape (PangoFont *font, + const char *text, + int length, + PangoAnalysis *analysis, + PangoGlyphString *glyphs) +{ + PangoXSubfont subfont; + + int n_chars, n_glyph; + int i; + const char *p; + const char *next; + GUChar4 *wc; + + g_return_if_fail (font != NULL); + g_return_if_fail (text != NULL); + g_return_if_fail (length >= 0); + g_return_if_fail (analysis != NULL); + + /* We assume we have an unicode-font like 10x20 which containes + ** the needed chars + */ + + n_chars = n_glyph = unicode_strlen (text, length); + + if (!(subfont = find_unic_font (font, default_charset))) + { + PangoGlyph unknown_glyph = pango_x_get_unknown_glyph (font); + + pango_glyph_string_set_size (glyphs, n_chars); + + p = text; + for (i=0; ilevel % 2 == 0) + { + /* We somehow were called on a LTR directional run; fallback as simply as possible */ + + pango_glyph_string_set_size (glyphs, n_chars); + + p = text; + for (i=0; i < n_chars; i++) + { + GUChar4 tmp_char; + + _pango_utf8_iterate (p, &next, &tmp_char); + set_glyph (glyphs, font, subfont, i, p - text, tmp_char); + p = next; + } + + return; + } + + wc = (GUChar4 *)g_malloc(sizeof(GUChar4)*n_chars); + + p = text; + for (i=0; i < n_chars; i++) + { + _pango_utf8_iterate (p, &next, &wc[n_chars - i - 1]); + p = next; + } + + reshape(n_chars,wc); + + /* The following support for ALIF.LAM ligatures is hackish and should + * be moved into arconv.c. (As a rough guess, arconv.c should fill + * in a log_clusters[] type array with character offets, that would + * then convert to the byte offsets needed for the real log_clusters[]) + * + * The ligature detection below is written in terms of the Unicode + * presentation forms, which is almost certainly not the right way + * to do things. + */ + +#define ALIF_R 0xFE8E +#define LAM_L 0xFEDF +#define LAM_M 0xFEE0 +#define ALIF_LAM_R 0xFEFC +#define ALIF_LAM_N 0xFEFB + + for (i = n_chars - 1; i >= 1; i--) + { + if (wc[i-1] == ALIF_R && (wc[i] == LAM_M || wc[i] == LAM_L)) + n_glyph--; + } + + pango_glyph_string_set_size (glyphs, n_glyph); + + p = text; + for (i = n_chars - 1; i >= 0; i--) + { + int ligature = 0; + + if (i > 0) + { + if (wc[i-1] == ALIF_R && wc[i] == LAM_M) + { + ligature = ALIF_LAM_R; + } + else if (wc[i-1] == ALIF_R && wc[i] == LAM_L) + { + ligature = ALIF_LAM_N; + } + } + + if (ligature != 0) + { + set_glyph (glyphs, font, subfont, n_glyph - 1, p - text, ligature); + p = unicode_next_utf8 (p); + i--; + } + else + set_glyph (glyphs, font, subfont, n_glyph - 1, p - text, wc[i]); + + p = unicode_next_utf8 (p); + n_glyph--; + }; + + g_free(wc); +} + + +static PangoCoverage * +arabic_engine_get_coverage (PangoFont *font, + const char *lang) +{ + GUChar4 i; + PangoCoverage *result = pango_coverage_new (); + + for (i = 0x60B; i <= 0x67E; i++) + pango_coverage_set (result, i, PANGO_COVERAGE_EXACT); + + return result; +} + +static PangoEngine * +arabic_engine_x_new () +{ + PangoEngineShape *result; + + result = g_new (PangoEngineShape, 1); + + result->engine.id = "ArabicScriptEngine"; + result->engine.type = PANGO_ENGINE_TYPE_LANG; + result->engine.length = sizeof (result); + result->script_shape = arabic_engine_shape; + result->get_coverage = arabic_engine_get_coverage; + + return (PangoEngine *)result; +} + + + + + +/* The following three functions provide the public module API for + * Pango + */ +void +script_engine_list (PangoEngineInfo **engines, int *n_engines) +{ + *engines = script_engines; + *n_engines = n_script_engines; +} + +PangoEngine * +script_engine_load (const char *id) +{ + if (!strcmp (id, "ArabicScriptEngineLang")) + return arabic_engine_lang_new (); + else if (!strcmp (id, "ArabicScriptEngineX")) + return arabic_engine_x_new (); + else + return NULL; +} + +void +script_engine_unload (PangoEngine *engine) +{ +} diff --git a/modules/arabic/arabic.c b/modules/arabic/arabic.c new file mode 100644 index 00000000..58875d58 --- /dev/null +++ b/modules/arabic/arabic.c @@ -0,0 +1,335 @@ +/* Pango - Arabic module + * arabic module + * + * (C) 2000 K. Koehler + * + */ + +#include +#include +#include "pango.h" +#include "pangox.h" +#include "utils.h" +#include + +#include "arconv.h" + +/* +** I hope thing's work easily ... +*/ + +static PangoEngineRange arabic_range[] = { + { 0x060B, 0x067F, "*" }, +}; + +static PangoEngineInfo script_engines[] = { + { + "ArabicScriptEngineLang", + PANGO_ENGINE_TYPE_LANG, + PANGO_RENDER_TYPE_NONE, + arabic_range, G_N_ELEMENTS(arabic_range) + }, + { + "ArabicScriptEngineX", + PANGO_ENGINE_TYPE_SHAPE, + PANGO_RENDER_TYPE_X, + arabic_range, G_N_ELEMENTS(arabic_range) + } +}; + +static gint n_script_engines = G_N_ELEMENTS (script_engines); + + + + +/* + * Language script engine + */ + +static void +arabic_engine_break (const char *text, + int len, + PangoAnalysis *analysis, + PangoLogAttr *attrs) +{ + /* Most of the code comes from tamil_engine_break + * only difference is char stop based on modifiers + */ + + const char *cur = text; + const char *next; + gint i = 0; + GUChar4 wc; + + while (*cur) + { + if (!_pango_utf8_iterate (cur, &next, &wc)) + return; + if (cur == next) + break; + if ((next - text) > len) + break; + cur = next; + + attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0; + attrs[i].is_break = (i > 0 && attrs[i-1].is_white) || attrs[i].is_white; + attrs[i].is_char_stop = 1; + attrs[i].is_word_stop = (i == 0) || attrs[i-1].is_white; + + i++; + } +} + +static PangoEngine * +arabic_engine_lang_new () +{ + PangoEngineLang *result; + + result = g_new (PangoEngineLang, 1); + + result->engine.id = "ArabicScriptEngine"; + result->engine.type = PANGO_ENGINE_TYPE_LANG; + result->engine.length = sizeof (result); + result->script_break = arabic_engine_break; + + return (PangoEngine *)result; +} + +/* + * X window system script engine portion + */ + +static PangoXSubfont +find_unic_font (PangoFont *font,char* charsets[]) +{ + PangoXSubfont *subfonts; + int *subfont_charsets; + int n_subfonts; + PangoXSubfont result = 0; + + n_subfonts = pango_x_list_subfonts (font, charsets, 1, &subfonts, &subfont_charsets); + + if (n_subfonts > 0) + result = subfonts[0]; + + g_free (subfonts); + g_free (subfont_charsets); + + return result; +} + +static char *default_charset[] = { + "iso10646-1" +}; + + + +static void +set_glyph (PangoGlyphString *glyphs, + PangoFont *font, PangoXSubfont subfont, + int i, int cluster_start, int glyph) +{ + PangoRectangle logical_rect; + + glyphs->glyphs[i].glyph = PANGO_X_MAKE_GLYPH (subfont, glyph); + + glyphs->glyphs[i].geometry.x_offset = 0; + glyphs->glyphs[i].geometry.y_offset = 0; + + glyphs->log_clusters[i] = cluster_start; + + pango_font_get_glyph_extents (font, glyphs->glyphs[i].glyph, NULL, &logical_rect); + glyphs->glyphs[i].geometry.width = logical_rect.width; +} + + +/* The following thing is actually critical ... */ + +static void +arabic_engine_shape (PangoFont *font, + const char *text, + int length, + PangoAnalysis *analysis, + PangoGlyphString *glyphs) +{ + PangoXSubfont subfont; + + int n_chars, n_glyph; + int i; + const char *p; + const char *next; + GUChar4 *wc; + + g_return_if_fail (font != NULL); + g_return_if_fail (text != NULL); + g_return_if_fail (length >= 0); + g_return_if_fail (analysis != NULL); + + /* We assume we have an unicode-font like 10x20 which containes + ** the needed chars + */ + + n_chars = n_glyph = unicode_strlen (text, length); + + if (!(subfont = find_unic_font (font, default_charset))) + { + PangoGlyph unknown_glyph = pango_x_get_unknown_glyph (font); + + pango_glyph_string_set_size (glyphs, n_chars); + + p = text; + for (i=0; ilevel % 2 == 0) + { + /* We somehow were called on a LTR directional run; fallback as simply as possible */ + + pango_glyph_string_set_size (glyphs, n_chars); + + p = text; + for (i=0; i < n_chars; i++) + { + GUChar4 tmp_char; + + _pango_utf8_iterate (p, &next, &tmp_char); + set_glyph (glyphs, font, subfont, i, p - text, tmp_char); + p = next; + } + + return; + } + + wc = (GUChar4 *)g_malloc(sizeof(GUChar4)*n_chars); + + p = text; + for (i=0; i < n_chars; i++) + { + _pango_utf8_iterate (p, &next, &wc[n_chars - i - 1]); + p = next; + } + + reshape(n_chars,wc); + + /* The following support for ALIF.LAM ligatures is hackish and should + * be moved into arconv.c. (As a rough guess, arconv.c should fill + * in a log_clusters[] type array with character offets, that would + * then convert to the byte offsets needed for the real log_clusters[]) + * + * The ligature detection below is written in terms of the Unicode + * presentation forms, which is almost certainly not the right way + * to do things. + */ + +#define ALIF_R 0xFE8E +#define LAM_L 0xFEDF +#define LAM_M 0xFEE0 +#define ALIF_LAM_R 0xFEFC +#define ALIF_LAM_N 0xFEFB + + for (i = n_chars - 1; i >= 1; i--) + { + if (wc[i-1] == ALIF_R && (wc[i] == LAM_M || wc[i] == LAM_L)) + n_glyph--; + } + + pango_glyph_string_set_size (glyphs, n_glyph); + + p = text; + for (i = n_chars - 1; i >= 0; i--) + { + int ligature = 0; + + if (i > 0) + { + if (wc[i-1] == ALIF_R && wc[i] == LAM_M) + { + ligature = ALIF_LAM_R; + } + else if (wc[i-1] == ALIF_R && wc[i] == LAM_L) + { + ligature = ALIF_LAM_N; + } + } + + if (ligature != 0) + { + set_glyph (glyphs, font, subfont, n_glyph - 1, p - text, ligature); + p = unicode_next_utf8 (p); + i--; + } + else + set_glyph (glyphs, font, subfont, n_glyph - 1, p - text, wc[i]); + + p = unicode_next_utf8 (p); + n_glyph--; + }; + + g_free(wc); +} + + +static PangoCoverage * +arabic_engine_get_coverage (PangoFont *font, + const char *lang) +{ + GUChar4 i; + PangoCoverage *result = pango_coverage_new (); + + for (i = 0x60B; i <= 0x67E; i++) + pango_coverage_set (result, i, PANGO_COVERAGE_EXACT); + + return result; +} + +static PangoEngine * +arabic_engine_x_new () +{ + PangoEngineShape *result; + + result = g_new (PangoEngineShape, 1); + + result->engine.id = "ArabicScriptEngine"; + result->engine.type = PANGO_ENGINE_TYPE_LANG; + result->engine.length = sizeof (result); + result->script_shape = arabic_engine_shape; + result->get_coverage = arabic_engine_get_coverage; + + return (PangoEngine *)result; +} + + + + + +/* The following three functions provide the public module API for + * Pango + */ +void +script_engine_list (PangoEngineInfo **engines, int *n_engines) +{ + *engines = script_engines; + *n_engines = n_script_engines; +} + +PangoEngine * +script_engine_load (const char *id) +{ + if (!strcmp (id, "ArabicScriptEngineLang")) + return arabic_engine_lang_new (); + else if (!strcmp (id, "ArabicScriptEngineX")) + return arabic_engine_x_new (); + else + return NULL; +} + +void +script_engine_unload (PangoEngine *engine) +{ +} diff --git a/modules/arabic/arconv.c b/modules/arabic/arconv.c new file mode 100644 index 00000000..aa612353 --- /dev/null +++ b/modules/arabic/arconv.c @@ -0,0 +1,136 @@ + +#include "arconv.h" + +/* This belongs to my arabic-shaping-module */ +typedef struct { + GUChar4 basechar; + GUChar4 charstart; + int count; +} shapestruct; + +/* The Unicode order is always: Standalone End Beginning Middle */ + +static shapestruct chartable [] = +{ + {0x622, 0xFE81,2}, /* ALIF MADDA */ + {0x623, 0xFE83,2}, /* ALIF HAMZA */ + {0x624, 0xFE85,2}, /* WAW HAMZA */ + {0x625, 0xFE87,2}, /* ALIF IHAMZA */ + {0x626, 0xFE89,4}, /* YA HAMZA */ + {0x627, 0xFE8D,2}, /* ALIF */ + {0x628, 0xFE8F,4}, /* BA */ + {0x629, 0xFE93,2}, /* TA MARBUTA */ + {0x62A, 0xFE95,4}, /* TA */ + {0x62B, 0xFE99,4}, /* THA */ + {0x62C, 0xFE9D,4}, /* DJIM */ + {0x62D, 0xFEA1,4}, /* HA */ + {0x62E, 0xFEA5,4}, /* CHA */ + {0x62F, 0xFEA9,2}, /* DAL */ + {0x630, 0xFEAB,2}, /* THAL */ + {0x631, 0xFEAD,2}, /* RA */ + {0x632, 0xFEAF,2}, /* ZAY */ + {0x633, 0xFEB1,4}, /* SIN */ + {0x634, 0xFEB5,4}, /* SHIN */ + {0x635, 0xFEB9,4}, /* SAAD */ + {0x636, 0xFEBD,4}, /* DAAD */ + {0x637, 0xFEC1,4}, /* .TA */ + {0x638, 0xFEC5,4}, /* .ZA */ + {0x639, 0xFEC9,4}, /* AIN */ + {0x63A, 0xFECD,4}, /* RAIN */ + {0x641, 0xFED1,4}, /* FA */ + {0x642, 0xFED5,4}, /* QAF */ + {0x643, 0xFED9,4}, /* KAF */ + {0x644, 0xFEDD,4}, /* LAM */ + {0x645, 0xFEE1,4}, /* MIM */ + {0x646, 0xFEE5,4}, /* NUN */ + {0x647, 0xFEE9,4}, /* HA */ + {0x648, 0xFEED,2}, /* WAW */ + {0x649, 0xFEEF,2}, /* ALIF MAQSORA */ + {0x64A, 0xFEF1,4}, /* YA */ + {0x66D, 0xFEF5,2} /* star,LAM-ALIF */ +}; + +static GUChar4 unshape(GUChar4 s) +{ + int j = 0; + if ( (s > 0xFE80) && ( s < 0xFEFF )){ /* arabic shaped Glyph */ + while ( chartable[j+1].charstart <= s) j++; + return chartable[j].basechar; + } else { + return s; + }; +} + +static GUChar4 charshape(GUChar4 s,short which) +{ /* which 0=alone 1=end 2=start 3=middle */ + int j = 0; + if ( (s >= 0x622) && ( s <= 0x64A )){ /* arabic base char */ + while ( chartable[j].basechar < s) j++; + return chartable[j].charstart+which; + } else { + return s; + }; +} + + +static short shapecount(GUChar4 s) +{ + int j = 0; + if ( (s >= 0x622) && ( s <= 0x64A )){ /* arabic base char */ + while ( chartable[j].basechar < s) j++; + return chartable[j].count; + } else { + return 1; + }; +} + +void shape(int len,GUChar4* string) +{ + /* The string must be in visual order already. + ** This routine does the basic arabic reshaping. + */ + int j; + short nc; /* #shapes of next char */ + short sc; /* #shapes of current char */ + int prevstate = 0; /* */ + int which; + + sc = shapecount(string[len-1]); + for ( j = len-1; j >= 0; j-- ){ + if (j > 0){ + nc = shapecount(string[j-1]); + } else { + nc = 1; + }; + + if ( prevstate & 2 ){ /* use and end or Middle-form */ + if (nc == 1){ + which = 1; /* end */ + } else { + which = 3; /* middle */ + } + } else { /* use a stand-alone or beginning-form */ + if (nc == 1){ + which = 0; /* basic */ + } else { + which = 2; /* beginning */ + } + }; + which = which % sc; /* middle->end,beginning->basic */ + string[j] = charshape(string[j],which); + prevstate = which; + sc = nc; /* no need to recalculate */ + } +} + +void reshape(int len,GUChar4* string) +{ + int i; + for ( i = 0; i < len; i++){ + string[i] = unshape(string[i]); + } + shape(len,string); +} + +/* Lam-Alif beginns at F5,F7,F9,FB ( MADDA,HAMZA,IHAMZA, . respectively ) */ + diff --git a/modules/arabic/arconv.h b/modules/arabic/arconv.h new file mode 100644 index 00000000..bc3d3300 --- /dev/null +++ b/modules/arabic/arconv.h @@ -0,0 +1,8 @@ + +#ifndef __arconv_h_ +#define __arconv_h_ +#include "../../libpango/utils.h" + +void reshape(int len,GUChar4* string); + +#endif -- cgit v1.2.1