summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBehdad Esfahbod <behdad@gnome.org>2008-04-24 20:00:41 +0000
committerBehdad Esfahbod <behdad@src.gnome.org>2008-04-24 20:00:41 +0000
commitf6b1fef713a035e4abcbe0d0fda54721a3560d5e (patch)
tree4e2590bc28e59fc971cc481de2920db32ee536ff
parent0c950b1d5c2a51ec8d238ae1399d1f71b132fa6e (diff)
downloadpango-f6b1fef713a035e4abcbe0d0fda54721a3560d5e.tar.gz
Part of Bug 97545 – Make pango_default_break follow Unicode TR #29 Patch
2008-04-24 Behdad Esfahbod <behdad@gnome.org> Part of Bug 97545 – Make pango_default_break follow Unicode TR #29 Patch from Noah Levitt * tests/Makefile.am: * tests/runtests.sh.in: * tests/testboundaries_ucd.c (count_attrs), (parse_line), (attrs_equal), (make_test_string), (do_test), (main): Add test driver for UAX#14 and UAX#29 test data from Unicode Character Databse. Just drop the following four files in pango/tests for it to use them: GraphemeBreakTest.txt LineBreakTest.txt SentenceBreakTest.txt WordBreakTest.txt svn path=/trunk/; revision=2617
-rw-r--r--ChangeLog18
-rw-r--r--tests/Makefile.am8
-rwxr-xr-xtests/runtests.sh.in2
-rw-r--r--tests/testboundaries_ucd.c364
4 files changed, 389 insertions, 3 deletions
diff --git a/ChangeLog b/ChangeLog
index dc9a8b82..6c912546 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,24 @@
2008-04-24 Behdad Esfahbod <behdad@gnome.org>
Part of Bug 97545 – Make pango_default_break follow Unicode TR #29
+ Patch from Noah Levitt
+
+ * tests/Makefile.am:
+ * tests/runtests.sh.in:
+ * tests/testboundaries_ucd.c (count_attrs), (parse_line),
+ (attrs_equal), (make_test_string), (do_test), (main):
+ Add test driver for UAX#14 and UAX#29 test data from Unicode Character
+ Databse. Just drop the following four files in pango/tests for it to
+ use them:
+
+ GraphemeBreakTest.txt
+ LineBreakTest.txt
+ SentenceBreakTest.txt
+ WordBreakTest.txt
+
+2008-04-24 Behdad Esfahbod <behdad@gnome.org>
+
+ Part of Bug 97545 – Make pango_default_break follow Unicode TR #29
* pango/break.c (pango_default_break): Make Grapheme Boundary code
exactly follow UAX#29 of Unicode 5.1.0
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f1b24a42..40c86780 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -3,7 +3,10 @@
EXTRA_DIST = \
all-unicode.txt \
boundaries.utf8 \
- runtests.sh
+ runtests.sh \
+ GraphemeClusterBreakTest.txt \
+ SentenceBreakTest.txt \
+ WordBreakTest.txt
CLEANFILES = pangorc
DISTCLEANFILES = all-unicode.txt runtests.log
@@ -43,7 +46,7 @@ TESTS_ENVIRONMENT = \
noinst_PROGRAMS = gen-all-unicode dump-boundaries
-check_PROGRAMS = testboundaries testcolor testscript
+check_PROGRAMS = testboundaries testboundaries_ucd testcolor testscript
if HAVE_CAIRO
check_PROGRAMS += testiter
@@ -54,6 +57,7 @@ endif
gen_all_unicode_LDADD = $(GLIB_LIBS)
testboundaries_LDADD = ../pango/libpango-$(PANGO_API_VERSION).la
+testboundaries_ucd_LDADD = ../pango/libpango-$(PANGO_API_VERSION).la
testcolor_LDADD = ../pango/libpango-$(PANGO_API_VERSION).la
testiter_LDADD = ../pango/libpango-$(PANGO_API_VERSION).la ../pango/libpangocairo-$(PANGO_API_VERSION).la
testscript_LDADD = ../pango/libpango-$(PANGO_API_VERSION).la
diff --git a/tests/runtests.sh.in b/tests/runtests.sh.in
index 0c3466e2..a6d60007 100755
--- a/tests/runtests.sh.in
+++ b/tests/runtests.sh.in
@@ -1,7 +1,7 @@
#! @SHELL@
LOGFILE=runtests.log
-POTENTIAL_TESTS='testboundaries testcolor'
+POTENTIAL_TESTS='testboundaries testcolor testboundaries_ucd'
ECHO_C='@ECHO_C@'
ECHO_N='@ECHO_N@'
diff --git a/tests/testboundaries_ucd.c b/tests/testboundaries_ucd.c
new file mode 100644
index 00000000..7e266f88
--- /dev/null
+++ b/tests/testboundaries_ucd.c
@@ -0,0 +1,364 @@
+/* Pango
+ * testboundaries_ucd.c: Test text boundary algorithms with test data from
+ * Unicode Character Database.
+ *
+ * Copyright (C) 2003 Noah Levitt
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include <pango/pango.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+
+static gboolean failed = FALSE;
+
+/* PangoLogAttr has to be the same size as guint or this hack breaks */
+typedef union
+{
+ PangoLogAttr attr;
+ guint bits;
+}
+AttrBits;
+
+/* counts the number of multiplication and divison signs up to the first
+ * '#' or null character */
+static gint
+count_attrs (gchar *line)
+{
+ gunichar ch;
+ gchar *p = line;
+ gint count = 0;
+
+ for (;;)
+ {
+ ch = g_utf8_get_char (p);
+
+ switch (ch)
+ {
+ /* MULTIPLICATION SIGN, DIVISION SIGN */
+ case 0x00d7: case 0x00f7:
+ count++;
+ break;
+
+ /* null char, NUMBER SIGN */
+ case 0x0000: case 0x0023:
+ return count;
+
+ default:
+ break;
+ }
+
+ p = g_utf8_next_char (p);
+ }
+ /* not reached */
+}
+
+static gboolean
+parse_line (gchar *line,
+ AttrBits bits,
+ gchar **str_return,
+ PangoLogAttr **attr_return,
+ gint *num_attrs)
+{
+ GString *gs;
+ gunichar ch, character;
+ gchar *p, *q;
+ gint i;
+ AttrBits temp_attr;
+
+ *num_attrs = count_attrs (line);
+ *attr_return = g_new (PangoLogAttr, *num_attrs);
+
+ p = line;
+ i = 0;
+ gs = g_string_new (NULL);
+
+ for (;;)
+ {
+ temp_attr.bits = 0;
+
+ /* skip white space */
+ do
+ {
+ ch = g_utf8_get_char (p);
+ p = g_utf8_next_char (p);
+ }
+ while (g_unichar_isspace (ch));
+
+ switch (ch)
+ {
+ case 0x00f7: /* DIVISION SIGN: boundary here */
+ temp_attr.bits |= bits.bits;
+ /* fall through */
+
+ case 0x00d7: /* MULTIPLICATION SIGN: no boundary here */
+ break;
+
+ case 0x0000:
+ case 0x0023:
+ *str_return = g_string_free (gs, FALSE);
+ return TRUE;
+
+ default: /* unexpected character */
+ g_free (*attr_return);
+ return FALSE;
+ }
+
+ (*attr_return)[i] = temp_attr.attr;
+
+ /* skip white space */
+ do
+ {
+ ch = g_utf8_get_char (p);
+ p = g_utf8_next_char (p);
+ }
+ while (g_unichar_isspace (ch));
+ p = g_utf8_prev_char (p);
+
+ if (ch == 0x0023 || ch == 0x0000)
+ {
+ *str_return = g_string_free (gs, FALSE);
+ return TRUE;
+ }
+
+ character = strtoul (p, &q, 16);
+ if (q < p + 4 || q > p + 6 || character > 0x10ffff)
+ {
+ g_free (*attr_return);
+ return FALSE;
+ }
+
+ p = q;
+
+ gs = g_string_append_unichar (gs, character);
+
+ i++;
+ }
+}
+
+static gboolean
+attrs_equal (PangoLogAttr *attrs1,
+ PangoLogAttr *attrs2,
+ gint len,
+ AttrBits bits)
+{
+ AttrBits a, b;
+ gint i;
+
+ for (i = 0; i < len; i++)
+ {
+ a.bits = 0;
+ a.attr = attrs1[i];
+
+ b.bits = 0;
+ b.attr = attrs2[i];
+
+ /* can't do a straight comparison because the bitmask may have
+ * multiple bits set, and as long as attr&bitmask is not zero, it
+ * counts as being set (see word boundaries) */
+ if (((a.bits & bits.bits) && !(b.bits & bits.bits)) ||
+ !(a.bits & bits.bits) && (b.bits & bits.bits))
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static gchar *
+make_test_string (gchar *string,
+ PangoLogAttr *attrs,
+ AttrBits bits)
+{
+ GString *gs = g_string_new (NULL);
+ gint i = 0;
+ AttrBits a;
+ gchar *p = string;
+ gunichar ch;
+
+ for (;;)
+ {
+ a.bits = 0;
+ a.attr = attrs[i];
+ if ((a.bits & bits.bits) != 0)
+ gs = g_string_append_unichar (gs, 0x00f7);
+ else
+ gs = g_string_append_unichar (gs, 0x00d7);
+
+ g_string_append_c (gs, ' ');
+
+ if (*p == '\0')
+ break;
+
+ ch = g_utf8_get_char (p);
+ g_string_append_printf (gs, "%04X ", ch);
+
+ p = g_utf8_next_char (p);
+ i++;
+ }
+
+ return g_string_free (gs, FALSE);
+}
+
+static void
+do_test (gchar *filename,
+ AttrBits bits,
+ gboolean fixup_broken_linebreaktest)
+{
+ GIOChannel *channel;
+ GIOStatus status;
+ gchar *line;
+ gsize length, terminator_pos;
+ GError *error;
+ gchar *string;
+ PangoLogAttr *expected_attrs;
+ gint num_attrs;
+ gint i;
+
+ error = NULL;
+ channel = g_io_channel_new_file (filename, "r", &error);
+ if (!channel)
+ {
+ if (error->domain == G_FILE_ERROR && error->code == G_FILE_ERROR_NOENT)
+ {
+ g_print ("%s not found. Skipping test.\n", filename);
+ goto done;
+ }
+ else
+ {
+ g_printerr ("%s: %s\n", filename, error->message);
+ exit (1);
+ }
+ }
+
+ i = 1;
+ for (;;)
+ {
+ error = NULL;
+ status = g_io_channel_read_line (channel, &line, &length, &terminator_pos, &error);
+
+ switch (status)
+ {
+ case G_IO_STATUS_ERROR:
+ g_printerr ("%s: %s\n", filename, error->message);
+ exit (1);
+
+ case G_IO_STATUS_EOF:
+ goto done;
+
+ case G_IO_STATUS_AGAIN:
+ continue;
+
+ case G_IO_STATUS_NORMAL:
+ line[terminator_pos] = '\0';
+ break;
+ }
+
+ if (! parse_line (line, bits, &string, &expected_attrs, &num_attrs))
+ {
+ g_printerr ("%s: error parsing line %d: %s\n", filename, i, line);
+ exit (1);
+ }
+
+ if (num_attrs > 0)
+ {
+ PangoLogAttr *attrs = g_new (PangoLogAttr, num_attrs);
+ pango_get_log_attrs (string, -1, 0, pango_language_from_string ("C"), attrs, num_attrs);
+
+ /* LineBreakTest.txt from Unicode 5.1.0 has this bug that it says
+ * breaking is allowed at the beginning of the strings, while the
+ * algorithm says it's not. Fix that up. */
+ if (fixup_broken_linebreaktest)
+ memset (expected_attrs, 0, sizeof (expected_attrs[0]));
+
+ if (! attrs_equal (attrs, expected_attrs, num_attrs, bits))
+ {
+ gchar *str = make_test_string (string, attrs, bits);
+ gchar *comments = strchr (line, '#');
+ if (comments) /* don't print the # comment in the error message. print it separately */
+ {
+ *comments = '\0';
+ comments++;
+ }
+ else
+ {
+ comments = "";
+ }
+
+ g_printerr ("%s: line %d failed\n"
+ " expected: %s\n"
+ " returned: %s\n"
+ " comments: %s\n\n",
+ filename, i, line, str, comments);
+
+ g_free (str);
+ failed = TRUE;
+ }
+ g_free (attrs);
+ }
+ g_free (string);
+ g_free (expected_attrs);
+
+ i++;
+ }
+
+done:
+ if (channel)
+ g_io_channel_unref (channel);
+ if (error)
+ g_error_free (error);
+ g_free (filename);
+}
+
+gint
+main (gint argc,
+ gchar **argv)
+{
+ gchar *srcdir;
+ gchar *filename;
+ AttrBits bits;
+
+ setlocale (LC_ALL, "");
+
+ srcdir = getenv ("srcdir");
+ if (!srcdir)
+ srcdir = ".";
+
+ filename = g_strdup_printf ("%s/GraphemeBreakTest.txt", srcdir);
+ bits.bits = 0;
+ bits.attr.is_cursor_position = 1;
+ do_test (filename, bits, FALSE);
+
+ filename = g_strdup_printf ("%s/WordBreakTest.txt", srcdir);
+ bits.bits = 0;
+ bits.attr.is_word_start = 1; /* either word start or end */
+ bits.attr.is_word_end = 1; /* (is this right?) */
+ do_test (filename, bits, FALSE);
+
+ filename = g_strdup_printf ("%s/SentenceBreakTest.txt", srcdir);
+ bits.bits = 0;
+ bits.attr.is_sentence_boundary = 1;
+ do_test (filename, bits, FALSE);
+
+ filename = g_strdup_printf ("%s/LineBreakTest.txt", srcdir);
+ bits.bits = 0;
+ bits.attr.is_line_break = 1;
+ bits.attr.is_mandatory_break = 1;
+ do_test (filename, bits, TRUE);
+
+ exit (failed);
+}