diff options
Diffstat (limited to 'tests/testboundaries.c')
-rw-r--r-- | tests/testboundaries.c | 356 |
1 files changed, 356 insertions, 0 deletions
diff --git a/tests/testboundaries.c b/tests/testboundaries.c new file mode 100644 index 00000000..c05bc837 --- /dev/null +++ b/tests/testboundaries.c @@ -0,0 +1,356 @@ +/* Pango + * testboundaries.c: Test text boundary algorithms + * + * Copyright (C) 1999-2000 Red Hat Software + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> + +#include <pango/pango.h> + +#define CHFORMAT "%0#6x" + +/* FIXME for now this just tests that the breaking of some sample + * text conforms to certain rules and invariants. But eventually + * we should also have test-result pairs, i.e. a string and some + * encoding of the correct way to break the string, to check + * more precisely that things worked + */ + + +/* Keep these in sync with the same macros in break.c */ + +#define LEADING_JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x115F) +#define VOWEL_JAMO(wc) ((wc) >= 0x1160 && (wc) <= 0x11A2) +#define TRAILING_JAMO(wc) ((wc) >= 0x11A8 && (wc) <= 0x11F9) +#define JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x11FF) +/* "virama script" is just an optimization; it includes a bunch of + * scripts without viramas in them + */ +#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF) +#define VIRAMA(wc) ((wc) == 0x094D || \ + (wc) == 0x09CD || \ + (wc) == 0x0A4D || \ + (wc) == 0x0ACD || \ + (wc) == 0x0B4D || \ + (wc) == 0x0BCD || \ + (wc) == 0x0C4D || \ + (wc) == 0x0CCD || \ + (wc) == 0x0D4D || \ + (wc) == 0x0DCA || \ + (wc) == 0x0E3A || \ + (wc) == 0x0F84 || \ + (wc) == 0x1039 || \ + (wc) == 0x17D2) +/* Types of Japanese characters */ +#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF) +#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF) +#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F) +#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF) + +static int offset = 0; +static int line = 0; +static gunichar current_wc = 0; +static const char *line_start = NULL; +static const char *line_end = NULL; + +static void +fail (const char *format, + ...) +{ + char *str; + char *line_text; + + va_list args; + + va_start (args, format); + str = g_strdup_vprintf (format, args); + va_end (args); + + line_text = g_strndup (line_start, line_end - line_start); + + fprintf (stderr, "line %d offset %d char is " CHFORMAT ": %s\n (line is '%s')\n", line, offset, current_wc, str, line_text); + g_free (str); + g_free (line_text); + + exit (1); +} + +typedef void (* CharForeachFunc) (gunichar wc, + gunichar prev_wc, + gunichar next_wc, + GUnicodeType type, + GUnicodeType prev_type, + GUnicodeType next_type, + PangoLogAttr *attr, + PangoLogAttr *prev_attr, + PangoLogAttr *next_attr, + gpointer data); + +static void +log_attr_foreach (const char *text, + PangoLogAttr *attrs, + CharForeachFunc func, + gpointer data) +{ + const gchar *next = text; + gint length = strlen (text); + const gchar *end = text + length; + gint i = 0; + gunichar prev_wc; + gunichar next_wc; + GUnicodeType prev_type; + GUnicodeType next_type; + + if (next == end) + return; + + offset = 0; + line = 0; + + prev_type = (GUnicodeType) -1; + prev_wc = 0; + + next_wc = g_utf8_get_char (next); + next_type = g_unichar_type (next_wc); + + line_start = text; + line_end = text; + + while (next_wc != 0) + { + GUnicodeType type; + gunichar wc; + + wc = next_wc; + type = next_type; + + current_wc = wc; + + next = g_utf8_next_char (next); + line_end = next; + + if (next >= end) + next_wc = 0; + else + next_wc = g_utf8_get_char (next); + + if (next_wc) + next_type = g_unichar_type (next_wc); + + (* func) (wc, prev_wc, next_wc, + type, prev_type, next_type, + &attrs[i], + i != 0 ? &attrs[i-1] : NULL, + next_wc != 0 ? &attrs[i+1] : NULL, + data); + + prev_type = type; + prev_wc = wc; + ++i; + ++offset; + if (wc == '\n') + { + ++line; + offset = 0; + line_start = next; + line_end = next; + } + } +} + +static void +check_line_char (gunichar wc, + gunichar prev_wc, + gunichar next_wc, + GUnicodeType type, + GUnicodeType prev_type, + GUnicodeType next_type, + PangoLogAttr *attr, + PangoLogAttr *prev_attr, + PangoLogAttr *next_attr, + gpointer data) +{ + GUnicodeBreakType break_type; + GUnicodeBreakType prev_break_type; + + break_type = g_unichar_break_type (wc); + if (prev_wc) + prev_break_type = g_unichar_break_type (prev_wc); + else + prev_break_type = G_UNICODE_BREAK_UNKNOWN; + + if (wc == '\n') + { + if (prev_wc == '\r') + { + if (attr->is_break) + fail ("line break between \\r and \\n"); + } + + if (next_attr && !next_attr->is_break) + fail ("no line break after \\n"); + } + + if (attr->is_break && prev_wc == 0) + fail ("first char in string should not be marked as a line break"); + + if (break_type == G_UNICODE_BREAK_SPACE) + { + if (attr->is_break && prev_attr != NULL && + !attr->is_mandatory_break) + fail ("can't break lines before a space unless a mandatory break char precedes it; prev char was " CHFORMAT, prev_wc); + } + + if (attr->is_mandatory_break && !attr->is_break) + fail ("mandatory breaks must also be marked as regular breaks"); + + + + /* FIXME use the break tables from break.c to automatically + * check invariants for each cell in the table. Shouldn't + * be that hard to do. + */ + + if (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION && + prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION && + attr->is_break && + !attr->is_mandatory_break) + fail ("can't break between two open punctuation chars"); + + if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION && + prev_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION && + attr->is_break && + !attr->is_mandatory_break) + fail ("can't break between two close punctuation chars"); + + if (break_type == G_UNICODE_BREAK_QUOTATION && + prev_break_type == G_UNICODE_BREAK_ALPHABETIC && + attr->is_break && + !attr->is_mandatory_break) + fail ("can't break letter-quotemark sequence"); +} + +static void +check_line_invariants (const char *text, + PangoLogAttr *attrs) +{ + log_attr_foreach (text, attrs, check_line_char, NULL); +} + +static void +check_word_invariants (const char *text, + PangoLogAttr *attrs) +{ + + +} + +static void +check_sentence_invariants (const char *text, + PangoLogAttr *attrs) +{ + + +} + +static void +check_grapheme_invariants (const char *text, + PangoLogAttr *attrs) +{ + + +} + +static void +print_sentences (const char *text, + PangoLogAttr *attrs) +{ + const char *p; + const char *last; + int i = 0; + + last = text; + p = text; + + while (*p) + { + if (attrs[i].is_sentence_boundary) + { + char *s = g_strndup (last, p - last); + printf ("%s\n", s); + g_free (s); + last = p; + } + + p = g_utf8_next_char (p); + ++i; + } +} + +static void +check_invariants (const char *text) +{ + int len; + PangoLogAttr *attrs; + + if (!g_utf8_validate (text, -1, NULL)) + fail ("Invalid UTF-8 in test text"); + + len = g_utf8_strlen (text, -1); + attrs = g_new0 (PangoLogAttr, len); + + pango_get_log_attrs (text, + -1, + 0, + "C", + attrs); + + check_line_invariants (text, attrs); + check_sentence_invariants (text, attrs); + check_grapheme_invariants (text, attrs); + check_word_invariants (text, attrs); + +#if 0 + print_sentences (text, attrs); +#endif + + g_free (attrs); +} + +int +main (int argc, + char **argv) +{ + gchar *text; + + if (!g_file_get_contents ("boundaries.utf8", &text, NULL, NULL)) + fail ("Couldn't open sample text file"); + + check_invariants (text); + + g_free (text); + + printf ("testboundaries passed\n"); + + return 0; +} + |