5 files changed, 549 insertions, 0 deletions
diff --git a/tests/Makefile.am b/tests/Makefile.am
new file mode 100644
index 00000000..e15157ab
--- /dev/null
+++ b/tests/Makefile.am
@@ -0,0 +1,14 @@
+## Process this file with automake to create Makefile.in.
+
+INCLUDES = -I$(top_srcdir) $(GLIB_CFLAGS)
+
+TESTS=runtests.sh
+
+check_PROGRAMS =	testboundaries
+
+testboundaries_SOURCES = testboundaries.c
+
+## pangox should not actually be required, this is broken
+testboundaries_LDADD = ../pango/libpango.la ../pango/libpangox.la $(X_LIBS)
+
+
diff --git a/tests/boundaries.utf8 b/tests/boundaries.utf8
new file mode 100644
index 00000000..f70bd0fc
--- /dev/null
+++ b/tests/boundaries.utf8
@@ -0,0 +1,69 @@
+Testing sentence boundaries - this is a sentence ending in several exclamation points!!!    Several spaces there. Abbreviations such as Mr. or Mrs. should not result in sentence breaks, should they?! (Parentheses should be included in a sentence.) (((Even nested parentheses, with funny punctuation inside!!?!!...))) Anyhow, this should be enough testing.
+
+This text has carriage returns
+all over the 
+ freaking place 
+
+ such as here
+here
+and 
+here
+
+     
+but not at the end of this line.
+
+This is some "quoted" text e.g. "this is some stuff in quotes" and 
+'this is some other stuff in single quotes' and ""this is some stuff with 
+two levels of double quotes"" and so on.
+
+Big string of Arabic:
+وقد بدأ ثلاث من أكثر المؤسسات تقدما في شبكة اكسيون برامجها كمنظمات لا تسعى للربح، ثم تحولت في السنوات الخمس الماضية إلى مؤسسات مالية منظمة، وباتت جزءا من النظام المالي في بلدانها، ولكنها تتخصص في خدمة قطاع المشروعات الصغيرة. وأحد أكثر هذه المؤسسات نجاحا هو »بانكوسول« في بوليفيا.
+
+
+Παν語
+
+This is a list of ways to say hello in various languages. Its purpose is to illustrate a number of scripts.
+
+(Converted into UTF-8)
+
+---------------------------------------------------------
+Arabic	السلام عليكم 
+Bengali (বাঙ্লা)	ষাগতোম
+Burmese (မ္ရန္မာ)
+Cherokee	(ᏣᎳᎩ)	ᎣᏏᏲ
+Czech	(česky)	Dobrý den
+Danish	(Dansk)	Hej, Goddag
+English	Hello
+Esperanto	Saluton
+Estonian	Tere, Tervist
+FORTRAN	PROGRAM
+Finnish	(Suomi)	Hei
+French	(Français)	Bonjour, Salut
+German	(Deutsch Nord)	Guten Tag
+German	(Deutsch Süd)	Grüß Gott
+Georgian	(ქართველი)	გამარჯობა
+Gujarati     (ગુજરાતિ)
+Greek	(Ελληνικά)	Γειά σας
+Hebrew	שלום
+Hindi	नमस्ते, नमस्कार।
+Italiano	Ciao, Buon giorno
+ɪŋglɪʃ       hɛləʊ
+Maltese	Ċaw, Saħħa
+Nederlands, Vlaams	Hallo, Dag
+Norwegian	(Norsk)	Hei, God dag
+Punjabi     (ੁਪੁਂਜਾਬਿ)
+Polish	Dzień dobry, Hej
+Russian	(Русский)	Здравствуйте!
+Slovak	Dobrý deň
+Spanish	(Español)	‎¡Hola!‎
+Swedish	(Svenska)	Hej, Goddag
+Thai	(ภาษาไทย)	สวัสดีครับ, สวัสดีค่ะ
+Turkish	(Türkçe)	Merhaba
+Vietnamese	(Tiếng Việt)	Xin Chào
+Yiddish	(ײַדישע) דאָס הײַזעלע 
+
+Japanese	(日本語)	こんにちは, ｺﾝﾆﾁﾊ
+Chinese	(中文,普通话,汉语)	你好
+Cantonese	(粵語,廣東話)	早晨, 你好
+Korean	(한글)	안녕하세요, 안녕하십니까
+
+Difference among chinese characters in GB, JIS, KSC, BIG5:‎
+ GB	--	元气	开发
+ JIS	--	元気	開発
+ KSC	--	元氣	開發
+ BIG5	--	元氣	開發
+
diff --git a/tests/runtests.sh b/tests/runtests.sh
new file mode 100755
index 00000000..aa67ccde
--- /dev/null
+++ b/tests/runtests.sh
@@ -0,0 +1,55 @@
+#! /bin/sh
+
+LOGFILE=runtests.log
+POTENTIAL_TESTS='testboundaries'
+
+for I in $POTENTIAL_TESTS
+do
+    GOOD=yes
+    test -f $I || {
+        echo "WARNING: test program $I not found, not running"
+        GOOD=no
+    }
+
+    if test x$GOOD = xyes; then
+        test -x $I || {
+            echo "WARNING: test program $I is not executable, not running"
+            GOOD=no
+        }
+    fi
+    
+    if test x$GOOD = xyes; then
+        TESTS="$TESTS$I "
+    fi
+done
+
+echo "Logging to $LOGFILE"
+
+echo "Log file for Pango test programs." > $LOGFILE
+echo "" >> $LOGFILE
+echo "Tests are: "$TESTS >> $LOGFILE
+echo "" >> $LOGFILE
+
+for I in $TESTS
+do
+    echo -n "Running test program \"$I\", please wait:"
+    echo "" >> $LOGFILE
+    echo "Output of $I:" >> $LOGFILE
+    if ./$I >>$LOGFILE 2>&1; then
+        echo " passed"
+    else
+        echo
+        echo
+        echo '***'
+        echo " Test failed: $I"
+        echo " See $LOGFILE for errors"
+        echo 
+        exit 1
+    fi
+done
+
+echo 
+echo "All tests passed."
+
+
+
diff --git a/tests/runtests.sh.in b/tests/runtests.sh.in
new file mode 100755
index 00000000..aa67ccde
--- /dev/null
+++ b/tests/runtests.sh.in
@@ -0,0 +1,55 @@
+#! /bin/sh
+
+LOGFILE=runtests.log
+POTENTIAL_TESTS='testboundaries'
+
+for I in $POTENTIAL_TESTS
+do
+    GOOD=yes
+    test -f $I || {
+        echo "WARNING: test program $I not found, not running"
+        GOOD=no
+    }
+
+    if test x$GOOD = xyes; then
+        test -x $I || {
+            echo "WARNING: test program $I is not executable, not running"
+            GOOD=no
+        }
+    fi
+    
+    if test x$GOOD = xyes; then
+        TESTS="$TESTS$I "
+    fi
+done
+
+echo "Logging to $LOGFILE"
+
+echo "Log file for Pango test programs." > $LOGFILE
+echo "" >> $LOGFILE
+echo "Tests are: "$TESTS >> $LOGFILE
+echo "" >> $LOGFILE
+
+for I in $TESTS
+do
+    echo -n "Running test program \"$I\", please wait:"
+    echo "" >> $LOGFILE
+    echo "Output of $I:" >> $LOGFILE
+    if ./$I >>$LOGFILE 2>&1; then
+        echo " passed"
+    else
+        echo
+        echo
+        echo '***'
+        echo " Test failed: $I"
+        echo " See $LOGFILE for errors"
+        echo 
+        exit 1
+    fi
+done
+
+echo 
+echo "All tests passed."
+
+
+
diff --git a/tests/testboundaries.c b/tests/testboundaries.c
new file mode 100644
index 00000000..c05bc837
--- /dev/null
+++ b/tests/testboundaries.c
@@ -0,0 +1,356 @@
+/* Pango
+ * testboundaries.c: Test text boundary algorithms
+ *
+ * Copyright (C) 1999-2000 Red Hat Software
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <pango/pango.h>
+
+#define CHFORMAT "%0#6x"
+
+/* FIXME for now this just tests that the breaking of some sample
+ * text conforms to certain rules and invariants. But eventually
+ * we should also have test-result pairs, i.e. a string and some
+ * encoding of the correct way to break the string, to check
+ * more precisely that things worked
+ */
+
+
+/* Keep these in sync with the same macros in break.c */
+
+#define LEADING_JAMO(wc)  ((wc) >= 0x1100 && (wc) <= 0x115F)
+#define VOWEL_JAMO(wc)    ((wc) >= 0x1160 && (wc) <= 0x11A2)
+#define TRAILING_JAMO(wc) ((wc) >= 0x11A8 && (wc) <= 0x11F9)
+#define JAMO(wc)          ((wc) >= 0x1100 && (wc) <= 0x11FF)
+/* "virama script" is just an optimization; it includes a bunch of
+ * scripts without viramas in them
+ */
+#define VIRAMA_SCRIPT(wc)        ((wc) >= 0x0901 && (wc) <= 0x17FF)
+#define VIRAMA(wc) ((wc) == 0x094D || \
+                    (wc) == 0x09CD || \
+                    (wc) == 0x0A4D || \
+                    (wc) == 0x0ACD || \
+                    (wc) == 0x0B4D || \
+                    (wc) == 0x0BCD || \
+                    (wc) == 0x0C4D || \
+                    (wc) == 0x0CCD || \
+                    (wc) == 0x0D4D || \
+                    (wc) == 0x0DCA || \
+                    (wc) == 0x0E3A || \
+                    (wc) == 0x0F84 || \
+                    (wc) == 0x1039 || \
+                    (wc) == 0x17D2)
+/* Types of Japanese characters */
+#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
+#define KANJI(wc)    ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
+#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
+#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
+
+static int offset = 0;
+static int line = 0;
+static gunichar current_wc = 0;
+static const char *line_start = NULL;
+static const char *line_end = NULL;
+
+static void
+fail (const char *format,
+      ...)
+{
+  char *str;
+  char *line_text;
+  
+  va_list args;
+
+  va_start (args, format);
+  str = g_strdup_vprintf (format, args);
+  va_end (args);
+
+  line_text = g_strndup (line_start, line_end - line_start);
+  
+  fprintf (stderr, "line %d offset %d char is " CHFORMAT ": %s\n (line is '%s')\n", line, offset, current_wc, str, line_text);
+  g_free (str);
+  g_free (line_text);
+
+  exit (1);
+}
+
+typedef void (* CharForeachFunc) (gunichar      wc,
+                                  gunichar      prev_wc,
+                                  gunichar      next_wc,
+                                  GUnicodeType  type,
+                                  GUnicodeType  prev_type,
+                                  GUnicodeType  next_type,
+                                  PangoLogAttr *attr,
+                                  PangoLogAttr *prev_attr,
+                                  PangoLogAttr *next_attr,
+                                  gpointer      data);
+
+static void
+log_attr_foreach (const char     *text,
+                  PangoLogAttr   *attrs,
+                  CharForeachFunc func,
+                  gpointer        data)
+{
+  const gchar *next = text;
+  gint length = strlen (text);
+  const gchar *end = text + length;
+  gint i = 0;
+  gunichar prev_wc;
+  gunichar next_wc;
+  GUnicodeType prev_type;
+  GUnicodeType next_type;
+  
+  if (next == end)
+    return;
+
+  offset = 0;
+  line = 0;
+  
+  prev_type = (GUnicodeType) -1;
+  prev_wc = 0;
+
+  next_wc = g_utf8_get_char (next);
+  next_type = g_unichar_type (next_wc);
+
+  line_start = text;
+  line_end = text;
+  
+  while (next_wc != 0)
+    {
+      GUnicodeType type;
+      gunichar wc;
+
+      wc = next_wc;
+      type = next_type;
+
+      current_wc = wc;
+      
+      next = g_utf8_next_char (next);
+      line_end = next;
+      
+      if (next >= end)
+        next_wc = 0;
+      else
+        next_wc = g_utf8_get_char (next);
+
+      if (next_wc)
+        next_type = g_unichar_type (next_wc);
+
+      (* func) (wc, prev_wc, next_wc,
+                type, prev_type, next_type,
+                &attrs[i],
+                i != 0 ? &attrs[i-1] : NULL,
+                next_wc != 0 ? &attrs[i+1] : NULL,
+                data);
+            
+      prev_type = type;
+      prev_wc = wc;
+      ++i;
+      ++offset;
+      if (wc == '\n')
+        {
+          ++line;
+          offset = 0;
+          line_start = next;
+          line_end = next;
+        }
+    }
+}
+
+static void
+check_line_char (gunichar      wc,
+                 gunichar      prev_wc,
+                 gunichar      next_wc,
+                 GUnicodeType  type,
+                 GUnicodeType  prev_type,
+                 GUnicodeType  next_type,
+                 PangoLogAttr *attr,
+                 PangoLogAttr *prev_attr,
+                 PangoLogAttr *next_attr,
+                 gpointer      data)
+{
+  GUnicodeBreakType break_type;
+  GUnicodeBreakType prev_break_type;
+
+  break_type = g_unichar_break_type (wc);
+  if (prev_wc)
+    prev_break_type = g_unichar_break_type (prev_wc);
+  else
+    prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+  
+  if (wc == '\n')
+    {
+      if (prev_wc == '\r')
+        {
+          if (attr->is_break)
+            fail ("line break between \\r and \\n");
+        }
+      
+      if (next_attr && !next_attr->is_break)
+        fail ("no line break after \\n");
+    }
+  
+  if (attr->is_break && prev_wc == 0)
+    fail ("first char in string should not be marked as a line break");
+
+  if (break_type == G_UNICODE_BREAK_SPACE)
+    {
+      if (attr->is_break && prev_attr != NULL &&
+          !attr->is_mandatory_break)
+        fail ("can't break lines before a space unless a mandatory break char precedes it; prev char was " CHFORMAT, prev_wc);
+    }
+
+  if (attr->is_mandatory_break && !attr->is_break)
+    fail ("mandatory breaks must also be marked as regular breaks");
+
+  
+  
+  /* FIXME use the break tables from break.c to automatically
+   * check invariants for each cell in the table. Shouldn't
+   * be that hard to do.
+   */
+  
+  if (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
+      prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
+      attr->is_break &&
+      !attr->is_mandatory_break)
+    fail ("can't break between two open punctuation chars");
+
+  if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION &&
+      prev_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION &&
+      attr->is_break &&
+      !attr->is_mandatory_break)
+    fail ("can't break between two close punctuation chars");
+
+  if (break_type == G_UNICODE_BREAK_QUOTATION &&
+      prev_break_type == G_UNICODE_BREAK_ALPHABETIC &&
+      attr->is_break &&
+      !attr->is_mandatory_break)
+    fail ("can't break letter-quotemark sequence");  
+}
+
+static void
+check_line_invariants (const char   *text,
+                       PangoLogAttr *attrs)
+{
+  log_attr_foreach (text, attrs, check_line_char, NULL);
+}
+
+static void
+check_word_invariants (const char   *text,
+                       PangoLogAttr *attrs)
+{
+
+
+}
+
+static void
+check_sentence_invariants (const char   *text,
+                           PangoLogAttr *attrs)
+{
+
+
+}
+
+static void
+check_grapheme_invariants (const char   *text,
+                           PangoLogAttr *attrs)
+{
+
+
+}
+
+static void
+print_sentences (const char   *text,
+                 PangoLogAttr *attrs)
+{
+  const char *p;
+  const char *last;
+  int i = 0;
+
+  last = text;
+  p = text;
+
+  while (*p)
+    {
+      if (attrs[i].is_sentence_boundary)
+        {
+          char *s = g_strndup (last, p - last);
+          printf ("%s\n", s);
+          g_free (s);
+          last = p;
+        }
+      
+      p = g_utf8_next_char (p);
+      ++i;
+    }
+}
+
+static void
+check_invariants (const char *text)
+{
+  int len;
+  PangoLogAttr *attrs;
+
+  if (!g_utf8_validate (text, -1, NULL))
+    fail ("Invalid UTF-8 in test text");
+  
+  len = g_utf8_strlen (text, -1);
+  attrs = g_new0 (PangoLogAttr, len);
+
+  pango_get_log_attrs (text,
+                       -1,
+                       0,
+                       "C",
+                       attrs);
+  
+  check_line_invariants (text, attrs);
+  check_sentence_invariants (text, attrs);
+  check_grapheme_invariants (text, attrs);
+  check_word_invariants (text, attrs);
+
+#if 0
+  print_sentences (text, attrs);
+#endif
+  
+  g_free (attrs);
+}
+
+int
+main (int    argc,
+      char **argv)
+{
+  gchar *text;
+
+  if (!g_file_get_contents ("boundaries.utf8", &text, NULL, NULL))
+    fail ("Couldn't open sample text file");
+  
+  check_invariants (text);
+
+  g_free (text);
+
+  printf ("testboundaries passed\n");
+  
+  return 0;
+}
+