Part of Bug 97545 – Make pango_default_break follow Unicode TR #29

2008-04-24 Behdad Esfahbod <behdad@gnome.org> Part of Bug 97545 – Make pango_default_break follow Unicode TR #29 * docs/tmpl/main.sgml: * pango/break.c (pango_default_break): * pango/pango-break.h: * tests/testboundaries_ucd.c (main): Add new PangoLogAttr member is_word_boundary, that implements UAX#29's Word Boundaries semantics. Test fully passes for it. svn path=/trunk/; revision=2618
author: Behdad Esfahbod <behdad@gnome.org> 2008-04-25 00:33:29 +0000
committer: Behdad Esfahbod <behdad@src.gnome.org> 2008-04-25 00:33:29 +0000
commit: 8cb6363b7e598b0e4d4a4ef4229bed3d7a5d46c8 (patch)
tree: 53d111625613ddf6b7f50cc0bbde17e630c236a7
parent: f6b1fef713a035e4abcbe0d0fda54721a3560d5e (diff)
download: pango-8cb6363b7e598b0e4d4a4ef4229bed3d7a5d46c8.tar.gz
5 files changed, 247 insertions, 25 deletions
diff --git a/ChangeLog b/ChangeLog
index 6c912546..c33813dc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,17 @@
 2008-04-24  Behdad Esfahbod  <behdad@gnome.org>
 
 	Part of Bug 97545 – Make pango_default_break follow Unicode TR #29
+
+	* docs/tmpl/main.sgml:
+	* pango/break.c (pango_default_break):
+	* pango/pango-break.h:
+	* tests/testboundaries_ucd.c (main):
+	Add new PangoLogAttr member is_word_boundary, that implements UAX#29's
+	Word Boundaries semantics.  Test fully passes for it.
+
+2008-04-24  Behdad Esfahbod  <behdad@gnome.org>
+
+	Part of Bug 97545 – Make pango_default_break follow Unicode TR #29
 	Patch from Noah Levitt
 
 	* tests/Makefile.am:
diff --git a/docs/tmpl/main.sgml b/docs/tmpl/main.sgml
index fe37ff91..f5cff8e0 100644
--- a/docs/tmpl/main.sgml
+++ b/docs/tmpl/main.sgml
@@ -393,6 +393,10 @@ about the attributes of a single character.
 @is_cursor_position: if set, cursor can appear in front of character.
 	i.e. this is a grapheme boundary, or the first character
         in the text.
+	This flag implements Unicode's
+	<ulink
+	url="http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries">Grapheme
+	Cluster Boundaries</ulink> semantics.
 @is_word_start: is first character in a word
 @is_word_end: is first non-word char after a word
 	Note that in degenerate cases, you could have both @is_word_start
@@ -424,6 +428,15 @@ about the attributes of a single character.
                               characters.
 @is_expandable_space: is a whitespace character that can possibly be
                       expanded for justification purposes. (Since: 1.18)
+@is_word_boundary: is a word boundary.
+	More specifically, means that this is not a position in the middle
+	of a word.  For example, both sides of a punctuation mark are
+	considered word boundaries.  This flag is particularly useful when
+	selecting text word-by-word.
+	This flag implements Unicode's
+	<ulink url="http://www.unicode.org/reports/tr29/#Word_Boundaries">Word
+	Boundaries</ulink> semantics.
+	(Since: 1.22)
 
 <!-- ##### FUNCTION pango_shape ##### -->
 <para>
diff --git a/pango/break.c b/pango/break.c
index 9e63f063..72d5d541 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -522,6 +522,23 @@ pango_default_break (const gchar   *text,
   } GraphemeBreakType;
   GraphemeBreakType prev_GB_type = GB_Other;
 
+  /* See Word_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    WB_Other,
+    WB_NewlineCRLF,
+    WB_ExtendFormat,
+    WB_Katakana,
+    WB_ALetter,
+    WB_MidNumLet,
+    WB_MidLetter,
+    WB_MidNum,
+    WB_Numeric,
+    WB_ExtendNumLet,
+  } WordBreakType;
+  WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
+  gint prev_WB_i = -1;
+
   WordType current_word_type = WordNone;
   gunichar last_word_letter = 0;
   gunichar base_character = 0;
@@ -567,6 +584,11 @@ pango_default_break (const gchar   *text,
       JamoType jamo;
       gboolean makes_hangul_syllable;
 
+      /* UAX#29 boundaries */
+      gboolean is_grapheme_boundary;
+      gboolean is_word_boundary;
+
+
       wc = next_wc;
       break_type = next_break_type;
 
@@ -624,23 +646,24 @@ pango_default_break (const gchar   *text,
        */
       attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);
 
-      /* ---- Cursor position breaks (Grapheme breaks) ---- */
-
+      /* ---- UAX#29 Grapheme Boundaries ---- */
       {
 	GraphemeBreakType GB_type;
-	gboolean is_grapheme_boundary = FALSE;
         /* Find the GraphemeBreakType of wc */
 	GB_type = GB_Other;
 	switch (type)
 	  {
-	  case G_UNICODE_CONTROL:
 	  case G_UNICODE_FORMAT:
+	    if (wc == 0x200C && wc == 0x200D)
+	      {
+		GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */
+		break;
+	      }
+	    /* fall through */
+	  case G_UNICODE_CONTROL:
 	  case G_UNICODE_LINE_SEPARATOR:
 	  case G_UNICODE_PARAGRAPH_SEPARATOR:
-	    if (wc != 0x200C && wc != 0x200D)
-	      GB_type = GB_ControlCRLF;
-	    else
-	      GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */
+	    GB_type = GB_ControlCRLF;
 	    break;
 
 	  case G_UNICODE_OTHER_LETTER:
@@ -679,16 +702,13 @@ pango_default_break (const gchar   *text,
 	  case G_UNICODE_NON_SPACING_MARK:
 	    GB_type = GB_Extend; /* Grapheme_Extend */
 	    break;
-
-	  default:
-	    break;
 	  }
 
 	/* Grapheme Cluster Boundary Rules */
 	/* We apply Rules GB1 and GB2 at the end of the function */
 	if (wc == '\n' && prev_wc == '\r')
 	  is_grapheme_boundary = FALSE; /* Rule GB3 */
-	else if (GB_type == GB_ControlCRLF || prev_GB_type == GB_ControlCRLF)
+	else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
 	  is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
 	else if (GB_type == GB_InHangulSyllable)
 	  is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
@@ -704,14 +724,189 @@ pango_default_break (const gchar   *text,
 	prev_GB_type = GB_type;
 
 	attrs[i].is_cursor_position = is_grapheme_boundary;
+	/* If this is a grapheme boundary, we have to decide if backspace
+	 * deletes a character or the whole grapheme cluster */
+	if (is_grapheme_boundary)
+	  attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
+	else
+	  attrs[i].backspace_deletes_character = FALSE;
+      }
+
+      /* ---- UAX#29 Word Boundaries ---- */
+      {
+	is_word_boundary = FALSE;
+	if (is_grapheme_boundary) /* Rules WB3 and WB4 */
+	  {
+	    PangoScript script;
+	    WordBreakType WB_type;
+
+	    script = pango_script_for_unichar (wc);
+
+	    /* Find the WordBreakType of wc */
+	    WB_type = WB_Other;
+
+	    if (script == PANGO_SCRIPT_KATAKANA)
+	      WB_type = WB_Katakana;
+
+	    if (WB_type == WB_Other)
+	      switch (wc >> 8)
+	        {
+		case 0x30:
+		  if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 ||
+		      wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc)
+		    WB_type = WB_Katakana; /* Katakana exceptions */
+		  break;
+		case 0xFF:
+		  if (wc == 0xFF70)
+		    WB_type = WB_Katakana; /* Katakana exceptions */
+		  else if (wc >= 0xFF9E || wc <= 0xFF9F)
+		    WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */
+		  break;
+		case 0x05:
+		  if (wc == 0x05F3)
+		    WB_type = WB_ALetter; /* ALetter exceptions */
+		  break;
+		}
+
+	    if (WB_type == WB_Other)
+	      switch (break_type)
+	        {
+		case G_UNICODE_BREAK_NUMERIC:
+		  if (wc != 0x066C)
+		    WB_type = WB_Numeric; /* Numeric */
+		  break;
+		case G_UNICODE_BREAK_INFIX_SEPARATOR:
+		  if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E)
+		    WB_type = WB_MidNum; /* MidNum */
+		  break;
+		}
+
+	    if (WB_type == WB_Other)
+	      switch (type)
+		{
+		case G_UNICODE_CONTROL:
+		  if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085)
+		    break;
+		  /* fall through */
+		case G_UNICODE_LINE_SEPARATOR:
+		case G_UNICODE_PARAGRAPH_SEPARATOR:
+		  WB_type = WB_NewlineCRLF; /* CR, LF, Newline */
+		  break;
+
+		case G_UNICODE_FORMAT:
+		case G_UNICODE_COMBINING_MARK:
+		case G_UNICODE_ENCLOSING_MARK:
+		case G_UNICODE_NON_SPACING_MARK:
+		  WB_type = WB_ExtendFormat; /* Extend, Format */
+		  break;
+
+		case G_UNICODE_CONNECT_PUNCTUATION:
+		  WB_type = WB_ExtendNumLet; /* ExtendNumLet */
+		  break;
+
+		case G_UNICODE_INITIAL_PUNCTUATION:
+		case G_UNICODE_FINAL_PUNCTUATION:
+		  if (wc == 0x2018 || wc == 0x2019)
+		    WB_type = WB_MidNumLet; /* MidNumLet */
+		  break;
+		case G_UNICODE_OTHER_PUNCTUATION:
+		  if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 ||
+		      wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e)
+		    WB_type = WB_MidNumLet; /* MidNumLet */
+		  else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 || wc == 0x003a || wc == 0x0387 ||
+			   wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a)
+		    WB_type = WB_MidLetter; /* WB_MidLetter */
+		  else if (wc == 0x066c ||
+			   wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b)
+		    WB_type = WB_MidNum; /* MidNum */
+		  break;
+
+		case G_UNICODE_OTHER_SYMBOL:
+		  if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */
+		    goto Alphabetic;
+		  break;
+
+		case G_UNICODE_OTHER_LETTER:
+		case G_UNICODE_LETTER_NUMBER:
+		  if (wc == 0x3006 || wc == 0x3007 ||
+		      (wc >= 0x3021 && wc <= 0x3029) ||
+		      (wc >= 0x3038 && wc <= 0x303A) ||
+		      (wc >= 0x3400 && wc <= 0x4DB5) ||
+		      (wc >= 0x4E00 && wc <= 0x9FC3) ||
+		      (wc >= 0xF900 && wc <= 0xFA2D) ||
+		      (wc >= 0xFA30 && wc <= 0xFA6A) ||
+		      (wc >= 0xFA70 && wc <= 0xFAD9) ||
+		      (wc >= 0x20000 && wc <= 0x2A6D6) ||
+		      (wc >= 0x2F800 && wc <= 0x2FA1D))
+		    break; /* ALetter exceptions: Ideographic */
+		  goto Alphabetic;
+
+		case G_UNICODE_LOWERCASE_LETTER:
+		case G_UNICODE_MODIFIER_LETTER:
+		case G_UNICODE_TITLECASE_LETTER:
+		case G_UNICODE_UPPERCASE_LETTER:
+		Alphabetic:
+		  if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA)
+		    WB_type = WB_ALetter; /* ALetter */
+		  break;
+		}
+
+	    /* Grapheme Cluster Boundary Rules */
+
+	    /* We apply Rules WB1 and WB2 at the end of the function */
+
+	    if (prev_wc == 0x3031 && wc == 0x41)
+	      g_message ("Y %d %d", prev_WB_type, WB_type);
+	    if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
+	      {
+	        /* The extra check for prev_WB_i is to correctly handle sequences like
+		 * Newline ÷ Extend × Extend
+		 * since we have not skipped ExtendFormat yet.
+		 */
+	        is_word_boundary = TRUE; /* Rule WB3a */
+	      }
+	    else if (WB_type == WB_NewlineCRLF)
+	      is_word_boundary = TRUE; /* Rule WB3b */
+	    else if (WB_type == WB_ExtendFormat)
+	      is_word_boundary = FALSE; /* Rules WB4? */
+	    else if ((prev_WB_type == WB_ALetter  ||
+		      prev_WB_type == WB_Numeric  ||
+		      prev_WB_type == WB_ExtendNumLet) &&
+		     (     WB_type == WB_ALetter  ||
+		           WB_type == WB_Numeric  ||
+		           WB_type == WB_ExtendNumLet))
+	      is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10, WB13a, WB13b */
+	    else if ((prev_WB_type == WB_Katakana ||
+		      prev_WB_type == WB_ExtendNumLet) &&
+		     (     WB_type == WB_Katakana ||
+		           WB_type == WB_ExtendNumLet))
+	      is_word_boundary = FALSE; /* Rules WB13, WB13a, WB13b */
+	    else if ((prev_prev_WB_type == WB_ALetter && WB_type == WB_ALetter) &&
+		     (prev_WB_type == WB_MidLetter || prev_WB_type == WB_MidNumLet))
+	      {
+		attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
+		is_word_boundary = FALSE; /* Rule WB7 */
+	      }
+	    else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
+		     (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet))
+	      {
+		is_word_boundary = FALSE; /* Rule WB11 */
+		attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
+	      }
+	    else
+	      is_word_boundary = TRUE; /* Rule WB14 */
+
+	    if (WB_type != WB_ExtendFormat)
+	      {
+		prev_prev_WB_type = prev_WB_type;
+		prev_WB_type = WB_type;
+		prev_WB_i = i;
+	      }
+	  }
+
+	attrs[i].is_word_boundary = is_word_boundary;
       }
 
-      /* If this is a grapheme boundary, we have to decide if backspace
-       * deletes a character or the whole grapheme cluster */
-      if (attrs[i].is_cursor_position)
-	attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
-      else
-	attrs[i].backspace_deletes_character = FALSE;
 
       /* ---- Line breaking ---- */
 
@@ -1446,14 +1641,15 @@ pango_default_break (const gchar   *text,
     }
   i--;
 
+  attrs[i].is_cursor_position = TRUE;  /* Rule GB2 */
+  attrs[0].is_cursor_position = TRUE;  /* Rule GB1 */
+
+  attrs[i].is_word_boundary = TRUE;  /* Rule WB2 */
+  attrs[0].is_word_boundary = TRUE;  /* Rule WB1 */
+
   attrs[i].is_line_break = TRUE;  /* Rule LB3 */
   attrs[0].is_line_break = FALSE; /* Rule LB2 */
 
-  attrs[i].is_word_end   = TRUE;  /* Rule WB2 */
-  attrs[0].is_word_start = TRUE;  /* Rule WB1 */
-
-  attrs[i].is_cursor_position = TRUE;  /* Rule GB2 */
-  attrs[0].is_cursor_position = TRUE;  /* Rule GB1 */
 }
 
 static gboolean
diff --git a/pango/pango-break.h b/pango/pango-break.h
index fe22acec..5c326dd4 100644
--- a/pango/pango-break.h
+++ b/pango/pango-break.h
@@ -74,6 +74,9 @@ struct _PangoLogAttr
    * width during justification.
    */
   guint is_expandable_space : 1;
+
+  /* Word boundary as defined by UAX#29 */
+  guint is_word_boundary : 1;	/* is NOT in the middle of a word */
 };
 
 /* Determine information about cluster/word/line breaks in a string
diff --git a/tests/testboundaries_ucd.c b/tests/testboundaries_ucd.c
index 7e266f88..1e6d7d01 100644
--- a/tests/testboundaries_ucd.c
+++ b/tests/testboundaries_ucd.c
@@ -345,8 +345,7 @@ main (gint argc,
 
   filename = g_strdup_printf ("%s/WordBreakTest.txt", srcdir);
   bits.bits = 0;
-  bits.attr.is_word_start = 1; /* either word start or end */
-  bits.attr.is_word_end = 1;   /* (is this right?) */
+  bits.attr.is_word_boundary = 1;
   do_test (filename, bits, FALSE);
 
   filename = g_strdup_printf ("%s/SentenceBreakTest.txt", srcdir);
author	Behdad Esfahbod <behdad@gnome.org>	2008-04-25 00:33:29 +0000
committer	Behdad Esfahbod <behdad@src.gnome.org>	2008-04-25 00:33:29 +0000
commit	8cb6363b7e598b0e4d4a4ef4229bed3d7a5d46c8 (patch)
tree	53d111625613ddf6b7f50cc0bbde17e630c236a7
parent	f6b1fef713a035e4abcbe0d0fda54721a3560d5e (diff)
download	pango-8cb6363b7e598b0e4d4a4ef4229bed3d7a5d46c8.tar.gz