unicode: make the tables smaller.

By splitting the ranges into 16-bit values and 32-bit values, we can reduce about 3000 entries by 48 bits per entry, or about 16KB, at the cost of a little more complexity in the code. R=iant, bradfitz, rsc, r CC=golang-dev http://codereview.appspot.com/4547066
author: Rob Pike <r@golang.org> 2011-05-31 09:58:07 +1000
committer: Rob Pike <r@golang.org> 2011-05-31 09:58:07 +1000
commit: 6c221227cb3f5bdcbc90eeaf230566a50341e40f (patch)
tree: c3f9f8b2209cb71616dc285d638e9279385fd7c2 /src/pkg/unicode/letter.go
parent: d08d87e4c5167dc5f0a3a2adf0eb28e73dcc20e2 (diff)
download: go-6c221227cb3f5bdcbc90eeaf230566a50341e40f.tar.gz
1 files changed, 77 insertions, 17 deletions
diff --git a/src/pkg/unicode/letter.go b/src/pkg/unicode/letter.go
index 382c6eb3f..06cb67e51 100644
--- a/src/pkg/unicode/letter.go
+++ b/src/pkg/unicode/letter.go
@@ -11,13 +11,30 @@ const (
 	ReplacementChar = 0xFFFD   // Represents invalid code points.
 )
 
+// RangeTable defines a set of Unicode code points by listing the ranges of
+// code points within the set. The ranges are listed in two slices
+// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
+// The two slices must be in sorted order and non-overlapping.
+type RangeTable struct {
+	R16 []Range16
+	R32 []Range32
+}
+
+// Range16 represents of a range of 16-bit Unicode code points.  The range runs from Lo to Hi
+// inclusive and has the specified stride.
+type Range16 struct {
+	Lo     uint16
+	Hi     uint16
+	Stride uint16
+}
 
-// The representation of a range of Unicode code points.  The range runs from Lo to Hi
+// Range32 represents of a range of Unicode code points and is used when one or
+//  more of the values will not fit in 16 bits.  The range runs from Lo to Hi
 // inclusive and has the specified stride.
-type Range struct {
-	Lo     int
-	Hi     int
-	Stride int
+type Range32 struct {
+	Lo     uint32
+	Hi     uint32
+	Stride uint32
 }
 
 // CaseRange represents a range of Unicode code points for simple (one
@@ -60,22 +77,28 @@ const (
 	UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
 )
 
-// Is tests whether rune is in the specified table of ranges.
-func Is(ranges []Range, rune int) bool {
-	// common case: rune is ASCII or Latin-1
-	if rune < 0x100 {
-		for _, r := range ranges {
-			if rune > r.Hi {
-				continue
-			}
-			if rune < r.Lo {
-				return false
-			}
+// is16 uses binary search to test whether rune is in the specified slice of 16-bit ranges.
+func is16(ranges []Range16, rune uint16) bool {
+	// binary search over ranges
+	lo := 0
+	hi := len(ranges)
+	for lo < hi {
+		m := lo + (hi-lo)/2
+		r := ranges[m]
+		if r.Lo <= rune && rune <= r.Hi {
 			return (rune-r.Lo)%r.Stride == 0
 		}
-		return false
+		if rune < r.Lo {
+			hi = m
+		} else {
+			lo = m + 1
+		}
 	}
+	return false
+}
 
+// is32 uses binary search to test whether rune is in the specified slice of 32-bit ranges.
+func is32(ranges []Range32, rune uint32) bool {
 	// binary search over ranges
 	lo := 0
 	hi := len(ranges)
@@ -94,6 +117,43 @@ func Is(ranges []Range, rune int) bool {
 	return false
 }
 
+// Is tests whether rune is in the specified table of ranges.
+func Is(rangeTab *RangeTable, rune int) bool {
+	// common case: rune is ASCII or Latin-1.
+	if rune < 0x100 {
+		r16 := uint16(rune)
+		for _, r := range rangeTab.R16 {
+			if r16 > r.Hi {
+				continue
+			}
+			if r16 < r.Lo {
+				return false
+			}
+			return (r16-r.Lo)%r.Stride == 0
+		}
+		r32 := uint32(rune)
+		for _, r := range rangeTab.R32 {
+			if r32 > r.Hi {
+				continue
+			}
+			if r32 < r.Lo {
+				return false
+			}
+			return (r32-r.Lo)%r.Stride == 0
+		}
+		return false
+	}
+	r16 := rangeTab.R16
+	if len(r16) > 0 && rune <= int(r16[len(r16)-1].Hi) {
+		return is16(r16, uint16(rune))
+	}
+	r32 := rangeTab.R32
+	if len(r32) > 0 && rune >= int(r32[0].Lo) {
+		return is32(r32, uint32(rune))
+	}
+	return false
+}
+
 // IsUpper reports whether the rune is an upper case letter.
 func IsUpper(rune int) bool {
 	if rune < 0x80 { // quick ASCII check
author	Rob Pike <r@golang.org>	2011-05-31 09:58:07 +1000
committer	Rob Pike <r@golang.org>	2011-05-31 09:58:07 +1000
commit	6c221227cb3f5bdcbc90eeaf230566a50341e40f (patch)
tree	c3f9f8b2209cb71616dc285d638e9279385fd7c2 /src/pkg/unicode/letter.go
parent	d08d87e4c5167dc5f0a3a2adf0eb28e73dcc20e2 (diff)
download	go-6c221227cb3f5bdcbc90eeaf230566a50341e40f.tar.gz