summaryrefslogtreecommitdiff
path: root/libgo/go/exp/locale/collate/collate.go
diff options
context:
space:
mode:
Diffstat (limited to 'libgo/go/exp/locale/collate/collate.go')
-rw-r--r--libgo/go/exp/locale/collate/collate.go586
1 files changed, 0 insertions, 586 deletions
diff --git a/libgo/go/exp/locale/collate/collate.go b/libgo/go/exp/locale/collate/collate.go
deleted file mode 100644
index 2cb29f24b74..00000000000
--- a/libgo/go/exp/locale/collate/collate.go
+++ /dev/null
@@ -1,586 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package collate contains types for comparing and sorting Unicode strings
-// according to a given collation order. Package locale provides a high-level
-// interface to collation. Users should typically use that package instead.
-package collate
-
-import (
- "bytes"
- "exp/norm"
-)
-
-// AlternateHandling identifies the various ways in which variables are handled.
-// A rune with a primary weight lower than the variable top is considered a
-// variable.
-// See http://www.unicode.org/reports/tr10/#Variable_Weighting for details.
-type AlternateHandling int
-
-const (
- // AltNonIgnorable turns off special handling of variables.
- AltNonIgnorable AlternateHandling = iota
-
- // AltBlanked sets variables and all subsequent primary ignorables to be
- // ignorable at all levels. This is identical to removing all variables
- // and subsequent primary ignorables from the input.
- AltBlanked
-
- // AltShifted sets variables to be ignorable for levels one through three and
- // adds a fourth level based on the values of the ignored levels.
- AltShifted
-
- // AltShiftTrimmed is a slight variant of AltShifted that is used to
- // emulate POSIX.
- AltShiftTrimmed
-)
-
-// Collator provides functionality for comparing strings for a given
-// collation order.
-type Collator struct {
- // TODO: hide most of these options. Low-level options are set through the locale
- // identifier (as defined by LDML) while high-level options are set through SetOptions.
- // Using high-level options allows us to be more flexible (such as not ignoring
- // Thai vowels for IgnoreDiacriticals) and more user-friendly (such as allowing
- // diacritical marks to be ignored but not case without having to fiddle with levels).
-
- // Strength sets the maximum level to use in comparison.
- Strength Level
-
- // Alternate specifies an alternative handling of variables.
- Alternate AlternateHandling
-
- // Backwards specifies the order of sorting at the secondary level.
- // This option exists predominantly to support reverse sorting of accents in French.
- Backwards bool
-
- // TODO: implement:
- // With HiraganaQuaternary enabled, Hiragana codepoints will get lower values
- // than all the other non-variable code points. Strength must be greater or
- // equal to Quaternary for this to take effect.
- HiraganaQuaternary bool
-
- // If CaseLevel is true, a level consisting only of case characteristics will
- // be inserted in front of the tertiary level. To ignore accents but take
- // cases into account, set Strength to Primary and CaseLevel to true.
- CaseLevel bool
-
- // If Numeric is true, any sequence of decimal digits (category is Nd) is sorted
- // at a primary level with its numeric value. For example, "A-21" < "A-123".
- Numeric bool
-
- // The largest primary value that is considered to be variable.
- variableTop uint32
-
- f norm.Form
-
- t Weigher
-
- sorter sorter
-
- _iter [2]iter
-}
-
-// An Option is used to change the behavior of Collator. They override the
-// settings passed through the locale identifier.
-type Option int
-
-const (
- Numeric Option = 1 << iota // Sort numbers numerically ("2" < "12").
- IgnoreCase // Case-insensitive search.
- IgnoreDiacritics // Ignore diacritical marks. ("o" == "รถ").
- IgnoreWidth // Ignore full versus normal width.
- UpperFirst // Sort upper case before lower case.
- LowerFirst // Sort lower case before upper case.
- Force // Force ordering if strings are equivalent but not equal.
-
- Loose = IgnoreDiacritics | IgnoreWidth | IgnoreCase
-)
-
-// SetOptions accepts a Options or-ed together. All previous calls to SetOptions are ignored.
-func (c *Collator) SetOptions(o Option) {
- // TODO: implement
-}
-
-func (c *Collator) iter(i int) *iter {
- // TODO: evaluate performance for making the second iterator optional.
- return &c._iter[i]
-}
-
-// Locales returns the list of locales for which collating differs from its parent locale.
-// The returned value should not be modified.
-func Locales() []string {
- return availableLocales
-}
-
-// New returns a new Collator initialized for the given locale.
-func New(loc string) *Collator {
- // TODO: handle locale selection according to spec.
- var t tableIndex
- if loc != "" {
- if idx, ok := locales[loc]; ok {
- t = idx
- } else {
- t = locales["root"]
- }
- }
- return NewFromTable(Init(t))
-}
-
-func NewFromTable(t Weigher) *Collator {
- c := &Collator{
- Strength: Tertiary,
- f: norm.NFD,
- t: t,
- }
- c._iter[0].init(c)
- c._iter[1].init(c)
- return c
-}
-
-// Buffer holds keys generated by Key and KeyString.
-type Buffer struct {
- buf [4096]byte
- key []byte
-}
-
-func (b *Buffer) init() {
- if b.key == nil {
- b.key = b.buf[:0]
- }
-}
-
-// Reset clears the buffer from previous results generated by Key and KeyString.
-func (b *Buffer) Reset() {
- b.key = b.key[:0]
-}
-
-// Compare returns an integer comparing the two byte slices.
-// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
-func (c *Collator) Compare(a, b []byte) int {
- // TODO: skip identical prefixes once we have a fast way to detect if a rune is
- // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
- c.iter(0).setInput(a)
- c.iter(1).setInput(b)
- if res := c.compare(); res != 0 {
- return res
- }
- if Identity == c.Strength {
- return bytes.Compare(a, b)
- }
- return 0
-}
-
-// CompareString returns an integer comparing the two strings.
-// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
-func (c *Collator) CompareString(a, b string) int {
- // TODO: skip identical prefixes once we have a fast way to detect if a rune is
- // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
- c.iter(0).setInputString(a)
- c.iter(1).setInputString(b)
- if res := c.compare(); res != 0 {
- return res
- }
- if Identity == c.Strength {
- if a < b {
- return -1
- } else if a > b {
- return 1
- }
- }
- return 0
-}
-
-func compareLevel(f func(i *iter) int, a, b *iter) int {
- a.pce = 0
- b.pce = 0
- for {
- va := f(a)
- vb := f(b)
- if va != vb {
- if va < vb {
- return -1
- }
- return 1
- } else if va == 0 {
- break
- }
- }
- return 0
-}
-
-func (c *Collator) compare() int {
- ia, ib := c.iter(0), c.iter(1)
- // Process primary level
- if c.Alternate != AltShifted {
- // TODO: implement script reordering
- // TODO: special hiragana handling
- if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
- return res
- }
- } else {
- // TODO: handle shifted
- }
- if Secondary <= c.Strength {
- f := (*iter).nextSecondary
- if c.Backwards {
- f = (*iter).prevSecondary
- }
- if res := compareLevel(f, ia, ib); res != 0 {
- return res
- }
- }
- // TODO: special case handling (Danish?)
- if Tertiary <= c.Strength || c.CaseLevel {
- if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
- return res
- }
- // TODO: Not needed for the default value of AltNonIgnorable?
- if Quaternary <= c.Strength {
- if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
- return res
- }
- }
- }
- return 0
-}
-
-// Key returns the collation key for str.
-// Passing the buffer buf may avoid memory allocations.
-// The returned slice will point to an allocation in Buffer and will remain
-// valid until the next call to buf.Reset().
-func (c *Collator) Key(buf *Buffer, str []byte) []byte {
- // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
- buf.init()
- return c.key(buf, c.getColElems(str))
-}
-
-// KeyFromString returns the collation key for str.
-// Passing the buffer buf may avoid memory allocations.
-// The returned slice will point to an allocation in Buffer and will retain
-// valid until the next call to buf.ResetKeys().
-func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
- // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
- buf.init()
- return c.key(buf, c.getColElemsString(str))
-}
-
-func (c *Collator) key(buf *Buffer, w []Elem) []byte {
- processWeights(c.Alternate, c.variableTop, w)
- kn := len(buf.key)
- c.keyFromElems(buf, w)
- return buf.key[kn:]
-}
-
-func (c *Collator) getColElems(str []byte) []Elem {
- i := c.iter(0)
- i.setInput(str)
- for i.next() {
- }
- return i.ce
-}
-
-func (c *Collator) getColElemsString(str string) []Elem {
- i := c.iter(0)
- i.setInputString(str)
- for i.next() {
- }
- return i.ce
-}
-
-type iter struct {
- bytes []byte
- str string
-
- wa [512]Elem
- ce []Elem
- pce int
- nce int // nce <= len(nce)
-
- prevCCC uint8
- pStarter int
-
- t Weigher
-}
-
-func (i *iter) init(c *Collator) {
- i.t = c.t
- i.ce = i.wa[:0]
-}
-
-func (i *iter) reset() {
- i.ce = i.ce[:0]
- i.nce = 0
- i.prevCCC = 0
- i.pStarter = 0
-}
-
-func (i *iter) setInput(s []byte) *iter {
- i.bytes = s
- i.str = ""
- i.reset()
- return i
-}
-
-func (i *iter) setInputString(s string) *iter {
- i.str = s
- i.bytes = nil
- i.reset()
- return i
-}
-
-func (i *iter) done() bool {
- return len(i.str) == 0 && len(i.bytes) == 0
-}
-
-func (i *iter) tail(n int) {
- if i.bytes == nil {
- i.str = i.str[n:]
- } else {
- i.bytes = i.bytes[n:]
- }
-}
-
-func (i *iter) appendNext() int {
- var sz int
- if i.bytes == nil {
- i.ce, sz = i.t.AppendNextString(i.ce, i.str)
- } else {
- i.ce, sz = i.t.AppendNext(i.ce, i.bytes)
- }
- return sz
-}
-
-// next appends Elems to the internal array until it adds an element with CCC=0.
-// In the majority of cases, a Elem with a primary value > 0 will have
-// a CCC of 0. The CCC values of colation elements are also used to detect if the
-// input string was not normalized and to adjust the result accordingly.
-func (i *iter) next() bool {
- for !i.done() {
- p0 := len(i.ce)
- sz := i.appendNext()
- i.tail(sz)
- last := len(i.ce) - 1
- if ccc := i.ce[last].CCC(); ccc == 0 {
- i.nce = len(i.ce)
- i.pStarter = last
- i.prevCCC = 0
- return true
- } else if p0 < last && i.ce[p0].CCC() == 0 {
- // set i.nce to only cover part of i.ce for which ccc == 0 and
- // use rest the next call to next.
- for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ {
- }
- i.nce = p0
- i.pStarter = p0 - 1
- i.prevCCC = ccc
- return true
- } else if ccc < i.prevCCC {
- i.doNorm(p0, ccc) // should be rare for most common cases
- } else {
- i.prevCCC = ccc
- }
- }
- if len(i.ce) != i.nce {
- i.nce = len(i.ce)
- return true
- }
- return false
-}
-
-// nextPlain is the same as next, but does not "normalize" the collation
-// elements.
-// TODO: remove this function. Using this instead of next does not seem
-// to improve performance in any significant way. We retain this until
-// later for evaluation purposes.
-func (i *iter) nextPlain() bool {
- if i.done() {
- return false
- }
- sz := i.appendNext()
- i.tail(sz)
- i.nce = len(i.ce)
- return true
-}
-
-const maxCombiningCharacters = 30
-
-// doNorm reorders the collation elements in i.ce.
-// It assumes that blocks of collation elements added with appendNext
-// either start and end with the same CCC or start with CCC == 0.
-// This allows for a single insertion point for the entire block.
-// The correctness of this assumption is verified in builder.go.
-func (i *iter) doNorm(p int, ccc uint8) {
- if p-i.pStarter > maxCombiningCharacters {
- i.prevCCC = i.ce[len(i.ce)-1].CCC()
- i.pStarter = len(i.ce) - 1
- return
- }
- n := len(i.ce)
- k := p
- for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- {
- }
- i.ce = append(i.ce, i.ce[p:k]...)
- copy(i.ce[p:], i.ce[k:])
- i.ce = i.ce[:n]
-}
-
-func (i *iter) nextPrimary() int {
- for {
- for ; i.pce < i.nce; i.pce++ {
- if v := i.ce[i.pce].Primary(); v != 0 {
- i.pce++
- return v
- }
- }
- if !i.next() {
- return 0
- }
- }
- panic("should not reach here")
-}
-
-func (i *iter) nextSecondary() int {
- for ; i.pce < len(i.ce); i.pce++ {
- if v := i.ce[i.pce].Secondary(); v != 0 {
- i.pce++
- return v
- }
- }
- return 0
-}
-
-func (i *iter) prevSecondary() int {
- for ; i.pce < len(i.ce); i.pce++ {
- if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 {
- i.pce++
- return v
- }
- }
- return 0
-}
-
-func (i *iter) nextTertiary() int {
- for ; i.pce < len(i.ce); i.pce++ {
- if v := i.ce[i.pce].Tertiary(); v != 0 {
- i.pce++
- return int(v)
- }
- }
- return 0
-}
-
-func (i *iter) nextQuaternary() int {
- for ; i.pce < len(i.ce); i.pce++ {
- if v := i.ce[i.pce].Quaternary(); v != 0 {
- i.pce++
- return v
- }
- }
- return 0
-}
-
-func appendPrimary(key []byte, p int) []byte {
- // Convert to variable length encoding; supports up to 23 bits.
- if p <= 0x7FFF {
- key = append(key, uint8(p>>8), uint8(p))
- } else {
- key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
- }
- return key
-}
-
-// keyFromElems converts the weights ws to a compact sequence of bytes.
-// The result will be appended to the byte buffer in buf.
-func (c *Collator) keyFromElems(buf *Buffer, ws []Elem) {
- for _, v := range ws {
- if w := v.Primary(); w > 0 {
- buf.key = appendPrimary(buf.key, w)
- }
- }
- if Secondary <= c.Strength {
- buf.key = append(buf.key, 0, 0)
- // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
- if !c.Backwards {
- for _, v := range ws {
- if w := v.Secondary(); w > 0 {
- buf.key = append(buf.key, uint8(w>>8), uint8(w))
- }
- }
- } else {
- for i := len(ws) - 1; i >= 0; i-- {
- if w := ws[i].Secondary(); w > 0 {
- buf.key = append(buf.key, uint8(w>>8), uint8(w))
- }
- }
- }
- } else if c.CaseLevel {
- buf.key = append(buf.key, 0, 0)
- }
- if Tertiary <= c.Strength || c.CaseLevel {
- buf.key = append(buf.key, 0, 0)
- for _, v := range ws {
- if w := v.Tertiary(); w > 0 {
- buf.key = append(buf.key, uint8(w))
- }
- }
- // Derive the quaternary weights from the options and other levels.
- // Note that we represent MaxQuaternary as 0xFF. The first byte of the
- // representation of a primary weight is always smaller than 0xFF,
- // so using this single byte value will compare correctly.
- if Quaternary <= c.Strength && c.Alternate >= AltShifted {
- if c.Alternate == AltShiftTrimmed {
- lastNonFFFF := len(buf.key)
- buf.key = append(buf.key, 0)
- for _, v := range ws {
- if w := v.Quaternary(); w == MaxQuaternary {
- buf.key = append(buf.key, 0xFF)
- } else if w > 0 {
- buf.key = appendPrimary(buf.key, w)
- lastNonFFFF = len(buf.key)
- }
- }
- buf.key = buf.key[:lastNonFFFF]
- } else {
- buf.key = append(buf.key, 0)
- for _, v := range ws {
- if w := v.Quaternary(); w == MaxQuaternary {
- buf.key = append(buf.key, 0xFF)
- } else if w > 0 {
- buf.key = appendPrimary(buf.key, w)
- }
- }
- }
- }
- }
-}
-
-func processWeights(vw AlternateHandling, top uint32, wa []Elem) {
- ignore := false
- vtop := int(top)
- switch vw {
- case AltShifted, AltShiftTrimmed:
- for i := range wa {
- if p := wa[i].Primary(); p <= vtop && p != 0 {
- wa[i] = MakeQuaternary(p)
- ignore = true
- } else if p == 0 {
- if ignore {
- wa[i] = ceIgnore
- }
- } else {
- ignore = false
- }
- }
- case AltBlanked:
- for i := range wa {
- if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
- wa[i] = ceIgnore
- ignore = true
- } else {
- ignore = false
- }
- }
- }
-}