diff options
Diffstat (limited to 'libgo/go/exp/locale/collate/collate.go')
-rw-r--r-- | libgo/go/exp/locale/collate/collate.go | 586 |
1 files changed, 0 insertions, 586 deletions
diff --git a/libgo/go/exp/locale/collate/collate.go b/libgo/go/exp/locale/collate/collate.go deleted file mode 100644 index 2cb29f24b74..00000000000 --- a/libgo/go/exp/locale/collate/collate.go +++ /dev/null @@ -1,586 +0,0 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package collate contains types for comparing and sorting Unicode strings -// according to a given collation order. Package locale provides a high-level -// interface to collation. Users should typically use that package instead. -package collate - -import ( - "bytes" - "exp/norm" -) - -// AlternateHandling identifies the various ways in which variables are handled. -// A rune with a primary weight lower than the variable top is considered a -// variable. -// See http://www.unicode.org/reports/tr10/#Variable_Weighting for details. -type AlternateHandling int - -const ( - // AltNonIgnorable turns off special handling of variables. - AltNonIgnorable AlternateHandling = iota - - // AltBlanked sets variables and all subsequent primary ignorables to be - // ignorable at all levels. This is identical to removing all variables - // and subsequent primary ignorables from the input. - AltBlanked - - // AltShifted sets variables to be ignorable for levels one through three and - // adds a fourth level based on the values of the ignored levels. - AltShifted - - // AltShiftTrimmed is a slight variant of AltShifted that is used to - // emulate POSIX. - AltShiftTrimmed -) - -// Collator provides functionality for comparing strings for a given -// collation order. -type Collator struct { - // TODO: hide most of these options. Low-level options are set through the locale - // identifier (as defined by LDML) while high-level options are set through SetOptions. - // Using high-level options allows us to be more flexible (such as not ignoring - // Thai vowels for IgnoreDiacriticals) and more user-friendly (such as allowing - // diacritical marks to be ignored but not case without having to fiddle with levels). - - // Strength sets the maximum level to use in comparison. - Strength Level - - // Alternate specifies an alternative handling of variables. - Alternate AlternateHandling - - // Backwards specifies the order of sorting at the secondary level. - // This option exists predominantly to support reverse sorting of accents in French. - Backwards bool - - // TODO: implement: - // With HiraganaQuaternary enabled, Hiragana codepoints will get lower values - // than all the other non-variable code points. Strength must be greater or - // equal to Quaternary for this to take effect. - HiraganaQuaternary bool - - // If CaseLevel is true, a level consisting only of case characteristics will - // be inserted in front of the tertiary level. To ignore accents but take - // cases into account, set Strength to Primary and CaseLevel to true. - CaseLevel bool - - // If Numeric is true, any sequence of decimal digits (category is Nd) is sorted - // at a primary level with its numeric value. For example, "A-21" < "A-123". - Numeric bool - - // The largest primary value that is considered to be variable. - variableTop uint32 - - f norm.Form - - t Weigher - - sorter sorter - - _iter [2]iter -} - -// An Option is used to change the behavior of Collator. They override the -// settings passed through the locale identifier. -type Option int - -const ( - Numeric Option = 1 << iota // Sort numbers numerically ("2" < "12"). - IgnoreCase // Case-insensitive search. - IgnoreDiacritics // Ignore diacritical marks. ("o" == "รถ"). - IgnoreWidth // Ignore full versus normal width. - UpperFirst // Sort upper case before lower case. - LowerFirst // Sort lower case before upper case. - Force // Force ordering if strings are equivalent but not equal. - - Loose = IgnoreDiacritics | IgnoreWidth | IgnoreCase -) - -// SetOptions accepts a Options or-ed together. All previous calls to SetOptions are ignored. -func (c *Collator) SetOptions(o Option) { - // TODO: implement -} - -func (c *Collator) iter(i int) *iter { - // TODO: evaluate performance for making the second iterator optional. - return &c._iter[i] -} - -// Locales returns the list of locales for which collating differs from its parent locale. -// The returned value should not be modified. -func Locales() []string { - return availableLocales -} - -// New returns a new Collator initialized for the given locale. -func New(loc string) *Collator { - // TODO: handle locale selection according to spec. - var t tableIndex - if loc != "" { - if idx, ok := locales[loc]; ok { - t = idx - } else { - t = locales["root"] - } - } - return NewFromTable(Init(t)) -} - -func NewFromTable(t Weigher) *Collator { - c := &Collator{ - Strength: Tertiary, - f: norm.NFD, - t: t, - } - c._iter[0].init(c) - c._iter[1].init(c) - return c -} - -// Buffer holds keys generated by Key and KeyString. -type Buffer struct { - buf [4096]byte - key []byte -} - -func (b *Buffer) init() { - if b.key == nil { - b.key = b.buf[:0] - } -} - -// Reset clears the buffer from previous results generated by Key and KeyString. -func (b *Buffer) Reset() { - b.key = b.key[:0] -} - -// Compare returns an integer comparing the two byte slices. -// The result will be 0 if a==b, -1 if a < b, and +1 if a > b. -func (c *Collator) Compare(a, b []byte) int { - // TODO: skip identical prefixes once we have a fast way to detect if a rune is - // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. - c.iter(0).setInput(a) - c.iter(1).setInput(b) - if res := c.compare(); res != 0 { - return res - } - if Identity == c.Strength { - return bytes.Compare(a, b) - } - return 0 -} - -// CompareString returns an integer comparing the two strings. -// The result will be 0 if a==b, -1 if a < b, and +1 if a > b. -func (c *Collator) CompareString(a, b string) int { - // TODO: skip identical prefixes once we have a fast way to detect if a rune is - // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. - c.iter(0).setInputString(a) - c.iter(1).setInputString(b) - if res := c.compare(); res != 0 { - return res - } - if Identity == c.Strength { - if a < b { - return -1 - } else if a > b { - return 1 - } - } - return 0 -} - -func compareLevel(f func(i *iter) int, a, b *iter) int { - a.pce = 0 - b.pce = 0 - for { - va := f(a) - vb := f(b) - if va != vb { - if va < vb { - return -1 - } - return 1 - } else if va == 0 { - break - } - } - return 0 -} - -func (c *Collator) compare() int { - ia, ib := c.iter(0), c.iter(1) - // Process primary level - if c.Alternate != AltShifted { - // TODO: implement script reordering - // TODO: special hiragana handling - if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 { - return res - } - } else { - // TODO: handle shifted - } - if Secondary <= c.Strength { - f := (*iter).nextSecondary - if c.Backwards { - f = (*iter).prevSecondary - } - if res := compareLevel(f, ia, ib); res != 0 { - return res - } - } - // TODO: special case handling (Danish?) - if Tertiary <= c.Strength || c.CaseLevel { - if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 { - return res - } - // TODO: Not needed for the default value of AltNonIgnorable? - if Quaternary <= c.Strength { - if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 { - return res - } - } - } - return 0 -} - -// Key returns the collation key for str. -// Passing the buffer buf may avoid memory allocations. -// The returned slice will point to an allocation in Buffer and will remain -// valid until the next call to buf.Reset(). -func (c *Collator) Key(buf *Buffer, str []byte) []byte { - // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. - buf.init() - return c.key(buf, c.getColElems(str)) -} - -// KeyFromString returns the collation key for str. -// Passing the buffer buf may avoid memory allocations. -// The returned slice will point to an allocation in Buffer and will retain -// valid until the next call to buf.ResetKeys(). -func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { - // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. - buf.init() - return c.key(buf, c.getColElemsString(str)) -} - -func (c *Collator) key(buf *Buffer, w []Elem) []byte { - processWeights(c.Alternate, c.variableTop, w) - kn := len(buf.key) - c.keyFromElems(buf, w) - return buf.key[kn:] -} - -func (c *Collator) getColElems(str []byte) []Elem { - i := c.iter(0) - i.setInput(str) - for i.next() { - } - return i.ce -} - -func (c *Collator) getColElemsString(str string) []Elem { - i := c.iter(0) - i.setInputString(str) - for i.next() { - } - return i.ce -} - -type iter struct { - bytes []byte - str string - - wa [512]Elem - ce []Elem - pce int - nce int // nce <= len(nce) - - prevCCC uint8 - pStarter int - - t Weigher -} - -func (i *iter) init(c *Collator) { - i.t = c.t - i.ce = i.wa[:0] -} - -func (i *iter) reset() { - i.ce = i.ce[:0] - i.nce = 0 - i.prevCCC = 0 - i.pStarter = 0 -} - -func (i *iter) setInput(s []byte) *iter { - i.bytes = s - i.str = "" - i.reset() - return i -} - -func (i *iter) setInputString(s string) *iter { - i.str = s - i.bytes = nil - i.reset() - return i -} - -func (i *iter) done() bool { - return len(i.str) == 0 && len(i.bytes) == 0 -} - -func (i *iter) tail(n int) { - if i.bytes == nil { - i.str = i.str[n:] - } else { - i.bytes = i.bytes[n:] - } -} - -func (i *iter) appendNext() int { - var sz int - if i.bytes == nil { - i.ce, sz = i.t.AppendNextString(i.ce, i.str) - } else { - i.ce, sz = i.t.AppendNext(i.ce, i.bytes) - } - return sz -} - -// next appends Elems to the internal array until it adds an element with CCC=0. -// In the majority of cases, a Elem with a primary value > 0 will have -// a CCC of 0. The CCC values of colation elements are also used to detect if the -// input string was not normalized and to adjust the result accordingly. -func (i *iter) next() bool { - for !i.done() { - p0 := len(i.ce) - sz := i.appendNext() - i.tail(sz) - last := len(i.ce) - 1 - if ccc := i.ce[last].CCC(); ccc == 0 { - i.nce = len(i.ce) - i.pStarter = last - i.prevCCC = 0 - return true - } else if p0 < last && i.ce[p0].CCC() == 0 { - // set i.nce to only cover part of i.ce for which ccc == 0 and - // use rest the next call to next. - for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ { - } - i.nce = p0 - i.pStarter = p0 - 1 - i.prevCCC = ccc - return true - } else if ccc < i.prevCCC { - i.doNorm(p0, ccc) // should be rare for most common cases - } else { - i.prevCCC = ccc - } - } - if len(i.ce) != i.nce { - i.nce = len(i.ce) - return true - } - return false -} - -// nextPlain is the same as next, but does not "normalize" the collation -// elements. -// TODO: remove this function. Using this instead of next does not seem -// to improve performance in any significant way. We retain this until -// later for evaluation purposes. -func (i *iter) nextPlain() bool { - if i.done() { - return false - } - sz := i.appendNext() - i.tail(sz) - i.nce = len(i.ce) - return true -} - -const maxCombiningCharacters = 30 - -// doNorm reorders the collation elements in i.ce. -// It assumes that blocks of collation elements added with appendNext -// either start and end with the same CCC or start with CCC == 0. -// This allows for a single insertion point for the entire block. -// The correctness of this assumption is verified in builder.go. -func (i *iter) doNorm(p int, ccc uint8) { - if p-i.pStarter > maxCombiningCharacters { - i.prevCCC = i.ce[len(i.ce)-1].CCC() - i.pStarter = len(i.ce) - 1 - return - } - n := len(i.ce) - k := p - for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- { - } - i.ce = append(i.ce, i.ce[p:k]...) - copy(i.ce[p:], i.ce[k:]) - i.ce = i.ce[:n] -} - -func (i *iter) nextPrimary() int { - for { - for ; i.pce < i.nce; i.pce++ { - if v := i.ce[i.pce].Primary(); v != 0 { - i.pce++ - return v - } - } - if !i.next() { - return 0 - } - } - panic("should not reach here") -} - -func (i *iter) nextSecondary() int { - for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[i.pce].Secondary(); v != 0 { - i.pce++ - return v - } - } - return 0 -} - -func (i *iter) prevSecondary() int { - for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 { - i.pce++ - return v - } - } - return 0 -} - -func (i *iter) nextTertiary() int { - for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[i.pce].Tertiary(); v != 0 { - i.pce++ - return int(v) - } - } - return 0 -} - -func (i *iter) nextQuaternary() int { - for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[i.pce].Quaternary(); v != 0 { - i.pce++ - return v - } - } - return 0 -} - -func appendPrimary(key []byte, p int) []byte { - // Convert to variable length encoding; supports up to 23 bits. - if p <= 0x7FFF { - key = append(key, uint8(p>>8), uint8(p)) - } else { - key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p)) - } - return key -} - -// keyFromElems converts the weights ws to a compact sequence of bytes. -// The result will be appended to the byte buffer in buf. -func (c *Collator) keyFromElems(buf *Buffer, ws []Elem) { - for _, v := range ws { - if w := v.Primary(); w > 0 { - buf.key = appendPrimary(buf.key, w) - } - } - if Secondary <= c.Strength { - buf.key = append(buf.key, 0, 0) - // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF. - if !c.Backwards { - for _, v := range ws { - if w := v.Secondary(); w > 0 { - buf.key = append(buf.key, uint8(w>>8), uint8(w)) - } - } - } else { - for i := len(ws) - 1; i >= 0; i-- { - if w := ws[i].Secondary(); w > 0 { - buf.key = append(buf.key, uint8(w>>8), uint8(w)) - } - } - } - } else if c.CaseLevel { - buf.key = append(buf.key, 0, 0) - } - if Tertiary <= c.Strength || c.CaseLevel { - buf.key = append(buf.key, 0, 0) - for _, v := range ws { - if w := v.Tertiary(); w > 0 { - buf.key = append(buf.key, uint8(w)) - } - } - // Derive the quaternary weights from the options and other levels. - // Note that we represent MaxQuaternary as 0xFF. The first byte of the - // representation of a primary weight is always smaller than 0xFF, - // so using this single byte value will compare correctly. - if Quaternary <= c.Strength && c.Alternate >= AltShifted { - if c.Alternate == AltShiftTrimmed { - lastNonFFFF := len(buf.key) - buf.key = append(buf.key, 0) - for _, v := range ws { - if w := v.Quaternary(); w == MaxQuaternary { - buf.key = append(buf.key, 0xFF) - } else if w > 0 { - buf.key = appendPrimary(buf.key, w) - lastNonFFFF = len(buf.key) - } - } - buf.key = buf.key[:lastNonFFFF] - } else { - buf.key = append(buf.key, 0) - for _, v := range ws { - if w := v.Quaternary(); w == MaxQuaternary { - buf.key = append(buf.key, 0xFF) - } else if w > 0 { - buf.key = appendPrimary(buf.key, w) - } - } - } - } - } -} - -func processWeights(vw AlternateHandling, top uint32, wa []Elem) { - ignore := false - vtop := int(top) - switch vw { - case AltShifted, AltShiftTrimmed: - for i := range wa { - if p := wa[i].Primary(); p <= vtop && p != 0 { - wa[i] = MakeQuaternary(p) - ignore = true - } else if p == 0 { - if ignore { - wa[i] = ceIgnore - } - } else { - ignore = false - } - } - case AltBlanked: - for i := range wa { - if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) { - wa[i] = ceIgnore - ignore = true - } else { - ignore = false - } - } - } -} |