1
// Copyright 2012 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
5
// The trie in this file is used to associate the first full character
6
// in a UTF-8 string to a collation element.
7
// All but the last byte in a UTF-8 byte sequence are
8
// used to look up offsets in the index table to be used for the next byte.
9
// The last byte is used to index into a table of collation elements.
10
// This file contains the code for the generation of the trie.
23
blockOffset = 2 // Subtract 2 blocks to compensate for the 0x80 added to continuation bytes.
26
type trieHandle struct {
27
lookupStart uint16 // offset in table for first byte
28
valueStart uint16 // offset in table for first byte
36
// trieNode is the intermediate trie structure used for generating a trie.
37
type trieNode struct {
45
func newNode() *trieNode {
47
index: make([]*trieNode, 64),
48
value: make([]uint32, 128), // root node size is 128 instead of 64
52
func (n *trieNode) isInternal() bool {
56
func (n *trieNode) insert(r rune, value uint32) {
57
const maskx = 0x3F // mask out two most-significant bits
60
n.value[str[0]] = value
63
for i := 0; i < len(str)-1; i++ {
66
n.index = make([]*trieNode, blockSize)
77
n.value = make([]uint32, blockSize)
79
b := str[len(str)-1] & maskx
83
type trieBuilder struct {
88
lookupBlocks []*trieNode
89
valueBlocks []*trieNode
91
lookupBlockIdx map[uint32]*trieNode
92
valueBlockIdx map[uint32]*trieNode
95
func newTrieBuilder() *trieBuilder {
96
index := &trieBuilder{}
97
index.lookupBlocks = make([]*trieNode, 0)
98
index.valueBlocks = make([]*trieNode, 0)
99
index.lookupBlockIdx = make(map[uint32]*trieNode)
100
index.valueBlockIdx = make(map[uint32]*trieNode)
101
// The third nil is the default null block. The other two blocks
102
// are used to guarantee an offset of at least 3 for each block.
103
index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil)
108
func (b *trieBuilder) computeOffsets(n *trieNode) *trieNode {
109
hasher := fnv.New32()
111
for i, nn := range n.index {
114
nn = b.computeOffsets(nn)
119
hasher.Write([]byte{byte(vi >> 8), byte(vi)})
120
hasher.Write([]byte{byte(vv >> 8), byte(vv)})
123
nn, ok := b.lookupBlockIdx[h]
125
n.refIndex = uint16(len(b.lookupBlocks)) - blockOffset
126
b.lookupBlocks = append(b.lookupBlocks, n)
127
b.lookupBlockIdx[h] = n
132
for _, v := range n.value {
133
hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)})
136
nn, ok := b.valueBlockIdx[h]
138
n.refValue = uint16(len(b.valueBlocks)) - blockOffset
139
n.refIndex = n.refValue
140
b.valueBlocks = append(b.valueBlocks, n)
141
b.valueBlockIdx[h] = n
149
func (b *trieBuilder) addStartValueBlock(n *trieNode) uint16 {
150
hasher := fnv.New32()
151
for _, v := range n.value[:2*blockSize] {
152
hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)})
155
nn, ok := b.valueBlockIdx[h]
157
n.refValue = uint16(len(b.valueBlocks))
158
n.refIndex = n.refValue
159
b.valueBlocks = append(b.valueBlocks, n)
160
// Add a dummy block to accommodate the double block size.
161
b.valueBlocks = append(b.valueBlocks, nil)
162
b.valueBlockIdx[h] = n
169
func genValueBlock(t *trie, n *trieNode) {
171
for _, v := range n.value {
172
t.values = append(t.values, v)
177
func genLookupBlock(t *trie, n *trieNode) {
178
for _, nn := range n.index {
187
t.index = append(t.index, v)
191
func (b *trieBuilder) addTrie(n *trieNode) *trieHandle {
193
b.roots = append(b.roots, h)
194
h.valueStart = b.addStartValueBlock(n)
195
if len(b.roots) == 1 {
196
// We insert a null block after the first start value block.
197
// This ensures that continuation bytes UTF-8 sequences of length
198
// greater than 2 will automatically hit a null block if there
199
// was an undefined entry.
200
b.valueBlocks = append(b.valueBlocks, nil)
202
n = b.computeOffsets(n)
203
// Offset by one extra block as the first byte starts at 0xC0 instead of 0x80.
204
h.lookupStart = n.refIndex - 1
208
// generate generates and returns the trie for n.
209
func (b *trieBuilder) generate() (t *trie, err error) {
211
if len(b.valueBlocks) >= 1<<16 {
212
return nil, fmt.Errorf("maximum number of value blocks exceeded (%d > %d)", len(b.valueBlocks), 1<<16)
214
if len(b.lookupBlocks) >= 1<<16 {
215
return nil, fmt.Errorf("maximum number of lookup blocks exceeded (%d > %d)", len(b.lookupBlocks), 1<<16)
217
genValueBlock(t, b.valueBlocks[0])
218
genValueBlock(t, &trieNode{value: make([]uint32, 64)})
219
for i := 2; i < len(b.valueBlocks); i++ {
220
genValueBlock(t, b.valueBlocks[i])
222
n := &trieNode{index: make([]*trieNode, 64)}
226
for i := 3; i < len(b.lookupBlocks); i++ {
227
genLookupBlock(t, b.lookupBlocks[i])
232
func (t *trie) printArrays(w io.Writer, name string) (n, size int, err error) {
233
p := func(f string, a ...interface{}) {
234
nn, e := fmt.Fprintf(w, f, a...)
241
p("// %sValues: %d entries, %d bytes\n", name, nv, nv*4)
242
p("// Block 2 is the null block.\n")
243
p("var %sValues = [%d]uint32 {", name, nv)
244
var printnewline bool
245
for i, v := range t.values {
246
if i%blockSize == 0 {
247
p("\n\t// Block %#x, offset %#x", i/blockSize, i)
257
p("%#04x:%#08x, ", i, v)
262
p("// %sLookup: %d entries, %d bytes\n", name, ni, ni*2)
263
p("// Block 0 is the null block.\n")
264
p("var %sLookup = [%d]uint16 {", name, ni)
266
for i, v := range t.index {
267
if i%blockSize == 0 {
268
p("\n\t// Block %#x, offset %#x", i/blockSize, i)
278
p("%#03x:%#02x, ", i, v)
282
return n, nv*4 + ni*2, err
285
func (t *trie) printStruct(w io.Writer, handle *trieHandle, name string) (n, sz int, err error) {
286
const msg = "trie{ %sLookup[%d:], %sValues[%d:], %sLookup[:], %sValues[:]}"
287
n, err = fmt.Fprintf(w, msg, name, handle.lookupStart*blockSize, name, handle.valueStart*blockSize, name, name)
288
sz += int(reflect.TypeOf(trie{}).Size())