// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package bzip2 import "sort" // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a // symbol. type huffmanTree struct { // nodes contains all the non-leaf nodes in the tree. nodes[0] is the // root of the tree and nextNode contains the index of the next element // of nodes to use when the tree is being constructed. nodes []huffmanNode nextNode int } // A huffmanNode is a node in the tree. left and right contain indexes into the // nodes slice of the tree. If left or right is invalidNodeValue then the child // is a left node and its value is in leftValue/rightValue. // // The symbols are uint16s because bzip2 encodes not only MTF indexes in the // tree, but also two magic values for run-length encoding and an EOF symbol. // Thus there are more than 256 possible symbols. type huffmanNode struct { left, right uint16 leftValue, rightValue uint16 } // invalidNodeValue is an invalid index which marks a leaf node in the tree. const invalidNodeValue = 0xffff // Decode reads bits from the given bitReader and navigates the tree until a // symbol is found. func (t *huffmanTree) Decode(br *bitReader) (v uint16) { nodeIndex := uint16(0) // node 0 is the root of the tree. for { node := &t.nodes[nodeIndex] bit, ok := br.TryReadBit() if !ok && br.ReadBit() { bit = 1 } // bzip2 encodes left as a true bit. if bit != 0 { // left if node.left == invalidNodeValue { return node.leftValue } nodeIndex = node.left } else { // right if node.right == invalidNodeValue { return node.rightValue } nodeIndex = node.right } } } // newHuffmanTree builds a Huffman tree from a slice containing the code // lengths of each symbol. The maximum code length is 32 bits. func newHuffmanTree(lengths []uint8) (huffmanTree, error) { // There are many possible trees that assign the same code length to // each symbol (consider reflecting a tree down the middle, for // example). Since the code length assignments determine the // efficiency of the tree, each of these trees is equally good. In // order to minimize the amount of information needed to build a tree // bzip2 uses a canonical tree so that it can be reconstructed given // only the code length assignments. if len(lengths) < 2 { panic("newHuffmanTree: too few symbols") } var t huffmanTree // First we sort the code length assignments by ascending code length, // using the symbol value to break ties. pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths))) for i, length := range lengths { pairs[i].value = uint16(i) pairs[i].length = length } sort.Sort(pairs) // Now we assign codes to the symbols, starting with the longest code. // We keep the codes packed into a uint32, at the most-significant end. // So branches are taken from the MSB downwards. This makes it easy to // sort them later. code := uint32(0) length := uint8(32) codes := huffmanCodes(make([]huffmanCode, len(lengths))) for i := len(pairs) - 1; i >= 0; i-- { if length > pairs[i].length { // If the code length decreases we shift in order to // zero any bits beyond the end of the code. length >>= 32 - pairs[i].length length <<= 32 - pairs[i].length length = pairs[i].length } codes[i].code = code codes[i].codeLen = length codes[i].value = pairs[i].value // We need to 'increment' the code, which means treating |code| // like a |length| bit number. code += 1 << (32 - length) } // Now we can sort by the code so that the left half of each branch are // grouped together, recursively. sort.Sort(codes) t.nodes = make([]huffmanNode, len(codes)) _, err := buildHuffmanNode(&t, codes, 0) return t, err } // huffmanSymbolLengthPair contains a symbol and its code length. type huffmanSymbolLengthPair struct { value uint16 length uint8 } // huffmanSymbolLengthPair is used to provide an interface for sorting. type huffmanSymbolLengthPairs []huffmanSymbolLengthPair func (h huffmanSymbolLengthPairs) Len() int { return len(h) } func (h huffmanSymbolLengthPairs) Less(i, j int) bool { if h[i].length < h[j].length { return true } if h[i].length > h[j].length { return false } if h[i].value < h[j].value { return true } return false } func (h huffmanSymbolLengthPairs) Swap(i, j int) { h[i], h[j] = h[j], h[i] } // huffmanCode contains a symbol, its code and code length. type huffmanCode struct { code uint32 codeLen uint8 value uint16 } // huffmanCodes is used to provide an interface for sorting. type huffmanCodes []huffmanCode func (n huffmanCodes) Len() int { return len(n) } func (n huffmanCodes) Less(i, j int) bool { return n[i].code < n[j].code } func (n huffmanCodes) Swap(i, j int) { n[i], n[j] = n[j], n[i] } // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in // the Huffman tree at the given level. It returns the index of the newly // constructed node. func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) { test := uint32(1) << (31 - level) // We have to search the list of codes to find the divide between the left and right sides. firstRightIndex := len(codes) for i, code := range codes { if code.code&test != 0 { firstRightIndex = i break } } left := codes[:firstRightIndex] right := codes[firstRightIndex:] if len(left) == 0 || len(right) == 0 { // There is a superfluous level in the Huffman tree indicating // a bug in the encoder. However, this bug has been observed in // the wild so we handle it. // If this function was called recursively then we know that // len(codes) >= 2 because, otherwise, we would have hit the // "leaf node" case, below, and not recursed. // // However, for the initial call it's possible that len(codes) // is zero or one. Both cases are invalid because a zero length // tree cannot encode anything and a length-1 tree can only // encode EOF and so is superfluous. We reject both. if len(codes) < 2 { return 0, StructuralError("empty Huffman tree") } // In this case the recursion doesn't always reduce the length // of codes so we need to ensure termination via another // mechanism. if level == 31 { // Since len(codes) >= 2 the only way that the values // can match at all 32 bits is if they are equal, which // is invalid. This ensures that we never enter // infinite recursion. return 0, StructuralError("equal symbols in Huffman tree") } if len(left) == 0 { return buildHuffmanNode(t, right, level+1) } return buildHuffmanNode(t, left, level+1) } nodeIndex = uint16(t.nextNode) node := &t.nodes[t.nextNode] t.nextNode++ if len(left) == 1 { // leaf node node.left = invalidNodeValue node.leftValue = left[0].value } else { node.left, err = buildHuffmanNode(t, left, level+1) } if err != nil { return } if len(right) == 1 { // leaf node node.right = invalidNodeValue node.rightValue = right[0].value } else { node.right, err = buildHuffmanNode(t, right, level+1) } return }