1 files changed, 290 insertions, 116 deletions
diff --git a/libgo/go/compress/flate/deflate.go b/libgo/go/compress/flate/deflate.go
index 169a0c7b2e..4d6a5357d8 100644
--- a/libgo/go/compress/flate/deflate.go
+++ b/libgo/go/compress/flate/deflate.go
@@ -13,59 +13,81 @@ import (
 const (
 	NoCompression      = 0
 	BestSpeed          = 1
-	fastCompression    = 3
 	BestCompression    = 9
 	DefaultCompression = -1
-	logWindowSize      = 15
-	windowSize         = 1 << logWindowSize
-	windowMask         = windowSize - 1
-	logMaxOffsetSize   = 15  // Standard DEFLATE
-	minMatchLength     = 3   // The smallest match that the compressor looks for
-	maxMatchLength     = 258 // The longest match for the compressor
-	minOffsetSize      = 1   // The shortest offset that makes any sense
-
-	// The maximum number of tokens we put into a single flat block, just to
+
+	// HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
+	// entropy encoding. This mode is useful in compressing data that has
+	// already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
+	// that lacks an entropy encoder. Compression gains are achieved when
+	// certain bytes in the input stream occur more frequently than others.
+	//
+	// Note that HuffmanOnly produces a compressed output that is
+	// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
+	// continue to be able to decompress this output.
+	HuffmanOnly = -2
+)
+
+const (
+	logWindowSize = 15
+	windowSize    = 1 << logWindowSize
+	windowMask    = windowSize - 1
+
+	// The LZ77 step produces a sequence of literal tokens and <length, offset>
+	// pair tokens. The offset is also known as distance. The underlying wire
+	// format limits the range of lengths and offsets. For example, there are
+	// 256 legitimate lengths: those in the range [3, 258]. This package's
+	// compressor uses a higher minimum match length, enabling optimizations
+	// such as finding matches via 32-bit loads and compares.
+	baseMatchLength = 3       // The smallest match length per the RFC section 3.2.5
+	minMatchLength  = 4       // The smallest match length that the compressor actually emits
+	maxMatchLength  = 258     // The largest match length
+	baseMatchOffset = 1       // The smallest match offset
+	maxMatchOffset  = 1 << 15 // The largest match offset
+
+	// The maximum number of tokens we put into a single flate block, just to
 	// stop things from getting too large.
 	maxFlateBlockTokens = 1 << 14
 	maxStoreBlockSize   = 65535
-	hashBits            = 17
+	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
 	hashMask            = (1 << hashBits) - 1
-	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
 	maxHashOffset       = 1 << 24
 
 	skipNever = math.MaxInt32
 )
 
 type compressionLevel struct {
-	good, lazy, nice, chain, fastSkipHashing int
+	level, good, lazy, nice, chain, fastSkipHashing int
 }
 
 var levels = []compressionLevel{
-	{}, // 0
-	// For levels 1-3 we don't bother trying with lazy matches
-	{3, 0, 8, 4, 4},
-	{3, 0, 16, 8, 5},
-	{3, 0, 32, 32, 6},
+	{0, 0, 0, 0, 0, 0}, // NoCompression.
+	{1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go.
+	// For levels 2-3 we don't bother trying with lazy matches.
+	{2, 4, 0, 16, 8, 5},
+	{3, 4, 0, 32, 32, 6},
 	// Levels 4-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{4, 4, 16, 16, skipNever},
-	{8, 16, 32, 32, skipNever},
-	{8, 16, 128, 128, skipNever},
-	{8, 32, 128, 256, skipNever},
-	{32, 128, 258, 1024, skipNever},
-	{32, 258, 258, 4096, skipNever},
+	{4, 4, 4, 16, 16, skipNever},
+	{5, 8, 16, 32, 32, skipNever},
+	{6, 8, 16, 128, 128, skipNever},
+	{7, 8, 32, 128, 256, skipNever},
+	{8, 32, 128, 258, 1024, skipNever},
+	{9, 32, 258, 258, 4096, skipNever},
 }
 
 type compressor struct {
 	compressionLevel
 
-	w *huffmanBitWriter
+	w          *huffmanBitWriter
+	bulkHasher func([]byte, []uint32)
 
 	// compression algorithm
-	fill func(*compressor, []byte) int // copy data to window
-	step func(*compressor)             // process window
-	sync bool                          // requesting flush
+	fill      func(*compressor, []byte) int // copy data to window
+	step      func(*compressor)             // process window
+	sync      bool                          // requesting flush
+	bestSpeed *deflateFast                  // Encoder for BestSpeed
 
 	// Input hash chains
 	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
@@ -73,8 +95,8 @@ type compressor struct {
 	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
 	// with the same hash value.
 	chainHead  int
-	hashHead   []int
-	hashPrev   []int
+	hashHead   [hashSize]uint32
+	hashPrev   [windowSize]uint32
 	hashOffset int
 
 	// input window: unprocessed data is window[index:windowEnd]
@@ -90,9 +112,12 @@ type compressor struct {
 	// deflate state
 	length         int
 	offset         int
-	hash           int
+	hash           uint32
 	maxInsertIndex int
 	err            error
+
+	// hashMatch must be able to contain hashes for the maximum match length.
+	hashMatch [maxMatchLength - 1]uint32
 }
 
 func (d *compressor) fillDeflate(b []byte) int {
@@ -111,16 +136,19 @@ func (d *compressor) fillDeflate(b []byte) int {
 			delta := d.hashOffset - 1
 			d.hashOffset -= delta
 			d.chainHead -= delta
-			for i, v := range d.hashPrev {
-				if v > delta {
-					d.hashPrev[i] -= delta
+
+			// Iterate over slices instead of arrays to avoid copying
+			// the entire table onto the stack (Issue #18625).
+			for i, v := range d.hashPrev[:] {
+				if int(v) > delta {
+					d.hashPrev[i] = uint32(int(v) - delta)
 				} else {
 					d.hashPrev[i] = 0
 				}
 			}
-			for i, v := range d.hashHead {
-				if v > delta {
-					d.hashHead[i] -= delta
+			for i, v := range d.hashHead[:] {
+				if int(v) > delta {
+					d.hashHead[i] = uint32(int(v) - delta)
 				} else {
 					d.hashHead[i] = 0
 				}
@@ -132,19 +160,74 @@ func (d *compressor) fillDeflate(b []byte) int {
 	return n
 }
 
-func (d *compressor) writeBlock(tokens []token, index int, eof bool) error {
-	if index > 0 || eof {
+func (d *compressor) writeBlock(tokens []token, index int) error {
+	if index > 0 {
 		var window []byte
 		if d.blockStart <= index {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tokens, eof, window)
+		d.w.writeBlock(tokens, false, window)
 		return d.w.err
 	}
 	return nil
 }
 
+// fillWindow will fill the current window with the supplied
+// dictionary and calculate all hashes.
+// This is much faster than doing a full encode.
+// Should only be used after a reset.
+func (d *compressor) fillWindow(b []byte) {
+	// Do not fill window if we are in store-only mode.
+	if d.compressionLevel.level < 2 {
+		return
+	}
+	if d.index != 0 || d.windowEnd != 0 {
+		panic("internal error: fillWindow called with stale data")
+	}
+
+	// If we are given too much, cut it.
+	if len(b) > windowSize {
+		b = b[len(b)-windowSize:]
+	}
+	// Add all to window.
+	n := copy(d.window, b)
+
+	// Calculate 256 hashes at the time (more L1 cache hits)
+	loops := (n + 256 - minMatchLength) / 256
+	for j := 0; j < loops; j++ {
+		index := j * 256
+		end := index + 256 + minMatchLength - 1
+		if end > n {
+			end = n
+		}
+		toCheck := d.window[index:end]
+		dstSize := len(toCheck) - minMatchLength + 1
+
+		if dstSize <= 0 {
+			continue
+		}
+
+		dst := d.hashMatch[:dstSize]
+		d.bulkHasher(toCheck, dst)
+		var newH uint32
+		for i, val := range dst {
+			di := i + index
+			newH = val
+			hh := &d.hashHead[newH&hashMask]
+			// Get previous value with the same hash.
+			// Our chain should point to the previous value.
+			d.hashPrev[di&windowMask] = *hh
+			// Set the head of the hash chain to us.
+			*hh = uint32(di + d.hashOffset)
+		}
+		d.hash = newH
+	}
+	// Update window information.
+	d.windowEnd = n
+	d.index = n
+}
+
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
 func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
@@ -168,20 +251,15 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 		tries >>= 2
 	}
 
-	w0 := win[pos]
-	w1 := win[pos+1]
 	wEnd := win[pos+length]
+	wPos := win[pos:]
 	minIndex := pos - windowSize
 
 	for i := prevHead; tries > 0; tries-- {
-		if w0 == win[i] && w1 == win[i+1] && wEnd == win[i+length] {
-			// The hash function ensures that if win[i] and win[i+1] match, win[i+2] matches
+		if wEnd == win[i+length] {
+			n := matchLen(win[i:], wPos, minMatchLook)
 
-			n := 3
-			for pos+n < len(win) && win[i+n] == win[pos+n] {
-				n++
-			}
-			if n > length && (n > 3 || pos-i <= 4096) {
+			if n > length && (n > minMatchLength || pos-i <= 4096) {
 				length = n
 				offset = pos - i
 				ok = true
@@ -196,7 +274,8 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 			// hashPrev[i & windowMask] has already been overwritten, so stop now.
 			break
 		}
-		if i = d.hashPrev[i&windowMask] - d.hashOffset; i < minIndex || i < 0 {
+		i = int(d.hashPrev[i&windowMask]) - d.hashOffset
+		if i < minIndex || i < 0 {
 			break
 		}
 	}
@@ -211,9 +290,85 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
 	return d.w.err
 }
 
+const hashmul = 0x1e35a7bd
+
+// hash4 returns a hash representation of the first 4 bytes
+// of the supplied slice.
+// The caller must ensure that len(b) >= 4.
+func hash4(b []byte) uint32 {
+	return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
+}
+
+// bulkHash4 will compute hashes using the same
+// algorithm as hash4
+func bulkHash4(b []byte, dst []uint32) {
+	if len(b) < minMatchLength {
+		return
+	}
+	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	dst[0] = (hb * hashmul) >> (32 - hashBits)
+	end := len(b) - minMatchLength + 1
+	for i := 1; i < end; i++ {
+		hb = (hb << 8) | uint32(b[i+3])
+		dst[i] = (hb * hashmul) >> (32 - hashBits)
+	}
+}
+
+// matchLen returns the number of matching bytes in a and b
+// up to length 'max'. Both slices must be at least 'max'
+// bytes in size.
+func matchLen(a, b []byte, max int) int {
+	a = a[:max]
+	b = b[:len(a)]
+	for i, av := range a {
+		if b[i] != av {
+			return i
+		}
+	}
+	return max
+}
+
+// encSpeed will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) encSpeed() {
+	// We only compress if we have maxStoreBlockSize.
+	if d.windowEnd < maxStoreBlockSize {
+		if !d.sync {
+			return
+		}
+
+		// Handle small sizes.
+		if d.windowEnd < 128 {
+			switch {
+			case d.windowEnd == 0:
+				return
+			case d.windowEnd <= 16:
+				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+			default:
+				d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+				d.err = d.w.err
+			}
+			d.windowEnd = 0
+			d.bestSpeed.reset()
+			return
+		}
+
+	}
+	// Encode the block.
+	d.tokens = d.bestSpeed.encode(d.tokens[:0], d.window[:d.windowEnd])
+
+	// If we removed less than 1/16th, Huffman compress the block.
+	if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) {
+		d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+	} else {
+		d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd])
+	}
+	d.err = d.w.err
+	d.windowEnd = 0
+}
+
 func (d *compressor) initDeflate() {
-	d.hashHead = make([]int, hashSize)
-	d.hashPrev = make([]int, windowSize)
 	d.window = make([]byte, 2*windowSize)
 	d.hashOffset = 1
 	d.tokens = make([]token, 0, maxFlateBlockTokens+1)
@@ -223,6 +378,7 @@ func (d *compressor) initDeflate() {
 	d.index = 0
 	d.hash = 0
 	d.chainHead = -1
+	d.bulkHasher = bulkHash4
 }
 
 func (d *compressor) deflate() {
@@ -232,7 +388,7 @@ func (d *compressor) deflate() {
 
 	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
 	if d.index < d.maxInsertIndex {
-		d.hash = int(d.window[d.index])<<hashShift + int(d.window[d.index+1])
+		d.hash = hash4(d.window[d.index : d.index+minMatchLength])
 	}
 
 Loop:
@@ -256,7 +412,7 @@ Loop:
 					d.byteAvailable = false
 				}
 				if len(d.tokens) > 0 {
-					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+					if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
 						return
 					}
 					d.tokens = d.tokens[:0]
@@ -266,10 +422,11 @@ Loop:
 		}
 		if d.index < d.maxInsertIndex {
 			// Update the hash
-			d.hash = (d.hash<<hashShift + int(d.window[d.index+2])) & hashMask
-			d.chainHead = d.hashHead[d.hash]
-			d.hashPrev[d.index&windowMask] = d.chainHead
-			d.hashHead[d.hash] = d.index + d.hashOffset
+			d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+			hh := &d.hashHead[d.hash&hashMask]
+			d.chainHead = int(*hh)
+			d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
+			*hh = uint32(d.index + d.hashOffset)
 		}
 		prevLength := d.length
 		prevOffset := d.offset
@@ -293,9 +450,9 @@ Loop:
 			// There was a match at the previous step, and the current match is
 			// not better. Output the previous match.
 			if d.fastSkipHashing != skipNever {
-				d.tokens = append(d.tokens, matchToken(uint32(d.length-minMatchLength), uint32(d.offset-minOffsetSize)))
+				d.tokens = append(d.tokens, matchToken(uint32(d.length-baseMatchLength), uint32(d.offset-baseMatchOffset)))
 			} else {
-				d.tokens = append(d.tokens, matchToken(uint32(prevLength-minMatchLength), uint32(prevOffset-minOffsetSize)))
+				d.tokens = append(d.tokens, matchToken(uint32(prevLength-baseMatchLength), uint32(prevOffset-baseMatchOffset)))
 			}
 			// Insert in the hash table all strings up to the end of the match.
 			// index and index-1 are already inserted. If there is not enough
@@ -310,12 +467,13 @@ Loop:
 				}
 				for d.index++; d.index < newIndex; d.index++ {
 					if d.index < d.maxInsertIndex {
-						d.hash = (d.hash<<hashShift + int(d.window[d.index+2])) & hashMask
+						d.hash = hash4(d.window[d.index : d.index+minMatchLength])
 						// Get previous value with the same hash.
 						// Our chain should point to the previous value.
-						d.hashPrev[d.index&windowMask] = d.hashHead[d.hash]
+						hh := &d.hashHead[d.hash&hashMask]
+						d.hashPrev[d.index&windowMask] = *hh
 						// Set the head of the hash chain to us.
-						d.hashHead[d.hash] = d.index + d.hashOffset
+						*hh = uint32(d.index + d.hashOffset)
 					}
 				}
 				if d.fastSkipHashing == skipNever {
@@ -327,12 +485,12 @@ Loop:
 				// item into the table.
 				d.index += d.length
 				if d.index < d.maxInsertIndex {
-					d.hash = (int(d.window[d.index])<<hashShift + int(d.window[d.index+1]))
+					d.hash = hash4(d.window[d.index : d.index+minMatchLength])
 				}
 			}
 			if len(d.tokens) == maxFlateBlockTokens {
 				// The block includes the current character
-				if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+				if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
 					return
 				}
 				d.tokens = d.tokens[:0]
@@ -345,7 +503,7 @@ Loop:
 				}
 				d.tokens = append(d.tokens, literalToken(uint32(d.window[i])))
 				if len(d.tokens) == maxFlateBlockTokens {
-					if d.err = d.writeBlock(d.tokens, i+1, false); d.err != nil {
+					if d.err = d.writeBlock(d.tokens, i+1); d.err != nil {
 						return
 					}
 					d.tokens = d.tokens[:0]
@@ -366,23 +524,43 @@ func (d *compressor) fillStore(b []byte) int {
 }
 
 func (d *compressor) store() {
-	if d.windowEnd > 0 {
+	if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
 		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		d.windowEnd = 0
 	}
+}
+
+// storeHuff compresses and stores the currently added data
+// when the d.window is full or we are at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeHuff() {
+	if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
+		return
+	}
+	d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+	d.err = d.w.err
 	d.windowEnd = 0
 }
 
 func (d *compressor) write(b []byte) (n int, err error) {
+	if d.err != nil {
+		return 0, d.err
+	}
 	n = len(b)
-	b = b[d.fill(d, b):]
 	for len(b) > 0 {
 		d.step(d)
 		b = b[d.fill(d, b):]
+		if d.err != nil {
+			return 0, d.err
+		}
 	}
-	return n, d.err
+	return n, nil
 }
 
 func (d *compressor) syncFlush() error {
+	if d.err != nil {
+		return d.err
+	}
 	d.sync = true
 	d.step(d)
 	if d.err == nil {
@@ -402,56 +580,53 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillStore
 		d.step = (*compressor).store
+	case level == HuffmanOnly:
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillStore
+		d.step = (*compressor).storeHuff
+	case level == BestSpeed:
+		d.compressionLevel = levels[level]
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillStore
+		d.step = (*compressor).encSpeed
+		d.bestSpeed = newDeflateFast()
+		d.tokens = make([]token, maxStoreBlockSize)
 	case level == DefaultCompression:
 		level = 6
 		fallthrough
-	case 1 <= level && level <= 9:
+	case 2 <= level && level <= 9:
 		d.compressionLevel = levels[level]
 		d.initDeflate()
 		d.fill = (*compressor).fillDeflate
 		d.step = (*compressor).deflate
 	default:
-		return fmt.Errorf("flate: invalid compression level %d: want value in range [-1, 9]", level)
+		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
 	}
 	return nil
 }
 
-var zeroes [32]int
-var bzeroes [256]byte
-
 func (d *compressor) reset(w io.Writer) {
 	d.w.reset(w)
 	d.sync = false
 	d.err = nil
-	switch d.compressionLevel.chain {
-	case 0:
-		// level was NoCompression.
-		for i := range d.window {
-			d.window[i] = 0
-		}
+	switch d.compressionLevel.level {
+	case NoCompression:
+		d.windowEnd = 0
+	case BestSpeed:
 		d.windowEnd = 0
+		d.tokens = d.tokens[:0]
+		d.bestSpeed.reset()
 	default:
 		d.chainHead = -1
-		for s := d.hashHead; len(s) > 0; {
-			n := copy(s, zeroes[:])
-			s = s[n:]
+		for i := range d.hashHead {
+			d.hashHead[i] = 0
 		}
-		for s := d.hashPrev; len(s) > 0; s = s[len(zeroes):] {
-			copy(s, zeroes[:])
+		for i := range d.hashPrev {
+			d.hashPrev[i] = 0
 		}
 		d.hashOffset = 1
-
 		d.index, d.windowEnd = 0, 0
-		for s := d.window; len(s) > 0; {
-			n := copy(s, bzeroes[:])
-			s = s[n:]
-		}
 		d.blockStart, d.byteAvailable = 0, false
-
-		d.tokens = d.tokens[:maxFlateBlockTokens+1]
-		for i := 0; i <= maxFlateBlockTokens; i++ {
-			d.tokens[i] = 0
-		}
 		d.tokens = d.tokens[:0]
 		d.length = minMatchLength - 1
 		d.offset = 0
@@ -461,6 +636,9 @@ func (d *compressor) reset(w io.Writer) {
 }
 
 func (d *compressor) close() error {
+	if d.err != nil {
+		return d.err
+	}
 	d.sync = true
 	d.step(d)
 	if d.err != nil {
@@ -477,10 +655,13 @@ func (d *compressor) close() error {
 // Following zlib, levels range from 1 (BestSpeed) to 9 (BestCompression);
 // higher levels typically run slower but compress more. Level 0
 // (NoCompression) does not attempt any compression; it only adds the
-// necessary DEFLATE framing. Level -1 (DefaultCompression) uses the default
-// compression level.
+// necessary DEFLATE framing.
+// Level -1 (DefaultCompression) uses the default compression level.
+// Level -2 (HuffmanOnly) will use Huffman compression only, giving
+// a very fast compression for all types of input, but sacrificing considerable
+// compression efficiency.
 //
-// If level is in the range [-1, 9] then the error returned will be nil.
+// If level is in the range [-2, 9] then the error returned will be nil.
 // Otherwise the error returned will be non-nil.
 func NewWriter(w io.Writer, level int) (*Writer, error) {
 	var dw Writer
@@ -491,34 +672,28 @@ func NewWriter(w io.Writer, level int) (*Writer, error) {
 }
 
 // NewWriterDict is like NewWriter but initializes the new
-// Writer with a preset dictionary.  The returned Writer behaves
+// Writer with a preset dictionary. The returned Writer behaves
 // as if the dictionary had been written to it without producing
-// any compressed output.  The compressed data written to w
+// any compressed output. The compressed data written to w
 // can only be decompressed by a Reader initialized with the
 // same dictionary.
 func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
-	dw := &dictWriter{w, false}
+	dw := &dictWriter{w}
 	zw, err := NewWriter(dw, level)
 	if err != nil {
 		return nil, err
 	}
-	zw.Write(dict)
-	zw.Flush()
-	dw.enabled = true
+	zw.d.fillWindow(dict)
 	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
 	return zw, err
 }
 
 type dictWriter struct {
-	w       io.Writer
-	enabled bool
+	w io.Writer
 }
 
 func (w *dictWriter) Write(b []byte) (n int, err error) {
-	if w.enabled {
-		return w.w.Write(b)
-	}
-	return len(b), nil
+	return w.w.Write(b)
 }
 
 // A Writer takes data written to it and writes the compressed
@@ -534,10 +709,12 @@ func (w *Writer) Write(data []byte) (n int, err error) {
 	return w.d.write(data)
 }
 
-// Flush flushes any pending compressed data to the underlying writer.
+// Flush flushes any pending data to the underlying writer.
 // It is useful mainly in compressed network protocols, to ensure that
 // a remote reader has enough data to reconstruct a packet.
 // Flush does not return until the data has been written.
+// Calling Flush when there is no pending data still causes the Writer
+// to emit a sync marker of at least 4 bytes.
 // If the underlying writer returns an error, Flush returns that error.
 //
 // In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
@@ -556,14 +733,11 @@ func (w *Writer) Close() error {
 // the result of NewWriter or NewWriterDict called with dst
 // and w's level and dictionary.
 func (w *Writer) Reset(dst io.Writer) {
-	if dw, ok := w.d.w.w.(*dictWriter); ok {
+	if dw, ok := w.d.w.writer.(*dictWriter); ok {
 		// w was created with NewWriterDict
 		dw.w = dst
 		w.d.reset(dw)
-		dw.enabled = false
-		w.Write(w.dict)
-		w.Flush()
-		dw.enabled = true
+		w.d.fillWindow(w.dict)
 	} else {
 		// w was created with NewWriter
 		w.d.reset(dst)