summaryrefslogtreecommitdiff
path: root/bzrlib/chunk_writer.py
blob: a9f3b851d4c46be2b7dd2b939f164cedd26901c8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# Copyright (C) 2008 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#

"""ChunkWriter: write compressed data out with a fixed upper bound."""

from __future__ import absolute_import

import zlib
from zlib import Z_FINISH, Z_SYNC_FLUSH


class ChunkWriter(object):
    """ChunkWriter allows writing of compressed data with a fixed size.

    If less data is supplied than fills a chunk, the chunk is padded with
    NULL bytes. If more data is supplied, then the writer packs as much
    in as it can, but never splits any item it was given.

    The algorithm for packing is open to improvement! Current it is:
     - write the bytes given
     - if the total seen bytes so far exceeds the chunk size, flush.

    :cvar _max_repack: To fit the maximum number of entries into a node, we
        will sometimes start over and compress the whole list to get tighter
        packing. We get diminishing returns after a while, so this limits the
        number of times we will try.
        The default is to try to avoid recompressing entirely, but setting this
        to something like 20 will give maximum compression.

    :cvar _max_zsync: Another tunable nob. If _max_repack is set to 0, then you
        can limit the number of times we will try to pack more data into a
        node. This allows us to do a single compression pass, rather than
        trying until we overflow, and then recompressing again.
    """
    #    In testing, some values for bzr.dev::
    #        repack  time  MB   max   full
    #         1       7.5  4.6  1140  0
    #         2       8.4  4.2  1036  1
    #         3       9.8  4.1  1012  278
    #         4      10.8  4.1  728   945
    #        20      11.1  4.1  0     1012
    #        repack = 0
    #        zsync   time  MB    repack  stop_for_z
    #         0       5.0  24.7  0       6270
    #         1       4.3  13.2  0       3342
    #         2       4.9   9.6  0       2414
    #         5       4.8   6.2  0       1549
    #         6       4.8   5.8  1       1435
    #         7       4.8   5.5  19      1337
    #         8       4.4   5.3  81      1220
    #        10       5.3   5.0  260     967
    #        11       5.3   4.9  366     839
    #        12       5.1   4.8  454     731
    #        15       5.8   4.7  704     450
    #        20       5.8   4.6  1133    7

    #    In testing, some values for mysql-unpacked::
    #                next_bytes estim
    #        repack  time  MB    full    stop_for_repack
    #         1            15.4  0       3913
    #         2      35.4  13.7  0       346
    #        20      46.7  13.4  3380    0
    #        repack=0
    #        zsync                       stop_for_z
    #         0      29.5 116.5  0       29782
    #         1      27.8  60.2  0       15356
    #         2      27.8  42.4  0       10822
    #         5      26.8  25.5  0       6491
    #         6      27.3  23.2  13      5896
    #         7      27.5  21.6  29      5451
    #         8      27.1  20.3  52      5108
    #        10      29.4  18.6  195     4526
    #        11      29.2  18.0  421     4143
    #        12      28.0  17.5  702     3738
    #        15      28.9  16.5  1223    2969
    #        20      29.6  15.7  2182    1810
    #        30      31.4  15.4  3891    23

    # Tuple of (num_repack_attempts, num_zsync_attempts)
    # num_zsync_attempts only has meaning if num_repack_attempts is 0.
    _repack_opts_for_speed = (0, 8)
    _repack_opts_for_size = (20, 0)

    def __init__(self, chunk_size, reserved=0, optimize_for_size=False):
        """Create a ChunkWriter to write chunk_size chunks.

        :param chunk_size: The total byte count to emit at the end of the
            chunk.
        :param reserved: How many bytes to allow for reserved data. reserved
            data space can only be written to via the write(..., reserved=True).
        """
        self.chunk_size = chunk_size
        self.compressor = zlib.compressobj()
        self.bytes_in = []
        self.bytes_list = []
        self.bytes_out_len = 0
        # bytes that have been seen, but not included in a flush to out yet
        self.unflushed_in_bytes = 0
        self.num_repack = 0
        self.num_zsync = 0
        self.unused_bytes = None
        self.reserved_size = reserved
        # Default is to make building fast rather than compact
        self.set_optimize(for_size=optimize_for_size)

    def finish(self):
        """Finish the chunk.

        This returns the final compressed chunk, and either None, or the
        bytes that did not fit in the chunk.

        :return: (compressed_bytes, unused_bytes, num_nulls_needed)

            * compressed_bytes: a list of bytes that were output from the
              compressor. If the compressed length was not exactly chunk_size,
              the final string will be a string of all null bytes to pad this
              to chunk_size
            * unused_bytes: None, or the last bytes that were added, which we
              could not fit.
            * num_nulls_needed: How many nulls are padded at the end
        """
        self.bytes_in = None # Free the data cached so far, we don't need it
        out = self.compressor.flush(Z_FINISH)
        self.bytes_list.append(out)
        self.bytes_out_len += len(out)

        if self.bytes_out_len > self.chunk_size:
            raise AssertionError('Somehow we ended up with too much'
                                 ' compressed data, %d > %d'
                                 % (self.bytes_out_len, self.chunk_size))
        nulls_needed = self.chunk_size - self.bytes_out_len
        if nulls_needed:
            self.bytes_list.append("\x00" * nulls_needed)
        return self.bytes_list, self.unused_bytes, nulls_needed

    def set_optimize(self, for_size=True):
        """Change how we optimize our writes.

        :param for_size: If True, optimize for minimum space usage, otherwise
            optimize for fastest writing speed.
        :return: None
        """
        if for_size:
            opts = ChunkWriter._repack_opts_for_size
        else:
            opts = ChunkWriter._repack_opts_for_speed
        self._max_repack, self._max_zsync = opts

    def _recompress_all_bytes_in(self, extra_bytes=None):
        """Recompress the current bytes_in, and optionally more.

        :param extra_bytes: Optional, if supplied we will add it with
            Z_SYNC_FLUSH
        :return: (bytes_out, bytes_out_len, alt_compressed)

            * bytes_out: is the compressed bytes returned from the compressor
            * bytes_out_len: the length of the compressed output
            * compressor: An object with everything packed in so far, and
              Z_SYNC_FLUSH called.
        """
        compressor = zlib.compressobj()
        bytes_out = []
        append = bytes_out.append
        compress = compressor.compress
        for accepted_bytes in self.bytes_in:
            out = compress(accepted_bytes)
            if out:
                append(out)
        if extra_bytes:
            out = compress(extra_bytes)
            out += compressor.flush(Z_SYNC_FLUSH)
            append(out)
        bytes_out_len = sum(map(len, bytes_out))
        return bytes_out, bytes_out_len, compressor

    def write(self, bytes, reserved=False):
        """Write some bytes to the chunk.

        If the bytes fit, False is returned. Otherwise True is returned
        and the bytes have not been added to the chunk.

        :param bytes: The bytes to include
        :param reserved: If True, we can use the space reserved in the
            constructor.
        """
        if self.num_repack > self._max_repack and not reserved:
            self.unused_bytes = bytes
            return True
        if reserved:
            capacity = self.chunk_size
        else:
            capacity = self.chunk_size - self.reserved_size
        comp = self.compressor

        # Check to see if the currently unflushed bytes would fit with a bit of
        # room to spare, assuming no compression.
        next_unflushed = self.unflushed_in_bytes + len(bytes)
        remaining_capacity = capacity - self.bytes_out_len - 10
        if (next_unflushed < remaining_capacity):
            # looks like it will fit
            out = comp.compress(bytes)
            if out:
                self.bytes_list.append(out)
                self.bytes_out_len += len(out)
            self.bytes_in.append(bytes)
            self.unflushed_in_bytes += len(bytes)
        else:
            # This may or may not fit, try to add it with Z_SYNC_FLUSH
            # Note: It is tempting to do this as a look-ahead pass, and to
            #       'copy()' the compressor before flushing. However, it seems
            #       that Which means that it is the same thing as increasing
            #       repack, similar cost, same benefit. And this way we still
            #       have the 'repack' knob that can be adjusted, and not depend
            #       on a platform-specific 'copy()' function.
            self.num_zsync += 1
            if self._max_repack == 0 and self.num_zsync > self._max_zsync:
                self.num_repack += 1
                self.unused_bytes = bytes
                return True
            out = comp.compress(bytes)
            out += comp.flush(Z_SYNC_FLUSH)
            self.unflushed_in_bytes = 0
            if out:
                self.bytes_list.append(out)
                self.bytes_out_len += len(out)

            # We are a bit extra conservative, because it seems that you *can*
            # get better compression with Z_SYNC_FLUSH than a full compress. It
            # is probably very rare, but we were able to trigger it.
            if self.num_repack == 0:
                safety_margin = 100
            else:
                safety_margin = 10
            if self.bytes_out_len + safety_margin <= capacity:
                # It fit, so mark it added
                self.bytes_in.append(bytes)
            else:
                # We are over budget, try to squeeze this in without any
                # Z_SYNC_FLUSH calls
                self.num_repack += 1
                (bytes_out, this_len,
                 compressor) = self._recompress_all_bytes_in(bytes)
                if self.num_repack >= self._max_repack:
                    # When we get *to* _max_repack, bump over so that the
                    # earlier > _max_repack will be triggered.
                    self.num_repack += 1
                if this_len + 10 > capacity:
                    (bytes_out, this_len,
                     compressor) = self._recompress_all_bytes_in()
                    self.compressor = compressor
                    # Force us to not allow more data
                    self.num_repack = self._max_repack + 1
                    self.bytes_list = bytes_out
                    self.bytes_out_len = this_len
                    self.unused_bytes = bytes
                    return True
                else:
                    # This fits when we pack it tighter, so use the new packing
                    self.compressor = compressor
                    self.bytes_in.append(bytes)
                    self.bytes_list = bytes_out
                    self.bytes_out_len = this_len
        return False