bzrlib/_groupcompress_py.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468

# Copyright (C) 2009 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

"""Python version of compiled extensions for doing compression.

We separate the implementation from the groupcompress.py to avoid importing
useless stuff.
"""

from __future__ import absolute_import

from bzrlib import osutils


class _OutputHandler(object):
    """A simple class which just tracks how to split up an insert request."""

    def __init__(self, out_lines, index_lines, min_len_to_index):
        self.out_lines = out_lines
        self.index_lines = index_lines
        self.min_len_to_index = min_len_to_index
        self.cur_insert_lines = []
        self.cur_insert_len = 0

    def add_copy(self, start_byte, end_byte):
        # The data stream allows >64kB in a copy, but to match the compiled
        # code, we will also limit it to a 64kB copy
        for start_byte in xrange(start_byte, end_byte, 64*1024):
            num_bytes = min(64*1024, end_byte - start_byte)
            copy_bytes = encode_copy_instruction(start_byte, num_bytes)
            self.out_lines.append(copy_bytes)
            self.index_lines.append(False)

    def _flush_insert(self):
        if not self.cur_insert_lines:
            return
        if self.cur_insert_len > 127:
            raise AssertionError('We cannot insert more than 127 bytes'
                                 ' at a time.')
        self.out_lines.append(chr(self.cur_insert_len))
        self.index_lines.append(False)
        self.out_lines.extend(self.cur_insert_lines)
        if self.cur_insert_len < self.min_len_to_index:
            self.index_lines.extend([False]*len(self.cur_insert_lines))
        else:
            self.index_lines.extend([True]*len(self.cur_insert_lines))
        self.cur_insert_lines = []
        self.cur_insert_len = 0

    def _insert_long_line(self, line):
        # Flush out anything pending
        self._flush_insert()
        line_len = len(line)
        for start_index in xrange(0, line_len, 127):
            next_len = min(127, line_len - start_index)
            self.out_lines.append(chr(next_len))
            self.index_lines.append(False)
            self.out_lines.append(line[start_index:start_index+next_len])
            # We don't index long lines, because we won't be able to match
            # a line split across multiple inserts anway
            self.index_lines.append(False)

    def add_insert(self, lines):
        if self.cur_insert_lines != []:
            raise AssertionError('self.cur_insert_lines must be empty when'
                                 ' adding a new insert')
        for line in lines:
            if len(line) > 127:
                self._insert_long_line(line)
            else:
                next_len = len(line) + self.cur_insert_len
                if next_len > 127:
                    # Adding this line would overflow, so flush, and start over
                    self._flush_insert()
                    self.cur_insert_lines = [line]
                    self.cur_insert_len = len(line)
                else:
                    self.cur_insert_lines.append(line)
                    self.cur_insert_len = next_len
        self._flush_insert()


class LinesDeltaIndex(object):
    """This class indexes matches between strings.

    :ivar lines: The 'static' lines that will be preserved between runs.
    :ivar _matching_lines: A dict of {line:[matching offsets]}
    :ivar line_offsets: The byte offset for the end of each line, used to
        quickly map between a matching line number and the byte location
    :ivar endpoint: The total number of bytes in self.line_offsets
    """

    _MIN_MATCH_BYTES = 10
    _SOFT_MIN_MATCH_BYTES = 200

    def __init__(self, lines):
        self.lines = []
        self.line_offsets = []
        self.endpoint = 0
        self._matching_lines = {}
        self.extend_lines(lines, [True]*len(lines))

    def _update_matching_lines(self, new_lines, index):
        matches = self._matching_lines
        start_idx = len(self.lines)
        if len(new_lines) != len(index):
            raise AssertionError('The number of lines to be indexed does'
                ' not match the index/don\'t index flags: %d != %d'
                % (len(new_lines), len(index)))
        for idx, do_index in enumerate(index):
            if not do_index:
                continue
            line = new_lines[idx]
            try:
                matches[line].add(start_idx + idx)
            except KeyError:
                matches[line] = set([start_idx + idx])

    def get_matches(self, line):
        """Return the lines which match the line in right."""
        try:
            return self._matching_lines[line]
        except KeyError:
            return None

    def _get_longest_match(self, lines, pos):
        """Look at all matches for the current line, return the longest.

        :param lines: The lines we are matching against
        :param pos: The current location we care about
        :param locations: A list of lines that matched the current location.
            This may be None, but often we'll have already found matches for
            this line.
        :return: (start_in_self, start_in_lines, num_lines)
            All values are the offset in the list (aka the line number)
            If start_in_self is None, then we have no matches, and this line
            should be inserted in the target.
        """
        range_start = pos
        range_len = 0
        prev_locations = None
        max_pos = len(lines)
        matching = self._matching_lines
        while pos < max_pos:
            try:
                locations = matching[lines[pos]]
            except KeyError:
                # No more matches, just return whatever we have, but we know
                # that this last position is not going to match anything
                pos += 1
                break
            # We have a match
            if prev_locations is None:
                # This is the first match in a range
                prev_locations = locations
                range_len = 1
                locations = None # Consumed
            else:
                # We have a match started, compare to see if any of the
                # current matches can be continued
                next_locations = locations.intersection([loc + 1 for loc
                                                         in prev_locations])
                if next_locations:
                    # At least one of the regions continues to match
                    prev_locations = set(next_locations)
                    range_len += 1
                    locations = None # Consumed
                else:
                    # All current regions no longer match.
                    # This line does still match something, just not at the
                    # end of the previous matches. We will return locations
                    # so that we can avoid another _matching_lines lookup.
                    break
            pos += 1
        if prev_locations is None:
            # We have no matches, this is a pure insert
            return None, pos
        smallest = min(prev_locations)
        return (smallest - range_len + 1, range_start, range_len), pos

    def get_matching_blocks(self, lines, soft=False):
        """Return the ranges in lines which match self.lines.

        :param lines: lines to compress
        :return: A list of (old_start, new_start, length) tuples which reflect
            a region in self.lines that is present in lines.  The last element
            of the list is always (old_len, new_len, 0) to provide a end point
            for generating instructions from the matching blocks list.
        """
        # In this code, we iterate over multiple _get_longest_match calls, to
        # find the next longest copy, and possible insert regions. We then
        # convert that to the simple matching_blocks representation, since
        # otherwise inserting 10 lines in a row would show up as 10
        # instructions.
        result = []
        pos = 0
        max_pos = len(lines)
        result_append = result.append
        min_match_bytes = self._MIN_MATCH_BYTES
        if soft:
            min_match_bytes = self._SOFT_MIN_MATCH_BYTES
        while pos < max_pos:
            block, pos = self._get_longest_match(lines, pos)
            if block is not None:
                # Check to see if we match fewer than min_match_bytes. As we
                # will turn this into a pure 'insert', rather than a copy.
                # block[-1] is the number of lines. A quick check says if we
                # have more lines than min_match_bytes, then we know we have
                # enough bytes.
                if block[-1] < min_match_bytes:
                    # This block may be a 'short' block, check
                    old_start, new_start, range_len = block
                    matched_bytes = sum(map(len,
                        lines[new_start:new_start + range_len]))
                    if matched_bytes < min_match_bytes:
                        block = None
            if block is not None:
                result_append(block)
        result_append((len(self.lines), len(lines), 0))
        return result

    def extend_lines(self, lines, index):
        """Add more lines to the left-lines list.

        :param lines: A list of lines to add
        :param index: A True/False for each node to define if it should be
            indexed.
        """
        self._update_matching_lines(lines, index)
        self.lines.extend(lines)
        endpoint = self.endpoint
        for line in lines:
            endpoint += len(line)
            self.line_offsets.append(endpoint)
        if len(self.line_offsets) != len(self.lines):
            raise AssertionError('Somehow the line offset indicator'
                ' got out of sync with the line counter.')
        self.endpoint = endpoint

    def _flush_insert(self, start_linenum, end_linenum,
                      new_lines, out_lines, index_lines):
        """Add an 'insert' request to the data stream."""
        bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
        insert_length = len(bytes_to_insert)
        # Each insert instruction is at most 127 bytes long
        for start_byte in xrange(0, insert_length, 127):
            insert_count = min(insert_length - start_byte, 127)
            out_lines.append(chr(insert_count))
            # Don't index the 'insert' instruction
            index_lines.append(False)
            insert = bytes_to_insert[start_byte:start_byte+insert_count]
            as_lines = osutils.split_lines(insert)
            out_lines.extend(as_lines)
            index_lines.extend([True]*len(as_lines))

    def _flush_copy(self, old_start_linenum, num_lines,
                    out_lines, index_lines):
        if old_start_linenum == 0:
            first_byte = 0
        else:
            first_byte = self.line_offsets[old_start_linenum - 1]
        stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
        num_bytes = stop_byte - first_byte
        # The data stream allows >64kB in a copy, but to match the compiled
        # code, we will also limit it to a 64kB copy
        for start_byte in xrange(first_byte, stop_byte, 64*1024):
            num_bytes = min(64*1024, stop_byte - start_byte)
            copy_bytes = encode_copy_instruction(start_byte, num_bytes)
            out_lines.append(copy_bytes)
            index_lines.append(False)

    def make_delta(self, new_lines, bytes_length=None, soft=False):
        """Compute the delta for this content versus the original content."""
        if bytes_length is None:
            bytes_length = sum(map(len, new_lines))
        # reserved for content type, content length
        out_lines = ['', '', encode_base128_int(bytes_length)]
        index_lines = [False, False, False]
        output_handler = _OutputHandler(out_lines, index_lines,
                                        self._MIN_MATCH_BYTES)
        blocks = self.get_matching_blocks(new_lines, soft=soft)
        current_line_num = 0
        # We either copy a range (while there are reusable lines) or we
        # insert new lines. To find reusable lines we traverse
        for old_start, new_start, range_len in blocks:
            if new_start != current_line_num:
                # non-matching region, insert the content
                output_handler.add_insert(new_lines[current_line_num:new_start])
            current_line_num = new_start + range_len
            if range_len:
                # Convert the line based offsets into byte based offsets
                if old_start == 0:
                    first_byte = 0
                else:
                    first_byte = self.line_offsets[old_start - 1]
                last_byte = self.line_offsets[old_start + range_len - 1]
                output_handler.add_copy(first_byte, last_byte)
        return out_lines, index_lines


def encode_base128_int(val):
    """Convert an integer into a 7-bit lsb encoding."""
    bytes = []
    count = 0
    while val >= 0x80:
        bytes.append(chr((val | 0x80) & 0xFF))
        val >>= 7
    bytes.append(chr(val))
    return ''.join(bytes)


def decode_base128_int(bytes):
    """Decode an integer from a 7-bit lsb encoding."""
    offset = 0
    val = 0
    shift = 0
    bval = ord(bytes[offset])
    while bval >= 0x80:
        val |= (bval & 0x7F) << shift
        shift += 7
        offset += 1
        bval = ord(bytes[offset])
    val |= bval << shift
    offset += 1
    return val, offset


def encode_copy_instruction(offset, length):
    """Convert this offset into a control code and bytes."""
    copy_command = 0x80
    copy_bytes = [None]

    for copy_bit in (0x01, 0x02, 0x04, 0x08):
        base_byte = offset & 0xff
        if base_byte:
            copy_command |= copy_bit
            copy_bytes.append(chr(base_byte))
        offset >>= 8
    if length is None:
        raise ValueError("cannot supply a length of None")
    if length > 0x10000:
        raise ValueError("we don't emit copy records for lengths > 64KiB")
    if length == 0:
        raise ValueError("We cannot emit a copy of length 0")
    if length != 0x10000:
        # A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
        # since that saves bytes for large chained copies
        for copy_bit in (0x10, 0x20):
            base_byte = length & 0xff
            if base_byte:
                copy_command |= copy_bit
                copy_bytes.append(chr(base_byte))
            length >>= 8
    copy_bytes[0] = chr(copy_command)
    return ''.join(copy_bytes)


def decode_copy_instruction(bytes, cmd, pos):
    """Decode a copy instruction from the next few bytes.

    A copy instruction is a variable number of bytes, so we will parse the
    bytes we care about, and return the new position, as well as the offset and
    length referred to in the bytes.

    :param bytes: A string of bytes
    :param cmd: The command code
    :param pos: The position in bytes right after the copy command
    :return: (offset, length, newpos)
        The offset of the copy start, the number of bytes to copy, and the
        position after the last byte of the copy
    """
    if cmd & 0x80 != 0x80:
        raise ValueError('copy instructions must have bit 0x80 set')
    offset = 0
    length = 0
    if (cmd & 0x01):
        offset = ord(bytes[pos])
        pos += 1
    if (cmd & 0x02):
        offset = offset | (ord(bytes[pos]) << 8)
        pos += 1
    if (cmd & 0x04):
        offset = offset | (ord(bytes[pos]) << 16)
        pos += 1
    if (cmd & 0x08):
        offset = offset | (ord(bytes[pos]) << 24)
        pos += 1
    if (cmd & 0x10):
        length = ord(bytes[pos])
        pos += 1
    if (cmd & 0x20):
        length = length | (ord(bytes[pos]) << 8)
        pos += 1
    if (cmd & 0x40):
        length = length | (ord(bytes[pos]) << 16)
        pos += 1
    if length == 0:
        length = 65536
    return (offset, length, pos)


def make_delta(source_bytes, target_bytes):
    """Create a delta from source to target."""
    if type(source_bytes) is not str:
        raise TypeError('source is not a str')
    if type(target_bytes) is not str:
        raise TypeError('target is not a str')
    line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))
    delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
                                         bytes_length=len(target_bytes))
    return ''.join(delta)


def apply_delta(basis, delta):
    """Apply delta to this object to become new_version_id."""
    if type(basis) is not str:
        raise TypeError('basis is not a str')
    if type(delta) is not str:
        raise TypeError('delta is not a str')
    target_length, pos = decode_base128_int(delta)
    lines = []
    len_delta = len(delta)
    while pos < len_delta:
        cmd = ord(delta[pos])
        pos += 1
        if cmd & 0x80:
            offset, length, pos = decode_copy_instruction(delta, cmd, pos)
            last = offset + length
            if last > len(basis):
                raise ValueError('data would copy bytes past the'
                                 'end of source')
            lines.append(basis[offset:last])
        else: # Insert of 'cmd' bytes
            if cmd == 0:
                raise ValueError('Command == 0 not supported yet')
            lines.append(delta[pos:pos+cmd])
            pos += cmd
    bytes = ''.join(lines)
    if len(bytes) != target_length:
        raise ValueError('Delta claimed to be %d long, but ended up'
                         ' %d long' % (target_length, len(bytes)))
    return bytes


def apply_delta_to_source(source, delta_start, delta_end):
    """Extract a delta from source bytes, and apply it."""
    source_size = len(source)
    if delta_start >= source_size:
        raise ValueError('delta starts after source')
    if delta_end > source_size:
        raise ValueError('delta ends after source')
    if delta_start >= delta_end:
        raise ValueError('delta starts after it ends')
    delta_bytes = source[delta_start:delta_end]
    return apply_delta(source, delta_bytes)