summaryrefslogtreecommitdiff
path: root/bzrlib/_groupcompress_pyx.pyx
blob: a283554da44e21bce8538eb2dce617dd7c4f84ee (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

"""Compiled extensions for doing compression."""

#python2.4 support
cdef extern from "python-compat.h":
    pass


cdef extern from "Python.h":
    ctypedef struct PyObject:
        pass
    ctypedef int Py_ssize_t # Required for older pyrex versions
    int PyString_CheckExact(object)
    char * PyString_AS_STRING(object)
    Py_ssize_t PyString_GET_SIZE(object)
    object PyString_FromStringAndSize(char *, Py_ssize_t)


cdef extern from *:
    ctypedef unsigned long size_t
    void * malloc(size_t) nogil
    void * realloc(void *, size_t) nogil
    void free(void *) nogil
    void memcpy(void *, void *, size_t) nogil


cdef extern from "delta.h":
    struct source_info:
        void *buf
        unsigned long size
        unsigned long agg_offset
    struct delta_index:
        pass
    ctypedef enum delta_result:
        DELTA_OK
        DELTA_OUT_OF_MEMORY
        DELTA_INDEX_NEEDED
        DELTA_SOURCE_EMPTY
        DELTA_SOURCE_BAD
        DELTA_BUFFER_EMPTY
        DELTA_SIZE_TOO_BIG
    delta_result create_delta_index(source_info *src,
                                    delta_index *old,
                                    delta_index **fresh,
                                    int max_entries) nogil
    delta_result create_delta_index_from_delta(source_info *delta,
                                               delta_index *old,
                                               delta_index **fresh) nogil
    void free_delta_index(delta_index *index) nogil
    delta_result create_delta(delta_index *indexes,
                              void *buf, unsigned long bufsize,
                              unsigned long *delta_size,
                              unsigned long max_delta_size,
                              void **delta_data) nogil
    unsigned long get_delta_hdr_size(unsigned char **datap,
                                     unsigned char *top) nogil
    unsigned long sizeof_delta_index(delta_index *index)
    Py_ssize_t DELTA_SIZE_MIN
    int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
    int get_entry_summary(delta_index *index, int pos,
                          unsigned int *global_offset, unsigned int *hash_val)
    unsigned int rabin_hash (unsigned char *data)


cdef void *safe_malloc(size_t count) except NULL:
    cdef void *result
    result = malloc(count)
    if result == NULL:
        raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
    return result


cdef void *safe_realloc(void * old, size_t count) except NULL:
    cdef void *result
    result = realloc(old, count)
    if result == NULL:
        raise MemoryError('Failed to reallocate to %d bytes of memory'
                          % (count,))
    return result


cdef int safe_free(void **val) except -1:
    assert val != NULL
    if val[0] != NULL:
        free(val[0])
        val[0] = NULL

def make_delta_index(source):
    return DeltaIndex(source)


cdef object _translate_delta_failure(delta_result result):
    if result == DELTA_OUT_OF_MEMORY:
        return MemoryError("Delta function failed to allocate memory")
    elif result == DELTA_INDEX_NEEDED:
        return ValueError("Delta function requires delta_index param")
    elif result == DELTA_SOURCE_EMPTY:
        return ValueError("Delta function given empty source_info param")
    elif result == DELTA_SOURCE_BAD:
        return RuntimeError("Delta function given invalid source_info param")
    elif result == DELTA_BUFFER_EMPTY:
        return ValueError("Delta function given empty buffer params")
    return AssertionError("Unrecognised delta result code: %d" % result)


def _rabin_hash(content):
    if not PyString_CheckExact(content):
        raise ValueError('content must be a string')
    if len(content) < 16:
        raise ValueError('content must be at least 16 bytes long')
    # Try to cast it to an int, if it can fit
    return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))


cdef class DeltaIndex:

    # We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
    # isn't performance critical
    # cdef readonly list _sources
    cdef readonly object _sources
    cdef source_info *_source_infos
    cdef delta_index *_index
    cdef public unsigned long _source_offset
    cdef readonly unsigned int _max_num_sources
    cdef public int _max_bytes_to_index

    def __init__(self, source=None, max_bytes_to_index=None):
        self._sources = []
        self._index = NULL
        self._max_num_sources = 65000
        self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
                                                        * self._max_num_sources)
        self._source_offset = 0
        self._max_bytes_to_index = 0
        if max_bytes_to_index is not None:
            self._max_bytes_to_index = max_bytes_to_index

        if source is not None:
            self.add_source(source, 0)

    def __sizeof__(self):
        # We want to track the _source_infos allocations, but the referenced
        # void* are actually tracked in _sources itself.
        # XXX: Cython is capable of doing sizeof(class) and returning the size
        #      of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
        #      to do it manually. *sigh* Note that we might get it wrong
        #      because of alignment issues.
        cdef Py_ssize_t size
        # PyObject start, vtable *, 3 object pointers, 2 C ints
        size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
                 + sizeof(unsigned long)
                 + sizeof(unsigned int))
                + (sizeof(source_info) * self._max_num_sources)
                + sizeof_delta_index(self._index))
        return size

    def __repr__(self):
        return '%s(%d, %d)' % (self.__class__.__name__,
            len(self._sources), self._source_offset)

    def __dealloc__(self):
        if self._index != NULL:
            free_delta_index(self._index)
            self._index = NULL
        safe_free(<void **>&self._source_infos)

    def _has_index(self):
        return (self._index != NULL)

    def _dump_index(self):
        """Dump the pointers in the index.

        This is an arbitrary layout, used for testing. It is not meant to be
        used in production code.

        :return: (hash_list, entry_list)
            hash_list   A list of offsets, so hash[i] points to the 'hash
                        bucket' starting at the given offset and going until
                        hash[i+1]
            entry_list  A list of (text_offset, hash_val). text_offset is the
                        offset in the "source" texts, and hash_val is the RABIN
                        hash for that offset.
                        Note that the entry should be in the hash bucket
                        defined by
                        hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
        """
        cdef int pos
        cdef unsigned int text_offset
        cdef unsigned int hash_val
        cdef unsigned int hash_offset
        if self._index == NULL:
            return None
        hash_list = []
        pos = 0
        while get_hash_offset(self._index, pos, &hash_offset):
            hash_list.append(int(hash_offset))
            pos += 1
        entry_list = []
        pos = 0
        while get_entry_summary(self._index, pos, &text_offset, &hash_val):
            # Map back using 'int' so that we don't get Long everywhere, when
            # almost everything is <2**31.
            val = tuple(map(int, [text_offset, hash_val]))
            entry_list.append(val)
            pos += 1
        return hash_list, entry_list

    def add_delta_source(self, delta, unadded_bytes):
        """Add a new delta to the source texts.

        :param delta: The text of the delta, this must be a byte string.
        :param unadded_bytes: Number of bytes that were added to the source
            that were not indexed.
        """
        cdef char *c_delta
        cdef Py_ssize_t c_delta_size
        cdef delta_index *index
        cdef delta_result res
        cdef unsigned int source_location
        cdef source_info *src
        cdef unsigned int num_indexes

        if not PyString_CheckExact(delta):
            raise TypeError('delta is not a str')

        source_location = len(self._sources)
        if source_location >= self._max_num_sources:
            self._expand_sources()
        self._sources.append(delta)
        c_delta = PyString_AS_STRING(delta)
        c_delta_size = PyString_GET_SIZE(delta)
        src = self._source_infos + source_location
        src.buf = c_delta
        src.size = c_delta_size
        src.agg_offset = self._source_offset + unadded_bytes
        with nogil:
            res = create_delta_index_from_delta(src, self._index, &index)
        if res != DELTA_OK:
            raise _translate_delta_failure(res)
        self._source_offset = src.agg_offset + src.size
        if index != self._index:
            free_delta_index(self._index)
            self._index = index

    def add_source(self, source, unadded_bytes):
        """Add a new bit of source text to the delta indexes.

        :param source: The text in question, this must be a byte string
        :param unadded_bytes: Assume there are this many bytes that didn't get
            added between this source and the end of the previous source.
        :param max_pointers: Add no more than this many entries to the index.
            By default, we sample every 16 bytes, if that would require more
            than max_entries, we will reduce the sampling rate.
            A value of 0 means unlimited, None means use the default limit.
        """
        cdef char *c_source
        cdef Py_ssize_t c_source_size
        cdef delta_index *index
        cdef delta_result res
        cdef unsigned int source_location
        cdef source_info *src
        cdef unsigned int num_indexes
        cdef int max_num_entries

        if not PyString_CheckExact(source):
            raise TypeError('source is not a str')

        source_location = len(self._sources)
        if source_location >= self._max_num_sources:
            self._expand_sources()
        if source_location != 0 and self._index == NULL:
            # We were lazy about populating the index, create it now
            self._populate_first_index()
        self._sources.append(source)
        c_source = PyString_AS_STRING(source)
        c_source_size = PyString_GET_SIZE(source)
        src = self._source_infos + source_location
        src.buf = c_source
        src.size = c_source_size

        src.agg_offset = self._source_offset + unadded_bytes
        self._source_offset = src.agg_offset + src.size
        # We delay creating the index on the first insert
        if source_location != 0:
            with nogil:
                res = create_delta_index(src, self._index, &index,
                                         self._max_bytes_to_index)
            if res != DELTA_OK:
                raise _translate_delta_failure(res)
            if index != self._index:
                free_delta_index(self._index)
                self._index = index

    cdef _populate_first_index(self):
        cdef delta_index *index
        cdef delta_result res
        if len(self._sources) != 1 or self._index != NULL:
            raise AssertionError('_populate_first_index should only be'
                ' called when we have a single source and no index yet')

        # We know that self._index is already NULL, so create_delta_index
        # will always create a new index unless there's a malloc failure
        with nogil:
            res = create_delta_index(&self._source_infos[0], NULL, &index,
                                     self._max_bytes_to_index)
        if res != DELTA_OK:
            raise _translate_delta_failure(res)
        self._index = index

    cdef _expand_sources(self):
        raise RuntimeError('if we move self._source_infos, then we need to'
                           ' change all of the index pointers as well.')
        self._max_num_sources = self._max_num_sources * 2
        self._source_infos = <source_info *>safe_realloc(self._source_infos,
                                                sizeof(source_info)
                                                * self._max_num_sources)

    def make_delta(self, target_bytes, max_delta_size=0):
        """Create a delta from the current source to the target bytes."""
        cdef char *target
        cdef Py_ssize_t target_size
        cdef void * delta
        cdef unsigned long delta_size
        cdef unsigned long c_max_delta_size
        cdef delta_result res

        if self._index == NULL:
            if len(self._sources) == 0:
                return None
            # We were just lazy about generating the index
            self._populate_first_index()

        if not PyString_CheckExact(target_bytes):
            raise TypeError('target is not a str')

        target = PyString_AS_STRING(target_bytes)
        target_size = PyString_GET_SIZE(target_bytes)

        # TODO: inline some of create_delta so we at least don't have to double
        #       malloc, and can instead use PyString_FromStringAndSize, to
        #       allocate the bytes into the final string
        c_max_delta_size = max_delta_size
        with nogil:
            res = create_delta(self._index, target, target_size,
                               &delta_size, c_max_delta_size, &delta)
        result = None
        if res == DELTA_OK:
            result = PyString_FromStringAndSize(<char *>delta, delta_size)
            free(delta)
        elif res != DELTA_SIZE_TOO_BIG:
            raise _translate_delta_failure(res)
        return result


def make_delta(source_bytes, target_bytes):
    """Create a delta, this is a wrapper around DeltaIndex.make_delta."""
    di = DeltaIndex(source_bytes)
    return di.make_delta(target_bytes)


def apply_delta(source_bytes, delta_bytes):
    """Apply a delta generated by make_delta to source_bytes."""
    cdef char *source
    cdef Py_ssize_t source_size
    cdef char *delta
    cdef Py_ssize_t delta_size

    if not PyString_CheckExact(source_bytes):
        raise TypeError('source is not a str')
    if not PyString_CheckExact(delta_bytes):
        raise TypeError('delta is not a str')
    source = PyString_AS_STRING(source_bytes)
    source_size = PyString_GET_SIZE(source_bytes)
    delta = PyString_AS_STRING(delta_bytes)
    delta_size = PyString_GET_SIZE(delta_bytes)
    # Code taken from patch-delta.c, only brought here to give better error
    # handling, and to avoid double allocating memory
    if (delta_size < DELTA_SIZE_MIN):
        # XXX: Invalid delta block
        raise RuntimeError('delta_size %d smaller than min delta size %d'
                           % (delta_size, DELTA_SIZE_MIN))

    return _apply_delta(source, source_size, delta, delta_size)


cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
    unsigned char cmd, unsigned int *offset,
    unsigned int *length) nogil: # cannot_raise
    """Decode a copy instruction from the next few bytes.

    A copy instruction is a variable number of bytes, so we will parse the
    bytes we care about, and return the new position, as well as the offset and
    length referred to in the bytes.

    :param bytes: Pointer to the start of bytes after cmd
    :param cmd: The command code
    :return: Pointer to the bytes just after the last decode byte
    """
    cdef unsigned int off, size, count
    off = 0
    size = 0
    count = 0
    if (cmd & 0x01):
        off = bytes[count]
        count = count + 1
    if (cmd & 0x02):
        off = off | (bytes[count] << 8)
        count = count + 1
    if (cmd & 0x04):
        off = off | (bytes[count] << 16)
        count = count + 1
    if (cmd & 0x08):
        off = off | (bytes[count] << 24)
        count = count + 1
    if (cmd & 0x10):
        size = bytes[count]
        count = count + 1
    if (cmd & 0x20):
        size = size | (bytes[count] << 8)
        count = count + 1
    if (cmd & 0x40):
        size = size | (bytes[count] << 16)
        count = count + 1
    if (size == 0):
        size = 0x10000
    offset[0] = off
    length[0] = size
    return bytes + count


cdef object _apply_delta(char *source, Py_ssize_t source_size,
                         char *delta, Py_ssize_t delta_size):
    """common functionality between apply_delta and apply_delta_to_source."""
    cdef unsigned char *data, *top
    cdef unsigned char *dst_buf, *out, cmd
    cdef Py_ssize_t size
    cdef unsigned int cp_off, cp_size
    cdef int failed

    data = <unsigned char *>delta
    top = data + delta_size

    # now the result size
    size = get_delta_hdr_size(&data, top)
    result = PyString_FromStringAndSize(NULL, size)
    dst_buf = <unsigned char*>PyString_AS_STRING(result)

    failed = 0
    with nogil:
        out = dst_buf
        while (data < top):
            cmd = data[0]
            data = data + 1
            if (cmd & 0x80):
                # Copy instruction
                data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
                if (cp_off + cp_size < cp_size or
                    cp_off + cp_size > <unsigned int>source_size or
                    cp_size > <unsigned int>size):
                    failed = 1
                    break
                memcpy(out, source + cp_off, cp_size)
                out = out + cp_size
                size = size - cp_size
            else:
                # Insert instruction
                if cmd == 0:
                    # cmd == 0 is reserved for future encoding
                    # extensions. In the mean time we must fail when
                    # encountering them (might be data corruption).
                    failed = 2
                    break
                if cmd > size:
                    failed = 3
                    break
                memcpy(out, data, cmd)
                out = out + cmd
                data = data + cmd
                size = size - cmd
    if failed:
        if failed == 1:
            raise ValueError('Something wrong with:'
                ' cp_off = %s, cp_size = %s'
                ' source_size = %s, size = %s'
                % (cp_off, cp_size, source_size, size))
        elif failed == 2:
            raise ValueError('Got delta opcode: 0, not supported')
        elif failed == 3:
            raise ValueError('Insert instruction longer than remaining'
                ' bytes: %d > %d' % (cmd, size))

    # sanity check
    if (data != top or size != 0):
        raise RuntimeError('Did not extract the number of bytes we expected'
            ' we were left with %d bytes in "size", and top - data = %d'
            % (size, <int>(top - data)))
        return None

    # *dst_size = out - dst_buf;
    if (out - dst_buf) != PyString_GET_SIZE(result):
        raise RuntimeError('Number of bytes extracted did not match the'
            ' size encoded in the delta header.')
    return result


def apply_delta_to_source(source, delta_start, delta_end):
    """Extract a delta from source bytes, and apply it."""
    cdef char *c_source
    cdef Py_ssize_t c_source_size
    cdef char *c_delta
    cdef Py_ssize_t c_delta_size
    cdef Py_ssize_t c_delta_start, c_delta_end

    if not PyString_CheckExact(source):
        raise TypeError('source is not a str')
    c_source_size = PyString_GET_SIZE(source)
    c_delta_start = delta_start
    c_delta_end = delta_end
    if c_delta_start >= c_source_size:
        raise ValueError('delta starts after source')
    if c_delta_end > c_source_size:
        raise ValueError('delta ends after source')
    if c_delta_start >= c_delta_end:
        raise ValueError('delta starts after it ends')

    c_delta_size = c_delta_end - c_delta_start
    c_source = PyString_AS_STRING(source)
    c_delta = c_source + c_delta_start
    # We don't use source_size, because we know the delta should not refer to
    # any bytes after it starts
    return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)


def encode_base128_int(val):
    """Convert an integer into a 7-bit lsb encoding."""
    cdef unsigned int c_val
    cdef Py_ssize_t count
    cdef unsigned int num_bytes
    cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes

    c_val = val
    count = 0
    while c_val >= 0x80 and count < 8:
        c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
        c_val = c_val >> 7
        count = count + 1
    if count >= 8 or c_val >= 0x80:
        raise ValueError('encode_base128_int overflowed the buffer')
    c_bytes[count] = <unsigned char>(c_val & 0xFF)
    count = count + 1
    return PyString_FromStringAndSize(<char *>c_bytes, count)


def decode_base128_int(bytes):
    """Decode an integer from a 7-bit lsb encoding."""
    cdef int offset
    cdef int val
    cdef unsigned int uval
    cdef int shift
    cdef Py_ssize_t num_low_bytes
    cdef unsigned char *c_bytes

    offset = 0
    val = 0
    shift = 0
    if not PyString_CheckExact(bytes):
        raise TypeError('bytes is not a string')
    c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
    # We take off 1, because we have to be able to decode the non-expanded byte
    num_low_bytes = PyString_GET_SIZE(bytes) - 1
    while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
        val = val | ((c_bytes[offset] & 0x7F) << shift)
        shift = shift + 7
        offset = offset + 1
    if c_bytes[offset] & 0x80:
        raise ValueError('Data not properly formatted, we ran out of'
                         ' bytes before 0x80 stopped being set.')
    val = val | (c_bytes[offset] << shift)
    offset = offset + 1
    if val < 0:
        uval = <unsigned int> val
        return uval, offset
    return val, offset