diff options
author | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-08-22 15:47:16 +0100 |
---|---|---|
committer | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-08-22 15:47:16 +0100 |
commit | 25335618bf8755ce6b116ee14f47f5a1f2c821e9 (patch) | |
tree | d889d7ab3f9f985d0c54c534cb8052bd2e6d7163 /bzrlib/_groupcompress_pyx.pyx | |
download | bzr-tarball-25335618bf8755ce6b116ee14f47f5a1f2c821e9.tar.gz |
Tarball conversion
Diffstat (limited to 'bzrlib/_groupcompress_pyx.pyx')
-rw-r--r-- | bzrlib/_groupcompress_pyx.pyx | 600 |
1 files changed, 600 insertions, 0 deletions
diff --git a/bzrlib/_groupcompress_pyx.pyx b/bzrlib/_groupcompress_pyx.pyx new file mode 100644 index 0000000..a283554 --- /dev/null +++ b/bzrlib/_groupcompress_pyx.pyx @@ -0,0 +1,600 @@ +# Copyright (C) 2008, 2009, 2010 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +"""Compiled extensions for doing compression.""" + +#python2.4 support +cdef extern from "python-compat.h": + pass + + +cdef extern from "Python.h": + ctypedef struct PyObject: + pass + ctypedef int Py_ssize_t # Required for older pyrex versions + int PyString_CheckExact(object) + char * PyString_AS_STRING(object) + Py_ssize_t PyString_GET_SIZE(object) + object PyString_FromStringAndSize(char *, Py_ssize_t) + + +cdef extern from *: + ctypedef unsigned long size_t + void * malloc(size_t) nogil + void * realloc(void *, size_t) nogil + void free(void *) nogil + void memcpy(void *, void *, size_t) nogil + + +cdef extern from "delta.h": + struct source_info: + void *buf + unsigned long size + unsigned long agg_offset + struct delta_index: + pass + ctypedef enum delta_result: + DELTA_OK + DELTA_OUT_OF_MEMORY + DELTA_INDEX_NEEDED + DELTA_SOURCE_EMPTY + DELTA_SOURCE_BAD + DELTA_BUFFER_EMPTY + DELTA_SIZE_TOO_BIG + delta_result create_delta_index(source_info *src, + delta_index *old, + delta_index **fresh, + int max_entries) nogil + delta_result create_delta_index_from_delta(source_info *delta, + delta_index *old, + delta_index **fresh) nogil + void free_delta_index(delta_index *index) nogil + delta_result create_delta(delta_index *indexes, + void *buf, unsigned long bufsize, + unsigned long *delta_size, + unsigned long max_delta_size, + void **delta_data) nogil + unsigned long get_delta_hdr_size(unsigned char **datap, + unsigned char *top) nogil + unsigned long sizeof_delta_index(delta_index *index) + Py_ssize_t DELTA_SIZE_MIN + int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset) + int get_entry_summary(delta_index *index, int pos, + unsigned int *global_offset, unsigned int *hash_val) + unsigned int rabin_hash (unsigned char *data) + + +cdef void *safe_malloc(size_t count) except NULL: + cdef void *result + result = malloc(count) + if result == NULL: + raise MemoryError('Failed to allocate %d bytes of memory' % (count,)) + return result + + +cdef void *safe_realloc(void * old, size_t count) except NULL: + cdef void *result + result = realloc(old, count) + if result == NULL: + raise MemoryError('Failed to reallocate to %d bytes of memory' + % (count,)) + return result + + +cdef int safe_free(void **val) except -1: + assert val != NULL + if val[0] != NULL: + free(val[0]) + val[0] = NULL + +def make_delta_index(source): + return DeltaIndex(source) + + +cdef object _translate_delta_failure(delta_result result): + if result == DELTA_OUT_OF_MEMORY: + return MemoryError("Delta function failed to allocate memory") + elif result == DELTA_INDEX_NEEDED: + return ValueError("Delta function requires delta_index param") + elif result == DELTA_SOURCE_EMPTY: + return ValueError("Delta function given empty source_info param") + elif result == DELTA_SOURCE_BAD: + return RuntimeError("Delta function given invalid source_info param") + elif result == DELTA_BUFFER_EMPTY: + return ValueError("Delta function given empty buffer params") + return AssertionError("Unrecognised delta result code: %d" % result) + + +def _rabin_hash(content): + if not PyString_CheckExact(content): + raise ValueError('content must be a string') + if len(content) < 16: + raise ValueError('content must be at least 16 bytes long') + # Try to cast it to an int, if it can fit + return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content)))) + + +cdef class DeltaIndex: + + # We need Pyrex 0.9.8+ to understand a 'list' definition, and this object + # isn't performance critical + # cdef readonly list _sources + cdef readonly object _sources + cdef source_info *_source_infos + cdef delta_index *_index + cdef public unsigned long _source_offset + cdef readonly unsigned int _max_num_sources + cdef public int _max_bytes_to_index + + def __init__(self, source=None, max_bytes_to_index=None): + self._sources = [] + self._index = NULL + self._max_num_sources = 65000 + self._source_infos = <source_info *>safe_malloc(sizeof(source_info) + * self._max_num_sources) + self._source_offset = 0 + self._max_bytes_to_index = 0 + if max_bytes_to_index is not None: + self._max_bytes_to_index = max_bytes_to_index + + if source is not None: + self.add_source(source, 0) + + def __sizeof__(self): + # We want to track the _source_infos allocations, but the referenced + # void* are actually tracked in _sources itself. + # XXX: Cython is capable of doing sizeof(class) and returning the size + # of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need + # to do it manually. *sigh* Note that we might get it wrong + # because of alignment issues. + cdef Py_ssize_t size + # PyObject start, vtable *, 3 object pointers, 2 C ints + size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*) + + sizeof(unsigned long) + + sizeof(unsigned int)) + + (sizeof(source_info) * self._max_num_sources) + + sizeof_delta_index(self._index)) + return size + + def __repr__(self): + return '%s(%d, %d)' % (self.__class__.__name__, + len(self._sources), self._source_offset) + + def __dealloc__(self): + if self._index != NULL: + free_delta_index(self._index) + self._index = NULL + safe_free(<void **>&self._source_infos) + + def _has_index(self): + return (self._index != NULL) + + def _dump_index(self): + """Dump the pointers in the index. + + This is an arbitrary layout, used for testing. It is not meant to be + used in production code. + + :return: (hash_list, entry_list) + hash_list A list of offsets, so hash[i] points to the 'hash + bucket' starting at the given offset and going until + hash[i+1] + entry_list A list of (text_offset, hash_val). text_offset is the + offset in the "source" texts, and hash_val is the RABIN + hash for that offset. + Note that the entry should be in the hash bucket + defined by + hash[(hash_val & mask)] && hash[(hash_val & mask) + 1] + """ + cdef int pos + cdef unsigned int text_offset + cdef unsigned int hash_val + cdef unsigned int hash_offset + if self._index == NULL: + return None + hash_list = [] + pos = 0 + while get_hash_offset(self._index, pos, &hash_offset): + hash_list.append(int(hash_offset)) + pos += 1 + entry_list = [] + pos = 0 + while get_entry_summary(self._index, pos, &text_offset, &hash_val): + # Map back using 'int' so that we don't get Long everywhere, when + # almost everything is <2**31. + val = tuple(map(int, [text_offset, hash_val])) + entry_list.append(val) + pos += 1 + return hash_list, entry_list + + def add_delta_source(self, delta, unadded_bytes): + """Add a new delta to the source texts. + + :param delta: The text of the delta, this must be a byte string. + :param unadded_bytes: Number of bytes that were added to the source + that were not indexed. + """ + cdef char *c_delta + cdef Py_ssize_t c_delta_size + cdef delta_index *index + cdef delta_result res + cdef unsigned int source_location + cdef source_info *src + cdef unsigned int num_indexes + + if not PyString_CheckExact(delta): + raise TypeError('delta is not a str') + + source_location = len(self._sources) + if source_location >= self._max_num_sources: + self._expand_sources() + self._sources.append(delta) + c_delta = PyString_AS_STRING(delta) + c_delta_size = PyString_GET_SIZE(delta) + src = self._source_infos + source_location + src.buf = c_delta + src.size = c_delta_size + src.agg_offset = self._source_offset + unadded_bytes + with nogil: + res = create_delta_index_from_delta(src, self._index, &index) + if res != DELTA_OK: + raise _translate_delta_failure(res) + self._source_offset = src.agg_offset + src.size + if index != self._index: + free_delta_index(self._index) + self._index = index + + def add_source(self, source, unadded_bytes): + """Add a new bit of source text to the delta indexes. + + :param source: The text in question, this must be a byte string + :param unadded_bytes: Assume there are this many bytes that didn't get + added between this source and the end of the previous source. + :param max_pointers: Add no more than this many entries to the index. + By default, we sample every 16 bytes, if that would require more + than max_entries, we will reduce the sampling rate. + A value of 0 means unlimited, None means use the default limit. + """ + cdef char *c_source + cdef Py_ssize_t c_source_size + cdef delta_index *index + cdef delta_result res + cdef unsigned int source_location + cdef source_info *src + cdef unsigned int num_indexes + cdef int max_num_entries + + if not PyString_CheckExact(source): + raise TypeError('source is not a str') + + source_location = len(self._sources) + if source_location >= self._max_num_sources: + self._expand_sources() + if source_location != 0 and self._index == NULL: + # We were lazy about populating the index, create it now + self._populate_first_index() + self._sources.append(source) + c_source = PyString_AS_STRING(source) + c_source_size = PyString_GET_SIZE(source) + src = self._source_infos + source_location + src.buf = c_source + src.size = c_source_size + + src.agg_offset = self._source_offset + unadded_bytes + self._source_offset = src.agg_offset + src.size + # We delay creating the index on the first insert + if source_location != 0: + with nogil: + res = create_delta_index(src, self._index, &index, + self._max_bytes_to_index) + if res != DELTA_OK: + raise _translate_delta_failure(res) + if index != self._index: + free_delta_index(self._index) + self._index = index + + cdef _populate_first_index(self): + cdef delta_index *index + cdef delta_result res + if len(self._sources) != 1 or self._index != NULL: + raise AssertionError('_populate_first_index should only be' + ' called when we have a single source and no index yet') + + # We know that self._index is already NULL, so create_delta_index + # will always create a new index unless there's a malloc failure + with nogil: + res = create_delta_index(&self._source_infos[0], NULL, &index, + self._max_bytes_to_index) + if res != DELTA_OK: + raise _translate_delta_failure(res) + self._index = index + + cdef _expand_sources(self): + raise RuntimeError('if we move self._source_infos, then we need to' + ' change all of the index pointers as well.') + self._max_num_sources = self._max_num_sources * 2 + self._source_infos = <source_info *>safe_realloc(self._source_infos, + sizeof(source_info) + * self._max_num_sources) + + def make_delta(self, target_bytes, max_delta_size=0): + """Create a delta from the current source to the target bytes.""" + cdef char *target + cdef Py_ssize_t target_size + cdef void * delta + cdef unsigned long delta_size + cdef unsigned long c_max_delta_size + cdef delta_result res + + if self._index == NULL: + if len(self._sources) == 0: + return None + # We were just lazy about generating the index + self._populate_first_index() + + if not PyString_CheckExact(target_bytes): + raise TypeError('target is not a str') + + target = PyString_AS_STRING(target_bytes) + target_size = PyString_GET_SIZE(target_bytes) + + # TODO: inline some of create_delta so we at least don't have to double + # malloc, and can instead use PyString_FromStringAndSize, to + # allocate the bytes into the final string + c_max_delta_size = max_delta_size + with nogil: + res = create_delta(self._index, target, target_size, + &delta_size, c_max_delta_size, &delta) + result = None + if res == DELTA_OK: + result = PyString_FromStringAndSize(<char *>delta, delta_size) + free(delta) + elif res != DELTA_SIZE_TOO_BIG: + raise _translate_delta_failure(res) + return result + + +def make_delta(source_bytes, target_bytes): + """Create a delta, this is a wrapper around DeltaIndex.make_delta.""" + di = DeltaIndex(source_bytes) + return di.make_delta(target_bytes) + + +def apply_delta(source_bytes, delta_bytes): + """Apply a delta generated by make_delta to source_bytes.""" + cdef char *source + cdef Py_ssize_t source_size + cdef char *delta + cdef Py_ssize_t delta_size + + if not PyString_CheckExact(source_bytes): + raise TypeError('source is not a str') + if not PyString_CheckExact(delta_bytes): + raise TypeError('delta is not a str') + source = PyString_AS_STRING(source_bytes) + source_size = PyString_GET_SIZE(source_bytes) + delta = PyString_AS_STRING(delta_bytes) + delta_size = PyString_GET_SIZE(delta_bytes) + # Code taken from patch-delta.c, only brought here to give better error + # handling, and to avoid double allocating memory + if (delta_size < DELTA_SIZE_MIN): + # XXX: Invalid delta block + raise RuntimeError('delta_size %d smaller than min delta size %d' + % (delta_size, DELTA_SIZE_MIN)) + + return _apply_delta(source, source_size, delta, delta_size) + + +cdef unsigned char *_decode_copy_instruction(unsigned char *bytes, + unsigned char cmd, unsigned int *offset, + unsigned int *length) nogil: # cannot_raise + """Decode a copy instruction from the next few bytes. + + A copy instruction is a variable number of bytes, so we will parse the + bytes we care about, and return the new position, as well as the offset and + length referred to in the bytes. + + :param bytes: Pointer to the start of bytes after cmd + :param cmd: The command code + :return: Pointer to the bytes just after the last decode byte + """ + cdef unsigned int off, size, count + off = 0 + size = 0 + count = 0 + if (cmd & 0x01): + off = bytes[count] + count = count + 1 + if (cmd & 0x02): + off = off | (bytes[count] << 8) + count = count + 1 + if (cmd & 0x04): + off = off | (bytes[count] << 16) + count = count + 1 + if (cmd & 0x08): + off = off | (bytes[count] << 24) + count = count + 1 + if (cmd & 0x10): + size = bytes[count] + count = count + 1 + if (cmd & 0x20): + size = size | (bytes[count] << 8) + count = count + 1 + if (cmd & 0x40): + size = size | (bytes[count] << 16) + count = count + 1 + if (size == 0): + size = 0x10000 + offset[0] = off + length[0] = size + return bytes + count + + +cdef object _apply_delta(char *source, Py_ssize_t source_size, + char *delta, Py_ssize_t delta_size): + """common functionality between apply_delta and apply_delta_to_source.""" + cdef unsigned char *data, *top + cdef unsigned char *dst_buf, *out, cmd + cdef Py_ssize_t size + cdef unsigned int cp_off, cp_size + cdef int failed + + data = <unsigned char *>delta + top = data + delta_size + + # now the result size + size = get_delta_hdr_size(&data, top) + result = PyString_FromStringAndSize(NULL, size) + dst_buf = <unsigned char*>PyString_AS_STRING(result) + + failed = 0 + with nogil: + out = dst_buf + while (data < top): + cmd = data[0] + data = data + 1 + if (cmd & 0x80): + # Copy instruction + data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size) + if (cp_off + cp_size < cp_size or + cp_off + cp_size > <unsigned int>source_size or + cp_size > <unsigned int>size): + failed = 1 + break + memcpy(out, source + cp_off, cp_size) + out = out + cp_size + size = size - cp_size + else: + # Insert instruction + if cmd == 0: + # cmd == 0 is reserved for future encoding + # extensions. In the mean time we must fail when + # encountering them (might be data corruption). + failed = 2 + break + if cmd > size: + failed = 3 + break + memcpy(out, data, cmd) + out = out + cmd + data = data + cmd + size = size - cmd + if failed: + if failed == 1: + raise ValueError('Something wrong with:' + ' cp_off = %s, cp_size = %s' + ' source_size = %s, size = %s' + % (cp_off, cp_size, source_size, size)) + elif failed == 2: + raise ValueError('Got delta opcode: 0, not supported') + elif failed == 3: + raise ValueError('Insert instruction longer than remaining' + ' bytes: %d > %d' % (cmd, size)) + + # sanity check + if (data != top or size != 0): + raise RuntimeError('Did not extract the number of bytes we expected' + ' we were left with %d bytes in "size", and top - data = %d' + % (size, <int>(top - data))) + return None + + # *dst_size = out - dst_buf; + if (out - dst_buf) != PyString_GET_SIZE(result): + raise RuntimeError('Number of bytes extracted did not match the' + ' size encoded in the delta header.') + return result + + +def apply_delta_to_source(source, delta_start, delta_end): + """Extract a delta from source bytes, and apply it.""" + cdef char *c_source + cdef Py_ssize_t c_source_size + cdef char *c_delta + cdef Py_ssize_t c_delta_size + cdef Py_ssize_t c_delta_start, c_delta_end + + if not PyString_CheckExact(source): + raise TypeError('source is not a str') + c_source_size = PyString_GET_SIZE(source) + c_delta_start = delta_start + c_delta_end = delta_end + if c_delta_start >= c_source_size: + raise ValueError('delta starts after source') + if c_delta_end > c_source_size: + raise ValueError('delta ends after source') + if c_delta_start >= c_delta_end: + raise ValueError('delta starts after it ends') + + c_delta_size = c_delta_end - c_delta_start + c_source = PyString_AS_STRING(source) + c_delta = c_source + c_delta_start + # We don't use source_size, because we know the delta should not refer to + # any bytes after it starts + return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size) + + +def encode_base128_int(val): + """Convert an integer into a 7-bit lsb encoding.""" + cdef unsigned int c_val + cdef Py_ssize_t count + cdef unsigned int num_bytes + cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes + + c_val = val + count = 0 + while c_val >= 0x80 and count < 8: + c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF) + c_val = c_val >> 7 + count = count + 1 + if count >= 8 or c_val >= 0x80: + raise ValueError('encode_base128_int overflowed the buffer') + c_bytes[count] = <unsigned char>(c_val & 0xFF) + count = count + 1 + return PyString_FromStringAndSize(<char *>c_bytes, count) + + +def decode_base128_int(bytes): + """Decode an integer from a 7-bit lsb encoding.""" + cdef int offset + cdef int val + cdef unsigned int uval + cdef int shift + cdef Py_ssize_t num_low_bytes + cdef unsigned char *c_bytes + + offset = 0 + val = 0 + shift = 0 + if not PyString_CheckExact(bytes): + raise TypeError('bytes is not a string') + c_bytes = <unsigned char*>PyString_AS_STRING(bytes) + # We take off 1, because we have to be able to decode the non-expanded byte + num_low_bytes = PyString_GET_SIZE(bytes) - 1 + while (c_bytes[offset] & 0x80) and offset < num_low_bytes: + val = val | ((c_bytes[offset] & 0x7F) << shift) + shift = shift + 7 + offset = offset + 1 + if c_bytes[offset] & 0x80: + raise ValueError('Data not properly formatted, we ran out of' + ' bytes before 0x80 stopped being set.') + val = val | (c_bytes[offset] << shift) + offset = offset + 1 + if val < 0: + uval = <unsigned int> val + return uval, offset + return val, offset + + |