diff options
Diffstat (limited to 'git/fun.py')
-rw-r--r-- | git/fun.py | 674 |
1 files changed, 674 insertions, 0 deletions
diff --git a/git/fun.py b/git/fun.py new file mode 100644 index 00000000..5bbe8efc --- /dev/null +++ b/git/fun.py @@ -0,0 +1,674 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Contains basic c-functions which usually contain performance critical code +Keeping this code separate from the beginning makes it easier to out-source +it into c later, if required""" + +from exc import ( + BadObjectType + ) + +from util import zlib +decompressobj = zlib.decompressobj + +import mmap +from itertools import islice, izip + +from cStringIO import StringIO + +# INVARIANTS +OFS_DELTA = 6 +REF_DELTA = 7 +delta_types = (OFS_DELTA, REF_DELTA) + +type_id_to_type_map = { + 0 : "", # EXT 1 + 1 : "commit", + 2 : "tree", + 3 : "blob", + 4 : "tag", + 5 : "", # EXT 2 + OFS_DELTA : "OFS_DELTA", # OFFSET DELTA + REF_DELTA : "REF_DELTA" # REFERENCE DELTA + } + +type_to_type_id_map = dict( + commit=1, + tree=2, + blob=3, + tag=4, + OFS_DELTA=OFS_DELTA, + REF_DELTA=REF_DELTA + ) + +# used when dealing with larger streams +chunk_size = 1000*mmap.PAGESIZE + +__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', + 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', + 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') + + +#{ Structures + +def _set_delta_rbound(d, size): + """Truncate the given delta to the given size + :param size: size relative to our target offset, may not be 0, must be smaller or equal + to our size + :return: d""" + d.ts = size + + # NOTE: data is truncated automatically when applying the delta + # MUST NOT DO THIS HERE + return d + +def _move_delta_lbound(d, bytes): + """Move the delta by the given amount of bytes, reducing its size so that its + right bound stays static + :param bytes: amount of bytes to move, must be smaller than delta size + :return: d""" + if bytes == 0: + return + + d.to += bytes + d.so += bytes + d.ts -= bytes + if d.data is not None: + d.data = d.data[bytes:] + # END handle data + + return d + +def delta_duplicate(src): + return DeltaChunk(src.to, src.ts, src.so, src.data) + +def delta_chunk_apply(dc, bbuf, write): + """Apply own data to the target buffer + :param bbuf: buffer providing source bytes for copy operations + :param write: write method to call with data to write""" + if dc.data is None: + # COPY DATA FROM SOURCE + write(buffer(bbuf, dc.so, dc.ts)) + else: + # APPEND DATA + # whats faster: if + 4 function calls or just a write with a slice ? + # Considering data can be larger than 127 bytes now, it should be worth it + if dc.ts < len(dc.data): + write(dc.data[:dc.ts]) + else: + write(dc.data) + # END handle truncation + # END handle chunk mode + + +class DeltaChunk(object): + """Represents a piece of a delta, it can either add new data, or copy existing + one from a source buffer""" + __slots__ = ( + 'to', # start offset in the target buffer in bytes + 'ts', # size of this chunk in the target buffer in bytes + 'so', # start offset in the source buffer in bytes or None + 'data', # chunk of bytes to be added to the target buffer, + # DeltaChunkList to use as base, or None + ) + + def __init__(self, to, ts, so, data): + self.to = to + self.ts = ts + self.so = so + self.data = data + + def __repr__(self): + return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "") + + #{ Interface + + def rbound(self): + return self.to + self.ts + + def has_data(self): + """:return: True if the instance has data to add to the target stream""" + return self.data is not None + + #} END interface + +def _closest_index(dcl, absofs): + """:return: index at which the given absofs should be inserted. The index points + to the DeltaChunk with a target buffer absofs that equals or is greater than + absofs. + :note: global method for performance only, it belongs to DeltaChunkList""" + lo = 0 + hi = len(dcl) + while lo < hi: + mid = (lo + hi) / 2 + dc = dcl[mid] + if dc.to > absofs: + hi = mid + elif dc.rbound() > absofs or dc.to == absofs: + return mid + else: + lo = mid + 1 + # END handle bound + # END for each delta absofs + return len(dcl)-1 + +def delta_list_apply(dcl, bbuf, write): + """Apply the chain's changes and write the final result using the passed + write function. + :param bbuf: base buffer containing the base of all deltas contained in this + list. It will only be used if the chunk in question does not have a base + chain. + :param write: function taking a string of bytes to write to the output""" + for dc in dcl: + delta_chunk_apply(dc, bbuf, write) + # END for each dc + +def delta_list_slice(dcl, absofs, size, ndcl): + """:return: Subsection of this list at the given absolute offset, with the given + size in bytes. + :return: None""" + cdi = _closest_index(dcl, absofs) # delta start index + cd = dcl[cdi] + slen = len(dcl) + lappend = ndcl.append + + if cd.to != absofs: + tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) + _move_delta_lbound(tcd, absofs - cd.to) + tcd.ts = min(tcd.ts, size) + lappend(tcd) + size -= tcd.ts + cdi += 1 + # END lbound overlap handling + + while cdi < slen and size: + # are we larger than the current block + cd = dcl[cdi] + if cd.ts <= size: + lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data)) + size -= cd.ts + else: + tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) + tcd.ts = size + lappend(tcd) + size -= tcd.ts + break + # END hadle size + cdi += 1 + # END for each chunk + + +class DeltaChunkList(list): + """List with special functionality to deal with DeltaChunks. + There are two types of lists we represent. The one was created bottom-up, working + towards the latest delta, the other kind was created top-down, working from the + latest delta down to the earliest ancestor. This attribute is queryable + after all processing with is_reversed.""" + + __slots__ = tuple() + + def rbound(self): + """:return: rightmost extend in bytes, absolute""" + if len(self) == 0: + return 0 + return self[-1].rbound() + + def lbound(self): + """:return: leftmost byte at which this chunklist starts""" + if len(self) == 0: + return 0 + return self[0].to + + def size(self): + """:return: size of bytes as measured by our delta chunks""" + return self.rbound() - self.lbound() + + def apply(self, bbuf, write): + """Only used by public clients, internally we only use the global routines + for performance""" + return delta_list_apply(self, bbuf, write) + + def compress(self): + """Alter the list to reduce the amount of nodes. Currently we concatenate + add-chunks + :return: self""" + slen = len(self) + if slen < 2: + return self + i = 0 + slen_orig = slen + + first_data_index = None + while i < slen: + dc = self[i] + i += 1 + if dc.data is None: + if first_data_index is not None and i-2-first_data_index > 1: + #if first_data_index is not None: + nd = StringIO() # new data + so = self[first_data_index].to # start offset in target buffer + for x in xrange(first_data_index, i-1): + xdc = self[x] + nd.write(xdc.data[:xdc.ts]) + # END collect data + + del(self[first_data_index:i-1]) + buf = nd.getvalue() + self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) + + slen = len(self) + i = first_data_index + 1 + + # END concatenate data + first_data_index = None + continue + # END skip non-data chunks + + if first_data_index is None: + first_data_index = i-1 + # END iterate list + + #if slen_orig != len(self): + # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) + return self + + def check_integrity(self, target_size=-1): + """Verify the list has non-overlapping chunks only, and the total size matches + target_size + :param target_size: if not -1, the total size of the chain must be target_size + :raise AssertionError: if the size doen't match""" + if target_size > -1: + assert self[-1].rbound() == target_size + assert reduce(lambda x,y: x+y, (d.ts for d in self), 0) == target_size + # END target size verification + + if len(self) < 2: + return + + # check data + for dc in self: + assert dc.ts > 0 + if dc.has_data(): + assert len(dc.data) >= dc.ts + # END for each dc + + left = islice(self, 0, len(self)-1) + right = iter(self) + right.next() + # this is very pythonic - we might have just use index based access here, + # but this could actually be faster + for lft,rgt in izip(left, right): + assert lft.rbound() == rgt.to + assert lft.to + lft.ts == rgt.to + # END for each pair + + +class TopdownDeltaChunkList(DeltaChunkList): + """Represents a list which is generated by feeding its ancestor streams one by + one""" + __slots__ = tuple() + + def connect_with_next_base(self, bdcl): + """Connect this chain with the next level of our base delta chunklist. + The goal in this game is to mark as many of our chunks rigid, hence they + cannot be changed by any of the upcoming bases anymore. Once all our + chunks are marked like that, we can stop all processing + :param bdcl: data chunk list being one of our bases. They must be fed in + consequtively and in order, towards the earliest ancestor delta + :return: True if processing was done. Use it to abort processing of + remaining streams if False is returned""" + nfc = 0 # number of frozen chunks + dci = 0 # delta chunk index + slen = len(self) # len of self + ccl = list() # temporary list + while dci < slen: + dc = self[dci] + dci += 1 + + # all add-chunks which are already topmost don't need additional processing + if dc.data is not None: + nfc += 1 + continue + # END skip add chunks + + # copy chunks + # integrate the portion of the base list into ourselves. Lists + # dont support efficient insertion ( just one at a time ), but for now + # we live with it. Internally, its all just a 32/64bit pointer, and + # the portions of moved memory should be smallish. Maybe we just rebuild + # ourselves in order to reduce the amount of insertions ... + del(ccl[:]) + delta_list_slice(bdcl, dc.so, dc.ts, ccl) + + # move the target bounds into place to match with our chunk + ofs = dc.to - dc.so + for cdc in ccl: + cdc.to += ofs + # END update target bounds + + if len(ccl) == 1: + self[dci-1] = ccl[0] + else: + # maybe try to compute the expenses here, and pick the right algorithm + # It would normally be faster than copying everything physically though + # TODO: Use a deque here, and decide by the index whether to extend + # or extend left ! + post_dci = self[dci:] + del(self[dci-1:]) # include deletion of dc + self.extend(ccl) + self.extend(post_dci) + + slen = len(self) + dci += len(ccl)-1 # deleted dc, added rest + + # END handle chunk replacement + # END for each chunk + + if nfc == slen: + return False + # END handle completeness + return True + + +#} END structures + +#{ Routines + +def is_loose_object(m): + """ + :return: True the file contained in memory map m appears to be a loose object. + Only the first two bytes are needed""" + b0, b1 = map(ord, m[:2]) + word = (b0 << 8) + b1 + return b0 == 0x78 and (word % 31) == 0 + +def loose_object_header_info(m): + """ + :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the + object as well as its uncompressed size in bytes. + :param m: memory map from which to read the compressed object data""" + decompress_size = 8192 # is used in cgit as well + hdr = decompressobj().decompress(m, decompress_size) + type_name, size = hdr[:hdr.find("\0")].split(" ") + return type_name, int(size) + +def pack_object_header_info(data): + """ + :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset) + The type_id should be interpreted according to the ``type_id_to_type_map`` map + The byte-offset specifies the start of the actual zlib compressed datastream + :param m: random-access memory, like a string or memory map""" + c = ord(data[0]) # first byte + i = 1 # next char to read + type_id = (c >> 4) & 7 # numeric type + size = c & 15 # starting size + s = 4 # starting bit-shift size + while c & 0x80: + c = ord(data[i]) + i += 1 + size += (c & 0x7f) << s + s += 7 + # END character loop + return (type_id, size, i) + +def create_pack_object_header(obj_type, obj_size): + """:return: string defining the pack header comprised of the object type + and its incompressed size in bytes + :parmam obj_type: pack type_id of the object + :param obj_size: uncompressed size in bytes of the following object stream""" + c = 0 # 1 byte + hdr = str() # output string + + c = (obj_type << 4) | (obj_size & 0xf) + obj_size >>= 4 + while obj_size: + hdr += chr(c | 0x80) + c = obj_size & 0x7f + obj_size >>= 7 + #END until size is consumed + hdr += chr(c) + return hdr + +def msb_size(data, offset=0): + """ + :return: tuple(read_bytes, size) read the msb size from the given random + access data starting at the given byte offset""" + size = 0 + i = 0 + l = len(data) + hit_msb = False + while i < l: + c = ord(data[i+offset]) + size |= (c & 0x7f) << i*7 + i += 1 + if not c & 0x80: + hit_msb = True + break + # END check msb bit + # END while in range + if not hit_msb: + raise AssertionError("Could not find terminating MSB byte in data stream") + return i+offset, size + +def loose_object_header(type, size): + """ + :return: string representing the loose object header, which is immediately + followed by the content stream of size 'size'""" + return "%s %i\0" % (type, size) + +def write_object(type, size, read, write, chunk_size=chunk_size): + """ + Write the object as identified by type, size and source_stream into the + target_stream + + :param type: type string of the object + :param size: amount of bytes to write from source_stream + :param read: read method of a stream providing the content data + :param write: write method of the output stream + :param close_target_stream: if True, the target stream will be closed when + the routine exits, even if an error is thrown + :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" + tbw = 0 # total num bytes written + + # WRITE HEADER: type SP size NULL + tbw += write(loose_object_header(type, size)) + tbw += stream_copy(read, write, size, chunk_size) + + return tbw + +def stream_copy(read, write, size, chunk_size): + """ + Copy a stream up to size bytes using the provided read and write methods, + in chunks of chunk_size + + :note: its much like stream_copy utility, but operates just using methods""" + dbw = 0 # num data bytes written + + # WRITE ALL DATA UP TO SIZE + while True: + cs = min(chunk_size, size-dbw) + # NOTE: not all write methods return the amount of written bytes, like + # mmap.write. Its bad, but we just deal with it ... perhaps its not + # even less efficient + # data_len = write(read(cs)) + # dbw += data_len + data = read(cs) + data_len = len(data) + dbw += data_len + write(data) + if data_len < cs or dbw == size: + break + # END check for stream end + # END duplicate data + return dbw + +def connect_deltas(dstreams): + """ + Read the condensed delta chunk information from dstream and merge its information + into a list of existing delta chunks + + :param dstreams: iterable of delta stream objects, the delta to be applied last + comes first, then all its ancestors in order + :return: DeltaChunkList, containing all operations to apply""" + tdcl = None # topmost dcl + + dcl = tdcl = TopdownDeltaChunkList() + for dsi, ds in enumerate(dstreams): + # print "Stream", dsi + db = ds.read() + delta_buf_size = ds.size + + # read header + i, base_size = msb_size(db) + i, target_size = msb_size(db, i) + + # interpret opcodes + tbw = 0 # amount of target bytes written + while i < delta_buf_size: + c = ord(db[i]) + i += 1 + if c & 0x80: + cp_off, cp_size = 0, 0 + if (c & 0x01): + cp_off = ord(db[i]) + i += 1 + if (c & 0x02): + cp_off |= (ord(db[i]) << 8) + i += 1 + if (c & 0x04): + cp_off |= (ord(db[i]) << 16) + i += 1 + if (c & 0x08): + cp_off |= (ord(db[i]) << 24) + i += 1 + if (c & 0x10): + cp_size = ord(db[i]) + i += 1 + if (c & 0x20): + cp_size |= (ord(db[i]) << 8) + i += 1 + if (c & 0x40): + cp_size |= (ord(db[i]) << 16) + i += 1 + + if not cp_size: + cp_size = 0x10000 + + rbound = cp_off + cp_size + if (rbound < cp_size or + rbound > base_size): + break + + dcl.append(DeltaChunk(tbw, cp_size, cp_off, None)) + tbw += cp_size + elif c: + # NOTE: in C, the data chunks should probably be concatenated here. + # In python, we do it as a post-process + dcl.append(DeltaChunk(tbw, c, 0, db[i:i+c])) + i += c + tbw += c + else: + raise ValueError("unexpected delta opcode 0") + # END handle command byte + # END while processing delta data + + dcl.compress() + + # merge the lists ! + if dsi > 0: + if not tdcl.connect_with_next_base(dcl): + break + # END handle merge + + # prepare next base + dcl = DeltaChunkList() + # END for each delta stream + + return tdcl + +def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): + """ + Apply data from a delta buffer using a source buffer to the target file + + :param src_buf: random access data from which the delta was created + :param src_buf_size: size of the source buffer in bytes + :param delta_buf_size: size fo the delta buffer in bytes + :param delta_buf: random access delta data + :param write: write method taking a chunk of bytes + :note: transcribed to python from the similar routine in patch-delta.c""" + i = 0 + db = delta_buf + while i < delta_buf_size: + c = ord(db[i]) + i += 1 + if c & 0x80: + cp_off, cp_size = 0, 0 + if (c & 0x01): + cp_off = ord(db[i]) + i += 1 + if (c & 0x02): + cp_off |= (ord(db[i]) << 8) + i += 1 + if (c & 0x04): + cp_off |= (ord(db[i]) << 16) + i += 1 + if (c & 0x08): + cp_off |= (ord(db[i]) << 24) + i += 1 + if (c & 0x10): + cp_size = ord(db[i]) + i += 1 + if (c & 0x20): + cp_size |= (ord(db[i]) << 8) + i += 1 + if (c & 0x40): + cp_size |= (ord(db[i]) << 16) + i += 1 + + if not cp_size: + cp_size = 0x10000 + + rbound = cp_off + cp_size + if (rbound < cp_size or + rbound > src_buf_size): + break + write(buffer(src_buf, cp_off, cp_size)) + elif c: + write(db[i:i+c]) + i += c + else: + raise ValueError("unexpected delta opcode 0") + # END handle command byte + # END while processing delta data + + # yes, lets use the exact same error message that git uses :) + assert i == delta_buf_size, "delta replay has gone wild" + + +def is_equal_canonical_sha(canonical_length, match, sha1): + """ + :return: True if the given lhs and rhs 20 byte binary shas + The comparison will take the canonical_length of the match sha into account, + hence the comparison will only use the last 4 bytes for uneven canonical representations + :param match: less than 20 byte sha + :param sha1: 20 byte sha""" + binary_length = canonical_length/2 + if match[:binary_length] != sha1[:binary_length]: + return False + + if canonical_length - binary_length and \ + (ord(match[-1]) ^ ord(sha1[len(match)-1])) & 0xf0: + return False + # END handle uneven canonnical length + return True + +#} END routines + + +try: + # raise ImportError; # DEBUG + from _perf import connect_deltas +except ImportError: + pass |