1 files changed, 674 insertions, 0 deletions
diff --git a/git/fun.py b/git/fun.py
new file mode 100644
index 00000000..5bbe8efc
--- /dev/null
+++ b/git/fun.py
@@ -0,0 +1,674 @@
+# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
+#
+# This module is part of GitDB and is released under
+# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
+"""Contains basic c-functions which usually contain performance critical code
+Keeping this code separate from the beginning makes it easier to out-source
+it into c later, if required"""
+
+from exc import (
+	BadObjectType
+	)
+
+from util import zlib
+decompressobj = zlib.decompressobj
+
+import mmap
+from itertools import islice, izip
+
+from cStringIO import StringIO
+
+# INVARIANTS
+OFS_DELTA = 6
+REF_DELTA = 7
+delta_types = (OFS_DELTA, REF_DELTA)
+
+type_id_to_type_map = 	{
+							0 : "",				# EXT 1
+							1 : "commit",
+							2 : "tree",
+							3 : "blob",
+							4 : "tag",
+							5 : "",				# EXT 2
+							OFS_DELTA : "OFS_DELTA", 	# OFFSET DELTA
+							REF_DELTA : "REF_DELTA"		# REFERENCE DELTA
+						}
+
+type_to_type_id_map = dict(
+							commit=1, 
+							tree=2,
+							blob=3,
+							tag=4,
+							OFS_DELTA=OFS_DELTA,
+							REF_DELTA=REF_DELTA
+						)
+
+# used when dealing with larger streams
+chunk_size = 1000*mmap.PAGESIZE
+
+__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', 
+			'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', 
+			'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header')
+
+
+#{ Structures
+
+def _set_delta_rbound(d, size):
+	"""Truncate the given delta to the given size
+	:param size: size relative to our target offset, may not be 0, must be smaller or equal
+		to our size
+	:return: d"""
+	d.ts = size
+	
+	# NOTE: data is truncated automatically when applying the delta
+	# MUST NOT DO THIS HERE
+	return d
+			
+def _move_delta_lbound(d, bytes):
+	"""Move the delta by the given amount of bytes, reducing its size so that its
+	right bound stays static
+	:param bytes: amount of bytes to move, must be smaller than delta size
+	:return: d"""
+	if bytes == 0:
+		return
+		
+	d.to += bytes
+	d.so += bytes
+	d.ts -= bytes
+	if d.data is not None:
+		d.data = d.data[bytes:]
+	# END handle data
+	
+	return d
+	
+def delta_duplicate(src):
+	return DeltaChunk(src.to, src.ts, src.so, src.data)
+	
+def delta_chunk_apply(dc, bbuf, write):
+	"""Apply own data to the target buffer
+	:param bbuf: buffer providing source bytes for copy operations
+	:param write: write method to call with data to write"""
+	if dc.data is None:
+		# COPY DATA FROM SOURCE
+		write(buffer(bbuf, dc.so, dc.ts))
+	else:
+		# APPEND DATA
+		# whats faster: if + 4 function calls or just a write with a slice ?
+		# Considering data can be larger than 127 bytes now, it should be worth it
+		if dc.ts < len(dc.data):
+			write(dc.data[:dc.ts])
+		else:
+			write(dc.data)
+		# END handle truncation
+	# END handle chunk mode
+
+
+class DeltaChunk(object):
+	"""Represents a piece of a delta, it can either add new data, or copy existing
+	one from a source buffer"""
+	__slots__ = (
+					'to',		# start offset in the target buffer in bytes 
+					'ts',		# size of this chunk in the target buffer in bytes
+					'so',		# start offset in the source buffer in bytes or None
+					'data',		# chunk of bytes to be added to the target buffer,
+								# DeltaChunkList to use as base, or None
+				)
+	
+	def __init__(self, to, ts, so, data):
+		self.to = to
+		self.ts = ts
+		self.so = so
+		self.data = data
+
+	def __repr__(self):
+		return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "")
+	
+	#{ Interface
+		
+	def rbound(self):
+		return self.to + self.ts
+		
+	def has_data(self):
+		""":return: True if the instance has data to add to the target stream"""
+		return self.data is not None
+		
+	#} END interface
+
+def _closest_index(dcl, absofs):
+	""":return: index at which the given absofs should be inserted. The index points
+	to the DeltaChunk with a target buffer absofs that equals or is greater than
+	absofs. 
+	:note: global method for performance only, it belongs to DeltaChunkList"""
+	lo = 0
+	hi = len(dcl)
+	while lo < hi:
+		mid = (lo + hi) / 2
+		dc = dcl[mid]
+		if dc.to > absofs:
+			hi = mid
+		elif dc.rbound() > absofs or dc.to == absofs:
+			return mid
+		else:
+			lo = mid + 1
+		# END handle bound
+	# END for each delta absofs
+	return len(dcl)-1
+	
+def delta_list_apply(dcl, bbuf, write):
+	"""Apply the chain's changes and write the final result using the passed
+	write function.
+	:param bbuf: base buffer containing the base of all deltas contained in this
+		list. It will only be used if the chunk in question does not have a base
+		chain.
+	:param write: function taking a string of bytes to write to the output"""
+	for dc in dcl:
+		delta_chunk_apply(dc, bbuf, write)
+	# END for each dc
+
+def delta_list_slice(dcl, absofs, size, ndcl):
+	""":return: Subsection of this  list at the given absolute  offset, with the given 
+		size in bytes.
+	:return: None"""
+	cdi = _closest_index(dcl, absofs)	# delta start index
+	cd = dcl[cdi]
+	slen = len(dcl)
+	lappend = ndcl.append 
+	
+	if cd.to != absofs:
+		tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data)
+		_move_delta_lbound(tcd, absofs - cd.to)
+		tcd.ts = min(tcd.ts, size)
+		lappend(tcd)
+		size -= tcd.ts
+		cdi += 1
+	# END lbound overlap handling
+	
+	while cdi < slen and size:
+		# are we larger than the current block
+		cd = dcl[cdi]
+		if cd.ts <= size:
+			lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data))
+			size -= cd.ts
+		else:
+			tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data)
+			tcd.ts = size
+			lappend(tcd)
+			size -= tcd.ts
+			break
+		# END hadle size
+		cdi += 1
+	# END for each chunk
+	
+	
+class DeltaChunkList(list):
+	"""List with special functionality to deal with DeltaChunks.
+	There are two types of lists we represent. The one was created bottom-up, working
+	towards the latest delta, the other kind was created top-down, working from the 
+	latest delta down to the earliest ancestor. This attribute is queryable 
+	after all processing with is_reversed."""
+	
+	__slots__ = tuple()
+	
+	def rbound(self):
+		""":return: rightmost extend in bytes, absolute"""
+		if len(self) == 0:
+			return 0
+		return self[-1].rbound()
+		
+	def lbound(self):
+		""":return: leftmost byte at which this chunklist starts"""
+		if len(self) == 0:
+			return 0
+		return self[0].to
+		
+	def size(self):
+		""":return: size of bytes as measured by our delta chunks"""
+		return self.rbound() - self.lbound()
+		
+	def apply(self, bbuf, write):
+		"""Only used by public clients, internally we only use the global routines
+		for performance"""
+		return delta_list_apply(self, bbuf, write)
+		
+	def compress(self):
+		"""Alter the list to reduce the amount of nodes. Currently we concatenate
+		add-chunks
+		:return: self"""
+		slen = len(self)
+		if slen < 2:
+			return self
+		i = 0
+		slen_orig = slen
+		
+		first_data_index = None
+		while i < slen:
+			dc = self[i]
+			i += 1
+			if dc.data is None:
+				if first_data_index is not None and i-2-first_data_index > 1:
+				#if first_data_index is not None:
+					nd = StringIO()						# new data
+					so = self[first_data_index].to		# start offset in target buffer
+					for x in xrange(first_data_index, i-1):
+						xdc = self[x]
+						nd.write(xdc.data[:xdc.ts])
+					# END collect data
+					
+					del(self[first_data_index:i-1])
+					buf = nd.getvalue()
+					self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) 
+					
+					slen = len(self)
+					i = first_data_index + 1
+					
+				# END concatenate data
+				first_data_index = None
+				continue
+			# END skip non-data chunks
+			
+			if first_data_index is None:
+				first_data_index = i-1
+		# END iterate list
+		
+		#if slen_orig != len(self):
+		#	print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100)
+		return self
+		
+	def check_integrity(self, target_size=-1):
+		"""Verify the list has non-overlapping chunks only, and the total size matches
+		target_size
+		:param target_size: if not -1, the total size of the chain must be target_size
+		:raise AssertionError: if the size doen't match"""
+		if target_size > -1:
+			assert self[-1].rbound() == target_size
+			assert reduce(lambda x,y: x+y, (d.ts for d in self), 0) == target_size
+		# END target size verification
+		
+		if len(self) < 2:
+			return
+			
+		# check data
+		for dc in self:
+			assert dc.ts > 0
+			if dc.has_data():
+				assert len(dc.data) >= dc.ts
+		# END for each dc
+			
+		left = islice(self, 0, len(self)-1)
+		right = iter(self)
+		right.next()
+		# this is very pythonic - we might have just use index based access here, 
+		# but this could actually be faster
+		for lft,rgt in izip(left, right):
+			assert lft.rbound() == rgt.to
+			assert lft.to + lft.ts == rgt.to
+		# END for each pair
+		
+
+class TopdownDeltaChunkList(DeltaChunkList):
+	"""Represents a list which is generated by feeding its ancestor streams one by 
+	one"""
+	__slots__ = tuple() 
+	
+	def connect_with_next_base(self, bdcl):
+		"""Connect this chain with the next level of our base delta chunklist.
+		The goal in this game is to mark as many of our chunks rigid, hence they
+		cannot be changed by any of the upcoming bases anymore. Once all our 
+		chunks are marked like that, we can stop all processing
+		:param bdcl: data chunk list being one of our bases. They must be fed in 
+			consequtively and in order, towards the earliest ancestor delta
+		:return: True if processing was done. Use it to abort processing of
+			remaining streams if False is returned"""
+		nfc = 0								# number of frozen chunks
+		dci = 0								# delta chunk index
+		slen = len(self)					# len of self
+		ccl = list()						# temporary list
+		while dci < slen:
+			dc = self[dci]
+			dci += 1
+			
+			# all add-chunks which are already topmost don't need additional processing
+			if dc.data is not None:
+				nfc += 1
+				continue
+			# END skip add chunks
+			
+			# copy chunks
+			# integrate the portion of the base list into ourselves. Lists
+			# dont support efficient insertion ( just one at a time ), but for now
+			# we live with it. Internally, its all just a 32/64bit pointer, and
+			# the portions of moved memory should be smallish. Maybe we just rebuild
+			# ourselves in order to reduce the amount of insertions ...
+			del(ccl[:])
+			delta_list_slice(bdcl, dc.so, dc.ts, ccl)
+			
+			# move the target bounds into place to match with our chunk
+			ofs = dc.to - dc.so
+			for cdc in ccl:
+				cdc.to += ofs
+			# END update target bounds
+			
+			if len(ccl) == 1:
+				self[dci-1] = ccl[0]
+			else:
+				# maybe try to compute the expenses here, and pick the right algorithm
+				# It would normally be faster than copying everything physically though
+				# TODO: Use a deque here, and decide by the index whether to extend
+				# or extend left !
+				post_dci = self[dci:]
+				del(self[dci-1:])			# include deletion of dc
+				self.extend(ccl)
+				self.extend(post_dci)
+				
+				slen = len(self)
+				dci += len(ccl)-1			# deleted dc, added rest
+				
+			# END handle chunk replacement
+		# END for each chunk
+		
+		if nfc == slen:
+			return False
+		# END handle completeness
+		return True
+		
+		
+#} END structures
+
+#{ Routines
+
+def is_loose_object(m):
+	"""
+	:return: True the file contained in memory map m appears to be a loose object.
+		Only the first two bytes are needed"""
+	b0, b1 = map(ord, m[:2])
+	word = (b0 << 8) + b1
+	return b0 == 0x78 and (word % 31) == 0
+
+def loose_object_header_info(m):
+	"""
+	:return: tuple(type_string, uncompressed_size_in_bytes) the type string of the 
+		object as well as its uncompressed size in bytes.
+	:param m: memory map from which to read the compressed object data"""
+	decompress_size = 8192		# is used in cgit as well
+	hdr = decompressobj().decompress(m, decompress_size)
+	type_name, size = hdr[:hdr.find("\0")].split(" ")
+	return type_name, int(size)
+	
+def pack_object_header_info(data):
+	"""
+	:return: tuple(type_id, uncompressed_size_in_bytes, byte_offset)
+		The type_id should be interpreted according to the ``type_id_to_type_map`` map
+		The byte-offset specifies the start of the actual zlib compressed datastream
+	:param m: random-access memory, like a string or memory map"""
+	c = ord(data[0])				# first byte
+	i = 1							# next char to read
+	type_id = (c >> 4) & 7			# numeric type
+	size = c & 15					# starting size
+	s = 4							# starting bit-shift size
+	while c & 0x80:
+		c = ord(data[i])
+		i += 1
+		size += (c & 0x7f) << s
+		s += 7
+	# END character loop
+	return (type_id, size, i)
+
+def create_pack_object_header(obj_type, obj_size):
+	""":return: string defining the pack header comprised of the object type
+	and its incompressed size in bytes
+	:parmam obj_type: pack type_id of the object
+	:param obj_size: uncompressed size in bytes of the following object stream"""
+	c = 0		# 1 byte
+	hdr = str()	# output string
+
+	c = (obj_type << 4) | (obj_size & 0xf)
+	obj_size >>= 4
+	while obj_size:
+		hdr += chr(c | 0x80)
+		c = obj_size & 0x7f
+		obj_size >>= 7
+	#END until size is consumed
+	hdr += chr(c)
+	return hdr
+	
+def msb_size(data, offset=0):
+	"""
+	:return: tuple(read_bytes, size) read the msb size from the given random 
+		access data starting at the given byte offset"""
+	size = 0
+	i = 0
+	l = len(data)
+	hit_msb = False
+	while i < l:
+		c = ord(data[i+offset])
+		size |= (c & 0x7f) << i*7
+		i += 1
+		if not c & 0x80:
+			hit_msb = True
+			break
+		# END check msb bit
+	# END while in range
+	if not hit_msb:
+		raise AssertionError("Could not find terminating MSB byte in data stream")
+	return i+offset, size 
+	
+def loose_object_header(type, size):
+	"""
+	:return: string representing the loose object header, which is immediately
+		followed by the content stream of size 'size'"""
+	return "%s %i\0" % (type, size)
+		
+def write_object(type, size, read, write, chunk_size=chunk_size):
+	"""
+	Write the object as identified by type, size and source_stream into the 
+	target_stream
+	
+	:param type: type string of the object
+	:param size: amount of bytes to write from source_stream
+	:param read: read method of a stream providing the content data
+	:param write: write method of the output stream
+	:param close_target_stream: if True, the target stream will be closed when
+		the routine exits, even if an error is thrown
+	:return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
+	tbw = 0												# total num bytes written
+	
+	# WRITE HEADER: type SP size NULL
+	tbw += write(loose_object_header(type, size))
+	tbw += stream_copy(read, write, size, chunk_size)
+	
+	return tbw
+
+def stream_copy(read, write, size, chunk_size):
+	"""
+	Copy a stream up to size bytes using the provided read and write methods, 
+	in chunks of chunk_size
+	
+	:note: its much like stream_copy utility, but operates just using methods"""
+	dbw = 0												# num data bytes written
+	
+	# WRITE ALL DATA UP TO SIZE
+	while True:
+		cs = min(chunk_size, size-dbw)
+		# NOTE: not all write methods return the amount of written bytes, like
+		# mmap.write. Its bad, but we just deal with it ... perhaps its not 
+		# even less efficient
+		# data_len = write(read(cs))
+		# dbw += data_len
+		data = read(cs)
+		data_len = len(data)
+		dbw += data_len
+		write(data)
+		if data_len < cs or dbw == size:
+			break
+		# END check for stream end
+	# END duplicate data
+	return dbw
+	
+def connect_deltas(dstreams):
+	"""
+	Read the condensed delta chunk information from dstream and merge its information
+		into a list of existing delta chunks
+	
+	:param dstreams: iterable of delta stream objects, the delta to be applied last
+		comes first, then all its ancestors in order
+	:return: DeltaChunkList, containing all operations to apply"""
+	tdcl = None							# topmost dcl
+	
+	dcl = tdcl = TopdownDeltaChunkList()
+	for dsi, ds in enumerate(dstreams):
+		# print "Stream", dsi
+		db = ds.read()
+		delta_buf_size = ds.size
+		
+		# read header
+		i, base_size = msb_size(db)
+		i, target_size = msb_size(db, i)
+		
+		# interpret opcodes
+		tbw = 0						# amount of target bytes written
+		while i < delta_buf_size:
+			c = ord(db[i])
+			i += 1
+			if c & 0x80:
+				cp_off, cp_size = 0, 0
+				if (c & 0x01):
+					cp_off = ord(db[i])
+					i += 1
+				if (c & 0x02):
+					cp_off |= (ord(db[i]) << 8)
+					i += 1
+				if (c & 0x04):
+					cp_off |= (ord(db[i]) << 16)
+					i += 1
+				if (c & 0x08):
+					cp_off |= (ord(db[i]) << 24)
+					i += 1
+				if (c & 0x10):
+					cp_size = ord(db[i])
+					i += 1
+				if (c & 0x20):
+					cp_size |= (ord(db[i]) << 8)
+					i += 1
+				if (c & 0x40):
+					cp_size |= (ord(db[i]) << 16)
+					i += 1
+					
+				if not cp_size: 
+					cp_size = 0x10000
+				
+				rbound = cp_off + cp_size
+				if (rbound < cp_size or
+					rbound > base_size):
+					break
+				
+				dcl.append(DeltaChunk(tbw, cp_size, cp_off, None))
+				tbw += cp_size
+			elif c:
+				# NOTE: in C, the data chunks should probably be concatenated here.
+				# In python, we do it as a post-process
+				dcl.append(DeltaChunk(tbw, c, 0, db[i:i+c]))
+				i += c
+				tbw += c
+			else:
+				raise ValueError("unexpected delta opcode 0")
+			# END handle command byte
+		# END while processing delta data
+		
+		dcl.compress()
+		
+		# merge the lists !
+		if dsi > 0:
+			if not tdcl.connect_with_next_base(dcl):
+				break
+		# END handle merge
+		
+		# prepare next base
+		dcl = DeltaChunkList()
+	# END for each delta stream
+	
+	return tdcl
+	
+def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write):
+	"""
+	Apply data from a delta buffer using a source buffer to the target file
+	
+	:param src_buf: random access data from which the delta was created
+	:param src_buf_size: size of the source buffer in bytes
+	:param delta_buf_size: size fo the delta buffer in bytes
+	:param delta_buf: random access delta data
+	:param write: write method taking a chunk of bytes
+	:note: transcribed to python from the similar routine in patch-delta.c"""
+	i = 0
+	db = delta_buf
+	while i < delta_buf_size:
+		c = ord(db[i])
+		i += 1
+		if c & 0x80:
+			cp_off, cp_size = 0, 0
+			if (c & 0x01):
+				cp_off = ord(db[i])
+				i += 1
+			if (c & 0x02):
+				cp_off |= (ord(db[i]) << 8)
+				i += 1
+			if (c & 0x04):
+				cp_off |= (ord(db[i]) << 16)
+				i += 1
+			if (c & 0x08):
+				cp_off |= (ord(db[i]) << 24)
+				i += 1
+			if (c & 0x10):
+				cp_size = ord(db[i])
+				i += 1
+			if (c & 0x20):
+				cp_size |= (ord(db[i]) << 8)
+				i += 1
+			if (c & 0x40):
+				cp_size |= (ord(db[i]) << 16)
+				i += 1
+				
+			if not cp_size: 
+				cp_size = 0x10000
+			
+			rbound = cp_off + cp_size
+			if (rbound < cp_size or
+			    rbound > src_buf_size):
+				break
+			write(buffer(src_buf, cp_off, cp_size))
+		elif c:
+			write(db[i:i+c])
+			i += c
+		else:
+			raise ValueError("unexpected delta opcode 0")
+		# END handle command byte
+	# END while processing delta data
+	
+	# yes, lets use the exact same error message that git uses :)
+	assert i == delta_buf_size, "delta replay has gone wild"
+	
+	
+def is_equal_canonical_sha(canonical_length, match, sha1):
+	"""
+	:return: True if the given lhs and rhs 20 byte binary shas
+		The comparison will take the canonical_length of the match sha into account, 
+		hence the comparison will only use the last 4 bytes for uneven canonical representations
+	:param match: less than 20 byte sha
+	:param sha1: 20 byte sha"""
+	binary_length = canonical_length/2
+	if match[:binary_length] != sha1[:binary_length]:
+		return False
+		
+	if canonical_length - binary_length and \
+		(ord(match[-1]) ^ ord(sha1[len(match)-1])) & 0xf0:
+		return False
+	# END handle uneven canonnical length
+	return True
+	
+#} END routines
+
+
+try:
+	# raise ImportError; # DEBUG
+	from _perf import connect_deltas
+except ImportError:
+	pass