commit.create_from_tree now uses pure python implementation, fixed message parsing which truncated newlines although it was ilegitimate. Its up to the reader to truncate therse, nowhere in the git code I could find anyone adding newlines to commits where it is written

Added performance tests for serialization, it does about 5k commits per second if writing to tmpfs
author: Sebastian Thiel <byronimo@gmail.com> 2010-06-03 23:20:34 +0200
committer: Sebastian Thiel <byronimo@gmail.com> 2010-06-03 23:20:34 +0200
commit: 1e2b46138ba58033738a24dadccc265748fce2ca (patch)
tree: 0f2a625a371c16cc95e53e024e007d8b89d87c92 /lib/git
parent: 4b4a514e51fbc7dc6ddcb27c188159d57b5d1fa9 (diff)
download: gitpython-1e2b46138ba58033738a24dadccc265748fce2ca.tar.gz
7 files changed, 104 insertions, 64 deletions
diff --git a/lib/git/cmd.py b/lib/git/cmd.py
index aaa27adc..18d1c505 100644
--- a/lib/git/cmd.py
+++ b/lib/git/cmd.py
@@ -323,12 +323,7 @@ class Git(object):
 				stdout_value = proc.stdout.read().rstrip()		# strip trailing "\n"
 			else:
 				max_chunk_size = 1024*64
-				while True:
-					chunk = proc.stdout.read(max_chunk_size)
-					output_stream.write(chunk)
-					if len(chunk) < max_chunk_size:
-						break
-				# END reading output stream
+				stream_copy(proc.stdout, output_stream, max_chunk_size)
 				stdout_value = output_stream
 			# END stdout handling
 			stderr_value = proc.stderr.read().rstrip()			# strip trailing "\n"
diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py
index 64a5678e..f7043199 100644
--- a/lib/git/objects/base.py
+++ b/lib/git/objects/base.py
@@ -125,8 +125,8 @@ class Object(LazyMixin):
 		Returns 
 			File Object compatible stream to the uncompressed raw data of the object
 		"""
-		sha, type, size, stream = self.repo.git.stream_object_data(self.sha)
-		return stream 
+		proc = self.repo.git.cat_file(self.type, self.sha, as_process=True)
+		return utils.ProcessStreamAdapter(proc, "stdout") 
 
 	def stream_data(self, ostream):
 		"""
diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py
index 98aca360..d56ce306 100644
--- a/lib/git/objects/commit.py
+++ b/lib/git/objects/commit.py
@@ -91,15 +91,6 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
 		"""
 		super(Commit,self).__init__(repo, sha)
 		self._set_self_from_args_(locals())
-
-		if parents is not None:
-			cls = type(self)
-			self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls))
-		# END for each parent to convert
-			
-		if self.sha and tree is not None:
-			self.tree = Tree(repo, tree, path='')
-		# END id to tree conversion
 		
 	@classmethod
 	def _get_intermediate_items(cls, commit):
@@ -350,7 +341,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
 						committer, committer_time, committer_offset,
 						message, parent_commits, conf_encoding)
 		
-		# serialize !
+		stream = StringIO()
+		new_commit._serialize(stream)
+		streamlen = stream.tell()
+		stream.seek(0)
+		
+		new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True)
 		
 		if head:
 			try:
@@ -377,8 +373,28 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
 	#{ Serializable Implementation
 	
 	def _serialize(self, stream):
-		# for now, this is very inefficient and in fact shouldn't be used like this
-		return super(Commit, self)._serialize(stream)
+		write = stream.write
+		write("tree %s\n" % self.tree)
+		for p in self.parents:
+			write("parent %s\n" % p)
+			
+		a = self.author
+		c = self.committer
+		fmt = "%s %s <%s> %s %s\n"
+		write(fmt % ("author", a.name, a.email, 
+						self.authored_date, 
+						utils.altz_to_utctz_str(self.author_tz_offset)))
+			
+		write(fmt % ("committer", c.name, c.email, 
+						self.committed_date,
+						utils.altz_to_utctz_str(self.committer_tz_offset)))
+		
+		if self.encoding != self.default_encoding:
+			write("encoding %s\n" % self.encoding)
+		
+		write("\n")
+		write(self.message)
+		return self
 	
 	def _deserialize(self, stream):
 		""":param from_rev_list: if true, the stream format is coming from the rev-list command
@@ -416,7 +432,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
 		
 		# a stream from our data simply gives us the plain message
 		# The end of our message stream is marked with a newline that we strip
-		self.message = stream.read()[:-1]
+		self.message = stream.read()
 		return self
 		
 	#} END serializable implementation
diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py
index 6d378a72..c93f2091 100644
--- a/lib/git/objects/utils.py
+++ b/lib/git/objects/utils.py
@@ -16,7 +16,8 @@ import time
 import os
 
 __all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', 
-			'ProcessStreamAdapter', 'Traversable')
+			'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz', 
+			'verify_utctz')
 
 def get_object_type_by_name(object_type_name):
 	"""
@@ -57,14 +58,24 @@ def get_user_id():
 	return "%s@%s" % (username, platform.node())
 		
 
-def _utc_tz_to_altz(utctz):
+def utctz_to_altz(utctz):
 	"""we convert utctz to the timezone in seconds, it is the format time.altzone
 	returns. Git stores it as UTC timezon which has the opposite sign as well, 
 	which explains the -1 * ( that was made explicit here )
 	:param utctz: git utc timezone string, i.e. +0200"""
 	return -1 * int(float(utctz)/100*3600)
+	
+def altz_to_utctz_str(altz):
+	"""As above, but inverses the operation, returning a string that can be used
+	in commit objects"""
+	utci = -1 * int((altz / 3600)*100)
+	utcs = str(abs(utci))
+	utcs = "0"*(4-len(utcs)) + utcs
+	prefix = (utci < 0 and '-') or '+'
+	return prefix + utcs
+	
 
-def _verify_utctz(offset):
+def verify_utctz(offset):
 	""":raise ValueError: if offset is incorrect
 	:return: offset"""
 	fmt_exc = ValueError("Invalid timezone offset format: %s" % offset)
@@ -97,11 +108,11 @@ def parse_date(string_date):
 		if string_date.count(' ') == 1 and string_date.rfind(':') == -1:
 			timestamp, offset = string_date.split()
 			timestamp = int(timestamp)
-			return timestamp, _utc_tz_to_altz(_verify_utctz(offset))
+			return timestamp, utctz_to_altz(verify_utctz(offset))
 		else:
 			offset = "+0000"					# local time by default
 			if string_date[-5] in '-+':
-				offset = _verify_utctz(string_date[-5:])
+				offset = verify_utctz(string_date[-5:])
 				string_date = string_date[:-6]	# skip space as well
 			# END split timezone info
 			
@@ -139,7 +150,7 @@ def parse_date(string_date):
 					fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, 
 												tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec,
 												dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst))
-					return int(time.mktime(fstruct)), _utc_tz_to_altz(offset)
+					return int(time.mktime(fstruct)), utctz_to_altz(offset)
 				except ValueError:
 					continue
 				# END exception handling
@@ -167,7 +178,7 @@ def parse_actor_and_date(line):
 	"""
 	m = _re_actor_epoch.search(line)
 	actor, epoch, offset = m.groups()
-	return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset))
+	return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset))
 	
 	
 	
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
index 94d1cea8..fd340962 100644
--- a/lib/git/odb/utils.py
+++ b/lib/git/odb/utils.py
@@ -137,7 +137,7 @@ class DecompressMemMapReader(object):
 		# END handle size
 		
 		# read header
-		maxb = 8192
+		maxb = 512				# should really be enough, cgit uses 8192 I believe
 		self._s = maxb
 		hdr = self.read(maxb)
 		hdrend = hdr.find("\0")
@@ -172,20 +172,24 @@ class DecompressMemMapReader(object):
 		# Our performance now depends on StringIO. This way we don't need two large
 		# buffers in peak times, but only one large one in the end which is 
 		# the return buffer
-		if size > self.max_read_size:
-			sio = StringIO()
-			while size:
-				read_size = min(self.max_read_size, size)
-				data = self.read(read_size)
-				sio.write(data)
-				size -= len(data)
-				if len(data) < read_size:
-					break
-			# END data loop
-			sio.seek(0)
-			return sio.getvalue()
-		# END handle maxread
+		# NO: We don't do it - if the user thinks its best, he is right. If he 
+		# has trouble, he will start reading in chunks. According to our tests
+		# its still faster if we read 10 Mb at once instead of chunking it.
 		
+		# if size > self.max_read_size:
+			# sio = StringIO()
+			# while size:
+				# read_size = min(self.max_read_size, size)
+				# data = self.read(read_size)
+				# sio.write(data)
+				# size -= len(data)
+				# if len(data) < read_size:
+					# break
+			# # END data loop
+			# sio.seek(0)
+			# return sio.getvalue()
+		# # END handle maxread
+		# 
 		# deplete the buffer, then just continue using the decompress object 
 		# which has an own buffer. We just need this to transparently parse the 
 		# header from the zlib stream
diff --git a/lib/git/repo.py b/lib/git/repo.py
index f4caa3fb..0bd2249c 100644
--- a/lib/git/repo.py
+++ b/lib/git/repo.py
@@ -4,12 +4,6 @@
 # This module is part of GitPython and is released under
 # the BSD License: http://www.opensource.org/licenses/bsd-license.php
 
-import os
-import sys
-import re
-import gzip
-import StringIO
-
 from errors import InvalidGitRepositoryError, NoSuchPathError
 from cmd import Git
 from actor import Actor
@@ -19,6 +13,15 @@ from objects import *
 from config import GitConfigParser
 from remote import Remote
 
+from odb.db import LooseObjectDB
+
+import os
+import sys
+import re
+import gzip
+import StringIO
+
+
 def touch(filename):
     fp = open(filename, "a")
     fp.close()
@@ -53,7 +56,7 @@ class Repo(object):
     'git_dir' is the .git repository directoy, which is always set.
     """
     DAEMON_EXPORT_FILE = 'git-daemon-export-ok'
-    __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" )
+    __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" )
     
     # precompiled regex
     re_whitespace = re.compile(r'\s+')
@@ -65,27 +68,22 @@ class Repo(object):
     # represents the configuration level of a configuration file
     config_level = ("system", "global", "repository")
 
-    def __init__(self, path=None):
-        """
-        Create a new Repo instance
-
-        ``path``
-            is the path to either the root git directory or the bare git repo
+    def __init__(self, path=None, odbt = LooseObjectDB):
+        """ Create a new Repo instance
 
-        Examples::
+		:param path: is the path to either the root git directory or the bare git repo::
 
             repo = Repo("/Users/mtrier/Development/git-python")
             repo = Repo("/Users/mtrier/Development/git-python.git")
             repo = Repo("~/Development/git-python.git")
             repo = Repo("$REPOSITORIES/Development/git-python.git")
-            
-        Raises
-            InvalidGitRepositoryError or NoSuchPathError
-
-        Returns
-            ``git.Repo``
-        """
-
+        
+        :param odbt: Object DataBase type - a type which is constructed by providing 
+        	the directory containing the database objects, i.e. .git/objects. It will
+        	be used to access all object data
+        :raise InvalidGitRepositoryError:
+        :raise NoSuchPathError:
+		:return: git.Repo """
         epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd())))
 
         if not os.path.exists(epath):
@@ -130,6 +128,7 @@ class Repo(object):
         
         self.working_dir = self._working_tree_dir or self.git_dir
         self.git = Git(self.working_dir)
+        self.odb = odbt(os.path.join(self.git_dir, 'objects'))
 
     def __eq__(self, rhs):
     	if isinstance(rhs, Repo):
diff --git a/lib/git/utils.py b/lib/git/utils.py
index 360c77c9..60a7de48 100644
--- a/lib/git/utils.py
+++ b/lib/git/utils.py
@@ -27,6 +27,21 @@ def make_sha(source=''):
         sha1 = sha.sha(source)
         return sha1
 
+def stream_copy(source, destination, chunk_size=512*1024):
+	"""Copy all data from the source stream into the destination stream in chunks
+	of size chunk_size
+	:return: amount of bytes written"""
+	br = 0
+	while True:
+		chunk = source.read(chunk_size)
+		destination.write(chunk)
+		br += len(chunk)
+		if len(chunk) < chunk_size:
+			break
+	# END reading output stream
+	return br
+	
+
 def join_path(a, *p):
     """Join path tokens together similar to os.path.join, but always use 
     '/' instead of possibly '\' on windows."""
author	Sebastian Thiel <byronimo@gmail.com>	2010-06-03 23:20:34 +0200
committer	Sebastian Thiel <byronimo@gmail.com>	2010-06-03 23:20:34 +0200
commit	1e2b46138ba58033738a24dadccc265748fce2ca (patch)
tree	0f2a625a371c16cc95e53e024e007d8b89d87c92 /lib/git
parent	4b4a514e51fbc7dc6ddcb27c188159d57b5d1fa9 (diff)
download	gitpython-1e2b46138ba58033738a24dadccc265748fce2ca.tar.gz