diff options
author | Sebastian Thiel <byronimo@gmail.com> | 2010-06-03 23:20:34 +0200 |
---|---|---|
committer | Sebastian Thiel <byronimo@gmail.com> | 2010-06-03 23:20:34 +0200 |
commit | 1e2b46138ba58033738a24dadccc265748fce2ca (patch) | |
tree | 0f2a625a371c16cc95e53e024e007d8b89d87c92 /lib/git | |
parent | 4b4a514e51fbc7dc6ddcb27c188159d57b5d1fa9 (diff) | |
download | gitpython-1e2b46138ba58033738a24dadccc265748fce2ca.tar.gz |
commit.create_from_tree now uses pure python implementation, fixed message parsing which truncated newlines although it was ilegitimate. Its up to the reader to truncate therse, nowhere in the git code I could find anyone adding newlines to commits where it is written
Added performance tests for serialization, it does about 5k commits per second if writing to tmpfs
Diffstat (limited to 'lib/git')
-rw-r--r-- | lib/git/cmd.py | 7 | ||||
-rw-r--r-- | lib/git/objects/base.py | 4 | ||||
-rw-r--r-- | lib/git/objects/commit.py | 42 | ||||
-rw-r--r-- | lib/git/objects/utils.py | 25 | ||||
-rw-r--r-- | lib/git/odb/utils.py | 32 | ||||
-rw-r--r-- | lib/git/repo.py | 43 | ||||
-rw-r--r-- | lib/git/utils.py | 15 |
7 files changed, 104 insertions, 64 deletions
diff --git a/lib/git/cmd.py b/lib/git/cmd.py index aaa27adc..18d1c505 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -323,12 +323,7 @@ class Git(object): stdout_value = proc.stdout.read().rstrip() # strip trailing "\n" else: max_chunk_size = 1024*64 - while True: - chunk = proc.stdout.read(max_chunk_size) - output_stream.write(chunk) - if len(chunk) < max_chunk_size: - break - # END reading output stream + stream_copy(proc.stdout, output_stream, max_chunk_size) stdout_value = output_stream # END stdout handling stderr_value = proc.stderr.read().rstrip() # strip trailing "\n" diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 64a5678e..f7043199 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -125,8 +125,8 @@ class Object(LazyMixin): Returns File Object compatible stream to the uncompressed raw data of the object """ - sha, type, size, stream = self.repo.git.stream_object_data(self.sha) - return stream + proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) + return utils.ProcessStreamAdapter(proc, "stdout") def stream_data(self, ostream): """ diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 98aca360..d56ce306 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -91,15 +91,6 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri """ super(Commit,self).__init__(repo, sha) self._set_self_from_args_(locals()) - - if parents is not None: - cls = type(self) - self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls)) - # END for each parent to convert - - if self.sha and tree is not None: - self.tree = Tree(repo, tree, path='') - # END id to tree conversion @classmethod def _get_intermediate_items(cls, commit): @@ -350,7 +341,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri committer, committer_time, committer_offset, message, parent_commits, conf_encoding) - # serialize ! + stream = StringIO() + new_commit._serialize(stream) + streamlen = stream.tell() + stream.seek(0) + + new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True) if head: try: @@ -377,8 +373,28 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri #{ Serializable Implementation def _serialize(self, stream): - # for now, this is very inefficient and in fact shouldn't be used like this - return super(Commit, self)._serialize(stream) + write = stream.write + write("tree %s\n" % self.tree) + for p in self.parents: + write("parent %s\n" % p) + + a = self.author + c = self.committer + fmt = "%s %s <%s> %s %s\n" + write(fmt % ("author", a.name, a.email, + self.authored_date, + utils.altz_to_utctz_str(self.author_tz_offset))) + + write(fmt % ("committer", c.name, c.email, + self.committed_date, + utils.altz_to_utctz_str(self.committer_tz_offset))) + + if self.encoding != self.default_encoding: + write("encoding %s\n" % self.encoding) + + write("\n") + write(self.message) + return self def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command @@ -416,7 +432,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri # a stream from our data simply gives us the plain message # The end of our message stream is marked with a newline that we strip - self.message = stream.read()[:-1] + self.message = stream.read() return self #} END serializable implementation diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 6d378a72..c93f2091 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -16,7 +16,8 @@ import time import os __all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', - 'ProcessStreamAdapter', 'Traversable') + 'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz', + 'verify_utctz') def get_object_type_by_name(object_type_name): """ @@ -57,14 +58,24 @@ def get_user_id(): return "%s@%s" % (username, platform.node()) -def _utc_tz_to_altz(utctz): +def utctz_to_altz(utctz): """we convert utctz to the timezone in seconds, it is the format time.altzone returns. Git stores it as UTC timezon which has the opposite sign as well, which explains the -1 * ( that was made explicit here ) :param utctz: git utc timezone string, i.e. +0200""" return -1 * int(float(utctz)/100*3600) + +def altz_to_utctz_str(altz): + """As above, but inverses the operation, returning a string that can be used + in commit objects""" + utci = -1 * int((altz / 3600)*100) + utcs = str(abs(utci)) + utcs = "0"*(4-len(utcs)) + utcs + prefix = (utci < 0 and '-') or '+' + return prefix + utcs + -def _verify_utctz(offset): +def verify_utctz(offset): """:raise ValueError: if offset is incorrect :return: offset""" fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) @@ -97,11 +108,11 @@ def parse_date(string_date): if string_date.count(' ') == 1 and string_date.rfind(':') == -1: timestamp, offset = string_date.split() timestamp = int(timestamp) - return timestamp, _utc_tz_to_altz(_verify_utctz(offset)) + return timestamp, utctz_to_altz(verify_utctz(offset)) else: offset = "+0000" # local time by default if string_date[-5] in '-+': - offset = _verify_utctz(string_date[-5:]) + offset = verify_utctz(string_date[-5:]) string_date = string_date[:-6] # skip space as well # END split timezone info @@ -139,7 +150,7 @@ def parse_date(string_date): fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) - return int(time.mktime(fstruct)), _utc_tz_to_altz(offset) + return int(time.mktime(fstruct)), utctz_to_altz(offset) except ValueError: continue # END exception handling @@ -167,7 +178,7 @@ def parse_actor_and_date(line): """ m = _re_actor_epoch.search(line) actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset)) + return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset)) diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 94d1cea8..fd340962 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -137,7 +137,7 @@ class DecompressMemMapReader(object): # END handle size # read header - maxb = 8192 + maxb = 512 # should really be enough, cgit uses 8192 I believe self._s = maxb hdr = self.read(maxb) hdrend = hdr.find("\0") @@ -172,20 +172,24 @@ class DecompressMemMapReader(object): # Our performance now depends on StringIO. This way we don't need two large # buffers in peak times, but only one large one in the end which is # the return buffer - if size > self.max_read_size: - sio = StringIO() - while size: - read_size = min(self.max_read_size, size) - data = self.read(read_size) - sio.write(data) - size -= len(data) - if len(data) < read_size: - break - # END data loop - sio.seek(0) - return sio.getvalue() - # END handle maxread + # NO: We don't do it - if the user thinks its best, he is right. If he + # has trouble, he will start reading in chunks. According to our tests + # its still faster if we read 10 Mb at once instead of chunking it. + # if size > self.max_read_size: + # sio = StringIO() + # while size: + # read_size = min(self.max_read_size, size) + # data = self.read(read_size) + # sio.write(data) + # size -= len(data) + # if len(data) < read_size: + # break + # # END data loop + # sio.seek(0) + # return sio.getvalue() + # # END handle maxread + # # deplete the buffer, then just continue using the decompress object # which has an own buffer. We just need this to transparently parse the # header from the zlib stream diff --git a/lib/git/repo.py b/lib/git/repo.py index f4caa3fb..0bd2249c 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -4,12 +4,6 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php -import os -import sys -import re -import gzip -import StringIO - from errors import InvalidGitRepositoryError, NoSuchPathError from cmd import Git from actor import Actor @@ -19,6 +13,15 @@ from objects import * from config import GitConfigParser from remote import Remote +from odb.db import LooseObjectDB + +import os +import sys +import re +import gzip +import StringIO + + def touch(filename): fp = open(filename, "a") fp.close() @@ -53,7 +56,7 @@ class Repo(object): 'git_dir' is the .git repository directoy, which is always set. """ DAEMON_EXPORT_FILE = 'git-daemon-export-ok' - __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" ) + __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" ) # precompiled regex re_whitespace = re.compile(r'\s+') @@ -65,27 +68,22 @@ class Repo(object): # represents the configuration level of a configuration file config_level = ("system", "global", "repository") - def __init__(self, path=None): - """ - Create a new Repo instance - - ``path`` - is the path to either the root git directory or the bare git repo + def __init__(self, path=None, odbt = LooseObjectDB): + """ Create a new Repo instance - Examples:: + :param path: is the path to either the root git directory or the bare git repo:: repo = Repo("/Users/mtrier/Development/git-python") repo = Repo("/Users/mtrier/Development/git-python.git") repo = Repo("~/Development/git-python.git") repo = Repo("$REPOSITORIES/Development/git-python.git") - - Raises - InvalidGitRepositoryError or NoSuchPathError - - Returns - ``git.Repo`` - """ - + + :param odbt: Object DataBase type - a type which is constructed by providing + the directory containing the database objects, i.e. .git/objects. It will + be used to access all object data + :raise InvalidGitRepositoryError: + :raise NoSuchPathError: + :return: git.Repo """ epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd()))) if not os.path.exists(epath): @@ -130,6 +128,7 @@ class Repo(object): self.working_dir = self._working_tree_dir or self.git_dir self.git = Git(self.working_dir) + self.odb = odbt(os.path.join(self.git_dir, 'objects')) def __eq__(self, rhs): if isinstance(rhs, Repo): diff --git a/lib/git/utils.py b/lib/git/utils.py index 360c77c9..60a7de48 100644 --- a/lib/git/utils.py +++ b/lib/git/utils.py @@ -27,6 +27,21 @@ def make_sha(source=''): sha1 = sha.sha(source) return sha1 +def stream_copy(source, destination, chunk_size=512*1024): + """Copy all data from the source stream into the destination stream in chunks + of size chunk_size + :return: amount of bytes written""" + br = 0 + while True: + chunk = source.read(chunk_size) + destination.write(chunk) + br += len(chunk) + if len(chunk) < chunk_size: + break + # END reading output stream + return br + + def join_path(a, *p): """Join path tokens together similar to os.path.join, but always use '/' instead of possibly '\' on windows.""" |