diff options
author | Sebastian Thiel <byronimo@gmail.com> | 2011-05-05 19:43:22 +0200 |
---|---|---|
committer | Sebastian Thiel <byronimo@gmail.com> | 2011-05-05 19:43:22 +0200 |
commit | 4177eefd7bdaea96a529b00ba9cf751924ede202 (patch) | |
tree | 958614c21bd97267e0d06f71bb18d4215ddd87b5 /git/test/performance | |
parent | f54546a9b857ae728033482f3c5c18c9ff3393c3 (diff) | |
download | gitpython-4177eefd7bdaea96a529b00ba9cf751924ede202.tar.gz |
Added all code from gitdb to gitpython. Next is to make it generally work. Then the tests will need some work
Diffstat (limited to 'git/test/performance')
-rw-r--r-- | git/test/performance/test_pack.py | 90 | ||||
-rw-r--r-- | git/test/performance/test_pack_streaming.py | 80 | ||||
-rw-r--r-- | git/test/performance/test_streams.py | 165 |
3 files changed, 335 insertions, 0 deletions
diff --git a/git/test/performance/test_pack.py b/git/test/performance/test_pack.py new file mode 100644 index 00000000..da952b17 --- /dev/null +++ b/git/test/performance/test_pack.py @@ -0,0 +1,90 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Performance tests for object store""" +from lib import ( + TestBigRepoR + ) + +from gitdb.exc import UnsupportedOperation +from gitdb.db.pack import PackedDB + +import sys +import os +from time import time +import random + +class TestPackedDBPerformance(TestBigRepoR): + + def _test_pack_random_access(self): + pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) + + # sha lookup + st = time() + sha_list = list(pdb.sha_iter()) + elapsed = time() - st + ns = len(sha_list) + print >> sys.stderr, "PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / elapsed) + + # sha lookup: best-case and worst case access + pdb_pack_info = pdb._pack_info + # END shuffle shas + st = time() + for sha in sha_list: + pdb_pack_info(sha) + # END for each sha to look up + elapsed = time() - st + + # discard cache + del(pdb._entities) + pdb.entities() + print >> sys.stderr, "PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % (ns, len(pdb.entities()), elapsed, ns / elapsed) + # END for each random mode + + # query info and streams only + max_items = 10000 # can wait longer when testing memory + for pdb_fun in (pdb.info, pdb.stream): + st = time() + for sha in sha_list[:max_items]: + pdb_fun(sha) + elapsed = time() - st + print >> sys.stderr, "PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % (max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed) + # END for each function + + # retrieve stream and read all + max_items = 5000 + pdb_stream = pdb.stream + total_size = 0 + st = time() + for sha in sha_list[:max_items]: + stream = pdb_stream(sha) + stream.read() + total_size += stream.size + elapsed = time() - st + total_kib = total_size / 1000 + print >> sys.stderr, "PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib/elapsed , elapsed, max_items / elapsed) + + def test_correctness(self): + pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) + # disabled for now as it used to work perfectly, checking big repositories takes a long time + print >> sys.stderr, "Endurance run: verify streaming of objects (crc and sha)" + for crc in range(2): + count = 0 + st = time() + for entity in pdb.entities(): + pack_verify = entity.is_valid_stream + sha_by_index = entity.index().sha + for index in xrange(entity.index().size()): + try: + assert pack_verify(sha_by_index(index), use_crc=crc) + count += 1 + except UnsupportedOperation: + pass + # END ignore old indices + # END for each index + # END for each entity + elapsed = time() - st + print >> sys.stderr, "PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % (count, crc, elapsed, count / elapsed) + # END for each verify mode + diff --git a/git/test/performance/test_pack_streaming.py b/git/test/performance/test_pack_streaming.py new file mode 100644 index 00000000..795ed1e2 --- /dev/null +++ b/git/test/performance/test_pack_streaming.py @@ -0,0 +1,80 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Specific test for pack streams only""" +from lib import ( + TestBigRepoR + ) + +from gitdb.db.pack import PackedDB +from gitdb.stream import NullStream +from gitdb.pack import PackEntity + +import os +import sys +from time import time +from nose import SkipTest + +class CountedNullStream(NullStream): + __slots__ = '_bw' + def __init__(self): + self._bw = 0 + + def bytes_written(self): + return self._bw + + def write(self, d): + self._bw += NullStream.write(self, d) + + +class TestPackStreamingPerformance(TestBigRepoR): + + def test_pack_writing(self): + # see how fast we can write a pack from object streams. + # This will not be fast, as we take time for decompressing the streams as well + ostream = CountedNullStream() + pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) + + ni = 5000 + count = 0 + total_size = 0 + st = time() + objs = list() + for sha in pdb.sha_iter(): + count += 1 + objs.append(pdb.stream(sha)) + if count == ni: + break + #END gather objects for pack-writing + elapsed = time() - st + print >> sys.stderr, "PDB Streaming: Got %i streams by sha in in %f s ( %f streams/s )" % (ni, elapsed, ni / elapsed) + + st = time() + PackEntity.write_pack(objs, ostream.write) + elapsed = time() - st + total_kb = ostream.bytes_written() / 1000 + print >> sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % (total_kb, elapsed, total_kb/elapsed) + + + def test_stream_reading(self): + raise SkipTest() + pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) + + # streaming only, meant for --with-profile runs + ni = 5000 + count = 0 + pdb_stream = pdb.stream + total_size = 0 + st = time() + for sha in pdb.sha_iter(): + if count == ni: + break + stream = pdb_stream(sha) + stream.read() + total_size += stream.size + count += 1 + elapsed = time() - st + total_kib = total_size / 1000 + print >> sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (ni, total_kib, total_kib/elapsed , elapsed, ni / elapsed) + diff --git a/git/test/performance/test_streams.py b/git/test/performance/test_streams.py index 7f17d722..196e9003 100644 --- a/git/test/performance/test_streams.py +++ b/git/test/performance/test_streams.py @@ -1,9 +1,17 @@ """Performance data streaming performance""" +from gitdb.db.py import * +from gitdb.base import * +from gitdb.stream import * +from gitdb.util import ( + pool, + bin_to_hex + ) from git.test.lib import * from gitdb import * from gitdb.util import bin_to_hex +from cStringIO import StringIO from time import time import os import sys @@ -14,9 +22,35 @@ from gitdb.test.lib import make_memory_file from lib import ( TestBigRepoR + make_memory_file, + with_rw_directory ) +#{ Utilities +def read_chunked_stream(stream): + total = 0 + while True: + chunk = stream.read(chunk_size) + total += len(chunk) + if len(chunk) < chunk_size: + break + # END read stream loop + assert total == stream.size + return stream + + +class TestStreamReader(ChannelThreadTask): + """Expects input streams and reads them in chunks. It will read one at a time, + requireing a queue chunk of size 1""" + def __init__(self, *args): + super(TestStreamReader, self).__init__(*args) + self.fun = read_chunked_stream + self.max_chunksize = 1 + + +#} END utilities + class TestObjDBPerformance(TestBigRepoR): large_data_size_bytes = 1000*1000*10 # some MiB should do it @@ -129,3 +163,134 @@ class TestObjDBPerformance(TestBigRepoR): # compare print >> sys.stderr, "Git-Python is %f %% faster than git when reading big %s files in chunks" % (100.0 - (elapsed_readchunks / gelapsed_readchunks) * 100, desc) # END for each randomization factor + + @with_rw_directory + def test_large_data_streaming(self, path): + ldb = PureLooseObjectODB(path) + string_ios = list() # list of streams we previously created + + # serial mode + for randomize in range(2): + desc = (randomize and 'random ') or '' + print >> sys.stderr, "Creating %s data ..." % desc + st = time() + size, stream = make_memory_file(self.large_data_size_bytes, randomize) + elapsed = time() - st + print >> sys.stderr, "Done (in %f s)" % elapsed + string_ios.append(stream) + + # writing - due to the compression it will seem faster than it is + st = time() + sha = ldb.store(IStream('blob', size, stream)).binsha + elapsed_add = time() - st + assert ldb.has_object(sha) + db_file = ldb.readable_db_object_path(bin_to_hex(sha)) + fsize_kib = os.path.getsize(db_file) / 1000 + + + size_kib = size / 1000 + print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add) + + # reading all at once + st = time() + ostream = ldb.stream(sha) + shadata = ostream.read() + elapsed_readall = time() - st + + stream.seek(0) + assert shadata == stream.getvalue() + print >> sys.stderr, "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, elapsed_readall, size_kib / elapsed_readall) + + + # reading in chunks of 1 MiB + cs = 512*1000 + chunks = list() + st = time() + ostream = ldb.stream(sha) + while True: + data = ostream.read(cs) + chunks.append(data) + if len(data) < cs: + break + # END read in chunks + elapsed_readchunks = time() - st + + stream.seek(0) + assert ''.join(chunks) == stream.getvalue() + + cs_kib = cs / 1000 + print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks) + + # del db file so we keep something to do + os.remove(db_file) + # END for each randomization factor + + + # multi-threaded mode + # want two, should be supported by most of todays cpus + pool.set_size(2) + total_kib = 0 + nsios = len(string_ios) + for stream in string_ios: + stream.seek(0) + total_kib += len(stream.getvalue()) / 1000 + # END rewind + + def istream_iter(): + for stream in string_ios: + stream.seek(0) + yield IStream(str_blob_type, len(stream.getvalue()), stream) + # END for each stream + # END util + + # write multiple objects at once, involving concurrent compression + reader = IteratorReader(istream_iter()) + istream_reader = ldb.store_async(reader) + istream_reader.task().max_chunksize = 1 + + st = time() + istreams = istream_reader.read(nsios) + assert len(istreams) == nsios + elapsed = time() - st + + print >> sys.stderr, "Threads(%i): Compressed %i KiB of data in loose odb in %f s ( %f Write KiB / s)" % (pool.size(), total_kib, elapsed, total_kib / elapsed) + + # decompress multiple at once, by reading them + # chunk size is not important as the stream will not really be decompressed + + # until its read + istream_reader = IteratorReader(iter([ i.binsha for i in istreams ])) + ostream_reader = ldb.stream_async(istream_reader) + + chunk_task = TestStreamReader(ostream_reader, "chunker", None) + output_reader = pool.add_task(chunk_task) + output_reader.task().max_chunksize = 1 + + st = time() + assert len(output_reader.read(nsios)) == nsios + elapsed = time() - st + + print >> sys.stderr, "Threads(%i): Decompressed %i KiB of data in loose odb in %f s ( %f Read KiB / s)" % (pool.size(), total_kib, elapsed, total_kib / elapsed) + + # store the files, and read them back. For the reading, we use a task + # as well which is chunked into one item per task. Reading all will + # very quickly result in two threads handling two bytestreams of + # chained compression/decompression streams + reader = IteratorReader(istream_iter()) + istream_reader = ldb.store_async(reader) + istream_reader.task().max_chunksize = 1 + + istream_to_sha = lambda items: [ i.binsha for i in items ] + istream_reader.set_post_cb(istream_to_sha) + + ostream_reader = ldb.stream_async(istream_reader) + + chunk_task = TestStreamReader(ostream_reader, "chunker", None) + output_reader = pool.add_task(chunk_task) + output_reader.max_chunksize = 1 + + st = time() + assert len(output_reader.read(nsios)) == nsios + elapsed = time() - st + + print >> sys.stderr, "Threads(%i): Compressed and decompressed and read %i KiB of data in loose odb in %f s ( %f Combined KiB / s)" % (pool.size(), total_kib, elapsed, total_kib / elapsed) |