diff options
author | Sebastian Thiel <sebastian.thiel@icloud.com> | 2020-07-13 10:08:37 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-13 10:08:37 +0800 |
commit | 3edd16ca6e217ee35353564cad3aa2920bc0c2e2 (patch) | |
tree | 0f5cd65c1db04255862b8c19f4bf73cab435c4f0 /test/performance | |
parent | 9cb7ae8d9721e1269f5bacd6dbc33ecdec4659c0 (diff) | |
parent | e0b10d965d6377c409ceb40eb47379d79c3fef9f (diff) | |
download | gitpython-3edd16ca6e217ee35353564cad3aa2920bc0c2e2.tar.gz |
Merge pull request #1031 from priv-kweihmann/move-test-2nd
[RFC/WIP] move tests and avoid packaging them
Diffstat (limited to 'test/performance')
-rw-r--r-- | test/performance/__init__.py | 0 | ||||
-rw-r--r-- | test/performance/lib.py | 94 | ||||
-rw-r--r-- | test/performance/test_commit.py | 108 | ||||
-rw-r--r-- | test/performance/test_odb.py | 74 | ||||
-rw-r--r-- | test/performance/test_streams.py | 149 |
5 files changed, 425 insertions, 0 deletions
diff --git a/test/performance/__init__.py b/test/performance/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/test/performance/__init__.py diff --git a/test/performance/lib.py b/test/performance/lib.py new file mode 100644 index 00000000..86f87757 --- /dev/null +++ b/test/performance/lib.py @@ -0,0 +1,94 @@ +"""Contains library functions""" +import logging +import os +import tempfile + +from git import ( + Repo +) +from git.db import ( + GitCmdObjectDB, + GitDB +) +from test.lib import ( + TestBase +) +from git.util import rmtree +import os.path as osp + +#{ Invariants + +k_env_git_repo = "GIT_PYTHON_TEST_GIT_REPO_BASE" + +#} END invariants + + +#{ Base Classes + +class TestBigRepoR(TestBase): + + """TestCase providing access to readonly 'big' repositories using the following + member variables: + + * gitrorepo + + * Read-Only git repository - actually the repo of git itself + + * puregitrorepo + + * As gitrepo, but uses pure python implementation + """ + + #{ Invariants + #} END invariants + + def setUp(self): + try: + super(TestBigRepoR, self).setUp() + except AttributeError: + pass + + repo_path = os.environ.get(k_env_git_repo) + if repo_path is None: + logging.info( + ("You can set the %s environment variable to a .git repository of" % k_env_git_repo) + + "your choice - defaulting to the gitpython repository") + repo_path = osp.dirname(__file__) + # end set some repo path + self.gitrorepo = Repo(repo_path, odbt=GitCmdObjectDB, search_parent_directories=True) + self.puregitrorepo = Repo(repo_path, odbt=GitDB, search_parent_directories=True) + + def tearDown(self): + self.gitrorepo.git.clear_cache() + self.gitrorepo = None + self.puregitrorepo.git.clear_cache() + self.puregitrorepo = None + + +class TestBigRepoRW(TestBigRepoR): + + """As above, but provides a big repository that we can write to. + + Provides ``self.gitrwrepo`` and ``self.puregitrwrepo``""" + + def setUp(self): + self.gitrwrepo = None + try: + super(TestBigRepoRW, self).setUp() + except AttributeError: + pass + dirname = tempfile.mktemp() + os.mkdir(dirname) + self.gitrwrepo = self.gitrorepo.clone(dirname, shared=True, bare=True, odbt=GitCmdObjectDB) + self.puregitrwrepo = Repo(dirname, odbt=GitDB) + + def tearDown(self): + super(TestBigRepoRW, self).tearDown() + if self.gitrwrepo is not None: + rmtree(self.gitrwrepo.working_dir) + self.gitrwrepo.git.clear_cache() + self.gitrwrepo = None + self.puregitrwrepo.git.clear_cache() + self.puregitrwrepo = None + +#} END base classes diff --git a/test/performance/test_commit.py b/test/performance/test_commit.py new file mode 100644 index 00000000..4617b052 --- /dev/null +++ b/test/performance/test_commit.py @@ -0,0 +1,108 @@ +# test_performance.py +# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors +# +# This module is part of GitPython and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php +from __future__ import print_function +from io import BytesIO +from time import time +import sys + +from .lib import TestBigRepoRW +from git import Commit +from gitdb import IStream +from test.test_commit import TestCommitSerialization + + +class TestPerformance(TestBigRepoRW, TestCommitSerialization): + + def tearDown(self): + import gc + gc.collect() + + # ref with about 100 commits in its history + ref_100 = '0.1.6' + + def _query_commit_info(self, c): + c.author + c.authored_date + c.author_tz_offset + c.committer + c.committed_date + c.committer_tz_offset + c.message + c.parents + + def test_iteration(self): + no = 0 + nc = 0 + + # find the first commit containing the given path - always do a full + # iteration ( restricted to the path in question ), but in fact it should + # return quite a lot of commits, we just take one and hence abort the operation + + st = time() + for c in self.rorepo.iter_commits(self.ref_100): + nc += 1 + self._query_commit_info(c) + for obj in c.tree.traverse(): + obj.size + no += 1 + # END for each object + # END for each commit + elapsed_time = time() - st + print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )" + % (nc, no, elapsed_time, no / elapsed_time), file=sys.stderr) + + def test_commit_traversal(self): + # bound to cat-file parsing performance + nc = 0 + st = time() + for c in self.gitrorepo.commit().traverse(branch_first=False): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print("Traversed %i Commits in %s [s] ( %f commits/s )" + % (nc, elapsed_time, nc / elapsed_time), file=sys.stderr) + + def test_commit_iteration(self): + # bound to stream parsing performance + nc = 0 + st = time() + for c in Commit.iter_items(self.gitrorepo, self.gitrorepo.head): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print("Iterated %i Commits in %s [s] ( %f commits/s )" + % (nc, elapsed_time, nc / elapsed_time), file=sys.stderr) + + def test_commit_serialization(self): + self.assert_commit_serialization(self.gitrwrepo, '58c78e6', True) + + rwrepo = self.gitrwrepo + make_object = rwrepo.odb.store + # direct serialization - deserialization can be tested afterwards + # serialization is probably limited on IO + hc = rwrepo.commit(rwrepo.head) + + nc = 5000 + st = time() + for i in range(nc): + cm = Commit(rwrepo, Commit.NULL_BIN_SHA, hc.tree, + hc.author, hc.authored_date, hc.author_tz_offset, + hc.committer, hc.committed_date, hc.committer_tz_offset, + str(i), parents=hc.parents, encoding=hc.encoding) + + stream = BytesIO() + cm._serialize(stream) + slen = stream.tell() + stream.seek(0) + + cm.binsha = make_object(IStream(Commit.type, slen, stream)).binsha + # END commit creation + elapsed = time() - st + + print("Serialized %i commits to loose objects in %f s ( %f commits / s )" + % (nc, elapsed, nc / elapsed), file=sys.stderr) diff --git a/test/performance/test_odb.py b/test/performance/test_odb.py new file mode 100644 index 00000000..8bd614f2 --- /dev/null +++ b/test/performance/test_odb.py @@ -0,0 +1,74 @@ +"""Performance tests for object store""" +from __future__ import print_function + +import sys +from time import time + +from .lib import ( + TestBigRepoR +) + + +class TestObjDBPerformance(TestBigRepoR): + + def test_random_access(self): + results = [["Iterate Commits"], ["Iterate Blobs"], ["Retrieve Blob Data"]] + for repo in (self.gitrorepo, self.puregitrorepo): + # GET COMMITS + st = time() + root_commit = repo.commit(repo.head) + commits = list(root_commit.traverse()) + nc = len(commits) + elapsed = time() - st + + print("%s: Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" + % (type(repo.odb), nc, elapsed, nc / elapsed), file=sys.stderr) + results[0].append(elapsed) + + # GET TREES + # walk all trees of all commits + st = time() + blobs_per_commit = [] + nt = 0 + for commit in commits: + tree = commit.tree + blobs = [] + for item in tree.traverse(): + nt += 1 + if item.type == 'blob': + blobs.append(item) + # direct access for speed + # END while trees are there for walking + blobs_per_commit.append(blobs) + # END for each commit + elapsed = time() - st + + print("%s: Retrieved %i objects from %i commits in %g s ( %f objects / s )" + % (type(repo.odb), nt, len(commits), elapsed, nt / elapsed), file=sys.stderr) + results[1].append(elapsed) + + # GET BLOBS + st = time() + nb = 0 + too_many = 15000 + data_bytes = 0 + for blob_list in blobs_per_commit: + for blob in blob_list: + data_bytes += len(blob.data_stream.read()) + # END for each blobsha + nb += len(blob_list) + if nb > too_many: + break + # END for each bloblist + elapsed = time() - st + + msg = "%s: Retrieved %i blob (%i KiB) and their data in %g s ( %f blobs / s, %f KiB / s )"\ + % (type(repo.odb), nb, data_bytes / 1000, elapsed, nb / elapsed, (data_bytes / 1000) / elapsed) + print(msg, file=sys.stderr) + results[2].append(elapsed) + # END for each repo type + + # final results + for test_name, a, b in results: + print("%s: %f s vs %f s, pure is %f times slower" % (test_name, a, b, b / a), file=sys.stderr) + # END for each result diff --git a/test/performance/test_streams.py b/test/performance/test_streams.py new file mode 100644 index 00000000..edf32c91 --- /dev/null +++ b/test/performance/test_streams.py @@ -0,0 +1,149 @@ +"""Performance data streaming performance""" +from __future__ import print_function + +import os +import subprocess +import sys +from time import time + +from test.lib import ( + with_rw_repo +) +from git.util import bin_to_hex +from gitdb import ( + LooseObjectDB, + IStream +) +from gitdb.test.lib import make_memory_file + +import os.path as osp + +from .lib import ( + TestBigRepoR +) + + +class TestObjDBPerformance(TestBigRepoR): + + large_data_size_bytes = 1000 * 1000 * 10 # some MiB should do it + moderate_data_size_bytes = 1000 * 1000 * 1 # just 1 MiB + + @with_rw_repo('HEAD', bare=True) + def test_large_data_streaming(self, rwrepo): + # TODO: This part overlaps with the same file in gitdb.test.performance.test_stream + # It should be shared if possible + ldb = LooseObjectDB(osp.join(rwrepo.git_dir, 'objects')) + + for randomize in range(2): + desc = (randomize and 'random ') or '' + print("Creating %s data ..." % desc, file=sys.stderr) + st = time() + size, stream = make_memory_file(self.large_data_size_bytes, randomize) + elapsed = time() - st + print("Done (in %f s)" % elapsed, file=sys.stderr) + + # writing - due to the compression it will seem faster than it is + st = time() + binsha = ldb.store(IStream('blob', size, stream)).binsha + elapsed_add = time() - st + assert ldb.has_object(binsha) + db_file = ldb.readable_db_object_path(bin_to_hex(binsha)) + fsize_kib = osp.getsize(db_file) / 1000 + + size_kib = size / 1000 + msg = "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" + msg %= (size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add) + print(msg, file=sys.stderr) + + # reading all at once + st = time() + ostream = ldb.stream(binsha) + shadata = ostream.read() + elapsed_readall = time() - st + + stream.seek(0) + assert shadata == stream.getvalue() + msg = "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" + msg %= (size_kib, desc, elapsed_readall, size_kib / elapsed_readall) + print(msg, file=sys.stderr) + + # reading in chunks of 1 MiB + cs = 512 * 1000 + chunks = [] + st = time() + ostream = ldb.stream(binsha) + while True: + data = ostream.read(cs) + chunks.append(data) + if len(data) < cs: + break + # END read in chunks + elapsed_readchunks = time() - st + + stream.seek(0) + assert b''.join(chunks) == stream.getvalue() + + cs_kib = cs / 1000 + print("Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" + % (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks), file=sys.stderr) + + # del db file so git has something to do + ostream = None + import gc + gc.collect() + os.remove(db_file) + + # VS. CGIT + ########## + # CGIT ! Can using the cgit programs be faster ? + proc = rwrepo.git.hash_object('-w', '--stdin', as_process=True, istream=subprocess.PIPE) + + # write file - pump everything in at once to be a fast as possible + data = stream.getvalue() # cache it + st = time() + proc.stdin.write(data) + proc.stdin.close() + gitsha = proc.stdout.read().strip() + proc.wait() + gelapsed_add = time() - st + del(data) + assert gitsha == bin_to_hex(binsha) # we do it the same way, right ? + + # as its the same sha, we reuse our path + fsize_kib = osp.getsize(db_file) / 1000 + msg = "Added %i KiB (filesize = %i KiB) of %s data to using git-hash-object in %f s ( %f Write KiB / s)" + msg %= (size_kib, fsize_kib, desc, gelapsed_add, size_kib / gelapsed_add) + print(msg, file=sys.stderr) + + # compare ... + print("Git-Python is %f %% faster than git when adding big %s files" + % (100.0 - (elapsed_add / gelapsed_add) * 100, desc), file=sys.stderr) + + # read all + st = time() + _hexsha, _typename, size, data = rwrepo.git.get_object_data(gitsha) + gelapsed_readall = time() - st + print("Read %i KiB of %s data at once using git-cat-file in %f s ( %f Read KiB / s)" + % (size_kib, desc, gelapsed_readall, size_kib / gelapsed_readall), file=sys.stderr) + + # compare + print("Git-Python is %f %% faster than git when reading big %sfiles" + % (100.0 - (elapsed_readall / gelapsed_readall) * 100, desc), file=sys.stderr) + + # read chunks + st = time() + _hexsha, _typename, size, stream = rwrepo.git.stream_object_data(gitsha) + while True: + data = stream.read(cs) + if len(data) < cs: + break + # END read stream + gelapsed_readchunks = time() - st + msg = "Read %i KiB of %s data in %i KiB chunks from git-cat-file in %f s ( %f Read KiB / s)" + msg %= (size_kib, desc, cs_kib, gelapsed_readchunks, size_kib / gelapsed_readchunks) + print(msg, file=sys.stderr) + + # compare + print("Git-Python is %f %% faster than git when reading big %s files in chunks" + % (100.0 - (elapsed_readchunks / gelapsed_readchunks) * 100, desc), file=sys.stderr) + # END for each randomization factor |