summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorVicent Marti <tanoku@gmail.com>2011-03-05 23:54:49 +0200
committerVicent Marti <tanoku@gmail.com>2011-03-14 23:36:10 +0200
commit26022f0719f652ae8311abea6f6de92bd4a75a87 (patch)
tree20453bf7a2bf36903ae326742e2a69b30355427d /src
parentb760fbf539e1ce17ac2768da755834babf700b9a (diff)
downloadlibgit2-26022f0719f652ae8311abea6f6de92bd4a75a87.tar.gz
Add `git_oid_shorten` (unique OID minimzer)
Set of methods to find the minimal-length to uniquely identify every OID in a list. Useful for GUI applications, commit logs and so on. Includes stress test. Signed-off-by: Vicent Marti <tanoku@gmail.com>
Diffstat (limited to 'src')
-rw-r--r--src/oid.c178
1 files changed, 178 insertions, 0 deletions
diff --git a/src/oid.c b/src/oid.c
index 698d0f927..81b7d6005 100644
--- a/src/oid.c
+++ b/src/oid.c
@@ -27,6 +27,7 @@
#include "git2/oid.h"
#include "repository.h"
#include <string.h>
+#include <limits.h>
static signed char from_hex[] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 00 */
@@ -166,3 +167,180 @@ int git_oid_cmp(const git_oid *a, const git_oid *b)
{
return memcmp(a->id, b->id, sizeof(a->id));
}
+
+
+typedef short node_index;
+
+typedef union {
+ const char *tail;
+ node_index children[16];
+} trie_node;
+
+struct git_oid_shorten {
+ trie_node *nodes;
+ size_t node_count, size;
+ int min_length, full;
+};
+
+static int resize_trie(git_oid_shorten *self, size_t new_size)
+{
+ self->nodes = realloc(self->nodes, new_size * sizeof(trie_node));
+ if (self->nodes == NULL)
+ return GIT_ENOMEM;
+
+ if (new_size > self->size) {
+ memset(&self->nodes[self->size], 0x0, (new_size - self->size) * sizeof(trie_node));
+ }
+
+ self->size = new_size;
+ return GIT_SUCCESS;
+}
+
+static trie_node *push_leaf(git_oid_shorten *os, node_index idx, int push_at, const char *oid)
+{
+ trie_node *node, *leaf;
+ node_index idx_leaf;
+
+ if (os->node_count >= os->size) {
+ if (resize_trie(os, os->size * 2) < GIT_SUCCESS)
+ return NULL;
+ }
+
+ idx_leaf = (node_index)os->node_count++;
+
+ if (os->node_count == SHRT_MAX)
+ os->full = 1;
+
+ node = &os->nodes[idx];
+ node->children[push_at] = -idx_leaf;
+
+ leaf = &os->nodes[idx_leaf];
+ leaf->tail = oid;
+
+ return node;
+}
+
+git_oid_shorten *git_oid_shorten_new(size_t min_length)
+{
+ git_oid_shorten *os;
+
+ os = git__malloc(sizeof(git_oid_shorten));
+ if (os == NULL)
+ return NULL;
+
+ memset(os, 0x0, sizeof(git_oid_shorten));
+
+ if (resize_trie(os, 16) < GIT_SUCCESS) {
+ free(os);
+ return NULL;
+ }
+
+ os->node_count = 1;
+ os->min_length = min_length;
+
+ return os;
+}
+
+void git_oid_shorten_free(git_oid_shorten *os)
+{
+ free(os->nodes);
+ free(os);
+}
+
+
+/*
+ * What wizardry is this?
+ *
+ * This is just a memory-optimized trie: basically a very fancy
+ * 16-ary tree, which is used to store the prefixes of the OID
+ * strings.
+ *
+ * Read more: http://en.wikipedia.org/wiki/Trie
+ *
+ * Magic that happens in this method:
+ *
+ * - Each node in the trie is an union, so it can work both as
+ * a normal node, or as a leaf.
+ *
+ * - Each normal node points to 16 children (one for each possible
+ * character in the oid). This is *not* stored in an array of
+ * pointers, because in a 64-bit arch this would be sucking
+ * 16*sizeof(void*) = 128 bytes of memory per node, which is fucking
+ * insane. What we do is store Node Indexes, and use these indexes
+ * to look up each node in the om->index array. These indexes are
+ * signed shorts, so this limits the amount of unique OIDs that
+ * fit in the structure to about 20000 (assuming a more or less uniform
+ * distribution).
+ *
+ * - All the nodes in om->index array are stored contiguously in
+ * memory, and each of them is 32 bytes, so we fit 2x nodes per
+ * cache line. Convenient for speed.
+ *
+ * - To differentiate the leafs from the normal nodes, we store all
+ * the indexes towards a leaf as a negative index (indexes to normal
+ * nodes are positives). When we find that one of the children for
+ * a node has a negative value, that means it's going to be a leaf.
+ * This reduces the amount of indexes we have by two, but also reduces
+ * the size of each node by 1-4 bytes (the amount we would need to
+ * add a `is_leaf` field): this is good because it allows the nodes
+ * to fit cleanly in cache lines.
+ *
+ * - Once we reach an empty children, instead of continuing to insert
+ * new nodes for each remaining character of the OID, we store a pointer
+ * to the tail in the leaf; if the leaf is reached again, we turn it
+ * into a normal node and use the tail to create a new leaf.
+ *
+ * This is a pretty good balance between performance and memory usage.
+ */
+int git_oid_shorten_add(git_oid_shorten *os, const char *text_oid)
+{
+ int i, is_leaf;
+ node_index idx;
+
+ if (os->full)
+ return GIT_ENOMEM;
+
+ idx = 0;
+ is_leaf = 0;
+
+ for (i = 0; i < GIT_OID_HEXSZ; ++i) {
+ int c = from_hex[(int)text_oid[i]];
+ trie_node *node;
+
+ if (c == -1)
+ return GIT_ENOTOID;
+
+ node = &os->nodes[idx];
+
+ if (is_leaf) {
+ const char *tail;
+
+ tail = node->tail;
+ node->tail = NULL;
+
+ node = push_leaf(os, idx, from_hex[(int)tail[0]], &tail[1]);
+ if (node == NULL)
+ return GIT_ENOMEM;
+ }
+
+ if (node->children[c] == 0) {
+ if (push_leaf(os, idx, c, &text_oid[i + 1]) == NULL)
+ return GIT_ENOMEM;
+ break;
+ }
+
+ idx = node->children[c];
+ is_leaf = 0;
+
+ if (idx < 0) {
+ node->children[c] = idx = -idx;
+ is_leaf = 1;
+ }
+ }
+
+ if (++i > os->min_length)
+ os->min_length = i;
+
+ return os->min_length;
+}
+