Fix LSM index searches.

The main issue was LSM search_near was not always returning the closest key to the search key, which calling code expects. It now tries hard to find the smallest cursor larger than the search key, and only if no larger record exists does it return the largest record smaller than the search key. While in the area, clean up the index search code and make it slightly more efficient.
author: Michael Cahill <michael.cahill@wiredtiger.com> 2012-10-19 14:45:07 +1100
committer: Michael Cahill <michael.cahill@wiredtiger.com> 2012-10-19 14:45:07 +1100
commit: a064c45bfc945152dc178d5c3a35889f87eddc9e (patch)
tree: f7d71cf1815d855021b649e0f493128faefde877
parent: 9eb9313ee2594bc1208511b9416e5c52cd0d75af (diff)
download: mongo-a064c45bfc945152dc178d5c3a35889f87eddc9e.tar.gz
2 files changed, 42 insertions, 25 deletions
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 52f9f01e26c..a3a34105e8c 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -191,31 +191,22 @@ __curindex_reset(WT_CURSOR *cursor)
 static int
 __curindex_search(WT_CURSOR *cursor)
 {
+	WT_CURSOR *child;
 	WT_CURSOR_INDEX *cindex;
 	WT_DECL_RET;
-	WT_ITEM *oldkeyp;
 	WT_SESSION_IMPL *session;
 	int exact;
 
 	cindex = (WT_CURSOR_INDEX *)cursor;
+	child = cindex->child;
 	CURSOR_API_CALL(cursor, session, search, NULL);
 
 	/*
 	 * We expect partial matches, but we want the smallest item that
 	 * matches the prefix.  Fail if there is no matching item.
 	 */
-
-	/*
-	 * Take a copy of the search key.
-	 * XXX
-	 * We can avoid this with a cursor flag indicating when the application
-	 * owns the data.
-	 */
-	WT_ERR(__wt_scr_alloc(session, cursor->key.size, &oldkeyp));
-	memcpy(oldkeyp->mem, cursor->key.data, cursor->key.size);
-	oldkeyp->size = cursor->key.size;
-
-	WT_ERR(cursor->search_near(cursor, &exact));
+	__wt_cursor_set_raw_key(child, &cursor->key);
+	WT_ERR(child->search_near(child, &exact));
 
 	/*
 	 * We expect partial matches, and want the smallest record with a key
@@ -225,10 +216,10 @@ __curindex_search(WT_CURSOR *cursor)
 	 * but we don't disallow that (odd) case.
 	 */
 	if (exact < 0)
-		WT_ERR(cursor->next(cursor));
+		WT_ERR(child->next(child));
 
-	if (cursor->key.size < oldkeyp->size ||
-	    memcmp(oldkeyp->data, cursor->key.data, oldkeyp->size) != 0) {
+	if (child->key.size < cursor->key.size ||
+	    memcmp(child->key.data, cursor->key.data, cursor->key.size) != 0) {
 		ret = WT_NOTFOUND;
 		goto err;
 	}
@@ -238,7 +229,6 @@ __curindex_search(WT_CURSOR *cursor)
 	if (0) {
 err:		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
 	}
-	__wt_scr_free(&oldkeyp);
 	API_END(session);
 
 	return (ret);
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index cc10837cc81..a7897ba5c6a 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -236,10 +236,10 @@ __clsm_get_current(
 		if (!F_ISSET(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET))
 			continue;
 		if (current == NULL) {
-			cmp = (smallest ? -1 : 1);
-		} else
-			WT_RET(WT_LSM_CURCMP(session,
-			    clsm->lsm_tree, c, current, cmp));
+			current = c;
+			continue;
+		}
+		WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, current, cmp));
 		if (smallest ? cmp < 0 : cmp > 0) {
 			current = c;
 			multiple = 0;
@@ -510,6 +510,7 @@ __clsm_search(WT_CURSOR *cursor)
 
 	WT_LSM_ENTER(clsm, cursor, session, search);
 	WT_CURSOR_NEEDKEY(cursor);
+	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
 	FORALL_CURSORS(clsm, c, i) {
 		/* If there is a Bloom filter, see if we can skip the read. */
 		if ((bloom = clsm->blooms[i]) != NULL) {
@@ -568,6 +569,7 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
 
 	WT_LSM_ENTER(clsm, cursor, session, search_near);
 	WT_CURSOR_NEEDKEY(cursor);
+	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
 
 	/*
 	 * search_near is somewhat fiddly: we can't just return a nearby key
@@ -599,6 +601,21 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
 		}
 
 		/*
+		 * Prefer larger cursors.  There are two reasons: (1) we expect
+		 * prefix searches to be a common case (as in our own indices);
+		 * and (2) we need a way to unambiguously know we have the
+		 * "closest" result.
+		 */
+		if (cmp < 0) {
+			if ((ret = c->next(c)) == 0)
+				cmp = 1;
+			else if (ret == WT_NOTFOUND)
+				ret = c->prev(c);
+			if (ret != 0)
+				goto err;
+		}
+
+		/*
 		 * If we land on a deleted item, try going forwards or
 		 * backwards to find one that isn't deleted.
 		 */
@@ -616,6 +633,16 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
 		WT_ERR_NOTFOUND_OK(ret);
 		if (deleted)
 			continue;
+
+		/*
+		 * We are trying to find the smallest cursor greater than the
+		 * search key, or, if there is no larger key, the largest
+		 * cursor smaller than the search key.
+		 *
+		 * It could happen that one cursor contains both of the closest
+		 * records.  In that case, we will track it in "larger", and it
+		 * will be the one we finally choose.
+		 */
 		if (cmp > 0) {
 			if (larger == NULL)
 				larger = c;
@@ -637,12 +664,12 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
 		}
 	}
 
-	if (smaller != NULL) {
-		clsm->current = smaller;
-		*exactp = -1;
-	} else if (larger != NULL) {
+	if (larger != NULL) {
 		clsm->current = larger;
 		*exactp = 1;
+	} else if (smaller != NULL) {
+		clsm->current = smaller;
+		*exactp = -1;
 	} else
 		ret = WT_NOTFOUND;
author	Michael Cahill <michael.cahill@wiredtiger.com>	2012-10-19 14:45:07 +1100
committer	Michael Cahill <michael.cahill@wiredtiger.com>	2012-10-19 14:45:07 +1100
commit	a064c45bfc945152dc178d5c3a35889f87eddc9e (patch)
tree	f7d71cf1815d855021b649e0f493128faefde877
parent	9eb9313ee2594bc1208511b9416e5c52cd0d75af (diff)
download	mongo-a064c45bfc945152dc178d5c3a35889f87eddc9e.tar.gz