SERVER-15576 Floating-point inaccuracy when unhashing GeoHash to box

author: Siyuan Zhou <siyuan.zhou@mongodb.com> 2014-10-14 19:41:33 -0400
committer: Siyuan Zhou <siyuan.zhou@mongodb.com> 2014-10-17 14:23:34 -0400
commit: 4e10def7b1ea9da02a5218d592073ff2587544cd (patch)
tree: db0d4bdd33ce109b61cce425e6f6fc020600f265 /src/mongo/db/geo
parent: cd82c0d6739550c04b75e41742433bf65ff7826a (diff)
download: mongo-4e10def7b1ea9da02a5218d592073ff2587544cd.tar.gz
5 files changed, 357 insertions, 41 deletions
diff --git a/src/mongo/db/geo/hash.cpp b/src/mongo/db/geo/hash.cpp
index 57996e0ba1e..ea481e14e14 100644
--- a/src/mongo/db/geo/hash.cpp
+++ b/src/mongo/db/geo/hash.cpp
@@ -32,6 +32,8 @@
 #include "mongo/db/geo/shapes.h"
 #include "mongo/util/mongoutils/str.h"
 
+#include <algorithm> // for max()
+
 // So we can get at the str namespace.
 using namespace mongoutils;
 
@@ -498,6 +500,21 @@ namespace mongo {
     static BSONField<double> maxField("max", 180.0);
     static BSONField<double> minField("min", -180.0);
 
+    //      a   x     b
+    //      |   |     |
+    // -----|---o-----|---------|--   "|" is a representable double number.
+    //
+    // In the above figure, b is the next representable double number after a, so
+    // |a - b|/|a| = epsilon (ULP) ~= 2.22E-16.
+    //
+    // An exact number x will be represented as the nearest representable double, which is a.
+    // |x - a|/|a| <= 0.5 ULP ~= 1.11e-16
+    //
+    // IEEE floating-point operations have a maximum error of 0.5 ULPS (units in
+    // the last place).  For double-precision numbers, this works out to 2**-53
+    // (about 1.11e-16) times the magnitude of the result.
+    double const GeoHashConverter::kMachinePrecision = 0.5 * std::numeric_limits<double>::epsilon();
+
     Status GeoHashConverter::parseParameters(const BSONObj& paramDoc,
                                              GeoHashConverter::Parameters* params) {
 
@@ -556,6 +573,9 @@ namespace mongo {
 
         // Error in radians
         _errorSphere = deg2rad(_error);
+
+        // 8 * max(|max|, |min|) * u
+        _errorUnhashToBox = calcUnhashToBoxError(_params);
     }
 
     double GeoHashConverter::distanceBetweenHashes(const GeoHash& a, const GeoHash& b) const {
@@ -672,16 +692,25 @@ namespace mongo {
         *y = convertFromHashScale(b);
     }
 
-    Box GeoHashConverter::unhashToBox(const GeoHash &h) const {
-        double sizeEdgeBox = sizeEdge(h);
+    Box GeoHashConverter::unhashToBoxCovering(const GeoHash &h) const {
+        if (h.getBits() == 0) {
+            // Return the result without any error.
+            return Box(Point(_params.min, _params.min), Point(_params.max, _params.max));
+        }
+
+        double sizeEdgeBox = sizeEdge(h.getBits());
         Point min(unhashToPoint(h));
         Point max(min.x + sizeEdgeBox, min.y + sizeEdgeBox);
+
+        // Expand the box by the error bound
         Box box(min, max);
+        box.fudge(_errorUnhashToBox);
         return box;
     }
 
-    Box GeoHashConverter::unhashToBox(const BSONElement &e) const {
-        return unhashToBox(hash(e));
+    double GeoHashConverter::calcUnhashToBoxError(const GeoHashConverter::Parameters& params) {
+        return std::max(fabs(params.min), fabs(params.max))
+                    * GeoHashConverter::kMachinePrecision* 8;
     }
 
     double GeoHashConverter::sizeOfDiag(const GeoHash& a) const {
@@ -690,34 +719,29 @@ namespace mongo {
         return distanceBetweenHashes(a, b);
     }
 
-    // TODO: Use ldexp(max - min, -level)
-    double GeoHashConverter::sizeEdge(const GeoHash& a) const {
-        if (!a.constrains())
-            return _params.max - _params.min;
-
-        double ax, ay, bx, by;
-        GeoHash b = a;
-        b.move(1, 1);
-        unhash(a, &ax, &ay);
-        unhash(b, &bx, &by);
-
-        // _min and _max are a singularity
-        if (bx == _params.min)
-            bx = _params.max;
 
-        return fabs(ax - bx);
+    // Relative error = epsilon_(max-min). ldexp() is just a direct translation to
+    // floating point exponent, and should be exact.
+    double GeoHashConverter::sizeEdge(unsigned level) const {
+        invariant(level >= 0);
+        invariant((int)level <= _params.bits);
+        return ldexp(_params.max - _params.min, -level);
     }
 
-    // Convert from an unsigned in [0, (max-min)*scaling] to [min, max]
-    double GeoHashConverter::convertFromHashScale(unsigned in) const {
-        double x = in;
+    // Convert from a double in [0, (max-min)*scaling] to [min, max]
+    double GeoHashConverter::convertDoubleFromHashScale(double x) const {
         x /= _params.scaling;
         x += _params.min;
         return x;
     }
 
-    // Convert from a double that is [min, max] to an unsigned in [0, (max-min)*scaling]
-    unsigned GeoHashConverter::convertToHashScale(double in) const {
+    // Convert from an unsigned in [0, (max-min)*scaling] to [min, max]
+    double GeoHashConverter::convertFromHashScale(unsigned in) const {
+        return convertDoubleFromHashScale((double)in);
+    }
+
+    // Convert from a double that is [min, max] to a double in [0, (max-min)*scaling]
+    double GeoHashConverter::convertToDoubleHashScale(double in) const {
         verify(in <= _params.max && in >= _params.min);
 
         if (in == _params.max) {
@@ -728,6 +752,13 @@ namespace mongo {
 
         in -= _params.min;
         verify(in >= 0);
-        return static_cast<unsigned>(in * _params.scaling);
+        return in * _params.scaling;
     }
+
+    // Convert from a double that is [min, max] to an unsigned in [0, (max-min)*scaling]
+    unsigned GeoHashConverter::convertToHashScale(double in) const {
+        return static_cast<unsigned>(convertToDoubleHashScale(in));
+    }
+
+
 }  // namespace mongo
diff --git a/src/mongo/db/geo/hash.h b/src/mongo/db/geo/hash.h
index 57d2083c430..dbd357983e9 100644
--- a/src/mongo/db/geo/hash.h
+++ b/src/mongo/db/geo/hash.h
@@ -155,6 +155,8 @@ namespace mongo {
      */
     class GeoHashConverter {
     public:
+        static double const kMachinePrecision; // = 1.1e-16
+
         struct Parameters {
             // How many bits to use for the hash?
             int bits;
@@ -172,6 +174,8 @@ namespace mongo {
          */
         static Status parseParameters(const BSONObj& paramDoc, Parameters* params);
 
+        static double calcUnhashToBoxError(const GeoHashConverter::Parameters& params);
+
         /**
          * Return converter parameterss which can be used to
          * construct an copy of this converter.
@@ -215,17 +219,18 @@ namespace mongo {
         void unhash(const GeoHash &h, double *x, double *y) const;
 
         /**
-         * Generates bounding box from geo hash using converter.
-         * Used in GeoBrowse::fillStack and db/query/explain_plan.cpp
-         * to generate index bounds from
-         * geo hashes in plan stats.
+         * Generates bounding box from geohash, expanded by the error bound
          */
-        Box unhashToBox(const GeoHash &h) const;
-        Box unhashToBox(const BSONElement &e) const;
+        Box unhashToBoxCovering(const GeoHash &h) const;
 
         double sizeOfDiag(const GeoHash& a) const;
-        // XXX: understand/clean this.
-        double sizeEdge(const GeoHash& a) const;
+
+        // Return the sizeEdge of a cell at a given level.
+        double sizeEdge(unsigned level) const;
+
+        // Used by test.
+        double convertDoubleFromHashScale(double in) const;
+        double convertToDoubleHashScale(double in) const;
     private:
 
         void init();
@@ -240,5 +245,9 @@ namespace mongo {
         // We compute these based on the _params:
         double _error;
         double _errorSphere;
+
+        // Error bound of unhashToBox, see hash_test.cpp for its proof.
+        // 8 * max(|max|, |min|) * u
+        double _errorUnhashToBox;
     };
 }  // namespace mongo
diff --git a/src/mongo/db/geo/hash_test.cpp b/src/mongo/db/geo/hash_test.cpp
index dc9be31fe97..7534a2efc04 100644
--- a/src/mongo/db/geo/hash_test.cpp
+++ b/src/mongo/db/geo/hash_test.cpp
@@ -32,9 +32,12 @@
 
 #include <string>
 #include <sstream>
+#include <iomanip>
 #include <cmath>
+#include <algorithm> // For max()
 
 #include "mongo/db/geo/hash.h"
+#include "mongo/db/geo/shapes.h"
 #include "mongo/platform/random.h"
 #include "mongo/unittest/unittest.h"
 #include "mongo/util/assert_util.h"
@@ -97,10 +100,280 @@ namespace {
 
         GeoHashConverter converter(params);
 
-        ASSERT_APPROX_EQUAL(100.0, converter.sizeEdge(GeoHash(0, 0, 0)), kError);
-        ASSERT_APPROX_EQUAL(50.0, converter.sizeEdge(GeoHash(0, 0, 1)), kError);
-        ASSERT_APPROX_EQUAL(25.0, converter.sizeEdge(GeoHash(0, 0, 2)), kError);
-        ASSERT_APPROX_EQUAL(ldexp(100.0, -26), converter.sizeEdge(GeoHash(0, 0, 26)), kError);
-        ASSERT_APPROX_EQUAL(ldexp(100.0, -32), converter.sizeEdge(GeoHash(0, 0, 32)), kError);
+        ASSERT_APPROX_EQUAL(100.0, converter.sizeEdge(0), kError);
+        ASSERT_APPROX_EQUAL(50.0, converter.sizeEdge(1), kError);
+        ASSERT_APPROX_EQUAL(25.0, converter.sizeEdge(2), kError);
+    }
+
+    /**
+     * ==========================
+     * Error Bound of UnhashToBox
+     * ==========================
+     *
+     * Compute the absolute error when unhashing a GeoHash to a box, so that expanding
+     * the box by this absolute error can guarantee a point is always contained by the box
+     * of its GeoHash. Thus, the absolute error of box should consist of 3 components:
+     *
+     * 1) The error introduced by hashing x to GeoHash. The extreme example would be a point
+     * close to the boundary of a cell is hashed to an adjacent box.
+     *
+     * For a hash/unhash functions h(x)/uh(x) and computed functions h'(x),uh'(x):
+     *
+     *          x  uh(h'(x))
+     * |--------|----|--------------------> min-max scale
+     * min       \
+     *            \
+     *             \
+     *              \
+     * |--------|--|-|--------------------> hash scale for cells c
+     * 0      h(x) c h'(x)
+     *
+     * 2) The error introduced by unhashing an (int) GeoHash to its lower left corner in x-y
+     * space.
+     *
+     *            uh(c)
+     *          x  |   uh'(c)
+     * |--------|--|----|-----------------> min-max scale
+     * min       \     /
+     *            \   /
+     *             \ /
+     *              X
+     * |--------|--|-|--------------------> hash scale for cells c
+     * 0      h(x) c h'(x)
+     *
+     * 3) The error introduced by adding the edge length to get the top-right corner of box.
+     * Instead of directly computing uh'(c+1), we add the computed box edge length to the computed
+     * value uh(c), giving us an extra error.
+     *
+     *               |edge(min,max)|
+     *               |             |
+     *               |         uh(c)+edge
+     *              uh(c)          |
+     * |-------------|------[uh(c)+edge']-----------> min-max scale
+     * min
+     *
+     * |-------------|-------------|----------------> hash scale
+     * 0             c            c+1
+     * Hash and unhash definitions
+     * -------------------------
+     * h(x) = (x - min) * scaling = 2^32 * (x - min) / (max - min)
+     * uh(h) = h / scaling + min,
+     * where
+     * scaling = 2^32 / (max - min)
+     *
+     * Again, h(x)/uh(x) are the exact hash functions and h'(x)/uh'(x) are the computational hash
+     * functions which have small rounding errors.
+     *
+     * | h'(x) - h(x) | == | delta_h(x; max, min) |
+     * where delta_fn = the absolute difference between the computed and actual value of a
+     * function.
+     *
+     * Restating the problem, we're looking for:
+     * |delta_box| = | delta_x_{h'(x)=H} + delta_uh(h) + delta_edge_length |
+     *            <= | delta_x_{h'(x)=H} | + | delta_uh(h) | + | delta_edge_length |
+     *
+     * 1. Error bounds calculation
+     * ---------------------------
+     *
+     * 1.1 Error: | delta_x_{h'(x)=H} |
+     * --------------------------------
+     * The first error | delta_x_{h'(x)=H} | means, given GeoHash H, we can find
+     * the range of x and only the range of x that may be mapped to H.
+     * In other words, given H, for any x that is far enough from uh(H) by at least d,
+     * it is impossible for x to be mapped to H.
+     * Mathematical, find d, such that for any x satisfying |x - uh(H)| > d,
+     *    |h(x) - H| >= | delta_h(x) |
+     * => |h(x) - H| - | delta_h(x) | >= 0
+     * => |h(x) - H + delta_h(x) | >= 0         (|a + b| >= |a| - |b|)
+     * => |h'(x) - H| >= 0                      (h'(x) = h(x) + delta_h(x))
+     * which guarantees h'(x) != H.
+     *
+     *
+     *          uh(H)-d
+     *              |
+     *          x   |  uh(H)
+     * |--------|---[----|----]-----------> min-max scale
+     * min     / \   \       /
+     *        /   \   \     /
+     *       /     \   \   /
+     *      /       \   \ /
+     * |---[----|--|-]---|----------------> hash scale for cells c
+     * 0      h(x) |     H
+     *          h'(x)
+     *         =h(x)+delta_h(x)
+     *
+     *
+     * Let's consider one case of the above inequality. We need to find the d,
+     * such that, when
+     *     x < uh(H) - d,                                 (1)
+     * we have
+     *     h(x) + |delta_h(x)| <= H.                      (2)
+     *
+     * Due to the monotonicity of h(x), apply h(x) to both side of inequality (1),
+     * we have
+     *     h(x) < h(uh(H) - d) <= H - |delta_h(x)|     (from (2))
+     *
+     * By solving it, we have
+     *     d = |delta_h(x)| / scaling
+     *      <= 2Mu * (1 + |x-min|/|max-min|)     (see calculation for |delta_h(x)| below)
+     *      <= 4Mu
+     *
+     * | delta_x_{h'(x)=H} | <= d <= 4Mu
+     * The similar calculation applies for the other side of the above inequality.
+     *
+     * 1.2 Error of h(x)
+     * -----------------
+     *
+     * Rules of error propagation
+     * --------------------------
+     * Absolute error of x is |delta_x|
+     * Relative error of x is epsilon_x = |delta_x| / |x|
+     * For any double number x, the relative error of x is bounded by "u". We assume all inputs
+     * have this error to make deduction clear.
+     * epsilon_x <= u = 0.5 * unit of least precision(ULP) ~= 1.1 * 10E-16
+     *
+     * |delta_(x + y)| <= |delta_x| + |delta_y|
+     * |delta_(x - y)| <= |delta_x| + |delta_y|
+     * epsilon_(x * y) <= epsilon_x + epsilon_y
+     * epsilon_(x / y) <= epsilon_x + epsilon_y
+     *
+     * For a given min, max scale, the maximum delta in a computation is bounded by the maximum
+     * value in the scale - M * u = max(|max|, |min|) * u.
+     *
+     * For the hash function h(x)
+     * --------------------------
+     *
+     * epsilon_h(x) = epsilon_(x-min) + epsilon_scaling
+     *
+     * epsilon_(x-min) = (|delta_x| + |delta_min|) / |x - min|
+     *                <= 2Mu / |x - min|
+     *
+     * epsilon_scaling = epsilon_(2^32) + epsilon_(max - min)
+     *                 = 0 + epsilon_(max - min)
+     *                <= 2Mu / |max - min|
+     *
+     * Hence, epsilon_h(x) <= 2Mu * (1/|x - min| + 1/|max - min|)
+     *
+     * |delta_h(x)| = 2Mu * (1 + |x-min|/|max-min|) * 2^32 / |max - min|
+     *             <= 4Mu * 2^32 / |max-min|
+     *
+     * 2. Error: unhashing GeoHash to point
+     * ------------------------------------
+     * Similarly, we can calculate the error for uh(h) function, assuming h is exactly
+     * represented in form of GeoHash, since integer is represented exactly.
+     *
+     * |delta_uh(h)| = epsilon_(h/scaling) * |h/scaling| + delta_min
+     *               = epsilon_(scaling) * |h/scaling| + delta_min
+     *              <= 2Mu / |max-min| * |max-min| + |min| * u
+     *              <= 3Mu
+     *
+     * Thus, the second error |delta_uh(h)| <= 3Mu
+     * Totally, the absolute error we need to add to unhashing to a point <=  4Mu + 3Mu = 7Mu
+     *
+     * 3. Error: edge length
+     * ---------------------
+     * The third part is easy to compute, since ldexp() doesn't introduce extra
+     * relative error.
+     *
+     * edge_length = ldexp(max - min, -level)
+     *
+     * epsilon_edge = epsilon_(max - min) <= 2 * M * u / |max - min|
+     *
+     * | delta_edge | = epsilon_edge * (max - min) * 2^(-level)
+     * = 2Mu * 2^(-level) <= Mu    (level >= 1)
+     *
+     * This error is neglectable when level >> 0.
+     *
+     * In conclusion, | delta_box | <= 8Mu
+     *
+     *
+     * Test
+     * ====
+     * This first two component errors can be simulated by uh'(h'(x)).
+     * Let h = h'(x)
+     * |delta_(uh'(h'(x)))|
+     * = epsilon_(h/scaling) * |h/scaling| + delta_min
+     * = (epsilon_(h) + epsilon_(scaling)) * |h/scaling| + delta_min
+     * = epsilon_(h) * h/scaling + epsilon_(scaling) * |h/scaling| + delta_min
+     * = |delta_h|/scaling + |delta_uh(h)|
+     * ~= |delta_box| when level = 32
+     *
+     * Another way to think about it is the error of uh'(h'(x)) also consists of
+     * the same two components that constitute the error of unhashing to a point,
+     * by substituting c with h'(x).
+     *
+     * | delta_(uh'(h'(x))) | = | x - uh'(h(x)) |
+     *
+     *            uh(h'(x))
+     *              |
+     *          x   | uh'(h(x))
+     * |--------|---|---|----------------> min-max scale
+     * min       \     /
+     *            \   /
+     *             \ /
+     * |--------|---|--------------------> hash scale for cells c
+     * 0      h(x)  h'(x)
+     *
+     *
+     * We can get the maximum of the error by making max very large and min = -min, x -> max
+     */
+    TEST(GeoHashConverter, UnhashToBoxError) {
+        GeoHashConverter::Parameters params;
+        // Test max from 2^-20 to 2^20
+        for (int times = -20; times <= 20; times += 2) {
+            // Construct parameters
+            params.max = ldexp(1 + 0.01 * times, times);
+            params.min = -params.max;
+            params.bits = 32;
+            double numBuckets = (1024 * 1024 * 1024 * 4.0);
+            params.scaling = numBuckets / (params.max - params.min);
+
+            GeoHashConverter converter(params);
+            // Assume level == 32, so we ignore the error of  edge length here.
+            double delta_box = 7.0 / 8.0 * GeoHashConverter::calcUnhashToBoxError(params);
+            double cellEdge = 1 / params.scaling;
+            double x;
+
+            // We are not able to test all the FP numbers to verify the error bound by design,
+            // so we consider the numbers in the cell near the point we are interested in.
+            //
+            // FP numbers starting at max, working downward in minimal increments
+            x = params.max;
+            while (x > params.max - cellEdge) {
+                x = nextafter(x, params.min);
+                double x_prime = converter.convertDoubleFromHashScale(
+                        converter.convertToDoubleHashScale(x));
+                double delta = fabs(x - x_prime);
+                ASSERT_LESS_THAN(delta, delta_box);
+            }
+
+            // FP numbers starting between first and second cell, working downward to min
+            x = params.min + cellEdge;
+            while (x > params.min) {
+                x = nextafter(x, params.min);
+                double x_prime = converter.convertDoubleFromHashScale(
+                        converter.convertToDoubleHashScale(x));
+                double delta = fabs(x - x_prime);
+                ASSERT_LESS_THAN(delta, delta_box);
+            }
+        }
+    }
+
+    // SERVER-15576 Verify a point is contained by its GeoHash box.
+    TEST(GeoHashConverter, GeoHashBox) {
+        GeoHashConverter::Parameters params;
+        params.max = 100000000.3;
+        params.min = -params.max;
+        params.bits = 32;
+        double numBuckets = (1024 * 1024 * 1024 * 4.0);
+        params.scaling = numBuckets / (params.max - params.min);
+
+        GeoHashConverter converter(params);
+
+        // Without expanding the box, the following point is not contained by its GeoHash box.
+        mongo::Point p(-7201198.6497758823, -0.1);
+        mongo::GeoHash hash = converter.hash(p);
+        mongo::Box box = converter.unhashToBoxCovering(hash);
+        ASSERT(box.inside(p));
     }
 }
diff --git a/src/mongo/db/geo/r2_region_coverer.cpp b/src/mongo/db/geo/r2_region_coverer.cpp
index 37dde0ac8e9..22812bccc77 100644
--- a/src/mongo/db/geo/r2_region_coverer.cpp
+++ b/src/mongo/db/geo/r2_region_coverer.cpp
@@ -134,7 +134,7 @@ namespace mongo {
     // Caller owns the returned pointer
     R2RegionCoverer::Candidate* R2RegionCoverer::newCandidate( const GeoHash& cell ) {
         // Exclude the cell that doesn't intersect with the geometry.
-        Box box = _hashConverter->unhashToBox(cell);
+        Box box = _hashConverter->unhashToBoxCovering(cell);
 
         if (_region->fastDisjoint(box)) {
             return NULL;
diff --git a/src/mongo/db/geo/r2_region_coverer_test.cpp b/src/mongo/db/geo/r2_region_coverer_test.cpp
index 15726598b69..5f61d40357a 100644
--- a/src/mongo/db/geo/r2_region_coverer_test.cpp
+++ b/src/mongo/db/geo/r2_region_coverer_test.cpp
@@ -161,7 +161,10 @@ namespace {
             GeoHash id( (long long) rand.nextInt64(),
                 (unsigned) rand.nextInt32( GeoHash::kMaxBits + 1 ) );
             vector<GeoHash> covering;
-            HashBoxRegion region(converter.unhashToBox(id));
+            Box box = converter.unhashToBoxCovering(id);
+            // Since the unhashed box is expanded by the error 8Mu, we need to shrink it.
+            box.fudge(-GeoHashConverter::kMachinePrecision * MAXBOUND * 20);
+            HashBoxRegion region(box);
             coverer.getCovering(region, &covering);
             ASSERT_EQUALS( covering.size(), (size_t)1 );
             ASSERT_EQUALS( covering[0], id );
@@ -183,7 +186,7 @@ namespace {
                        const R2CellUnion& covering,
                        const GeoHash cellId = GeoHash()) {
 
-        Box cell = converter.unhashToBox(cellId);
+        Box cell = converter.unhashToBoxCovering(cellId);
 
         // The covering may or may not contain this disjoint cell, we don't care.
         if (region.fastDisjoint(cell)) return;
author	Siyuan Zhou <siyuan.zhou@mongodb.com>	2014-10-14 19:41:33 -0400
committer	Siyuan Zhou <siyuan.zhou@mongodb.com>	2014-10-17 14:23:34 -0400
commit	4e10def7b1ea9da02a5218d592073ff2587544cd (patch)
tree	db0d4bdd33ce109b61cce425e6f6fc020600f265 /src/mongo/db/geo
parent	cd82c0d6739550c04b75e41742433bf65ff7826a (diff)
download	mongo-4e10def7b1ea9da02a5218d592073ff2587544cd.tar.gz