diff options
author | Siyuan Zhou <siyuan.zhou@mongodb.com> | 2014-10-14 19:41:33 -0400 |
---|---|---|
committer | Siyuan Zhou <siyuan.zhou@mongodb.com> | 2014-10-17 14:23:34 -0400 |
commit | 4e10def7b1ea9da02a5218d592073ff2587544cd (patch) | |
tree | db0d4bdd33ce109b61cce425e6f6fc020600f265 /src/mongo/db/geo | |
parent | cd82c0d6739550c04b75e41742433bf65ff7826a (diff) | |
download | mongo-4e10def7b1ea9da02a5218d592073ff2587544cd.tar.gz |
SERVER-15576 Floating-point inaccuracy when unhashing GeoHash to box
Diffstat (limited to 'src/mongo/db/geo')
-rw-r--r-- | src/mongo/db/geo/hash.cpp | 81 | ||||
-rw-r--r-- | src/mongo/db/geo/hash.h | 25 | ||||
-rw-r--r-- | src/mongo/db/geo/hash_test.cpp | 283 | ||||
-rw-r--r-- | src/mongo/db/geo/r2_region_coverer.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/geo/r2_region_coverer_test.cpp | 7 |
5 files changed, 357 insertions, 41 deletions
diff --git a/src/mongo/db/geo/hash.cpp b/src/mongo/db/geo/hash.cpp index 57996e0ba1e..ea481e14e14 100644 --- a/src/mongo/db/geo/hash.cpp +++ b/src/mongo/db/geo/hash.cpp @@ -32,6 +32,8 @@ #include "mongo/db/geo/shapes.h" #include "mongo/util/mongoutils/str.h" +#include <algorithm> // for max() + // So we can get at the str namespace. using namespace mongoutils; @@ -498,6 +500,21 @@ namespace mongo { static BSONField<double> maxField("max", 180.0); static BSONField<double> minField("min", -180.0); + // a x b + // | | | + // -----|---o-----|---------|-- "|" is a representable double number. + // + // In the above figure, b is the next representable double number after a, so + // |a - b|/|a| = epsilon (ULP) ~= 2.22E-16. + // + // An exact number x will be represented as the nearest representable double, which is a. + // |x - a|/|a| <= 0.5 ULP ~= 1.11e-16 + // + // IEEE floating-point operations have a maximum error of 0.5 ULPS (units in + // the last place). For double-precision numbers, this works out to 2**-53 + // (about 1.11e-16) times the magnitude of the result. + double const GeoHashConverter::kMachinePrecision = 0.5 * std::numeric_limits<double>::epsilon(); + Status GeoHashConverter::parseParameters(const BSONObj& paramDoc, GeoHashConverter::Parameters* params) { @@ -556,6 +573,9 @@ namespace mongo { // Error in radians _errorSphere = deg2rad(_error); + + // 8 * max(|max|, |min|) * u + _errorUnhashToBox = calcUnhashToBoxError(_params); } double GeoHashConverter::distanceBetweenHashes(const GeoHash& a, const GeoHash& b) const { @@ -672,16 +692,25 @@ namespace mongo { *y = convertFromHashScale(b); } - Box GeoHashConverter::unhashToBox(const GeoHash &h) const { - double sizeEdgeBox = sizeEdge(h); + Box GeoHashConverter::unhashToBoxCovering(const GeoHash &h) const { + if (h.getBits() == 0) { + // Return the result without any error. + return Box(Point(_params.min, _params.min), Point(_params.max, _params.max)); + } + + double sizeEdgeBox = sizeEdge(h.getBits()); Point min(unhashToPoint(h)); Point max(min.x + sizeEdgeBox, min.y + sizeEdgeBox); + + // Expand the box by the error bound Box box(min, max); + box.fudge(_errorUnhashToBox); return box; } - Box GeoHashConverter::unhashToBox(const BSONElement &e) const { - return unhashToBox(hash(e)); + double GeoHashConverter::calcUnhashToBoxError(const GeoHashConverter::Parameters& params) { + return std::max(fabs(params.min), fabs(params.max)) + * GeoHashConverter::kMachinePrecision* 8; } double GeoHashConverter::sizeOfDiag(const GeoHash& a) const { @@ -690,34 +719,29 @@ namespace mongo { return distanceBetweenHashes(a, b); } - // TODO: Use ldexp(max - min, -level) - double GeoHashConverter::sizeEdge(const GeoHash& a) const { - if (!a.constrains()) - return _params.max - _params.min; - - double ax, ay, bx, by; - GeoHash b = a; - b.move(1, 1); - unhash(a, &ax, &ay); - unhash(b, &bx, &by); - - // _min and _max are a singularity - if (bx == _params.min) - bx = _params.max; - return fabs(ax - bx); + // Relative error = epsilon_(max-min). ldexp() is just a direct translation to + // floating point exponent, and should be exact. + double GeoHashConverter::sizeEdge(unsigned level) const { + invariant(level >= 0); + invariant((int)level <= _params.bits); + return ldexp(_params.max - _params.min, -level); } - // Convert from an unsigned in [0, (max-min)*scaling] to [min, max] - double GeoHashConverter::convertFromHashScale(unsigned in) const { - double x = in; + // Convert from a double in [0, (max-min)*scaling] to [min, max] + double GeoHashConverter::convertDoubleFromHashScale(double x) const { x /= _params.scaling; x += _params.min; return x; } - // Convert from a double that is [min, max] to an unsigned in [0, (max-min)*scaling] - unsigned GeoHashConverter::convertToHashScale(double in) const { + // Convert from an unsigned in [0, (max-min)*scaling] to [min, max] + double GeoHashConverter::convertFromHashScale(unsigned in) const { + return convertDoubleFromHashScale((double)in); + } + + // Convert from a double that is [min, max] to a double in [0, (max-min)*scaling] + double GeoHashConverter::convertToDoubleHashScale(double in) const { verify(in <= _params.max && in >= _params.min); if (in == _params.max) { @@ -728,6 +752,13 @@ namespace mongo { in -= _params.min; verify(in >= 0); - return static_cast<unsigned>(in * _params.scaling); + return in * _params.scaling; } + + // Convert from a double that is [min, max] to an unsigned in [0, (max-min)*scaling] + unsigned GeoHashConverter::convertToHashScale(double in) const { + return static_cast<unsigned>(convertToDoubleHashScale(in)); + } + + } // namespace mongo diff --git a/src/mongo/db/geo/hash.h b/src/mongo/db/geo/hash.h index 57d2083c430..dbd357983e9 100644 --- a/src/mongo/db/geo/hash.h +++ b/src/mongo/db/geo/hash.h @@ -155,6 +155,8 @@ namespace mongo { */ class GeoHashConverter { public: + static double const kMachinePrecision; // = 1.1e-16 + struct Parameters { // How many bits to use for the hash? int bits; @@ -172,6 +174,8 @@ namespace mongo { */ static Status parseParameters(const BSONObj& paramDoc, Parameters* params); + static double calcUnhashToBoxError(const GeoHashConverter::Parameters& params); + /** * Return converter parameterss which can be used to * construct an copy of this converter. @@ -215,17 +219,18 @@ namespace mongo { void unhash(const GeoHash &h, double *x, double *y) const; /** - * Generates bounding box from geo hash using converter. - * Used in GeoBrowse::fillStack and db/query/explain_plan.cpp - * to generate index bounds from - * geo hashes in plan stats. + * Generates bounding box from geohash, expanded by the error bound */ - Box unhashToBox(const GeoHash &h) const; - Box unhashToBox(const BSONElement &e) const; + Box unhashToBoxCovering(const GeoHash &h) const; double sizeOfDiag(const GeoHash& a) const; - // XXX: understand/clean this. - double sizeEdge(const GeoHash& a) const; + + // Return the sizeEdge of a cell at a given level. + double sizeEdge(unsigned level) const; + + // Used by test. + double convertDoubleFromHashScale(double in) const; + double convertToDoubleHashScale(double in) const; private: void init(); @@ -240,5 +245,9 @@ namespace mongo { // We compute these based on the _params: double _error; double _errorSphere; + + // Error bound of unhashToBox, see hash_test.cpp for its proof. + // 8 * max(|max|, |min|) * u + double _errorUnhashToBox; }; } // namespace mongo diff --git a/src/mongo/db/geo/hash_test.cpp b/src/mongo/db/geo/hash_test.cpp index dc9be31fe97..7534a2efc04 100644 --- a/src/mongo/db/geo/hash_test.cpp +++ b/src/mongo/db/geo/hash_test.cpp @@ -32,9 +32,12 @@ #include <string> #include <sstream> +#include <iomanip> #include <cmath> +#include <algorithm> // For max() #include "mongo/db/geo/hash.h" +#include "mongo/db/geo/shapes.h" #include "mongo/platform/random.h" #include "mongo/unittest/unittest.h" #include "mongo/util/assert_util.h" @@ -97,10 +100,280 @@ namespace { GeoHashConverter converter(params); - ASSERT_APPROX_EQUAL(100.0, converter.sizeEdge(GeoHash(0, 0, 0)), kError); - ASSERT_APPROX_EQUAL(50.0, converter.sizeEdge(GeoHash(0, 0, 1)), kError); - ASSERT_APPROX_EQUAL(25.0, converter.sizeEdge(GeoHash(0, 0, 2)), kError); - ASSERT_APPROX_EQUAL(ldexp(100.0, -26), converter.sizeEdge(GeoHash(0, 0, 26)), kError); - ASSERT_APPROX_EQUAL(ldexp(100.0, -32), converter.sizeEdge(GeoHash(0, 0, 32)), kError); + ASSERT_APPROX_EQUAL(100.0, converter.sizeEdge(0), kError); + ASSERT_APPROX_EQUAL(50.0, converter.sizeEdge(1), kError); + ASSERT_APPROX_EQUAL(25.0, converter.sizeEdge(2), kError); + } + + /** + * ========================== + * Error Bound of UnhashToBox + * ========================== + * + * Compute the absolute error when unhashing a GeoHash to a box, so that expanding + * the box by this absolute error can guarantee a point is always contained by the box + * of its GeoHash. Thus, the absolute error of box should consist of 3 components: + * + * 1) The error introduced by hashing x to GeoHash. The extreme example would be a point + * close to the boundary of a cell is hashed to an adjacent box. + * + * For a hash/unhash functions h(x)/uh(x) and computed functions h'(x),uh'(x): + * + * x uh(h'(x)) + * |--------|----|--------------------> min-max scale + * min \ + * \ + * \ + * \ + * |--------|--|-|--------------------> hash scale for cells c + * 0 h(x) c h'(x) + * + * 2) The error introduced by unhashing an (int) GeoHash to its lower left corner in x-y + * space. + * + * uh(c) + * x | uh'(c) + * |--------|--|----|-----------------> min-max scale + * min \ / + * \ / + * \ / + * X + * |--------|--|-|--------------------> hash scale for cells c + * 0 h(x) c h'(x) + * + * 3) The error introduced by adding the edge length to get the top-right corner of box. + * Instead of directly computing uh'(c+1), we add the computed box edge length to the computed + * value uh(c), giving us an extra error. + * + * |edge(min,max)| + * | | + * | uh(c)+edge + * uh(c) | + * |-------------|------[uh(c)+edge']-----------> min-max scale + * min + * + * |-------------|-------------|----------------> hash scale + * 0 c c+1 + * Hash and unhash definitions + * ------------------------- + * h(x) = (x - min) * scaling = 2^32 * (x - min) / (max - min) + * uh(h) = h / scaling + min, + * where + * scaling = 2^32 / (max - min) + * + * Again, h(x)/uh(x) are the exact hash functions and h'(x)/uh'(x) are the computational hash + * functions which have small rounding errors. + * + * | h'(x) - h(x) | == | delta_h(x; max, min) | + * where delta_fn = the absolute difference between the computed and actual value of a + * function. + * + * Restating the problem, we're looking for: + * |delta_box| = | delta_x_{h'(x)=H} + delta_uh(h) + delta_edge_length | + * <= | delta_x_{h'(x)=H} | + | delta_uh(h) | + | delta_edge_length | + * + * 1. Error bounds calculation + * --------------------------- + * + * 1.1 Error: | delta_x_{h'(x)=H} | + * -------------------------------- + * The first error | delta_x_{h'(x)=H} | means, given GeoHash H, we can find + * the range of x and only the range of x that may be mapped to H. + * In other words, given H, for any x that is far enough from uh(H) by at least d, + * it is impossible for x to be mapped to H. + * Mathematical, find d, such that for any x satisfying |x - uh(H)| > d, + * |h(x) - H| >= | delta_h(x) | + * => |h(x) - H| - | delta_h(x) | >= 0 + * => |h(x) - H + delta_h(x) | >= 0 (|a + b| >= |a| - |b|) + * => |h'(x) - H| >= 0 (h'(x) = h(x) + delta_h(x)) + * which guarantees h'(x) != H. + * + * + * uh(H)-d + * | + * x | uh(H) + * |--------|---[----|----]-----------> min-max scale + * min / \ \ / + * / \ \ / + * / \ \ / + * / \ \ / + * |---[----|--|-]---|----------------> hash scale for cells c + * 0 h(x) | H + * h'(x) + * =h(x)+delta_h(x) + * + * + * Let's consider one case of the above inequality. We need to find the d, + * such that, when + * x < uh(H) - d, (1) + * we have + * h(x) + |delta_h(x)| <= H. (2) + * + * Due to the monotonicity of h(x), apply h(x) to both side of inequality (1), + * we have + * h(x) < h(uh(H) - d) <= H - |delta_h(x)| (from (2)) + * + * By solving it, we have + * d = |delta_h(x)| / scaling + * <= 2Mu * (1 + |x-min|/|max-min|) (see calculation for |delta_h(x)| below) + * <= 4Mu + * + * | delta_x_{h'(x)=H} | <= d <= 4Mu + * The similar calculation applies for the other side of the above inequality. + * + * 1.2 Error of h(x) + * ----------------- + * + * Rules of error propagation + * -------------------------- + * Absolute error of x is |delta_x| + * Relative error of x is epsilon_x = |delta_x| / |x| + * For any double number x, the relative error of x is bounded by "u". We assume all inputs + * have this error to make deduction clear. + * epsilon_x <= u = 0.5 * unit of least precision(ULP) ~= 1.1 * 10E-16 + * + * |delta_(x + y)| <= |delta_x| + |delta_y| + * |delta_(x - y)| <= |delta_x| + |delta_y| + * epsilon_(x * y) <= epsilon_x + epsilon_y + * epsilon_(x / y) <= epsilon_x + epsilon_y + * + * For a given min, max scale, the maximum delta in a computation is bounded by the maximum + * value in the scale - M * u = max(|max|, |min|) * u. + * + * For the hash function h(x) + * -------------------------- + * + * epsilon_h(x) = epsilon_(x-min) + epsilon_scaling + * + * epsilon_(x-min) = (|delta_x| + |delta_min|) / |x - min| + * <= 2Mu / |x - min| + * + * epsilon_scaling = epsilon_(2^32) + epsilon_(max - min) + * = 0 + epsilon_(max - min) + * <= 2Mu / |max - min| + * + * Hence, epsilon_h(x) <= 2Mu * (1/|x - min| + 1/|max - min|) + * + * |delta_h(x)| = 2Mu * (1 + |x-min|/|max-min|) * 2^32 / |max - min| + * <= 4Mu * 2^32 / |max-min| + * + * 2. Error: unhashing GeoHash to point + * ------------------------------------ + * Similarly, we can calculate the error for uh(h) function, assuming h is exactly + * represented in form of GeoHash, since integer is represented exactly. + * + * |delta_uh(h)| = epsilon_(h/scaling) * |h/scaling| + delta_min + * = epsilon_(scaling) * |h/scaling| + delta_min + * <= 2Mu / |max-min| * |max-min| + |min| * u + * <= 3Mu + * + * Thus, the second error |delta_uh(h)| <= 3Mu + * Totally, the absolute error we need to add to unhashing to a point <= 4Mu + 3Mu = 7Mu + * + * 3. Error: edge length + * --------------------- + * The third part is easy to compute, since ldexp() doesn't introduce extra + * relative error. + * + * edge_length = ldexp(max - min, -level) + * + * epsilon_edge = epsilon_(max - min) <= 2 * M * u / |max - min| + * + * | delta_edge | = epsilon_edge * (max - min) * 2^(-level) + * = 2Mu * 2^(-level) <= Mu (level >= 1) + * + * This error is neglectable when level >> 0. + * + * In conclusion, | delta_box | <= 8Mu + * + * + * Test + * ==== + * This first two component errors can be simulated by uh'(h'(x)). + * Let h = h'(x) + * |delta_(uh'(h'(x)))| + * = epsilon_(h/scaling) * |h/scaling| + delta_min + * = (epsilon_(h) + epsilon_(scaling)) * |h/scaling| + delta_min + * = epsilon_(h) * h/scaling + epsilon_(scaling) * |h/scaling| + delta_min + * = |delta_h|/scaling + |delta_uh(h)| + * ~= |delta_box| when level = 32 + * + * Another way to think about it is the error of uh'(h'(x)) also consists of + * the same two components that constitute the error of unhashing to a point, + * by substituting c with h'(x). + * + * | delta_(uh'(h'(x))) | = | x - uh'(h(x)) | + * + * uh(h'(x)) + * | + * x | uh'(h(x)) + * |--------|---|---|----------------> min-max scale + * min \ / + * \ / + * \ / + * |--------|---|--------------------> hash scale for cells c + * 0 h(x) h'(x) + * + * + * We can get the maximum of the error by making max very large and min = -min, x -> max + */ + TEST(GeoHashConverter, UnhashToBoxError) { + GeoHashConverter::Parameters params; + // Test max from 2^-20 to 2^20 + for (int times = -20; times <= 20; times += 2) { + // Construct parameters + params.max = ldexp(1 + 0.01 * times, times); + params.min = -params.max; + params.bits = 32; + double numBuckets = (1024 * 1024 * 1024 * 4.0); + params.scaling = numBuckets / (params.max - params.min); + + GeoHashConverter converter(params); + // Assume level == 32, so we ignore the error of edge length here. + double delta_box = 7.0 / 8.0 * GeoHashConverter::calcUnhashToBoxError(params); + double cellEdge = 1 / params.scaling; + double x; + + // We are not able to test all the FP numbers to verify the error bound by design, + // so we consider the numbers in the cell near the point we are interested in. + // + // FP numbers starting at max, working downward in minimal increments + x = params.max; + while (x > params.max - cellEdge) { + x = nextafter(x, params.min); + double x_prime = converter.convertDoubleFromHashScale( + converter.convertToDoubleHashScale(x)); + double delta = fabs(x - x_prime); + ASSERT_LESS_THAN(delta, delta_box); + } + + // FP numbers starting between first and second cell, working downward to min + x = params.min + cellEdge; + while (x > params.min) { + x = nextafter(x, params.min); + double x_prime = converter.convertDoubleFromHashScale( + converter.convertToDoubleHashScale(x)); + double delta = fabs(x - x_prime); + ASSERT_LESS_THAN(delta, delta_box); + } + } + } + + // SERVER-15576 Verify a point is contained by its GeoHash box. + TEST(GeoHashConverter, GeoHashBox) { + GeoHashConverter::Parameters params; + params.max = 100000000.3; + params.min = -params.max; + params.bits = 32; + double numBuckets = (1024 * 1024 * 1024 * 4.0); + params.scaling = numBuckets / (params.max - params.min); + + GeoHashConverter converter(params); + + // Without expanding the box, the following point is not contained by its GeoHash box. + mongo::Point p(-7201198.6497758823, -0.1); + mongo::GeoHash hash = converter.hash(p); + mongo::Box box = converter.unhashToBoxCovering(hash); + ASSERT(box.inside(p)); } } diff --git a/src/mongo/db/geo/r2_region_coverer.cpp b/src/mongo/db/geo/r2_region_coverer.cpp index 37dde0ac8e9..22812bccc77 100644 --- a/src/mongo/db/geo/r2_region_coverer.cpp +++ b/src/mongo/db/geo/r2_region_coverer.cpp @@ -134,7 +134,7 @@ namespace mongo { // Caller owns the returned pointer R2RegionCoverer::Candidate* R2RegionCoverer::newCandidate( const GeoHash& cell ) { // Exclude the cell that doesn't intersect with the geometry. - Box box = _hashConverter->unhashToBox(cell); + Box box = _hashConverter->unhashToBoxCovering(cell); if (_region->fastDisjoint(box)) { return NULL; diff --git a/src/mongo/db/geo/r2_region_coverer_test.cpp b/src/mongo/db/geo/r2_region_coverer_test.cpp index 15726598b69..5f61d40357a 100644 --- a/src/mongo/db/geo/r2_region_coverer_test.cpp +++ b/src/mongo/db/geo/r2_region_coverer_test.cpp @@ -161,7 +161,10 @@ namespace { GeoHash id( (long long) rand.nextInt64(), (unsigned) rand.nextInt32( GeoHash::kMaxBits + 1 ) ); vector<GeoHash> covering; - HashBoxRegion region(converter.unhashToBox(id)); + Box box = converter.unhashToBoxCovering(id); + // Since the unhashed box is expanded by the error 8Mu, we need to shrink it. + box.fudge(-GeoHashConverter::kMachinePrecision * MAXBOUND * 20); + HashBoxRegion region(box); coverer.getCovering(region, &covering); ASSERT_EQUALS( covering.size(), (size_t)1 ); ASSERT_EQUALS( covering[0], id ); @@ -183,7 +186,7 @@ namespace { const R2CellUnion& covering, const GeoHash cellId = GeoHash()) { - Box cell = converter.unhashToBox(cellId); + Box cell = converter.unhashToBoxCovering(cellId); // The covering may or may not contain this disjoint cell, we don't care. if (region.fastDisjoint(cell)) return; |