summaryrefslogtreecommitdiff
path: root/src/mongo/bson/bson_validate.cpp
blob: f69700696b1e3fc14439d0af090f2e7e47ce6218 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
/**
 *    Copyright (C) 2018-present MongoDB, Inc.
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the Server Side Public License, version 1,
 *    as published by MongoDB, Inc.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    Server Side Public License for more details.
 *
 *    You should have received a copy of the Server Side Public License
 *    along with this program. If not, see
 *    <http://www.mongodb.com/licensing/server-side-public-license>.
 *
 *    As a special exception, the copyright holders give permission to link the
 *    code of portions of this program with the OpenSSL library under certain
 *    conditions as described in each individual source file and distribute
 *    linked combinations including the program with the OpenSSL library. You
 *    must comply with the Server Side Public License in all respects for
 *    all of the code used other than as permitted herein. If you modify file(s)
 *    with this exception, you may extend this exception to your version of the
 *    file(s), but you are not obligated to do so. If you do not wish to do so,
 *    delete this exception statement from your version. If you delete this
 *    exception statement from all source files in the program, then also delete
 *    it in the license file.
 */

#include "mongo/bson/bson_validate.h"

#include <cstring>
#include <vector>

#include "mongo/base/data_view.h"
#include "mongo/bson/bson_depth.h"
#include "mongo/bson/bsonelement.h"
#include "mongo/bson/util/bsoncolumn.h"
#include "mongo/logv2/log.h"

#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault


namespace mongo {
namespace {

// The values of the kSkipXX styles are used to compute the size, the remaining ones are arbitrary.
// NOTE: The kSkipXX values directly encode the amount of 4-byte words to skip: don't change them!
enum ValidationStyle : uint8_t {
    kSkip0 = 0,          // The element only consists of the type byte and field name.
    kSkip4 = 1,          // There are 4 additional bytes of data, see note above.
    kSkip8 = 2,          // There are 8 additional bytes of data, see note above.
    kSkip12 = 3,         // There are 12 additional bytes of data, see note above.
    kSkip16 = 4,         // There are 16 additional bytes of data, see note above.
    kString = 5,         // An int32 with the string length (including NUL) follows the field name.
    kObjectOrArray = 6,  // The type starts a new nested object or array.
    kSpecial = 7,        // Handled specially: any cases that don't fall into the above.
};

// This table is padded and aligned to 32 bytes for more efficient lookup.
static constexpr ValidationStyle kTypeInfoTable alignas(32)[32] = {
    ValidationStyle::kSpecial,        // \x00 EOO
    ValidationStyle::kSkip8,          // \x01 NumberDouble
    ValidationStyle::kString,         // \x02 String
    ValidationStyle::kObjectOrArray,  // \x03 Object
    ValidationStyle::kObjectOrArray,  // \x04 Array
    ValidationStyle::kSpecial,        // \x05 BinData
    ValidationStyle::kSkip0,          // \x06 Undefined
    ValidationStyle::kSkip12,         // \x07 OID
    ValidationStyle::kSpecial,        // \x08 Bool (requires 0/1 false/true validation)
    ValidationStyle::kSkip8,          // \x09 Date
    ValidationStyle::kSkip0,          // \x0a Null
    ValidationStyle::kSpecial,        // \x0b Regex (two nul-terminated strings)
    ValidationStyle::kSpecial,        // \x0c DBRef
    ValidationStyle::kString,         // \x0d Code
    ValidationStyle::kString,         // \x0e Symbol
    ValidationStyle::kSpecial,        // \x0f CodeWScope
    ValidationStyle::kSkip4,          // \x10 Int
    ValidationStyle::kSkip8,          // \x11 Timestamp
    ValidationStyle::kSkip8,          // \x12 Long
    ValidationStyle::kSkip16,         // \x13 Decimal
};
MONGO_STATIC_ASSERT(sizeof(kTypeInfoTable) == 32);

constexpr ErrorCodes::Error InvalidBSON = ErrorCodes::InvalidBSON;
constexpr ErrorCodes::Error NonConformantBSON = ErrorCodes::NonConformantBSON;

class DefaultValidator {
public:
    void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {}

    void checkUTF8Char() {}

    void checkDuplicateFieldName() {}

    void popLevel() {}
};

class ExtendedValidator {
public:
    void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {
        // Checks the field name before the element, if inside array.
        checkArrIndex(ptr);
        // Increments the pointer to the actual element.
        ptr += offsetToValue;
        switch (type) {
            case BSONType::Undefined:
            case BSONType::DBRef:
            case BSONType::Symbol:
            case BSONType::CodeWScope:
                uasserted(NonConformantBSON, fmt::format("Use of deprecated BSON type {}", type));
                break;
            case BSONType::Array:
                addIndexLevel(true /* isArr */);
                break;
            case BSONType::Object:
                addIndexLevel(false /* isArr */);
                break;
            case BSONType::RegEx: {
                // Skips regular expression cstring.
                const char* options = ptr + strlen(ptr) + 1;
                _checkRegexOptions(options);
                break;
            }
            case BSONType::BinData:
                uint8_t subtype =
                    ConstDataView(ptr + sizeof(uint32_t)).read<LittleEndian<uint8_t>>();
                switch (subtype) {
                    case BinDataType::ByteArrayDeprecated:
                    case BinDataType::bdtUUID:
                        uasserted(
                            NonConformantBSON,
                            fmt::format("Use of deprecated BSON binary data subtype {}", subtype));
                        break;
                    case BinDataType::newUUID: {
                        constexpr uint32_t UUIDLength = 16;
                        uint32_t l = ConstDataView(ptr).read<LittleEndian<uint32_t>>();
                        uassert(
                            ErrorCodes::NonConformantBSON,
                            fmt::format("BSON UUID length should be {} bytes. Found {} instead.",
                                        UUIDLength,
                                        l),
                            l == UUIDLength);
                        break;
                    }
                    case BinDataType::MD5Type: {
                        constexpr uint32_t md5Length = 16;
                        auto md5Size = ConstDataView(ptr).read<LittleEndian<uint32_t>>();
                        uassert(NonConformantBSON,
                                fmt::format("MD5 must be 16 bytes, got {} instead.", md5Size),
                                md5Size == md5Length);
                        break;
                    }
                }
                break;
        }
    }

    void checkUTF8Char() {}

    void checkDuplicateFieldName() {}

    void popLevel() {
        if (!indexCount.empty()) {
            indexCount.pop_back();
        }
    }

private:
    struct Level {
        DecimalCounter<uint32_t> counter;  // Counter used to check whether indexes are sequential.
        bool isArr;                        // Indicates whether level is an array or other (object).
    };

    void addIndexLevel(bool isArr) {
        if (isArr) {
            indexCount.push_back(Level{DecimalCounter<uint32_t>(0), true /* isArr */});
        } else {
            indexCount.push_back(Level{DecimalCounter<uint32_t>(0), false /* isArr */});
        }
    }

    bool inArr() {
        return !indexCount.empty() && indexCount.back().isArr;
    }

    void checkArrIndex(const char* ptr) {
        if (!inArr()) {
            return;
        }
        // Checks the actual index, skipping the type byte.
        auto actualIndex = StringData(ptr + sizeof(char));
        uassert(NonConformantBSON,
                fmt::format("Indices of BSON Array are invalid. Expected {}, but got {}.",
                            indexCount.back().counter,
                            actualIndex),
                indexCount.back().counter == actualIndex);
        ++indexCount.back().counter;
    }

    void _checkRegexOptions(const char* options) {
        // Checks that the options are in ascending alphabetical order and that they're all valid.
        std::string validRegexOptions("ilmsux");
        for (const auto& option : std::string(options)) {
            uassert(
                NonConformantBSON,
                fmt::format("Valid regex options are [ i, l, m, s, u, x], but found '{}' instead.",
                            option),
                validRegexOptions.find(option) != std::string::npos);
            uassert(NonConformantBSON,
                    fmt::format("Regex options should be in ascending alphabetical order. "
                                "Found {} instead.",
                                options),
                    &option == options || option > *(&option - 1));
        }
    }

protected:
    // Behaves like a stack, used to validate array index count.
    std::vector<Level> indexCount;
};

class FullValidator : private ExtendedValidator {
public:
    void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {
        ExtendedValidator::checkNonConformantElem(ptr, offsetToValue, type);
        switch (type) {
            case BSONType::BinData: {
                uint8_t subtype = ConstDataView(ptr + offsetToValue + sizeof(uint32_t))
                                      .read<LittleEndian<uint8_t>>();
                switch (subtype) {
                    case BinDataType::Column: {
                        // Check for exceptions when decompressing.
                        // Calling size() decompresses the entire column.
                        try {
                            BSONColumn(BSONElement(ptr)).size();
                        } catch (...) {
                            uasserted(NonConformantBSON,
                                      "Exception ocurred while decompressing a BSON column.");
                        }
                        break;
                    }
                }
            }
        }
    }

    void checkUTF8Char() {}

    void checkDuplicateFieldName() {}

    void popLevel() {
        ExtendedValidator::popLevel();
    }
};

template <bool precise, typename BSONValidator>
class ValidateBuffer {
public:
    ValidateBuffer(const char* data, uint64_t maxLength, BSONValidator validator)
        : _data(data), _maxLength(maxLength), _validator(validator) {
        if constexpr (precise)
            _frames.resize(BSONDepth::getMaxAllowableDepth() + 1);
    }

    Status validate() noexcept {
        try {
            _currFrame = _frames.begin();
            _currElem = nullptr;
            auto maxFrames = BSONDepth::getMaxAllowableDepth() + 1;  // A flat BSON has one frame.
            uassert(InvalidBSON, "Cannot enforce max nesting depth", _frames.size() <= maxFrames);
            uassert(InvalidBSON, "BSON data has to be at least 5 bytes", _maxLength >= 5);

            // Read the length as signed integer, to ensure we limit it to < 2GB.
            // All other lengths are read as unsigned, which makes for easier bounds checking.
            Cursor cursor = {_data, _data + _maxLength};
            int32_t len = cursor.template read<int32_t>();
            uassert(InvalidBSON, "BSON data has to be at least 5 bytes", len >= 5);
            uassert(InvalidBSON,
                    str::stream() << "Incorrect BSON length " << static_cast<size_t>(len)
                                  << " should be less or equal to " << _maxLength,
                    static_cast<size_t>(len) <= _maxLength);
            const char* end = _currFrame->end = _data + len;
            uassert(InvalidBSON, "BSON object not terminated with EOO", end[-1] == 0);
            _validateIterative(Cursor{cursor.ptr, end});
        } catch (const ExceptionForCat<ErrorCategory::ValidationError>& e) {
            return Status(e.code(), str::stream() << e.what() << " " << _context());
        }
        return Status::OK();
    }

private:
    struct Empty {};

    /**
     * Extra information for each nesting level in the precise validation mode.
     */
    struct PreciseFrameInfo {
        BSONElement elem;  // _id for top frame, unchecked Object, Array or CodeWScope otherwise.
    };

    struct Frame : public std::conditional<precise, PreciseFrameInfo, Empty>::type {
        const char* end;  // Used for checking encoded object/array sizes, not bounds checking.
    };

    using Frames =
        typename std::conditional<precise, std::vector<Frame>, std::array<Frame, 32>>::type;

    struct Cursor {
        void skip(size_t len) {
            uassert(InvalidBSON, "BSON size is larger than buffer size", (ptr += len) < end);
        }

        template <typename T>
        T read() {
            auto val = ptr;
            skip(sizeof(T));
            return ConstDataView(val).read<LittleEndian<T>>();
        }

        void skipString() {
            auto len = read<uint32_t>();
            skip(len);
            uassert(InvalidBSON, "Not null terminated string", !ptr[-1] && len > 0);
        }

        size_t strlen() const {
            // This is actually by far the hottest code in all of BSON validation.
            dassert(ptr < end);
            size_t len = 0;
            while (ptr[len])
                ++len;
            return len;
        }

        const char* ptr;
        const char* const end;
    };

    const char* _pushFrame(Cursor cursor) {
        uassert(ErrorCodes::Overflow,
                "BSONObj exceeds maximum nested object depth",
                ++_currFrame != _frames.end());
        auto obj = cursor.ptr;
        auto len = cursor.template read<int32_t>();
        uassert(ErrorCodes::InvalidBSON, "Nested BSON object has to be at least 5 bytes", len >= 5);
        _currFrame->end = obj + len;

        if constexpr (precise) {
            auto nameLen = obj - _currElem;
            _currFrame->elem = BSONElement(_currElem, nameLen, nameLen + len);
        }
        return cursor.ptr;
    }

    bool _popFrame() {
        _validator.popLevel();
        if (_currFrame == _frames.begin())
            return false;
        --_currFrame;
        return true;
    }

    static const char* _validateSpecial(Cursor cursor, uint8_t type) {
        switch (type) {
            case BSONType::BinData:
                cursor.skip(cursor.template read<uint32_t>());  // Like String, but...
                cursor.skip(1);  // ...add extra skip for the subtype byte to avoid overflow.
                break;
            case BSONType::Bool:
                if (auto value = cursor.template read<uint8_t>())  // If not 0, must be 1.
                    uassert(InvalidBSON, "BSON bool is neither false nor true", value == 1);
                break;
            case BSONType::RegEx:
                cursor.skip(0);  // Force validation of the ptr after skipping past the field name.
                cursor.skip(cursor.strlen() + 1);  // Skip regular expression cstring.
                cursor.skip(cursor.strlen() + 1);  // Skip options cstring.
                break;
            case BSONType::DBRef:
                cursor.skipString();  // Like String, but...
                cursor.skip(12);      // ...also skip the 12-byte ObjectId.
                break;
            case static_cast<uint8_t>(BSONType::MinKey):  // Need to cast, as MinKey is negative.
            case BSONType::MaxKey:
                cursor.skip(0);  // Force validation of the ptr after skipping past the field name.
                break;
            default:
                uasserted(InvalidBSON, str::stream() << "Unrecognized BSON type " << type);
        }
        return cursor.ptr;
    }

    const char* _pushCodeWithScope(Cursor cursor) {
        cursor.ptr = _pushFrame(cursor);  // Push a dummy frame to check the CodeWScope size.
        cursor.skipString();              // Now skip the BSON UTF8 string containing the code.
        _currElem = cursor.ptr - 1;       // Use the terminating NUL as a dummy scope element.
        return _pushFrame(cursor);
    }

    void _maybePopCodeWithScope(Cursor cursor) {
        if constexpr (precise) {
            // When ending the scope of a CodeWScope, pop the extra dummy frame and check its size.
            if (_currFrame != _frames.begin() && (_currFrame - 1)->elem.type() == CodeWScope) {
                invariant(_popFrame());
                uassert(InvalidBSON, "incorrect BSON length", cursor.ptr == _currFrame->end);
            }
        }
    }

    const char* _validateElem(Cursor cursor, uint8_t type) {
        if (MONGO_unlikely(type > JSTypeMax))
            return _validateSpecial(cursor, type);

        auto style = kTypeInfoTable[type];
        if (MONGO_likely(style <= kSkip16))
            cursor.skip(style * 4);
        else if (MONGO_likely(style == kString))
            cursor.skipString();
        else if (MONGO_likely(style == kObjectOrArray))
            cursor.ptr = _pushFrame(cursor);
        else if (MONGO_unlikely(precise && type == CodeWScope))
            cursor.ptr = _pushCodeWithScope(cursor);
        else
            cursor.ptr = _validateSpecial(cursor, type);

        return cursor.ptr;
    }

    MONGO_COMPILER_NOINLINE void _validateIterative(Cursor cursor) {
        do {
            // Use the fact that the EOO byte is 0, just like the end of string, so checking for EOO
            // is same as finding len == 0. The cursor cannot point past EOO, so the strlen is safe.
            uassert(InvalidBSON, "BSON size is larger than buffer size", cursor.ptr < cursor.end);
            while (size_t len = cursor.strlen()) {
                uint8_t type = *cursor.ptr;
                _currElem = cursor.ptr;
                cursor.ptr += len + 1;
                cursor.ptr = _validateElem(cursor, type);

                // Check if the data is compliant to other BSON specifications if the element is
                // structurally correct.
                _validator.checkNonConformantElem(_currElem, len + 1, type);

                if constexpr (precise) {
                    // See if the _id field was just validated. If so, set the global scope element.
                    if (_currFrame == _frames.begin() && StringData(_currElem + 1) == "_id"_sd)
                        _currFrame->elem = BSONElement(_currElem);  // This is fully validated now.
                }
                dassert(cursor.ptr <= cursor.end);
            }

            // Got the EOO byte: skip it and compare its location with the expected frame end.
            uassert(InvalidBSON, "incorrect BSON length", ++cursor.ptr == _currFrame->end);
            _maybePopCodeWithScope(cursor);
        } while (_popFrame());  // Finished when there are no frames left.
    }

    /**
     * Returns a string qualifying the context in which an exception occurred. Example return is
     * "in element with field name 'foo.bar' in object with _id: 1".
     */
    std::string _context() {
        str::stream ctx;
        ctx << "in element with field name '";
        if constexpr (precise) {
            std::for_each(_frames.begin() + 1,
                          _currFrame + (_currFrame != _frames.end()),
                          [&](auto& frame) { ctx << frame.elem.fieldName() << "."; });
        }
        ctx << (_currElem ? _currElem + 1 : "?") << "'";

        if constexpr (precise) {
            auto _id = _frames.begin()->elem;
            ctx << " in object with " << (_id ? BSONElement(_id).toString() : "unknown _id");
        }
        return str::escape(ctx);
    }

    const char* const _data;  // The data buffer to check.
    const size_t _maxLength;  // The size of the data buffer. The BSON object may be smaller.
    const char* _currElem = nullptr;  // Element to validate: only the name is known to be good.
    typename Frames::iterator _currFrame;  // Frame currently being validated.
    Frames _frames;  // Has end pointers to check and the containing element for precise mode.
    BSONValidator _validator;
};

template <typename BSONValidator>
Status _doValidate(const char* originalBuffer, uint64_t maxLength, BSONValidator validator) {
    // First try validating using the fast but less precise version. That version will return
    // a not-OK status for objects with CodeWScope or nesting exceeding 32 levels. These cases and
    // actual failures will rerun the precise version that gives a detailed error context.
    if (MONGO_likely((ValidateBuffer<false, BSONValidator>(originalBuffer, maxLength, validator)
                          .validate()
                          .isOK())))
        return Status::OK();

    return ValidateBuffer<true, BSONValidator>(originalBuffer, maxLength, validator).validate();
}
}  // namespace

Status validateBSON(const char* originalBuffer,
                    uint64_t maxLength,
                    BSONValidateMode mode) noexcept {
    if (MONGO_likely(mode == BSONValidateMode::kDefault))
        return _doValidate(originalBuffer, maxLength, DefaultValidator());
    else if (mode == BSONValidateMode::kExtended)
        return _doValidate(originalBuffer, maxLength, ExtendedValidator());
    else if (mode == BSONValidateMode::kFull)
        return ValidateBuffer<true, FullValidator>(originalBuffer, maxLength, FullValidator())
            .validate();
    else
        MONGO_UNREACHABLE;
}

Status validateBSON(const BSONObj& obj, BSONValidateMode mode) {
    return validateBSON(obj.objdata(), obj.objsize(), mode);
}
}  // namespace mongo