/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/platform/basic.h" #include #include "mongo/base/init.h" #include "mongo/db/bson/dotted_path_support.h" #include "mongo/db/fts/fts_index_format.h" #include "mongo/db/fts/fts_spec.h" #include "mongo/db/server_options.h" #include "mongo/util/hex.h" #include "mongo/util/md5.hpp" #include "mongo/util/str.h" namespace mongo { namespace fts { using std::string; using std::vector; namespace dps = ::mongo::dotted_path_support; namespace { BSONObj nullObj; BSONElement nullElt; // New in textIndexVersion 2. // If the term is longer than 32 characters, it may // result in the generated key being too large // for the index. In that case, we generate a 64-character key // from the concatenation of the first 32 characters // and the hex string of the murmur3 hash value of the entire // term value. const size_t termKeyPrefixLengthV2 = 32U; // 128-bit hash value expressed in hex = 32 characters const size_t termKeySuffixLengthV2 = 32U; const size_t termKeyLengthV2 = termKeyPrefixLengthV2 + termKeySuffixLengthV2; // TextIndexVersion 3. // If the term is longer than 256 characters, it may // result in the generated key being too large // for the index. In that case, we generate a 256-character key // from the concatenation of the first 224 characters // and the hex string of the md5 hash value of the entire // term value. const size_t termKeyPrefixLengthV3 = 224U; // 128-bit hash value expressed in hex = 32 characters const size_t termKeySuffixLengthV3 = 32U; const size_t termKeyLengthV3 = termKeyPrefixLengthV3 + termKeySuffixLengthV3; /** * Given an object being indexed, 'obj', and a path through 'obj', returns the corresponding BSON * element, according to the indexing rules for the non-text fields of an FTS index key pattern. * * Specifically, throws a user assertion if an array is encountered while traversing the 'path'. It * is not legal for there to be an array along the path of the non-text prefix or suffix fields of a * text index, unless a particular array index is specified, as in "a.3". */ BSONElement extractNonFTSKeyElement(const BSONObj& obj, StringData path) { BSONElementSet indexedElements; const bool expandArrayOnTrailingField = true; MultikeyComponents arrayComponents; dps::extractAllElementsAlongPath( obj, path, indexedElements, expandArrayOnTrailingField, &arrayComponents); uassert(ErrorCodes::CannotBuildIndexKeys, str::stream() << "Field '" << path << "' of text index contains an array in document: " << obj, arrayComponents.empty()); // Since there aren't any arrays, there cannot be more than one extracted element on 'path'. invariant(indexedElements.size() <= 1U); return indexedElements.empty() ? nullElt : *indexedElements.begin(); } } // namespace MONGO_INITIALIZER(FTSIndexFormat)(InitializerContext* context) { BSONObjBuilder b; b.appendNull(""); nullObj = b.obj(); nullElt = nullObj.firstElement(); } void FTSIndexFormat::getKeys(SharedBufferFragmentBuilder& pooledBufferBuilder, const FTSSpec& spec, const BSONObj& obj, KeyStringSet* keys, KeyString::Version keyStringVersion, Ordering ordering, const boost::optional& id) { vector extrasBefore; vector extrasAfter; // Compute the non FTS key elements for the prefix. for (unsigned i = 0; i < spec.numExtraBefore(); i++) { auto indexedElement = extractNonFTSKeyElement(obj, spec.extraBefore(i)); extrasBefore.push_back(indexedElement); } // Compute the non FTS key elements for the suffix. for (unsigned i = 0; i < spec.numExtraAfter(); i++) { auto indexedElement = extractNonFTSKeyElement(obj, spec.extraAfter(i)); extrasAfter.push_back(indexedElement); } TermFrequencyMap term_freqs; spec.scoreDocument(obj, &term_freqs); auto sequence = keys->extract_sequence(); for (TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i) { const string& term = i->first; double weight = i->second; KeyString::PooledBuilder keyString(pooledBufferBuilder, keyStringVersion, ordering); for (const auto& elem : extrasBefore) { keyString.appendBSONElement(elem); } _appendIndexKey(keyString, weight, term, spec.getTextIndexVersion()); for (const auto& elem : extrasAfter) { keyString.appendBSONElement(elem); } if (id) { keyString.appendRecordId(*id); } sequence.push_back(keyString.release()); } keys->adopt_sequence(std::move(sequence)); } BSONObj FTSIndexFormat::getIndexKey(double weight, const string& term, const BSONObj& indexPrefix, TextIndexVersion textIndexVersion) { BSONObjBuilder b; BSONObjIterator i(indexPrefix); while (i.more()) { b.appendAs(i.next(), ""); } KeyString::Builder keyString(KeyString::Version::kLatestVersion, KeyString::ALL_ASCENDING); _appendIndexKey(keyString, weight, term, textIndexVersion); auto key = KeyString::toBson(keyString, KeyString::ALL_ASCENDING); return b.appendElements(key).obj(); } template void FTSIndexFormat::_appendIndexKey(KeyStringBuilder& keyString, double weight, const string& term, TextIndexVersion textIndexVersion) { invariant(weight >= 0 && weight <= MAX_WEIGHT); // FTSmaxweight = defined in fts_header // Terms are added to index key verbatim. if (TEXT_INDEX_VERSION_1 == textIndexVersion) { keyString.appendString(term); } // See comments at the top of file for termKeyPrefixLengthV2. // Apply hash for text index version 2 to long terms (longer than 32 characters). else if (TEXT_INDEX_VERSION_2 == textIndexVersion) { if (term.size() <= termKeyPrefixLengthV2) { keyString.appendString(term); } else { union { uint64_t hash[2]; char data[16]; } t; uint32_t seed = 0; MurmurHash3_x64_128(term.data(), term.size(), seed, t.hash); string keySuffix = hexblob::encodeLower(t.data, sizeof(t.data)); invariant(termKeySuffixLengthV2 == keySuffix.size()); keyString.appendString(term.substr(0, termKeyPrefixLengthV2) + keySuffix); } } else { invariant(TEXT_INDEX_VERSION_3 == textIndexVersion); if (term.size() <= termKeyPrefixLengthV3) { keyString.appendString(term); } else { string keySuffix = md5simpledigest(term); invariant(termKeySuffixLengthV3 == keySuffix.size()); keyString.appendString(term.substr(0, termKeyPrefixLengthV3) + keySuffix); } } keyString.appendNumberDouble(weight); } } // namespace fts } // namespace mongo