// fts_index_format.cpp /** * Copyright (C) 2012 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the GNU Affero General Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/pch.h" #include #include "mongo/base/init.h" #include "mongo/db/fts/fts_index_format.h" #include "mongo/util/hex.h" #include "mongo/util/mongoutils/str.h" namespace mongo { namespace fts { namespace { BSONObj nullObj; BSONElement nullElt; // New in textIndexVersion 2. // If the term is longer than 32 characters, it may // result in the generated key being too large // for the index. In that case, we generate a 64-character key // from the concatenation of the first 32 characters // and the hex string of the murmur3 hash value of the entire // term value. const size_t termKeyPrefixLength = 32U; // 128-bit hash value expressed in hex = 32 characters const size_t termKeySuffixLength = 32U; const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength; /** * Returns size of buffer required to store term in index key. * In version 1, terms are stored verbatim in key. * In version 2, terms longer than 32 characters are hashed and combined * with a prefix. */ int guessTermSize( const std::string& term, TextIndexVersion textIndexVersion ) { if ( TEXT_INDEX_VERSION_1 == textIndexVersion ) { return term.size(); } else { invariant( TEXT_INDEX_VERSION_2 == textIndexVersion ); if ( term.size() <= termKeyPrefixLength ) { return term.size(); } return termKeyLength; } } } MONGO_INITIALIZER( FTSIndexFormat )( InitializerContext* context ) { BSONObjBuilder b; b.appendNull( "" ); nullObj = b.obj(); nullElt = nullObj.firstElement(); return Status::OK(); } void FTSIndexFormat::getKeys( const FTSSpec& spec, const BSONObj& obj, BSONObjSet* keys ) { int extraSize = 0; vector extrasBefore; vector extrasAfter; // compute the non FTS key elements for ( unsigned i = 0; i < spec.numExtraBefore(); i++ ) { BSONElement e = obj.getFieldDotted(spec.extraBefore(i)); if ( e.eoo() ) e = nullElt; uassert( 16675, "cannot have a multi-key as a prefix to a text index", e.type() != Array ); extrasBefore.push_back(e); extraSize += e.size(); } for ( unsigned i = 0; i < spec.numExtraAfter(); i++ ) { BSONElement e = obj.getFieldDotted(spec.extraAfter(i)); if ( e.eoo() ) e = nullElt; extrasAfter.push_back(e); extraSize += e.size(); } TermFrequencyMap term_freqs; spec.scoreDocument( obj, &term_freqs ); // create index keys from raw scores // only 1 per string uassert( 16732, mongoutils::str::stream() << "too many unique keys for a single document to" << " have a text index, max is " << term_freqs.size() << obj["_id"], term_freqs.size() <= 400000 ); long long keyBSONSize = 0; const int MaxKeyBSONSizeMB = 4; for ( TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i ) { const string& term = i->first; double weight = i->second; // guess the total size of the btree entry based on the size of the weight, term tuple int guess = 5 /* bson overhead */ + 10 /* weight */ + 8 /* term overhead */ + /* term size (could be truncated/hashed) */ guessTermSize( term, spec.getTextIndexVersion() ) + extraSize; BSONObjBuilder b(guess); // builds a BSON object with guess length. for ( unsigned k = 0; k < extrasBefore.size(); k++ ) { b.appendAs( extrasBefore[k], "" ); } _appendIndexKey( b, weight, term, spec.getTextIndexVersion() ); for ( unsigned k = 0; k < extrasAfter.size(); k++ ) { b.appendAs( extrasAfter[k], "" ); } BSONObj res = b.obj(); verify( guess >= res.objsize() ); keys->insert( res ); keyBSONSize += res.objsize(); uassert( 16733, mongoutils::str::stream() << "trying to index text where term list is too big, max is " << MaxKeyBSONSizeMB << "mb " << obj["_id"], keyBSONSize <= ( MaxKeyBSONSizeMB * 1024 * 1024 ) ); } } BSONObj FTSIndexFormat::getIndexKey( double weight, const string& term, const BSONObj& indexPrefix, TextIndexVersion textIndexVersion ) { BSONObjBuilder b; BSONObjIterator i( indexPrefix ); while ( i.more() ) { b.appendAs( i.next(), "" ); } _appendIndexKey( b, weight, term, textIndexVersion ); return b.obj(); } void FTSIndexFormat::_appendIndexKey( BSONObjBuilder& b, double weight, const string& term, TextIndexVersion textIndexVersion ) { verify( weight >= 0 && weight <= MAX_WEIGHT ); // FTSmaxweight = defined in fts_header // Terms are added to index key verbatim. if ( TEXT_INDEX_VERSION_1 == textIndexVersion ) { b.append( "", term ); b.append( "", weight ); } // See comments at the top of file for termKeyPrefixLength. // Apply hash for text index version 2 to long terms (longer than 32 characters). else { invariant( TEXT_INDEX_VERSION_2 == textIndexVersion ); if ( term.size() <= termKeyPrefixLength ) { b.append( "", term ); } else { union { uint64_t hash[2]; char data[16]; } t; uint32_t seed = 0; MurmurHash3_x64_128( term.data(), term.size(), seed, t.hash ); string keySuffix = mongo::toHexLower( t.data, sizeof( t.data ) ); invariant( termKeySuffixLength == keySuffix.size() ); b.append( "", term.substr( 0, termKeyPrefixLength ) + keySuffix ); } b.append( "", weight ); } } } }