diff options
author | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-06-20 00:22:50 -0400 |
---|---|---|
committer | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-06-20 10:56:02 -0400 |
commit | 9c2ed42daa8fbbef4a919c21ec564e2db55e8d60 (patch) | |
tree | 3814f79c10d7b490948d8cb7b112ac1dd41ceff1 /src/mongo/db/fts/fts_spec_legacy.cpp | |
parent | 01965cf52bce6976637ecb8f4a622aeb05ab256a (diff) | |
download | mongo-9c2ed42daa8fbbef4a919c21ec564e2db55e8d60.tar.gz |
SERVER-18579: Clang-Format - reformat code, no comment reflow
Diffstat (limited to 'src/mongo/db/fts/fts_spec_legacy.cpp')
-rw-r--r-- | src/mongo/db/fts/fts_spec_legacy.cpp | 470 |
1 files changed, 224 insertions, 246 deletions
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp index a2dc1dc2489..4a161c8614a 100644 --- a/src/mongo/db/fts/fts_spec_legacy.cpp +++ b/src/mongo/db/fts/fts_spec_legacy.cpp @@ -33,290 +33,268 @@ namespace mongo { - namespace fts { +namespace fts { - // - // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 - // text indexes. - // +// +// This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 +// text indexes. +// - using std::map; - using std::string; - using namespace mongoutils; +using std::map; +using std::string; +using namespace mongoutils; - namespace { - void _addFTSStuff( BSONObjBuilder* b ) { - b->append( "_fts", INDEX_NAME ); - b->append( "_ftsx", 1 ); - } - } +namespace { +void _addFTSStuff(BSONObjBuilder* b) { + b->append("_fts", INDEX_NAME); + b->append("_ftsx", 1); +} +} - const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const { - BSONElement e = userDoc[_languageOverrideField]; - if ( e.type() == String ) { - const char * x = e.valuestrsafe(); - if ( strlen( x ) > 0 ) { - StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 ); - dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. - return *swl.getValue(); - } - } - return *_defaultLanguage; +const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const { + BSONElement e = userDoc[_languageOverrideField]; + if (e.type() == String) { + const char* x = e.valuestrsafe(); + if (strlen(x) > 0) { + StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1); + dassert(swl.isOK()); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. + return *swl.getValue(); } + } + return *_defaultLanguage; +} - void FTSSpec::_scoreStringV1( const Tools& tools, - StringData raw, - TermFrequencyMap* docScores, - double weight ) const { - - ScoreHelperMap terms; +void FTSSpec::_scoreStringV1(const Tools& tools, + StringData raw, + TermFrequencyMap* docScores, + double weight) const { + ScoreHelperMap terms; - unsigned numTokens = 0; + unsigned numTokens = 0; - Tokenizer i( &tools.language, raw ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) - continue; + Tokenizer i(&tools.language, raw); + while (i.more()) { + Token t = i.next(); + if (t.type != Token::TEXT) + continue; - string term = tolowerString( t.data ); - if ( tools.stopwords->isStopWord( term ) ) - continue; - term = tools.stemmer->stem( term ); + string term = tolowerString(t.data); + if (tools.stopwords->isStopWord(term)) + continue; + term = tools.stemmer->stem(term); - ScoreHelperStruct& data = terms[term]; + ScoreHelperStruct& data = terms[term]; - if ( data.exp ) - data.exp *= 2; - else - data.exp = 1; - data.count += 1; - data.freq += ( 1 / data.exp ); + if (data.exp) + data.exp *= 2; + else + data.exp = 1; + data.count += 1; + data.freq += (1 / data.exp); - numTokens++; - } + numTokens++; + } - for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { + for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) { + const string& term = i->first; + const ScoreHelperStruct& data = i->second; - const string& term = i->first; - const ScoreHelperStruct& data = i->second; + // in order to adjust weights as a function of term count as it + // relates to total field length. ie. is this the only word or + // a frequently occuring term? or does it only show up once in + // a long block of text? - // in order to adjust weights as a function of term count as it - // relates to total field length. ie. is this the only word or - // a frequently occuring term? or does it only show up once in - // a long block of text? + double coeff = (0.5 * data.count / numTokens) + 0.5; - double coeff = ( 0.5 * data.count / numTokens ) + 0.5; + // if term is identical to the raw form of the + // field (untokenized) give it a small boost. + double adjustment = 1; + if (raw.size() == term.length() && raw.equalCaseInsensitive(term)) + adjustment += 0.1; - // if term is identical to the raw form of the - // field (untokenized) give it a small boost. - double adjustment = 1; - if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) - adjustment += 0.1; + double& score = (*docScores)[term]; + score += (weight * data.freq * coeff * adjustment); + verify(score <= MAX_WEIGHT); + } +} - double& score = (*docScores)[term]; - score += ( weight * data.freq * coeff * adjustment ); - verify( score <= MAX_WEIGHT ); - } - } +bool FTSSpec::_weightV1(StringData field, double* out) const { + Weights::const_iterator i = _weights.find(field.toString()); + if (i == _weights.end()) + return false; + *out = i->second; + return true; +} - bool FTSSpec::_weightV1( StringData field, double* out ) const { - Weights::const_iterator i = _weights.find( field.toString() ); - if ( i == _weights.end() ) - return false; - *out = i->second; - return true; +/* + * Recurses over all fields of an obj (document in collection) + * and fills term,score map term_freqs + * @param tokenizer, tokenizer to tokenize a string into terms + * @param obj, object being parsed + * term_freqs, map <term,score> to be filled up + */ +void FTSSpec::_scoreRecurseV1(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs) const { + BSONObjIterator j(obj); + while (j.more()) { + BSONElement x = j.next(); + + if (languageOverrideField() == x.fieldName()) + continue; + + if (x.type() == String) { + double w = 1; + _weightV1(x.fieldName(), &w); + _scoreStringV1(tools, x.valuestr(), term_freqs, w); + } else if (x.isABSONObj()) { + _scoreRecurseV1(tools, x.Obj(), term_freqs); } + } +} - /* - * Recurses over all fields of an obj (document in collection) - * and fills term,score map term_freqs - * @param tokenizer, tokenizer to tokenize a string into terms - * @param obj, object being parsed - * term_freqs, map <term,score> to be filled up - */ - void FTSSpec::_scoreRecurseV1( const Tools& tools, - const BSONObj& obj, - TermFrequencyMap* term_freqs ) const { - BSONObjIterator j( obj ); - while ( j.more() ) { - BSONElement x = j.next(); +void FTSSpec::_scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const { + const FTSLanguage& language = _getLanguageToUseV1(obj); - if ( languageOverrideField() == x.fieldName() ) - continue; + Stemmer stemmer(&language); + Tools tools(language, &stemmer, StopWords::getStopWords(&language)); - if (x.type() == String) { - double w = 1; - _weightV1( x.fieldName(), &w ); - _scoreStringV1(tools, x.valuestr(), term_freqs, w); - } - else if ( x.isABSONObj() ) { - _scoreRecurseV1( tools, x.Obj(), term_freqs); - } + if (wildcard()) { + // if * is specified for weight, we can recurse over all fields. + _scoreRecurseV1(tools, obj, term_freqs); + return; + } + // otherwise, we need to remember the different weights for each field + // and act accordingly (in other words, call _score) + for (Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++) { + const char* leftOverName = i->first.c_str(); + // name of field + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + // weight associated to name of field + double weight = i->second; + + if (e.eoo()) { + // do nothing + } else if (e.type() == Array) { + BSONObjIterator j(e.Obj()); + while (j.more()) { + BSONElement x = j.next(); + if (leftOverName[0] && x.isABSONObj()) + x = x.Obj().getFieldDotted(leftOverName); + if (x.type() == String) + _scoreStringV1(tools, x.valuestr(), term_freqs, weight); } + } else if (e.type() == String) { + _scoreStringV1(tools, e.valuestr(), term_freqs, weight); } + } +} - void FTSSpec::_scoreDocumentV1( const BSONObj& obj, - TermFrequencyMap* term_freqs ) const { - - const FTSLanguage& language = _getLanguageToUseV1( obj ); - - Stemmer stemmer(&language); - Tools tools(language, &stemmer, StopWords::getStopWords( &language )); - - if ( wildcard() ) { - // if * is specified for weight, we can recurse over all fields. - _scoreRecurseV1(tools, obj, term_freqs); - return; - } - - // otherwise, we need to remember the different weights for each field - // and act accordingly (in other words, call _score) - for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) { - const char * leftOverName = i->first.c_str(); - // name of field - BSONElement e = obj.getFieldDottedOrArray(leftOverName); - // weight associated to name of field - double weight = i->second; - - if ( e.eoo() ) { - // do nothing - } - else if ( e.type() == Array ) { - BSONObjIterator j( e.Obj() ); - while ( j.more() ) { - BSONElement x = j.next(); - if ( leftOverName[0] && x.isABSONObj() ) - x = x.Obj().getFieldDotted( leftOverName ); - if ( x.type() == String ) - _scoreStringV1( tools, x.valuestr(), term_freqs, weight ); - } - } - else if ( e.type() == String ) { - _scoreStringV1( tools, e.valuestr(), term_freqs, weight ); +BSONObj FTSSpec::_fixSpecV1(const BSONObj& spec) { + map<string, int> m; + + BSONObj keyPattern; + { + BSONObjBuilder b; + bool addedFtsStuff = false; + + BSONObjIterator i(spec["key"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) { + addedFtsStuff = true; + b.append(e); + } else if (e.type() == String && + (str::equals("fts", e.valuestr()) || str::equals("text", e.valuestr()))) { + if (!addedFtsStuff) { + _addFTSStuff(&b); + addedFtsStuff = true; } + m[e.fieldName()] = 1; + } else { + b.append(e); } } - BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) { - map<string,int> m; - - BSONObj keyPattern; - { - BSONObjBuilder b; - bool addedFtsStuff = false; - - BSONObjIterator i( spec["key"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "_fts" ) || - str::equals( e.fieldName(), "_ftsx" ) ) { - addedFtsStuff = true; - b.append( e ); - } - else if ( e.type() == String && - ( str::equals( "fts", e.valuestr() ) || - str::equals( "text", e.valuestr() ) ) ) { - - if ( !addedFtsStuff ) { - _addFTSStuff( &b ); - addedFtsStuff = true; - } - - m[e.fieldName()] = 1; - } - else { - b.append( e ); - } - } - - if ( !addedFtsStuff ) - _addFTSStuff( &b ); - - keyPattern = b.obj(); - } - - if ( spec["weights"].isABSONObj() ) { - BSONObjIterator i( spec["weights"].Obj() ); - while ( i.more() ) { - BSONElement e = i.next(); - m[e.fieldName()] = e.numberInt(); - } - } - else if ( spec["weights"].str() == WILDCARD ) { - m[WILDCARD] = 1; - } - - BSONObj weights; - { - BSONObjBuilder b; - for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) { - uassert( 17365, "score for word too high", - i->second > 0 && i->second < MAX_WORD_WEIGHT ); - b.append( i->first, i->second ); - } - weights = b.obj(); - } + if (!addedFtsStuff) + _addFTSStuff(&b); - string default_language(spec.getStringField("default_language")); - if ( default_language.empty() ) - default_language = "english"; + keyPattern = b.obj(); + } - string language_override(spec.getStringField("language_override")); - if ( language_override.empty() ) - language_override = "language"; + if (spec["weights"].isABSONObj()) { + BSONObjIterator i(spec["weights"].Obj()); + while (i.more()) { + BSONElement e = i.next(); + m[e.fieldName()] = e.numberInt(); + } + } else if (spec["weights"].str() == WILDCARD) { + m[WILDCARD] = 1; + } - int version = -1; - int textIndexVersion = 1; + BSONObj weights; + { + BSONObjBuilder b; + for (map<string, int>::iterator i = m.begin(); i != m.end(); ++i) { + uassert(17365, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT); + b.append(i->first, i->second); + } + weights = b.obj(); + } - BSONObjBuilder b; - BSONObjIterator i( spec ); - while ( i.more() ) { - BSONElement e = i.next(); - if ( str::equals( e.fieldName(), "key" ) ) { - b.append( "key", keyPattern ); - } - else if ( str::equals( e.fieldName(), "weights" ) ) { - b.append( "weights", weights ); - weights = BSONObj(); - } - else if ( str::equals( e.fieldName(), "default_language" ) ) { - b.append( "default_language", default_language); - default_language = ""; - } - else if ( str::equals( e.fieldName(), "language_override" ) ) { - b.append( "language_override", language_override); - language_override = ""; - } - else if ( str::equals( e.fieldName(), "v" ) ) { - version = e.numberInt(); - } - else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) { - textIndexVersion = e.numberInt(); - uassert( 17366, - str::stream() << "bad textIndexVersion: " << textIndexVersion, - textIndexVersion == 1 ); - } - else { - b.append( e ); - } - } + string default_language(spec.getStringField("default_language")); + if (default_language.empty()) + default_language = "english"; + + string language_override(spec.getStringField("language_override")); + if (language_override.empty()) + language_override = "language"; + + int version = -1; + int textIndexVersion = 1; + + BSONObjBuilder b; + BSONObjIterator i(spec); + while (i.more()) { + BSONElement e = i.next(); + if (str::equals(e.fieldName(), "key")) { + b.append("key", keyPattern); + } else if (str::equals(e.fieldName(), "weights")) { + b.append("weights", weights); + weights = BSONObj(); + } else if (str::equals(e.fieldName(), "default_language")) { + b.append("default_language", default_language); + default_language = ""; + } else if (str::equals(e.fieldName(), "language_override")) { + b.append("language_override", language_override); + language_override = ""; + } else if (str::equals(e.fieldName(), "v")) { + version = e.numberInt(); + } else if (str::equals(e.fieldName(), "textIndexVersion")) { + textIndexVersion = e.numberInt(); + uassert(17366, + str::stream() << "bad textIndexVersion: " << textIndexVersion, + textIndexVersion == 1); + } else { + b.append(e); + } + } - if ( !weights.isEmpty() ) - b.append( "weights", weights ); - if ( !default_language.empty() ) - b.append( "default_language", default_language); - if ( !language_override.empty() ) - b.append( "language_override", language_override); + if (!weights.isEmpty()) + b.append("weights", weights); + if (!default_language.empty()) + b.append("default_language", default_language); + if (!language_override.empty()) + b.append("language_override", language_override); - if ( version >= 0 ) - b.append( "v", version ); + if (version >= 0) + b.append("v", version); - b.append( "textIndexVersion", textIndexVersion ); + b.append("textIndexVersion", textIndexVersion); - return b.obj(); - } - } + return b.obj(); +} +} } |