summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts/fts_spec_legacy.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/fts/fts_spec_legacy.cpp')
-rw-r--r--src/mongo/db/fts/fts_spec_legacy.cpp470
1 files changed, 224 insertions, 246 deletions
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index a2dc1dc2489..4a161c8614a 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -33,290 +33,268 @@
namespace mongo {
- namespace fts {
+namespace fts {
- //
- // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1
- // text indexes.
- //
+//
+// This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1
+// text indexes.
+//
- using std::map;
- using std::string;
- using namespace mongoutils;
+using std::map;
+using std::string;
+using namespace mongoutils;
- namespace {
- void _addFTSStuff( BSONObjBuilder* b ) {
- b->append( "_fts", INDEX_NAME );
- b->append( "_ftsx", 1 );
- }
- }
+namespace {
+void _addFTSStuff(BSONObjBuilder* b) {
+ b->append("_fts", INDEX_NAME);
+ b->append("_ftsx", 1);
+}
+}
- const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const {
- BSONElement e = userDoc[_languageOverrideField];
- if ( e.type() == String ) {
- const char * x = e.valuestrsafe();
- if ( strlen( x ) > 0 ) {
- StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 );
- dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
- return *swl.getValue();
- }
- }
- return *_defaultLanguage;
+const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const {
+ BSONElement e = userDoc[_languageOverrideField];
+ if (e.type() == String) {
+ const char* x = e.valuestrsafe();
+ if (strlen(x) > 0) {
+ StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1);
+ dassert(swl.isOK()); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
+ return *swl.getValue();
}
+ }
+ return *_defaultLanguage;
+}
- void FTSSpec::_scoreStringV1( const Tools& tools,
- StringData raw,
- TermFrequencyMap* docScores,
- double weight ) const {
-
- ScoreHelperMap terms;
+void FTSSpec::_scoreStringV1(const Tools& tools,
+ StringData raw,
+ TermFrequencyMap* docScores,
+ double weight) const {
+ ScoreHelperMap terms;
- unsigned numTokens = 0;
+ unsigned numTokens = 0;
- Tokenizer i( &tools.language, raw );
- while ( i.more() ) {
- Token t = i.next();
- if ( t.type != Token::TEXT )
- continue;
+ Tokenizer i(&tools.language, raw);
+ while (i.more()) {
+ Token t = i.next();
+ if (t.type != Token::TEXT)
+ continue;
- string term = tolowerString( t.data );
- if ( tools.stopwords->isStopWord( term ) )
- continue;
- term = tools.stemmer->stem( term );
+ string term = tolowerString(t.data);
+ if (tools.stopwords->isStopWord(term))
+ continue;
+ term = tools.stemmer->stem(term);
- ScoreHelperStruct& data = terms[term];
+ ScoreHelperStruct& data = terms[term];
- if ( data.exp )
- data.exp *= 2;
- else
- data.exp = 1;
- data.count += 1;
- data.freq += ( 1 / data.exp );
+ if (data.exp)
+ data.exp *= 2;
+ else
+ data.exp = 1;
+ data.count += 1;
+ data.freq += (1 / data.exp);
- numTokens++;
- }
+ numTokens++;
+ }
- for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {
+ for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) {
+ const string& term = i->first;
+ const ScoreHelperStruct& data = i->second;
- const string& term = i->first;
- const ScoreHelperStruct& data = i->second;
+ // in order to adjust weights as a function of term count as it
+ // relates to total field length. ie. is this the only word or
+ // a frequently occuring term? or does it only show up once in
+ // a long block of text?
- // in order to adjust weights as a function of term count as it
- // relates to total field length. ie. is this the only word or
- // a frequently occuring term? or does it only show up once in
- // a long block of text?
+ double coeff = (0.5 * data.count / numTokens) + 0.5;
- double coeff = ( 0.5 * data.count / numTokens ) + 0.5;
+ // if term is identical to the raw form of the
+ // field (untokenized) give it a small boost.
+ double adjustment = 1;
+ if (raw.size() == term.length() && raw.equalCaseInsensitive(term))
+ adjustment += 0.1;
- // if term is identical to the raw form of the
- // field (untokenized) give it a small boost.
- double adjustment = 1;
- if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
- adjustment += 0.1;
+ double& score = (*docScores)[term];
+ score += (weight * data.freq * coeff * adjustment);
+ verify(score <= MAX_WEIGHT);
+ }
+}
- double& score = (*docScores)[term];
- score += ( weight * data.freq * coeff * adjustment );
- verify( score <= MAX_WEIGHT );
- }
- }
+bool FTSSpec::_weightV1(StringData field, double* out) const {
+ Weights::const_iterator i = _weights.find(field.toString());
+ if (i == _weights.end())
+ return false;
+ *out = i->second;
+ return true;
+}
- bool FTSSpec::_weightV1( StringData field, double* out ) const {
- Weights::const_iterator i = _weights.find( field.toString() );
- if ( i == _weights.end() )
- return false;
- *out = i->second;
- return true;
+/*
+ * Recurses over all fields of an obj (document in collection)
+ * and fills term,score map term_freqs
+ * @param tokenizer, tokenizer to tokenize a string into terms
+ * @param obj, object being parsed
+ * term_freqs, map <term,score> to be filled up
+ */
+void FTSSpec::_scoreRecurseV1(const Tools& tools,
+ const BSONObj& obj,
+ TermFrequencyMap* term_freqs) const {
+ BSONObjIterator j(obj);
+ while (j.more()) {
+ BSONElement x = j.next();
+
+ if (languageOverrideField() == x.fieldName())
+ continue;
+
+ if (x.type() == String) {
+ double w = 1;
+ _weightV1(x.fieldName(), &w);
+ _scoreStringV1(tools, x.valuestr(), term_freqs, w);
+ } else if (x.isABSONObj()) {
+ _scoreRecurseV1(tools, x.Obj(), term_freqs);
}
+ }
+}
- /*
- * Recurses over all fields of an obj (document in collection)
- * and fills term,score map term_freqs
- * @param tokenizer, tokenizer to tokenize a string into terms
- * @param obj, object being parsed
- * term_freqs, map <term,score> to be filled up
- */
- void FTSSpec::_scoreRecurseV1( const Tools& tools,
- const BSONObj& obj,
- TermFrequencyMap* term_freqs ) const {
- BSONObjIterator j( obj );
- while ( j.more() ) {
- BSONElement x = j.next();
+void FTSSpec::_scoreDocumentV1(const BSONObj& obj, TermFrequencyMap* term_freqs) const {
+ const FTSLanguage& language = _getLanguageToUseV1(obj);
- if ( languageOverrideField() == x.fieldName() )
- continue;
+ Stemmer stemmer(&language);
+ Tools tools(language, &stemmer, StopWords::getStopWords(&language));
- if (x.type() == String) {
- double w = 1;
- _weightV1( x.fieldName(), &w );
- _scoreStringV1(tools, x.valuestr(), term_freqs, w);
- }
- else if ( x.isABSONObj() ) {
- _scoreRecurseV1( tools, x.Obj(), term_freqs);
- }
+ if (wildcard()) {
+ // if * is specified for weight, we can recurse over all fields.
+ _scoreRecurseV1(tools, obj, term_freqs);
+ return;
+ }
+ // otherwise, we need to remember the different weights for each field
+ // and act accordingly (in other words, call _score)
+ for (Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++) {
+ const char* leftOverName = i->first.c_str();
+ // name of field
+ BSONElement e = obj.getFieldDottedOrArray(leftOverName);
+ // weight associated to name of field
+ double weight = i->second;
+
+ if (e.eoo()) {
+ // do nothing
+ } else if (e.type() == Array) {
+ BSONObjIterator j(e.Obj());
+ while (j.more()) {
+ BSONElement x = j.next();
+ if (leftOverName[0] && x.isABSONObj())
+ x = x.Obj().getFieldDotted(leftOverName);
+ if (x.type() == String)
+ _scoreStringV1(tools, x.valuestr(), term_freqs, weight);
}
+ } else if (e.type() == String) {
+ _scoreStringV1(tools, e.valuestr(), term_freqs, weight);
}
+ }
+}
- void FTSSpec::_scoreDocumentV1( const BSONObj& obj,
- TermFrequencyMap* term_freqs ) const {
-
- const FTSLanguage& language = _getLanguageToUseV1( obj );
-
- Stemmer stemmer(&language);
- Tools tools(language, &stemmer, StopWords::getStopWords( &language ));
-
- if ( wildcard() ) {
- // if * is specified for weight, we can recurse over all fields.
- _scoreRecurseV1(tools, obj, term_freqs);
- return;
- }
-
- // otherwise, we need to remember the different weights for each field
- // and act accordingly (in other words, call _score)
- for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) {
- const char * leftOverName = i->first.c_str();
- // name of field
- BSONElement e = obj.getFieldDottedOrArray(leftOverName);
- // weight associated to name of field
- double weight = i->second;
-
- if ( e.eoo() ) {
- // do nothing
- }
- else if ( e.type() == Array ) {
- BSONObjIterator j( e.Obj() );
- while ( j.more() ) {
- BSONElement x = j.next();
- if ( leftOverName[0] && x.isABSONObj() )
- x = x.Obj().getFieldDotted( leftOverName );
- if ( x.type() == String )
- _scoreStringV1( tools, x.valuestr(), term_freqs, weight );
- }
- }
- else if ( e.type() == String ) {
- _scoreStringV1( tools, e.valuestr(), term_freqs, weight );
+BSONObj FTSSpec::_fixSpecV1(const BSONObj& spec) {
+ map<string, int> m;
+
+ BSONObj keyPattern;
+ {
+ BSONObjBuilder b;
+ bool addedFtsStuff = false;
+
+ BSONObjIterator i(spec["key"].Obj());
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (str::equals(e.fieldName(), "_fts") || str::equals(e.fieldName(), "_ftsx")) {
+ addedFtsStuff = true;
+ b.append(e);
+ } else if (e.type() == String &&
+ (str::equals("fts", e.valuestr()) || str::equals("text", e.valuestr()))) {
+ if (!addedFtsStuff) {
+ _addFTSStuff(&b);
+ addedFtsStuff = true;
}
+ m[e.fieldName()] = 1;
+ } else {
+ b.append(e);
}
}
- BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) {
- map<string,int> m;
-
- BSONObj keyPattern;
- {
- BSONObjBuilder b;
- bool addedFtsStuff = false;
-
- BSONObjIterator i( spec["key"].Obj() );
- while ( i.more() ) {
- BSONElement e = i.next();
- if ( str::equals( e.fieldName(), "_fts" ) ||
- str::equals( e.fieldName(), "_ftsx" ) ) {
- addedFtsStuff = true;
- b.append( e );
- }
- else if ( e.type() == String &&
- ( str::equals( "fts", e.valuestr() ) ||
- str::equals( "text", e.valuestr() ) ) ) {
-
- if ( !addedFtsStuff ) {
- _addFTSStuff( &b );
- addedFtsStuff = true;
- }
-
- m[e.fieldName()] = 1;
- }
- else {
- b.append( e );
- }
- }
-
- if ( !addedFtsStuff )
- _addFTSStuff( &b );
-
- keyPattern = b.obj();
- }
-
- if ( spec["weights"].isABSONObj() ) {
- BSONObjIterator i( spec["weights"].Obj() );
- while ( i.more() ) {
- BSONElement e = i.next();
- m[e.fieldName()] = e.numberInt();
- }
- }
- else if ( spec["weights"].str() == WILDCARD ) {
- m[WILDCARD] = 1;
- }
-
- BSONObj weights;
- {
- BSONObjBuilder b;
- for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) {
- uassert( 17365, "score for word too high",
- i->second > 0 && i->second < MAX_WORD_WEIGHT );
- b.append( i->first, i->second );
- }
- weights = b.obj();
- }
+ if (!addedFtsStuff)
+ _addFTSStuff(&b);
- string default_language(spec.getStringField("default_language"));
- if ( default_language.empty() )
- default_language = "english";
+ keyPattern = b.obj();
+ }
- string language_override(spec.getStringField("language_override"));
- if ( language_override.empty() )
- language_override = "language";
+ if (spec["weights"].isABSONObj()) {
+ BSONObjIterator i(spec["weights"].Obj());
+ while (i.more()) {
+ BSONElement e = i.next();
+ m[e.fieldName()] = e.numberInt();
+ }
+ } else if (spec["weights"].str() == WILDCARD) {
+ m[WILDCARD] = 1;
+ }
- int version = -1;
- int textIndexVersion = 1;
+ BSONObj weights;
+ {
+ BSONObjBuilder b;
+ for (map<string, int>::iterator i = m.begin(); i != m.end(); ++i) {
+ uassert(17365, "score for word too high", i->second > 0 && i->second < MAX_WORD_WEIGHT);
+ b.append(i->first, i->second);
+ }
+ weights = b.obj();
+ }
- BSONObjBuilder b;
- BSONObjIterator i( spec );
- while ( i.more() ) {
- BSONElement e = i.next();
- if ( str::equals( e.fieldName(), "key" ) ) {
- b.append( "key", keyPattern );
- }
- else if ( str::equals( e.fieldName(), "weights" ) ) {
- b.append( "weights", weights );
- weights = BSONObj();
- }
- else if ( str::equals( e.fieldName(), "default_language" ) ) {
- b.append( "default_language", default_language);
- default_language = "";
- }
- else if ( str::equals( e.fieldName(), "language_override" ) ) {
- b.append( "language_override", language_override);
- language_override = "";
- }
- else if ( str::equals( e.fieldName(), "v" ) ) {
- version = e.numberInt();
- }
- else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) {
- textIndexVersion = e.numberInt();
- uassert( 17366,
- str::stream() << "bad textIndexVersion: " << textIndexVersion,
- textIndexVersion == 1 );
- }
- else {
- b.append( e );
- }
- }
+ string default_language(spec.getStringField("default_language"));
+ if (default_language.empty())
+ default_language = "english";
+
+ string language_override(spec.getStringField("language_override"));
+ if (language_override.empty())
+ language_override = "language";
+
+ int version = -1;
+ int textIndexVersion = 1;
+
+ BSONObjBuilder b;
+ BSONObjIterator i(spec);
+ while (i.more()) {
+ BSONElement e = i.next();
+ if (str::equals(e.fieldName(), "key")) {
+ b.append("key", keyPattern);
+ } else if (str::equals(e.fieldName(), "weights")) {
+ b.append("weights", weights);
+ weights = BSONObj();
+ } else if (str::equals(e.fieldName(), "default_language")) {
+ b.append("default_language", default_language);
+ default_language = "";
+ } else if (str::equals(e.fieldName(), "language_override")) {
+ b.append("language_override", language_override);
+ language_override = "";
+ } else if (str::equals(e.fieldName(), "v")) {
+ version = e.numberInt();
+ } else if (str::equals(e.fieldName(), "textIndexVersion")) {
+ textIndexVersion = e.numberInt();
+ uassert(17366,
+ str::stream() << "bad textIndexVersion: " << textIndexVersion,
+ textIndexVersion == 1);
+ } else {
+ b.append(e);
+ }
+ }
- if ( !weights.isEmpty() )
- b.append( "weights", weights );
- if ( !default_language.empty() )
- b.append( "default_language", default_language);
- if ( !language_override.empty() )
- b.append( "language_override", language_override);
+ if (!weights.isEmpty())
+ b.append("weights", weights);
+ if (!default_language.empty())
+ b.append("default_language", default_language);
+ if (!language_override.empty())
+ b.append("language_override", language_override);
- if ( version >= 0 )
- b.append( "v", version );
+ if (version >= 0)
+ b.append("v", version);
- b.append( "textIndexVersion", textIndexVersion );
+ b.append("textIndexVersion", textIndexVersion);
- return b.obj();
- }
- }
+ return b.obj();
+}
+}
}