/**
* Copyright (C) 2014 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/db/fts/fts_spec.h"
#include "mongo/util/mongoutils/str.h"
namespace mongo {
namespace fts {
//
// This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1
// text indexes.
//
using namespace mongoutils;
namespace {
void _addFTSStuff( BSONObjBuilder* b ) {
b->append( "_fts", INDEX_NAME );
b->append( "_ftsx", 1 );
}
}
const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const {
BSONElement e = userDoc[_languageOverrideField];
if ( e.type() == String ) {
const char * x = e.valuestrsafe();
if ( strlen( x ) > 0 ) {
StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 );
dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
return *swl.getValue();
}
}
return *_defaultLanguage;
}
void FTSSpec::_scoreStringV1( const Tools& tools,
const StringData& raw,
TermFrequencyMap* docScores,
double weight ) const {
ScoreHelperMap terms;
unsigned numTokens = 0;
Tokenizer i( tools.language, raw );
while ( i.more() ) {
Token t = i.next();
if ( t.type != Token::TEXT )
continue;
string term = t.data.toString();
makeLower( &term );
if ( tools.stopwords->isStopWord( term ) )
continue;
term = tools.stemmer->stem( term );
ScoreHelperStruct& data = terms[term];
if ( data.exp )
data.exp *= 2;
else
data.exp = 1;
data.count += 1;
data.freq += ( 1 / data.exp );
numTokens++;
}
for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {
const string& term = i->first;
const ScoreHelperStruct& data = i->second;
// in order to adjust weights as a function of term count as it
// relates to total field length. ie. is this the only word or
// a frequently occuring term? or does it only show up once in
// a long block of text?
double coeff = ( 0.5 * data.count / numTokens ) + 0.5;
// if term is identical to the raw form of the
// field (untokenized) give it a small boost.
double adjustment = 1;
if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
adjustment += 0.1;
double& score = (*docScores)[term];
score += ( weight * data.freq * coeff * adjustment );
verify( score <= MAX_WEIGHT );
}
}
bool FTSSpec::_weightV1( const StringData& field, double* out ) const {
Weights::const_iterator i = _weights.find( field.toString() );
if ( i == _weights.end() )
return false;
*out = i->second;
return true;
}
/*
* Recurses over all fields of an obj (document in collection)
* and fills term,score map term_freqs
* @param tokenizer, tokenizer to tokenize a string into terms
* @param obj, object being parsed
* term_freqs, map to be filled up
*/
void FTSSpec::_scoreRecurseV1( const Tools& tools,
const BSONObj& obj,
TermFrequencyMap* term_freqs ) const {
BSONObjIterator j( obj );
while ( j.more() ) {
BSONElement x = j.next();
if ( languageOverrideField() == x.fieldName() )
continue;
if (x.type() == String) {
double w = 1;
_weightV1( x.fieldName(), &w );
_scoreStringV1(tools, x.valuestr(), term_freqs, w);
}
else if ( x.isABSONObj() ) {
_scoreRecurseV1( tools, x.Obj(), term_freqs);
}
}
}
void FTSSpec::_scoreDocumentV1( const BSONObj& obj,
TermFrequencyMap* term_freqs ) const {
const FTSLanguage& language = _getLanguageToUseV1( obj );
Stemmer stemmer(language);
Tools tools(language, &stemmer, StopWords::getStopWords( language ));
if ( wildcard() ) {
// if * is specified for weight, we can recurse over all fields.
_scoreRecurseV1(tools, obj, term_freqs);
return;
}
// otherwise, we need to remember the different weights for each field
// and act accordingly (in other words, call _score)
for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) {
const char * leftOverName = i->first.c_str();
// name of field
BSONElement e = obj.getFieldDottedOrArray(leftOverName);
// weight associated to name of field
double weight = i->second;
if ( e.eoo() ) {
// do nothing
}
else if ( e.type() == Array ) {
BSONObjIterator j( e.Obj() );
while ( j.more() ) {
BSONElement x = j.next();
if ( leftOverName[0] && x.isABSONObj() )
x = x.Obj().getFieldDotted( leftOverName );
if ( x.type() == String )
_scoreStringV1( tools, x.valuestr(), term_freqs, weight );
}
}
else if ( e.type() == String ) {
_scoreStringV1( tools, e.valuestr(), term_freqs, weight );
}
}
}
BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) {
map m;
BSONObj keyPattern;
{
BSONObjBuilder b;
bool addedFtsStuff = false;
BSONObjIterator i( spec["key"].Obj() );
while ( i.more() ) {
BSONElement e = i.next();
if ( str::equals( e.fieldName(), "_fts" ) ||
str::equals( e.fieldName(), "_ftsx" ) ) {
addedFtsStuff = true;
b.append( e );
}
else if ( e.type() == String &&
( str::equals( "fts", e.valuestr() ) ||
str::equals( "text", e.valuestr() ) ) ) {
if ( !addedFtsStuff ) {
_addFTSStuff( &b );
addedFtsStuff = true;
}
m[e.fieldName()] = 1;
}
else {
b.append( e );
}
}
if ( !addedFtsStuff )
_addFTSStuff( &b );
keyPattern = b.obj();
}
if ( spec["weights"].isABSONObj() ) {
BSONObjIterator i( spec["weights"].Obj() );
while ( i.more() ) {
BSONElement e = i.next();
m[e.fieldName()] = e.numberInt();
}
}
else if ( spec["weights"].str() == WILDCARD ) {
m[WILDCARD] = 1;
}
BSONObj weights;
{
BSONObjBuilder b;
for ( map::iterator i = m.begin(); i != m.end(); ++i ) {
uassert( 17365, "score for word too high",
i->second > 0 && i->second < MAX_WORD_WEIGHT );
b.append( i->first, i->second );
}
weights = b.obj();
}
string default_language(spec.getStringField("default_language"));
if ( default_language.empty() )
default_language = "english";
string language_override(spec.getStringField("language_override"));
if ( language_override.empty() )
language_override = "language";
int version = -1;
int textIndexVersion = 1;
BSONObjBuilder b;
BSONObjIterator i( spec );
while ( i.more() ) {
BSONElement e = i.next();
if ( str::equals( e.fieldName(), "key" ) ) {
b.append( "key", keyPattern );
}
else if ( str::equals( e.fieldName(), "weights" ) ) {
b.append( "weights", weights );
weights = BSONObj();
}
else if ( str::equals( e.fieldName(), "default_language" ) ) {
b.append( "default_language", default_language);
default_language = "";
}
else if ( str::equals( e.fieldName(), "language_override" ) ) {
b.append( "language_override", language_override);
language_override = "";
}
else if ( str::equals( e.fieldName(), "v" ) ) {
version = e.numberInt();
}
else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) {
textIndexVersion = e.numberInt();
uassert( 17366,
str::stream() << "bad textIndexVersion: " << textIndexVersion,
textIndexVersion == 1 );
}
else {
b.append( e );
}
}
if ( !weights.isEmpty() )
b.append( "weights", weights );
if ( !default_language.empty() )
b.append( "default_language", default_language);
if ( !language_override.empty() )
b.append( "language_override", language_override);
if ( version >= 0 )
b.append( "v", version );
b.append( "textIndexVersion", textIndexVersion );
return b.obj();
}
}
}