// fts_query_impl.cpp /** * Copyright (C) 2012 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the GNU Affero General Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/platform/basic.h" #include "mongo/db/fts/fts_query_impl.h" #include "mongo/db/fts/fts_query_parser.h" #include "mongo/db/fts/fts_spec.h" #include "mongo/db/fts/fts_tokenizer.h" #include "mongo/stdx/memory.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" namespace mongo { namespace fts { using namespace mongoutils; using std::set; using std::string; using std::stringstream; using std::vector; Status FTSQueryImpl::parse(TextIndexVersion textIndexVersion) { StatusWithFTSLanguage ftsLanguage = FTSLanguage::make(getLanguage(), textIndexVersion); if (!ftsLanguage.getStatus().isOK()) { return ftsLanguage.getStatus(); } // Build a space delimited list of words to have the FtsTokenizer tokenize string positiveTermSentence; string negativeTermSentence; bool inNegation = false; bool inPhrase = false; unsigned quoteOffset = 0; FTSQueryParser i(getQuery()); while (i.more()) { QueryToken t = i.next(); if (t.type == QueryToken::TEXT) { string s = t.data.toString(); if (inPhrase && inNegation) { // don't add term } else { // A negation should only continue until the next whitespace character. For example, // "-foo" should negate "foo", "- foo" should not negate "foo", and "-foo-bar" // should negate both "foo" and "bar". if (inNegation && t.previousWhiteSpace) { inNegation = false; } if (inNegation) { negativeTermSentence.append(s); negativeTermSentence.push_back(' '); } else { positiveTermSentence.append(s); positiveTermSentence.push_back(' '); } } } else if (t.type == QueryToken::DELIMITER) { char c = t.data[0]; if (c == '-') { if (!inPhrase && t.previousWhiteSpace) { // phrases can be negated, and terms not in phrases can be negated. // terms in phrases can not be negated. inNegation = true; } } else if (c == '"') { if (inPhrase) { // end of a phrase unsigned phraseStart = quoteOffset + 1; unsigned phraseLength = t.offset - phraseStart; StringData phrase = StringData(getQuery()).substr(phraseStart, phraseLength); if (inNegation) { _negatedPhrases.push_back(phrase.toString()); } else { _positivePhrases.push_back(phrase.toString()); } // Do not reset 'inNegation' here, since a negation should continue until the // next whitespace character. For example, '-"foo bar"-"baz quux"' should negate // both the phrase "foo bar" and the phrase "baz quux". inPhrase = false; } else { // start of a phrase inPhrase = true; // A "-" should only be treated as a negation if there is no whitespace between // the "-" and the start of the phrase. if (inNegation && t.previousWhiteSpace) { inNegation = false; } quoteOffset = t.offset; } } } else { invariant(false); } } std::unique_ptr tokenizer(ftsLanguage.getValue()->createTokenizer()); _addTerms(tokenizer.get(), positiveTermSentence, false); _addTerms(tokenizer.get(), negativeTermSentence, true); return Status::OK(); } std::unique_ptr FTSQueryImpl::clone() const { auto clonedQuery = stdx::make_unique(); clonedQuery->setQuery(getQuery()); clonedQuery->setLanguage(getLanguage()); clonedQuery->setCaseSensitive(getCaseSensitive()); clonedQuery->setDiacriticSensitive(getDiacriticSensitive()); clonedQuery->_positiveTerms = _positiveTerms; clonedQuery->_negatedTerms = _negatedTerms; clonedQuery->_positivePhrases = _positivePhrases; clonedQuery->_negatedPhrases = _negatedPhrases; clonedQuery->_termsForBounds = _termsForBounds; return std::move(clonedQuery); } void FTSQueryImpl::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool negated) { tokenizer->reset(sentence.c_str(), FTSTokenizer::kFilterStopWords); auto& activeTerms = negated ? _negatedTerms : _positiveTerms; // First, get all the terms for indexing, ie, lower cased words // If we are case-insensitive, we can also used this for positive, and negative terms // Some terms may be expanded into multiple words in some non-English languages while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); if (!negated) { _termsForBounds.insert(word); } // Compute the string corresponding to 'token' that will be used for the matcher. // For case and diacritic insensitive queries, this is the same string as 'boundsTerm' // computed above. if (!getCaseSensitive() && !getDiacriticSensitive()) { activeTerms.insert(word); } } if (!getCaseSensitive() && !getDiacriticSensitive()) { return; } FTSTokenizer::Options newOptions = FTSTokenizer::kFilterStopWords; if (getCaseSensitive()) { newOptions |= FTSTokenizer::kGenerateCaseSensitiveTokens; } if (getDiacriticSensitive()) { newOptions |= FTSTokenizer::kGenerateDiacriticSensitiveTokens; } tokenizer->reset(sentence.c_str(), newOptions); // If we want case-sensitivity or diacritic sensitivity, get the correct token. while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); activeTerms.insert(word); } } BSONObj FTSQueryImpl::toBSON() const { BSONObjBuilder bob; bob.append("terms", getPositiveTerms()); bob.append("negatedTerms", getNegatedTerms()); bob.append("phrases", getPositivePhr()); bob.append("negatedPhrases", getNegatedPhr()); return bob.obj(); } } }