diff options
author | Adam Chelminski <adam.chelminski@mongodb.com> | 2015-07-15 10:14:09 -0400 |
---|---|---|
committer | Adam Chelminski <adam.chelminski@mongodb.com> | 2015-07-29 16:59:04 -0400 |
commit | 852258ef9ec37856e22a8e1089507899a2396b00 (patch) | |
tree | 624155f8a779b29109f92eaec4982c94e9303e59 /src/mongo/db | |
parent | f1bdf1bb55f93c63297f36ef22ed40d6d84872c9 (diff) | |
download | mongo-852258ef9ec37856e22a8e1089507899a2396b00.tar.gz |
SERVER-19421 Add abstractions for phrase matching in FTS
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/fts/SConscript | 4 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_basic_phrase_matcher.cpp | 49 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_basic_phrase_matcher.h | 53 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp | 77 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.h | 13 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_matcher.cpp | 20 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_phrase_matcher.h | 65 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.h | 5 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_tokenizer.h | 6 |
11 files changed, 280 insertions, 33 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript index 5c9c28dcd05..25f3b467c57 100644 --- a/src/mongo/db/fts/SConscript +++ b/src/mongo/db/fts/SConscript @@ -34,6 +34,7 @@ baseEnv.Library('base', [ 'fts_spec.cpp', 'fts_spec_legacy.cpp', 'fts_language.cpp', + 'fts_basic_phrase_matcher.cpp', 'fts_basic_tokenizer.cpp', 'fts_util.cpp', 'fts_element_iterator.cpp', @@ -93,3 +94,6 @@ env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp", env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp", LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp", + LIBDEPS=["base"] ) diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp new file mode 100644 index 00000000000..e32174d3b15 --- /dev/null +++ b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp @@ -0,0 +1,49 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/fts/fts_basic_phrase_matcher.h" + +#include "mongo/platform/strcasestr.h" + +namespace mongo { +namespace fts { + +using std::string; + +bool BasicFTSPhraseMatcher::phraseMatches(const string& phrase, + const string& haystack, + PhraseMatcherOptions options) const { + if (options & kCaseSensitive) { + return haystack.find(phrase) != string::npos; + } + + return strcasestr(haystack.c_str(), phrase.c_str()) != NULL; +} + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.h b/src/mongo/db/fts/fts_basic_phrase_matcher.h new file mode 100644 index 00000000000..c595b9c3bf9 --- /dev/null +++ b/src/mongo/db/fts/fts_basic_phrase_matcher.h @@ -0,0 +1,53 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/base/disallow_copying.h" +#include "mongo/db/fts/fts_phrase_matcher.h" + +namespace mongo { +namespace fts { + +/** + * A phrase matcher that looks for exact substring matches with optional ASCII-aware case + * insensitivity. + */ +class BasicFTSPhraseMatcher final : public FTSPhraseMatcher { + MONGO_DISALLOW_COPYING(BasicFTSPhraseMatcher); + +public: + BasicFTSPhraseMatcher() = default; + + bool phraseMatches(const std::string& phrase, + const std::string& haystack, + PhraseMatcherOptions options) const final; +}; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp new file mode 100644 index 00000000000..9f193e836ad --- /dev/null +++ b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp @@ -0,0 +1,77 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/fts/fts_basic_phrase_matcher.h" + +#include "mongo/unittest/unittest.h" + +namespace mongo { +namespace fts { + +// Case insensitive match. +TEST(FtsBasicPhraseMatcher, CaseInsensitive) { + std::string str1 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; + std::string find1 = "Consectetur adipiscing"; + std::string nofind1 = "dolor amet"; + + std::string str2 = "Duis aute irure dolor in reprehenderit in Voluptate velit esse cillum."; + std::string find2 = "In Voluptate"; + std::string nofind2 = "dolor velit"; + + BasicFTSPhraseMatcher phraseMatcher; + FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kNone; + + ASSERT(phraseMatcher.phraseMatches(find1, str1, options)); + ASSERT(phraseMatcher.phraseMatches(find2, str2, options)); + + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str1, options)); + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str2, options)); +} + +// Case sensitive match. +TEST(FtsBasicPhraseMatcher, CaseSensitive) { + std::string str1 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; + std::string find1 = "Lorem ipsum"; + std::string nofind1 = "Sit amet"; + + std::string str2 = "Duis aute irure dolor in reprehenderit in Voluptate velit esse cillum."; + std::string find2 = "in Voluptate"; + std::string nofind2 = "Irure dolor"; + + BasicFTSPhraseMatcher phraseMatcher; + FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kCaseSensitive; + + ASSERT(phraseMatcher.phraseMatches(find1, str1, options)); + ASSERT(phraseMatcher.phraseMatches(find2, str2, options)); + + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str1, options)); + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str2, options)); +} + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp index 7a0c64ab1cf..1180cfa17b1 100644 --- a/src/mongo/db/fts/fts_language.cpp +++ b/src/mongo/db/fts/fts_language.cpp @@ -33,6 +33,7 @@ #include <string> #include "mongo/base/init.h" +#include "mongo/db/fts/fts_basic_phrase_matcher.h" #include "mongo/db/fts/fts_basic_tokenizer.h" #include "mongo/stdx/memory.h" #include "mongo/util/assert_util.h" @@ -85,6 +86,10 @@ std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const { return stdx::make_unique<BasicFTSTokenizer>(this); } +const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const { + return _basicPhraseMatcher; +} + MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS); // diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h index facdb8c9ce0..6c986f5de6e 100644 --- a/src/mongo/db/fts/fts_language.h +++ b/src/mongo/db/fts/fts_language.h @@ -30,6 +30,8 @@ #pragma once +#include "mongo/db/fts/fts_basic_phrase_matcher.h" +#include "mongo/db/fts/fts_phrase_matcher.h" #include "mongo/db/fts/fts_util.h" #include "mongo/base/status_with.h" @@ -87,6 +89,11 @@ public: virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0; /** + * Returns a reference to the phrase matcher instance that this language owns. + */ + virtual const FTSPhraseMatcher& getPhraseMatcher() const = 0; + + /** * Register std::string 'languageName' as a new language with text index version * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language @@ -133,7 +140,11 @@ typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; class BasicFTSLanguage : public FTSLanguage { public: - std::unique_ptr<FTSTokenizer> createTokenizer() const override; + std::unique_ptr<FTSTokenizer> createTokenizer() const final; + const FTSPhraseMatcher& getPhraseMatcher() const final; + +private: + BasicFTSPhraseMatcher _basicPhraseMatcher; }; extern BasicFTSLanguage languagePorterV1; diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp index 544ef93cf36..52c67e337e9 100644 --- a/src/mongo/db/fts/fts_matcher.cpp +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -31,9 +31,9 @@ #include "mongo/platform/basic.h" #include "mongo/db/fts/fts_matcher.h" +#include "mongo/db/fts/fts_phrase_matcher.h" #include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/fts_element_iterator.h" -#include "mongo/platform/strcasestr.h" namespace mongo { @@ -41,17 +41,6 @@ namespace fts { using std::string; -/** - * Does the string 'phrase' occur in the string 'haystack'? Match is case-insensitive if - * 'caseSensitive' is false; otherwise, an exact substring match is performed. - */ -static bool phraseMatches(const string& phrase, const string& haystack, bool caseSensitive) { - if (caseSensitive) { - return haystack.find(phrase) != string::npos; - } - return strcasestr(haystack.c_str(), phrase.c_str()) != NULL; -} - FTSMatcher::FTSMatcher(const FTSQuery& query, const FTSSpec& spec) : _query(query), _spec(spec) {} bool FTSMatcher::matches(const BSONObj& obj) const { @@ -163,7 +152,12 @@ bool FTSMatcher::_phraseMatch(const string& phrase, const BSONObj& obj) const { while (it.more()) { FTSIteratorValue val = it.next(); - if (phraseMatches(phrase, val._text, _query.getCaseSensitive())) { + + if (val._language->getPhraseMatcher().phraseMatches(phrase, + val._text, + _query.getCaseSensitive() + ? FTSPhraseMatcher::kCaseSensitive + : FTSPhraseMatcher::kNone)) { return true; } } diff --git a/src/mongo/db/fts/fts_phrase_matcher.h b/src/mongo/db/fts/fts_phrase_matcher.h new file mode 100644 index 00000000000..ae7b8c8a9f9 --- /dev/null +++ b/src/mongo/db/fts/fts_phrase_matcher.h @@ -0,0 +1,65 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <cstdint> +#include <string> + +namespace mongo { +namespace fts { + +/** + * An interface for substring matching routines. + */ +class FTSPhraseMatcher { +public: + virtual ~FTSPhraseMatcher() = default; + + using PhraseMatcherOptions = uint8_t; + + /** + * Use no options. + */ + static const int kNone = 0; + + /** + * Lowercase strings as part of phrase matching. + */ + static const int kCaseSensitive = 1 << 0; + + /** + * Does the string 'phrase' occur in the string 'haystack'? + */ + virtual bool phraseMatches(const std::string& phrase, + const std::string& haystack, + PhraseMatcherOptions options) const = 0; +}; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index 8dec8e29204..4fa332266a0 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -106,10 +106,11 @@ Status FTSQuery::parse(const string& query, unsigned phraseStart = quoteOffset + 1; unsigned phraseLength = t.offset - phraseStart; StringData phrase = StringData(query).substr(phraseStart, phraseLength); - if (inNegation) - _negatedPhrases.push_back(normalizeString(phrase)); - else - _positivePhrases.push_back(normalizeString(phrase)); + if (inNegation) { + _negatedPhrases.push_back(phrase.toString()); + } else { + _positivePhrases.push_back(phrase.toString()); + } inNegation = false; inPhrase = false; } else { @@ -170,13 +171,6 @@ void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool n } } -string FTSQuery::normalizeString(StringData str) const { - if (_caseSensitive) { - return str.toString(); - } - return tolowerString(str); -} - namespace { void _debugHelp(stringstream& ss, const set<string>& s, const string& sep) { bool first = true; diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h index 10e0cd2faaf..cac73425ffb 100644 --- a/src/mongo/db/fts/fts_query.h +++ b/src/mongo/db/fts/fts_query.h @@ -84,11 +84,6 @@ public: BSONObj toBSON() const; - /** - * Lowercases "str" if _caseSensitive is set, else returns a copy of "str" unchanged. - */ - std::string normalizeString(StringData str) const; - static const bool caseSensitiveDefault; private: diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h index 40cdbde2cb8..4c99506eae2 100644 --- a/src/mongo/db/fts/fts_tokenizer.h +++ b/src/mongo/db/fts/fts_tokenizer.h @@ -40,9 +40,9 @@ class StopWords; /** * FTSTokenizer - * A iterator of "documents" where a document contains space delimited words. - * For each word returns a stem or lemma version of a word optimized for full text indexing. - * Supports various options to control how tokens are generated. + * A iterator of "documents" where a document contains space delimited words. For each word returns + * a stem or lemma version of a word optimized for full text indexing. Supports various options to + * control how tokens are generated. */ class FTSTokenizer { public: |