summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Chelminski <adam.chelminski@mongodb.com>2015-07-15 10:14:09 -0400
committerAdam Chelminski <adam.chelminski@mongodb.com>2015-07-29 16:59:04 -0400
commit852258ef9ec37856e22a8e1089507899a2396b00 (patch)
tree624155f8a779b29109f92eaec4982c94e9303e59
parentf1bdf1bb55f93c63297f36ef22ed40d6d84872c9 (diff)
downloadmongo-852258ef9ec37856e22a8e1089507899a2396b00.tar.gz
SERVER-19421 Add abstractions for phrase matching in FTS
-rw-r--r--src/mongo/db/fts/SConscript4
-rw-r--r--src/mongo/db/fts/fts_basic_phrase_matcher.cpp49
-rw-r--r--src/mongo/db/fts/fts_basic_phrase_matcher.h53
-rw-r--r--src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp77
-rw-r--r--src/mongo/db/fts/fts_language.cpp5
-rw-r--r--src/mongo/db/fts/fts_language.h13
-rw-r--r--src/mongo/db/fts/fts_matcher.cpp20
-rw-r--r--src/mongo/db/fts/fts_phrase_matcher.h65
-rw-r--r--src/mongo/db/fts/fts_query.cpp16
-rw-r--r--src/mongo/db/fts/fts_query.h5
-rw-r--r--src/mongo/db/fts/fts_tokenizer.h6
11 files changed, 280 insertions, 33 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 5c9c28dcd05..25f3b467c57 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -34,6 +34,7 @@ baseEnv.Library('base', [
'fts_spec.cpp',
'fts_spec_legacy.cpp',
'fts_language.cpp',
+ 'fts_basic_phrase_matcher.cpp',
'fts_basic_tokenizer.cpp',
'fts_util.cpp',
'fts_element_iterator.cpp',
@@ -93,3 +94,6 @@ env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp",
LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp",
+ LIBDEPS=["base"] )
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp
new file mode 100644
index 00000000000..e32174d3b15
--- /dev/null
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/fts/fts_basic_phrase_matcher.h"
+
+#include "mongo/platform/strcasestr.h"
+
+namespace mongo {
+namespace fts {
+
+using std::string;
+
+bool BasicFTSPhraseMatcher::phraseMatches(const string& phrase,
+ const string& haystack,
+ PhraseMatcherOptions options) const {
+ if (options & kCaseSensitive) {
+ return haystack.find(phrase) != string::npos;
+ }
+
+ return strcasestr(haystack.c_str(), phrase.c_str()) != NULL;
+}
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.h b/src/mongo/db/fts/fts_basic_phrase_matcher.h
new file mode 100644
index 00000000000..c595b9c3bf9
--- /dev/null
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/db/fts/fts_phrase_matcher.h"
+
+namespace mongo {
+namespace fts {
+
+/**
+ * A phrase matcher that looks for exact substring matches with optional ASCII-aware case
+ * insensitivity.
+ */
+class BasicFTSPhraseMatcher final : public FTSPhraseMatcher {
+ MONGO_DISALLOW_COPYING(BasicFTSPhraseMatcher);
+
+public:
+ BasicFTSPhraseMatcher() = default;
+
+ bool phraseMatches(const std::string& phrase,
+ const std::string& haystack,
+ PhraseMatcherOptions options) const final;
+};
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
new file mode 100644
index 00000000000..9f193e836ad
--- /dev/null
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
@@ -0,0 +1,77 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/fts/fts_basic_phrase_matcher.h"
+
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+namespace fts {
+
+// Case insensitive match.
+TEST(FtsBasicPhraseMatcher, CaseInsensitive) {
+ std::string str1 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
+ std::string find1 = "Consectetur adipiscing";
+ std::string nofind1 = "dolor amet";
+
+ std::string str2 = "Duis aute irure dolor in reprehenderit in Voluptate velit esse cillum.";
+ std::string find2 = "In Voluptate";
+ std::string nofind2 = "dolor velit";
+
+ BasicFTSPhraseMatcher phraseMatcher;
+ FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kNone;
+
+ ASSERT(phraseMatcher.phraseMatches(find1, str1, options));
+ ASSERT(phraseMatcher.phraseMatches(find2, str2, options));
+
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str1, options));
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str2, options));
+}
+
+// Case sensitive match.
+TEST(FtsBasicPhraseMatcher, CaseSensitive) {
+ std::string str1 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
+ std::string find1 = "Lorem ipsum";
+ std::string nofind1 = "Sit amet";
+
+ std::string str2 = "Duis aute irure dolor in reprehenderit in Voluptate velit esse cillum.";
+ std::string find2 = "in Voluptate";
+ std::string nofind2 = "Irure dolor";
+
+ BasicFTSPhraseMatcher phraseMatcher;
+ FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kCaseSensitive;
+
+ ASSERT(phraseMatcher.phraseMatches(find1, str1, options));
+ ASSERT(phraseMatcher.phraseMatches(find2, str2, options));
+
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str1, options));
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str2, options));
+}
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index 7a0c64ab1cf..1180cfa17b1 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -33,6 +33,7 @@
#include <string>
#include "mongo/base/init.h"
+#include "mongo/db/fts/fts_basic_phrase_matcher.h"
#include "mongo/db/fts/fts_basic_tokenizer.h"
#include "mongo/stdx/memory.h"
#include "mongo/util/assert_util.h"
@@ -85,6 +86,10 @@ std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
return stdx::make_unique<BasicFTSTokenizer>(this);
}
+const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const {
+ return _basicPhraseMatcher;
+}
+
MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS);
//
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index facdb8c9ce0..6c986f5de6e 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -30,6 +30,8 @@
#pragma once
+#include "mongo/db/fts/fts_basic_phrase_matcher.h"
+#include "mongo/db/fts/fts_phrase_matcher.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/base/status_with.h"
@@ -87,6 +89,11 @@ public:
virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;
/**
+ * Returns a reference to the phrase matcher instance that this language owns.
+ */
+ virtual const FTSPhraseMatcher& getPhraseMatcher() const = 0;
+
+ /**
* Register std::string 'languageName' as a new language with text index version
* 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
* Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
@@ -133,7 +140,11 @@ typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
class BasicFTSLanguage : public FTSLanguage {
public:
- std::unique_ptr<FTSTokenizer> createTokenizer() const override;
+ std::unique_ptr<FTSTokenizer> createTokenizer() const final;
+ const FTSPhraseMatcher& getPhraseMatcher() const final;
+
+private:
+ BasicFTSPhraseMatcher _basicPhraseMatcher;
};
extern BasicFTSLanguage languagePorterV1;
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index 544ef93cf36..52c67e337e9 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -31,9 +31,9 @@
#include "mongo/platform/basic.h"
#include "mongo/db/fts/fts_matcher.h"
+#include "mongo/db/fts/fts_phrase_matcher.h"
#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/fts_element_iterator.h"
-#include "mongo/platform/strcasestr.h"
namespace mongo {
@@ -41,17 +41,6 @@ namespace fts {
using std::string;
-/**
- * Does the string 'phrase' occur in the string 'haystack'? Match is case-insensitive if
- * 'caseSensitive' is false; otherwise, an exact substring match is performed.
- */
-static bool phraseMatches(const string& phrase, const string& haystack, bool caseSensitive) {
- if (caseSensitive) {
- return haystack.find(phrase) != string::npos;
- }
- return strcasestr(haystack.c_str(), phrase.c_str()) != NULL;
-}
-
FTSMatcher::FTSMatcher(const FTSQuery& query, const FTSSpec& spec) : _query(query), _spec(spec) {}
bool FTSMatcher::matches(const BSONObj& obj) const {
@@ -163,7 +152,12 @@ bool FTSMatcher::_phraseMatch(const string& phrase, const BSONObj& obj) const {
while (it.more()) {
FTSIteratorValue val = it.next();
- if (phraseMatches(phrase, val._text, _query.getCaseSensitive())) {
+
+ if (val._language->getPhraseMatcher().phraseMatches(phrase,
+ val._text,
+ _query.getCaseSensitive()
+ ? FTSPhraseMatcher::kCaseSensitive
+ : FTSPhraseMatcher::kNone)) {
return true;
}
}
diff --git a/src/mongo/db/fts/fts_phrase_matcher.h b/src/mongo/db/fts/fts_phrase_matcher.h
new file mode 100644
index 00000000000..ae7b8c8a9f9
--- /dev/null
+++ b/src/mongo/db/fts/fts_phrase_matcher.h
@@ -0,0 +1,65 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace mongo {
+namespace fts {
+
+/**
+ * An interface for substring matching routines.
+ */
+class FTSPhraseMatcher {
+public:
+ virtual ~FTSPhraseMatcher() = default;
+
+ using PhraseMatcherOptions = uint8_t;
+
+ /**
+ * Use no options.
+ */
+ static const int kNone = 0;
+
+ /**
+ * Lowercase strings as part of phrase matching.
+ */
+ static const int kCaseSensitive = 1 << 0;
+
+ /**
+ * Does the string 'phrase' occur in the string 'haystack'?
+ */
+ virtual bool phraseMatches(const std::string& phrase,
+ const std::string& haystack,
+ PhraseMatcherOptions options) const = 0;
+};
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index 8dec8e29204..4fa332266a0 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -106,10 +106,11 @@ Status FTSQuery::parse(const string& query,
unsigned phraseStart = quoteOffset + 1;
unsigned phraseLength = t.offset - phraseStart;
StringData phrase = StringData(query).substr(phraseStart, phraseLength);
- if (inNegation)
- _negatedPhrases.push_back(normalizeString(phrase));
- else
- _positivePhrases.push_back(normalizeString(phrase));
+ if (inNegation) {
+ _negatedPhrases.push_back(phrase.toString());
+ } else {
+ _positivePhrases.push_back(phrase.toString());
+ }
inNegation = false;
inPhrase = false;
} else {
@@ -170,13 +171,6 @@ void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool n
}
}
-string FTSQuery::normalizeString(StringData str) const {
- if (_caseSensitive) {
- return str.toString();
- }
- return tolowerString(str);
-}
-
namespace {
void _debugHelp(stringstream& ss, const set<string>& s, const string& sep) {
bool first = true;
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index 10e0cd2faaf..cac73425ffb 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -84,11 +84,6 @@ public:
BSONObj toBSON() const;
- /**
- * Lowercases "str" if _caseSensitive is set, else returns a copy of "str" unchanged.
- */
- std::string normalizeString(StringData str) const;
-
static const bool caseSensitiveDefault;
private:
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
index 40cdbde2cb8..4c99506eae2 100644
--- a/src/mongo/db/fts/fts_tokenizer.h
+++ b/src/mongo/db/fts/fts_tokenizer.h
@@ -40,9 +40,9 @@ class StopWords;
/**
* FTSTokenizer
- * A iterator of "documents" where a document contains space delimited words.
- * For each word returns a stem or lemma version of a word optimized for full text indexing.
- * Supports various options to control how tokens are generated.
+ * A iterator of "documents" where a document contains space delimited words. For each word returns
+ * a stem or lemma version of a word optimized for full text indexing. Supports various options to
+ * control how tokens are generated.
*/
class FTSTokenizer {
public: