/** * Copyright (C) 2015 MongoDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the GNU Affero General Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/platform/basic.h" #include "mongo/db/fts/fts_language.h" #include "mongo/db/fts/fts_unicode_tokenizer.h" #include "mongo/unittest/unittest.h" namespace mongo { namespace fts { std::vector tokenizeString(const char* str, const char* language, FTSTokenizer::Options options) { StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_3); ASSERT_OK(swl); UnicodeFTSTokenizer tokenizer(swl.getValue()); tokenizer.reset(str, options); std::vector terms; while (tokenizer.moveNext()) { terms.push_back(tokenizer.get().toString()); } return terms; } // Ensure punctuation is filtered out of the indexed document and the 's is not separated TEST(FtsUnicodeTokenizer, English) { std::vector terms = tokenizeString("Do you see Mark's dog running?", "english", FTSTokenizer::kNone); ASSERT_EQUALS(6U, terms.size()); ASSERT_EQUALS("do", terms[0]); ASSERT_EQUALS("you", terms[1]); ASSERT_EQUALS("see", terms[2]); ASSERT_EQUALS("mark", terms[3]); ASSERT_EQUALS("dog", terms[4]); ASSERT_EQUALS("run", terms[5]); } // Ensure that the tokenization still works correctly when there are leading and/or trailing // delimiters. TEST(FtsUnicodeTokenizer, EnglishLeadingAndTrailingDelimiters) { std::vector terms = tokenizeString(" , Do you see Mark's dog running? ", "english", FTSTokenizer::kNone); ASSERT_EQUALS(6U, terms.size()); ASSERT_EQUALS("do", terms[0]); ASSERT_EQUALS("you", terms[1]); ASSERT_EQUALS("see", terms[2]); ASSERT_EQUALS("mark", terms[3]); ASSERT_EQUALS("dog", terms[4]); ASSERT_EQUALS("run", terms[5]); } // Ensure that strings containing only delimiters are properly handled. TEST(FtsUnicodeTokenizer, OnlyDelimiters) { std::vector terms = tokenizeString(" ", "english", FTSTokenizer::kNone); ASSERT_EQUALS(0U, terms.size()); } // Ensure punctuation is filtered out of the indexed document and the 'est is separated. TEST(FtsUnicodeTokenizer, FrenchAndNonAsciiPunctuation) { std::vector terms = tokenizeString( "Voyez-vous «le chien» de Mark courante? C'est bien!", "french", FTSTokenizer::kNone); ASSERT_EQUALS(10U, terms.size()); ASSERT_EQUALS("voi", terms[0]); ASSERT_EQUALS("vous", terms[1]); ASSERT_EQUALS("le", terms[2]); ASSERT_EQUALS("chien", terms[3]); ASSERT_EQUALS("de", terms[4]); ASSERT_EQUALS("mark", terms[5]); ASSERT_EQUALS("cour", terms[6]); ASSERT_EQUALS("c", terms[7]); ASSERT_EQUALS("est", terms[8]); ASSERT_EQUALS("bien", terms[9]); } // Ensure punctuation is filtered out of the indexed document and the 'est is separated. TEST(FtsUnicodeTokenizer, FrenchDiacriticStemming) { std::vector terms = tokenizeString("parlames, parlates, parlerent, parlâmes, parlâtes, parlèrent", "french", FTSTokenizer::kNone); ASSERT_EQUALS(6U, terms.size()); ASSERT_EQUALS("parlam", terms[0]); ASSERT_EQUALS("parlat", terms[1]); ASSERT_EQUALS("parlerent", terms[2]); ASSERT_EQUALS("parl", terms[3]); ASSERT_EQUALS("parl", terms[4]); ASSERT_EQUALS("parl", terms[5]); } // Ensure punctuation is filtered out of the indexed document and that diacritics are not in the // resulting tokens. TEST(FtsUnicodeTokenizer, Turkish) { std::vector terms = tokenizeString( "KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", "turkish", FTSTokenizer::kNone); ASSERT_EQUALS(7U, terms.size()); ASSERT_EQUALS("kac", terms[0]); ASSERT_EQUALS("yas", terms[1]); ASSERT_EQUALS("sen", terms[2]); ASSERT_EQUALS("ve", terms[3]); ASSERT_EQUALS("sen", terms[4]); ASSERT_EQUALS("nere", terms[5]); ASSERT_EQUALS("var", terms[6]); } // Ensure punctuation is filtered out of the indexed document, that diacritics are not in the // resulting tokens, and that the generated tokens are not lowercased. TEST(FtsUnicodeTokenizer, TurkishCaseSensitive) { std::vector terms = tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", "turkish", FTSTokenizer::kGenerateCaseSensitiveTokens); ASSERT_EQUALS(7U, terms.size()); ASSERT_EQUALS("KAC", terms[0]); ASSERT_EQUALS("YASINDASIN", terms[1]); ASSERT_EQUALS("SEN", terms[2]); ASSERT_EQUALS("VE", terms[3]); ASSERT_EQUALS("SEN", terms[4]); ASSERT_EQUALS("NEREDEN", terms[5]); ASSERT_EQUALS("VARDIR", terms[6]); } // Ensure punctuation is filtered out of the indexed document, that diacritics are in the // resulting tokens, and that the generated tokens are lowercased. TEST(FtsUnicodeTokenizer, TurkishDiacriticSensitive) { std::vector terms = tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", "turkish", FTSTokenizer::kGenerateDiacriticSensitiveTokens); ASSERT_EQUALS(7U, terms.size()); ASSERT_EQUALS("kaç", terms[0]); ASSERT_EQUALS("yaş", terms[1]); ASSERT_EQUALS("sen", terms[2]); ASSERT_EQUALS("ve", terms[3]); ASSERT_EQUALS("sen", terms[4]); ASSERT_EQUALS("nere", terms[5]); ASSERT_EQUALS("var", terms[6]); } // Ensure punctuation is filtered out of the indexed document, that diacritics are in the // resulting tokens, and that the generated tokens are not lowercased. TEST(FtsUnicodeTokenizer, TurkishDiacriticAndCaseSensitive) { std::vector terms = tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", "turkish", FTSTokenizer::kGenerateDiacriticSensitiveTokens | FTSTokenizer::kGenerateCaseSensitiveTokens); ASSERT_EQUALS(7U, terms.size()); ASSERT_EQUALS("KAÇ", terms[0]); ASSERT_EQUALS("YAŞINDASIN", terms[1]); ASSERT_EQUALS("SEN", terms[2]); ASSERT_EQUALS("VE", terms[3]); ASSERT_EQUALS("SEN", terms[4]); ASSERT_EQUALS("NEREDEN", terms[5]); ASSERT_EQUALS("VARDIR", terms[6]); } // Ensure punctuation is filtered out of the indexed document, that diacritics are in the // resulting tokens, and that the generated tokens are not lowercased. TEST(FtsUnicodeTokenizer, TurkishDiacriticAndCaseSensitiveAndStopWords) { std::vector terms = tokenizeString( "KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", "turkish", FTSTokenizer::kGenerateDiacriticSensitiveTokens | FTSTokenizer::kGenerateCaseSensitiveTokens | FTSTokenizer::kFilterStopWords); ASSERT_EQUALS(4U, terms.size()); ASSERT_EQUALS("KAÇ", terms[0]); ASSERT_EQUALS("YAŞINDASIN", terms[1]); ASSERT_EQUALS("NEREDEN", terms[2]); ASSERT_EQUALS("VARDIR", terms[3]); } // Ensure that stop words are only removed if they contain the correct diacritics. TEST(FtsUnicodeTokenizer, FrenchStopWords) { std::vector terms = tokenizeString("Je ne vais pas etre énervé. Je vais être excité.", "french", FTSTokenizer::kFilterStopWords); ASSERT_EQUALS(5U, terms.size()); ASSERT_EQUALS("vais", terms[0]); ASSERT_EQUALS("etre", terms[1]); ASSERT_EQUALS("enerv", terms[2]); ASSERT_EQUALS("vais", terms[3]); ASSERT_EQUALS("excit", terms[4]); } // Ensure that stop words are only removed if they contain the correct diacritics. TEST(FtsUnicodeTokenizer, FrenchStopWordsAndDiacriticSensitive) { std::vector terms = tokenizeString( "Je ne vais pas etre énervé. Je vais être excité.", "french", FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateDiacriticSensitiveTokens); ASSERT_EQUALS(5U, terms.size()); ASSERT_EQUALS("vais", terms[0]); ASSERT_EQUALS("etre", terms[1]); ASSERT_EQUALS("énerv", terms[2]); ASSERT_EQUALS("vais", terms[3]); ASSERT_EQUALS("excit", terms[4]); } } // namespace fts } // namespace mongo