// fts_language.cpp
/**
* Copyright (C) 2013 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/db/fts/fts_language.h"
#include
#include "mongo/base/init.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/string_map.h"
#include "mongo/util/stringutils.h"
namespace mongo {
namespace fts {
namespace {
/**
* Case-insensitive StringData comparator.
*/
struct LanguageStringCompare {
/** Returns true if lhs < rhs. */
bool operator()( const StringData& lhs, const StringData& rhs ) const {
size_t minSize = std::min( lhs.size(), rhs.size() );
for ( size_t x = 0; x < minSize; x++ ) {
char a = tolower( lhs[x] );
char b = tolower( rhs[x] );
if ( a < b ) {
return true;
}
if ( a > b ) {
return false;
}
}
return lhs.size() < rhs.size();
}
};
// Lookup table from user language string (case-insensitive) to FTSLanguage. Populated
// by initializers in group FTSAllLanguagesRegistered and initializer
// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only.
typedef std::map LanguageMapV2;
LanguageMapV2 languageMapV2;
// Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes.
// Case-sensitive by lookup key.
typedef std::map LanguageMapV1;
LanguageMapV1 languageMapV1;
}
MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES,
MONGO_NO_DEPENDENTS );
//
// Register supported languages' canonical names for TEXT_INDEX_VERSION_2.
//
MONGO_FTS_LANGUAGE_DECLARE( languageNoneV2, "none", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageDanishV2, "danish", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageDutchV2, "dutch", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV2, "english", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV2, "french", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageGermanV2, "german", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageItalianV2, "italian", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageRussianV2, "russian", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2 );
MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2 );
//
// Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full
// names are recognized by the StopWords class (as such, the language string "dan" in
// TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list).
//
MONGO_FTS_LANGUAGE_DECLARE( languageNoneV1, "none", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageDaV1, "da", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageDanV1, "dan", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageDanishV1, "danish", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageDeV1, "de", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageDeuV1, "deu", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageDutV1, "dut", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageDutchV1, "dutch", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageEnV1, "en", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageEngV1, "eng", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV1, "english", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageEsV1, "es", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageEslV1, "esl", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageFiV1, "fi", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageFinV1, "fin", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageFrV1, "fr", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageFraV1, "fra", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageFreV1, "fre", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV1, "french", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageGerV1, "ger", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageGermanV1, "german", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageHuV1, "hu", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageHunV1, "hun", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageItV1, "it", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageItaV1, "ita", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageItalianV1, "italian", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageNlV1, "nl", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageNldV1, "nld", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageNoV1, "no", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageNorV1, "nor", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languagePorV1, "por", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languagePorterV1, "porter", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languagePtV1, "pt", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageRoV1, "ro", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageRonV1, "ron", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageRuV1, "ru", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageRumV1, "rum", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageRusV1, "rus", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageRussianV1, "russian", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageSpaV1, "spa", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageSvV1, "sv", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageSweV1, "swe", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageTrV1, "tr", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageTurV1, "tur", TEXT_INDEX_VERSION_1 );
MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1 );
MONGO_INITIALIZER_WITH_PREREQUISITES( FTSRegisterLanguageAliases,
( "FTSAllLanguagesRegistered" ) )
( InitializerContext* context ) {
// Register language aliases for TEXT_INDEX_VERSION_2.
FTSLanguage::registerLanguageAlias( &languageDanishV2, "da", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageDutchV2, "nl", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageEnglishV2, "en", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageFinnishV2, "fi", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageFrenchV2, "fr", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageGermanV2, "de", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageHungarianV2, "hu", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageItalianV2, "it", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageRomanianV2, "ro", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageRussianV2, "ru", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageSpanishV2, "es", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageSwedishV2, "sv", TEXT_INDEX_VERSION_2 );
FTSLanguage::registerLanguageAlias( &languageTurkishV2, "tr", TEXT_INDEX_VERSION_2 );
return Status::OK();
}
// static
void FTSLanguage::registerLanguage( const StringData& languageName,
TextIndexVersion textIndexVersion,
FTSLanguage* language ) {
verify( !languageName.empty() );
language->_canonicalName = languageName.toString();
switch ( textIndexVersion ) {
case TEXT_INDEX_VERSION_2:
verify( languageMapV2.find( languageName ) == languageMapV2.end() );
languageMapV2[ languageName ] = language;
return;
case TEXT_INDEX_VERSION_1:
verify( languageMapV1.find( languageName ) == languageMapV1.end() );
languageMapV1[ languageName ] = language;
return;
}
verify( false );
}
// static
void FTSLanguage::registerLanguageAlias( const FTSLanguage* language,
const StringData& alias,
TextIndexVersion textIndexVersion ) {
switch ( textIndexVersion ) {
case TEXT_INDEX_VERSION_2:
verify( languageMapV2.find( alias ) == languageMapV2.end() );
languageMapV2[ alias ] = language;
return;
case TEXT_INDEX_VERSION_1:
verify( languageMapV1.find( alias ) == languageMapV1.end() );
languageMapV1[ alias ] = language;
return;
}
verify( false );
}
FTSLanguage::FTSLanguage() : _canonicalName() {
}
const std::string& FTSLanguage::str() const {
verify( !_canonicalName.empty() );
return _canonicalName;
}
// static
StatusWithFTSLanguage FTSLanguage::make( const StringData& langName,
TextIndexVersion textIndexVersion ) {
switch ( textIndexVersion ) {
case TEXT_INDEX_VERSION_2: {
LanguageMapV2::const_iterator it = languageMapV2.find( langName );
if ( it == languageMapV2.end() ) {
// TEXT_INDEX_VERSION_2 rejects unrecognized language strings.
Status status = Status( ErrorCodes::BadValue,
mongoutils::str::stream() <<
"unsupported language: \"" << langName <<
"\"" );
return StatusWithFTSLanguage( status );
}
return StatusWithFTSLanguage( it->second );
}
case TEXT_INDEX_VERSION_1: {
LanguageMapV1::const_iterator it = languageMapV1.find( langName );
if ( it == languageMapV1.end() ) {
// TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none".
return StatusWithFTSLanguage( &languageNoneV1 );
}
return StatusWithFTSLanguage( it->second );
}
}
verify( false );
return StatusWithFTSLanguage( Status::OK() );
}
}
}