diff options
Diffstat (limited to 'chromium/chrome/common/string_matching')
21 files changed, 0 insertions, 2338 deletions
diff --git a/chromium/chrome/common/string_matching/BUILD.gn b/chromium/chrome/common/string_matching/BUILD.gn deleted file mode 100644 index 183a9ede09b..00000000000 --- a/chromium/chrome/common/string_matching/BUILD.gn +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2019 The Chromium Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import("//build/config/ui.gni") -import("//chrome/common/features.gni") - -source_set("string_matching") { - sources = [ - "fuzzy_tokenized_string_match.cc", - "fuzzy_tokenized_string_match.h", - "sequence_matcher.cc", - "sequence_matcher.h", - "term_break_iterator.cc", - "term_break_iterator.h", - "tokenized_string.cc", - "tokenized_string.h", - "tokenized_string_char_iterator.cc", - "tokenized_string_char_iterator.h", - "tokenized_string_match.cc", - "tokenized_string_match.h", - ] - - deps = [ - "//base", - "//base:i18n", - "//cc", - ] - - public_deps = [ - "//base", - "//ui/gfx", - ] -} diff --git a/chromium/chrome/common/string_matching/OWNERS b/chromium/chrome/common/string_matching/OWNERS deleted file mode 100644 index cc83b874e21..00000000000 --- a/chromium/chrome/common/string_matching/OWNERS +++ /dev/null @@ -1,2 +0,0 @@ -jiameng@chromium.org -thanhdng@chromium.org diff --git a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.cc b/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.cc deleted file mode 100644 index f26a37650cf..00000000000 --- a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.cc +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright (c) 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/fuzzy_tokenized_string_match.h" - -#include <algorithm> -#include <cmath> -#include <iterator> - -#include "base/i18n/case_conversion.h" -#include "base/metrics/field_trial_params.h" -#include "base/strings/strcat.h" -#include "base/strings/string_util.h" -#include "base/strings/utf_string_conversions.h" -#include "chrome/common/string_matching/sequence_matcher.h" - -namespace { -constexpr double kMinScore = 0.0; -constexpr double kMaxScore = 1.0; -constexpr double kFirstCharacterMatchPenalty = 0.2; -constexpr double kPrefixMatchPenalty = 0.1; - -// Returns sorted tokens from a TokenizedString. -std::vector<base::string16> ProcessAndSort(const TokenizedString& text) { - std::vector<base::string16> result; - for (const auto& token : text.tokens()) { - result.emplace_back(token); - } - std::sort(result.begin(), result.end()); - return result; -} -} // namespace - -FuzzyTokenizedStringMatch::~FuzzyTokenizedStringMatch() {} -FuzzyTokenizedStringMatch::FuzzyTokenizedStringMatch() {} - -double FuzzyTokenizedStringMatch::FirstCharacterMatch( - const TokenizedString& query, - const TokenizedString& text) { - const base::string16 query_lower = base::i18n::ToLower(query.text()); - size_t query_index = 0; - for (size_t text_index = 0; text_index < text.tokens().size(); text_index++) { - if (query_index < query_lower.size() && - text.tokens()[text_index][0] == query_lower[query_index]) { - query_index++; - if (query_index == query_lower.size()) { - // Penalizes the score using the number of text's tokens that are - // needed. - return std::max(kMinScore, - kMaxScore - kFirstCharacterMatchPenalty * - (text_index + 1 - query_lower.size())); - } - } - } - return kMinScore; -} - -double FuzzyTokenizedStringMatch::PrefixMatch(const TokenizedString& query, - const TokenizedString& text) { - const std::vector<base::string16> query_tokens(query.tokens()); - const std::vector<base::string16> text_tokens(text.tokens()); - double match_score = kMaxScore; - int previous_matched_index = -1; - // For every query token, check if it is a prefix of a text token. The newly - // matching text token must have higher index than the previous matched token. - for (const auto& query_token : query_tokens) { - bool matched = false; - for (size_t text_index = previous_matched_index + 1; - text_index < text_tokens.size(); text_index++) { - if (query_token.size() <= text_tokens[text_index].size() && - query_token == - text_tokens[text_index].substr(0, query_token.size())) { - matched = true; - // Penalizes the score based on the number of skipped tokens. - match_score -= - kPrefixMatchPenalty * (text_index - previous_matched_index - 1); - previous_matched_index = text_index; - break; - } - } - if (!matched) { - return kMinScore; - } - } - return std::max(kMinScore, match_score); -} - -double FuzzyTokenizedStringMatch::TokenSetRatio( - const TokenizedString& query, - const TokenizedString& text, - bool partial, - double partial_match_penalty_rate, - bool use_edit_distance) { - std::set<base::string16> query_token(query.tokens().begin(), - query.tokens().end()); - std::set<base::string16> text_token(text.tokens().begin(), - text.tokens().end()); - - std::vector<base::string16> intersection; - std::vector<base::string16> query_diff_text; - std::vector<base::string16> text_diff_query; - - // Find the intersection and the differences between two set of tokens. - std::set_intersection(query_token.begin(), query_token.end(), - text_token.begin(), text_token.end(), - std::back_inserter(intersection)); - std::set_difference(query_token.begin(), query_token.end(), - text_token.begin(), text_token.end(), - std::back_inserter(query_diff_text)); - std::set_difference(text_token.begin(), text_token.end(), query_token.begin(), - query_token.end(), std::back_inserter(text_diff_query)); - - const base::string16 intersection_string = - base::JoinString(intersection, base::UTF8ToUTF16(" ")); - const base::string16 query_rewritten = - intersection.empty() - ? base::JoinString(query_diff_text, base::UTF8ToUTF16(" ")) - : base::StrCat( - {intersection_string, base::UTF8ToUTF16(" "), - base::JoinString(query_diff_text, base::UTF8ToUTF16(" "))}); - const base::string16 text_rewritten = - intersection.empty() - ? base::JoinString(text_diff_query, base::UTF8ToUTF16(" ")) - : base::StrCat( - {intersection_string, base::UTF8ToUTF16(" "), - base::JoinString(text_diff_query, base::UTF8ToUTF16(" "))}); - - if (partial) { - return std::max( - {PartialRatio(intersection_string, query_rewritten, - partial_match_penalty_rate, use_edit_distance), - PartialRatio(intersection_string, text_rewritten, - partial_match_penalty_rate, use_edit_distance), - PartialRatio(query_rewritten, text_rewritten, - partial_match_penalty_rate, use_edit_distance)}); - } - - return std::max( - {SequenceMatcher(intersection_string, query_rewritten, use_edit_distance) - .Ratio(), - SequenceMatcher(intersection_string, text_rewritten, use_edit_distance) - .Ratio(), - SequenceMatcher(query_rewritten, text_rewritten, use_edit_distance) - .Ratio()}); -} - -double FuzzyTokenizedStringMatch::TokenSortRatio( - const TokenizedString& query, - const TokenizedString& text, - bool partial, - double partial_match_penalty_rate, - bool use_edit_distance) { - const base::string16 query_sorted = - base::JoinString(ProcessAndSort(query), base::UTF8ToUTF16(" ")); - const base::string16 text_sorted = - base::JoinString(ProcessAndSort(text), base::UTF8ToUTF16(" ")); - - if (partial) { - return PartialRatio(query_sorted, text_sorted, partial_match_penalty_rate, - use_edit_distance); - } - return SequenceMatcher(query_sorted, text_sorted, use_edit_distance).Ratio(); -} - -double FuzzyTokenizedStringMatch::PartialRatio( - const base::string16& query, - const base::string16& text, - double partial_match_penalty_rate, - bool use_edit_distance) { - if (query.empty() || text.empty()) { - return kMinScore; - } - base::string16 shorter = query; - base::string16 longer = text; - - if (shorter.size() > longer.size()) { - shorter = text; - longer = query; - } - - const auto matching_blocks = - SequenceMatcher(shorter, longer, use_edit_distance).GetMatchingBlocks(); - double partial_ratio = 0; - - for (const auto& block : matching_blocks) { - const int long_start = - block.pos_second_string > block.pos_first_string - ? block.pos_second_string - block.pos_first_string - : 0; - - // Penalizes the match if it is not close to the beginning of a token. - int current = long_start - 1; - while (current >= 0 && - !base::EqualsCaseInsensitiveASCII(longer.substr(current, 1), - base::UTF8ToUTF16(" "))) { - current--; - } - const double penalty = - std::pow(partial_match_penalty_rate, long_start - current - 1); - // TODO(crbug/990684): currently this part re-calculate the ratio for every - // pair. Improve this to reduce latency. - partial_ratio = std::max( - partial_ratio, - SequenceMatcher(shorter, longer.substr(long_start, shorter.size()), - use_edit_distance) - .Ratio() * - penalty); - - if (partial_ratio > 0.995) { - return kMaxScore; - } - } - return partial_ratio; -} - -double FuzzyTokenizedStringMatch::WeightedRatio( - const TokenizedString& query, - const TokenizedString& text, - double partial_match_penalty_rate, - bool use_edit_distance) { - const double unbase_scale = 0.95; - // Since query.text() and text.text() is not normalized, we use query.tokens() - // and text.tokens() instead. - const base::string16 query_normalized( - base::JoinString(query.tokens(), base::UTF8ToUTF16(" "))); - const base::string16 text_normalized( - base::JoinString(text.tokens(), base::UTF8ToUTF16(" "))); - double weighted_ratio = - SequenceMatcher(query_normalized, text_normalized, use_edit_distance) - .Ratio(); - const double length_ratio = - static_cast<double>( - std::max(query_normalized.size(), text_normalized.size())) / - std::min(query_normalized.size(), text_normalized.size()); - - // Use partial if two strings are quite different in sizes. - const bool use_partial = length_ratio >= 1.5; - double partial_scale = 1; - - if (use_partial) { - // If one string is much much shorter than the other, set |partial_scale| to - // be 0.6, otherwise set it to be 0.9. - partial_scale = length_ratio > 8 ? 0.6 : 0.9; - weighted_ratio = - std::max(weighted_ratio, - PartialRatio(query_normalized, text_normalized, - partial_match_penalty_rate, use_edit_distance) * - partial_scale); - } - weighted_ratio = - std::max(weighted_ratio, - TokenSortRatio(query, text, use_partial /*partial*/, - partial_match_penalty_rate, use_edit_distance) * - unbase_scale * partial_scale); - weighted_ratio = - std::max(weighted_ratio, - TokenSetRatio(query, text, use_partial /*partial*/, - partial_match_penalty_rate, use_edit_distance) * - unbase_scale * partial_scale); - return weighted_ratio; -} - -double FuzzyTokenizedStringMatch::PrefixMatcher(const TokenizedString& query, - const TokenizedString& text) { - return std::max(PrefixMatch(query, text), FirstCharacterMatch(query, text)); -} - -bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query, - const TokenizedString& text, - double relevance_threshold, - bool use_prefix_only, - bool use_weighted_ratio, - bool use_edit_distance, - double partial_match_penalty_rate) { - // If there is an exact match, relevance will be 1.0 and there is only 1 hit - // that is the entire text/query. - const auto& query_text = query.text(); - const auto& text_text = text.text(); - const auto query_size = query_text.size(); - const auto text_size = text_text.size(); - if (query_size > 0 && query_size == text_size && - base::EqualsCaseInsensitiveASCII(query_text, text_text)) { - hits_.push_back(gfx::Range(0, query_size)); - relevance_ = 1.0; - return true; - } - - // Find |hits_| using SequenceMatcher on original query and text. - for (const auto& match : - SequenceMatcher(query_text, text_text, use_edit_distance) - .GetMatchingBlocks()) { - if (match.length > 0) { - hits_.push_back(gfx::Range(match.pos_second_string, - match.pos_second_string + match.length)); - } - } - - // If the query is much longer than the text then it's often not a match. - if (query_size >= text_size * 2) { - return false; - } - - const double prefix_score = PrefixMatcher(query, text); - - if (use_prefix_only && prefix_score >= relevance_threshold) { - // If the prefix score is already higher than |relevance_threshold|, use - // prefix score as final score. - relevance_ = prefix_score; - return true; - } - - if (use_weighted_ratio) { - // If WeightedRatio is used, |relevance_| is the average of WeightedRatio - // and PrefixMatcher scores. - relevance_ = (WeightedRatio(query, text, partial_match_penalty_rate, - use_edit_distance) + - prefix_score) / - 2; - } else { - // Use simple algorithm to calculate match ratio. - relevance_ = - (SequenceMatcher(base::i18n::ToLower(query_text), - base::i18n::ToLower(text_text), use_edit_distance) - .Ratio() + - prefix_score) / - 2; - } - - return relevance_ >= relevance_threshold; -} diff --git a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.h b/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.h deleted file mode 100644 index 01bfbde6078..00000000000 --- a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_COMMON_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_ -#define CHROME_COMMON_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_ - -#include "base/gtest_prod_util.h" -#include "base/macros.h" -#include "chrome/common/string_matching/tokenized_string.h" -#include "ui/gfx/range/range.h" - -// FuzzyTokenizedStringMatch takes two tokenized strings: one as the text and -// the other one as the query. It matches the query against the text, -// calculates a relevance score between [0, 1] and marks the matched portions -// of text. A relevance of zero means the two are completely different to each -// other. The higher the relevance score, the better the two strings are -// matched. Matched portions of text are stored as index ranges. -// TODO(crbug.com/1018613): each of these functions have too many input params, -// we should revise the structure and remove unnecessary ones. -class FuzzyTokenizedStringMatch { - public: - typedef std::vector<gfx::Range> Hits; - - FuzzyTokenizedStringMatch(); - ~FuzzyTokenizedStringMatch(); - - // Check if the query only contains first characters of the text, - // e.g. "coc" is a match of "Clash of Clan". Range of the score is [0, 1]. - static double FirstCharacterMatch(const TokenizedString& query, - const TokenizedString& text); - - // Check if tokens of query are prefixes of text's tokens. Range of score is - // [0, 1]. - static double PrefixMatch(const TokenizedString& query, - const TokenizedString& text); - - // TokenSetRatio takes two sets of tokens, finds their intersection and - // differences. From the intersection and differences, it rewrites the |query| - // and |text| and find the similarity ratio between them. This function - // assumes that TokenizedString is already normalized (converted to lower - // case). Duplicates tokens will be removed for ratio computation. - static double TokenSetRatio(const TokenizedString& query, - const TokenizedString& text, - bool partial, - double partial_match_penalty_rate, - bool use_edit_distance); - - // TokenSortRatio takes two set of tokens, sorts them and find the similarity - // between two sorted strings. This function assumes that TokenizedString is - // already normalized (converted to lower case) - static double TokenSortRatio(const TokenizedString& query, - const TokenizedString& text, - bool partial, - double partial_match_penalty_rate, - bool use_edit_distance); - - // Finds the best ratio of shorter text with a part of longer text. - // This function assumes that TokenizedString is already normalized (converted - // to lower case). The return score is in range of [0, 1]. - static double PartialRatio(const base::string16& query, - const base::string16& text, - double partial_match_penalty_rate, - bool use_edit_distance); - - // Combines scores from different ratio functions. This function assumes that - // TokenizedString is already normalized (converted to lower cases). - // The return score is in range of [0, 1]. - static double WeightedRatio(const TokenizedString& query, - const TokenizedString& text, - double partial_match_penalty_rate, - bool use_edit_distance); - // Since prefix match should always be favored over other matches, this - // function is dedicated to calculate a prefix match score in range of [0, 1]. - // This score has two components: first character match and whole prefix - // match. - static double PrefixMatcher(const TokenizedString& query, - const TokenizedString& text); - - // Calculates the relevance of two strings. Returns true if two strings are - // somewhat matched, i.e. relevance score is greater than a threshold. - bool IsRelevant(const TokenizedString& query, - const TokenizedString& text, - double relevance_threshold, - bool use_prefix_only, - bool use_weighted_ratio, - bool use_edit_distance, - double partial_match_penalty_rate); - double relevance() const { return relevance_; } - const Hits& hits() const { return hits_; } - - private: - // Score in range of [0,1] representing how well the query matches the text. - double relevance_ = 0; - Hits hits_; - - DISALLOW_COPY_AND_ASSIGN(FuzzyTokenizedStringMatch); -}; - -#endif // CHROME_COMMON_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_ diff --git a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc b/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc deleted file mode 100644 index 9413a36cb9c..00000000000 --- a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (c) 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/fuzzy_tokenized_string_match.h" - -#include "base/macros.h" -#include "base/strings/utf_string_conversions.h" -#include "chrome/common/string_matching/sequence_matcher.h" -#include "chrome/common/string_matching/tokenized_string.h" -#include "testing/gtest/include/gtest/gtest.h" - -namespace { -constexpr double kPartialMatchPenaltyRate = 0.9; - -class FuzzyTokenizedStringMatchTest : public testing::Test {}; - -// TODO(crbug.com/1018613): update the tests once params are consolidated. -TEST_F(FuzzyTokenizedStringMatchTest, PartialRatioTest) { - FuzzyTokenizedStringMatch match; - EXPECT_NEAR(match.PartialRatio(base::UTF8ToUTF16("abcde"), - base::UTF8ToUTF16("ababcXXXbcdeY"), - kPartialMatchPenaltyRate, false), - 0.6, 0.01); - EXPECT_NEAR(match.PartialRatio(base::UTF8ToUTF16("big string"), - base::UTF8ToUTF16("strength"), - kPartialMatchPenaltyRate, false), - 0.71, 0.01); - EXPECT_EQ(match.PartialRatio(base::UTF8ToUTF16("abc"), base::UTF8ToUTF16(""), - kPartialMatchPenaltyRate, false), - 0); - EXPECT_NEAR(match.PartialRatio(base::UTF8ToUTF16("different in order"), - base::UTF8ToUTF16("order text"), - kPartialMatchPenaltyRate, false), - 0.67, 0.01); -} - -TEST_F(FuzzyTokenizedStringMatchTest, TokenSetRatioTest) { - FuzzyTokenizedStringMatch match; - { - base::string16 query(base::UTF8ToUTF16("order different in")); - base::string16 text(base::UTF8ToUTF16("text order")); - EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text), - true, kPartialMatchPenaltyRate, false), - 1); - EXPECT_NEAR( - match.TokenSetRatio(TokenizedString(query), TokenizedString(text), - false, kPartialMatchPenaltyRate, false), - 0.67, 0.01); - } - { - base::string16 query(base::UTF8ToUTF16("short text")); - base::string16 text( - base::UTF8ToUTF16("this text is really really really long")); - EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text), - true, kPartialMatchPenaltyRate, false), - 1); - EXPECT_NEAR( - match.TokenSetRatio(TokenizedString(query), TokenizedString(text), - false, kPartialMatchPenaltyRate, false), - 0.57, 0.01); - } - { - base::string16 query(base::UTF8ToUTF16("common string")); - base::string16 text(base::UTF8ToUTF16("nothing is shared")); - EXPECT_NEAR( - match.TokenSetRatio(TokenizedString(query), TokenizedString(text), true, - kPartialMatchPenaltyRate, false), - 0.38, 0.01); - EXPECT_NEAR( - match.TokenSetRatio(TokenizedString(query), TokenizedString(text), - false, kPartialMatchPenaltyRate, false), - 0.33, 0.01); - } - { - base::string16 query( - base::UTF8ToUTF16("token shared token same shared same")); - base::string16 text(base::UTF8ToUTF16("token shared token text text long")); - EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text), - true, kPartialMatchPenaltyRate, false), - 1); - EXPECT_NEAR( - match.TokenSetRatio(TokenizedString(query), TokenizedString(text), - false, kPartialMatchPenaltyRate, false), - 0.83, 0.01); - } -} - -TEST_F(FuzzyTokenizedStringMatchTest, TokenSortRatioTest) { - FuzzyTokenizedStringMatch match; - { - base::string16 query(base::UTF8ToUTF16("order different in")); - base::string16 text(base::UTF8ToUTF16("text order")); - EXPECT_NEAR( - match.TokenSortRatio(TokenizedString(query), TokenizedString(text), - true, kPartialMatchPenaltyRate, false), - 0.67, 0.01); - EXPECT_NEAR( - match.TokenSortRatio(TokenizedString(query), TokenizedString(text), - false, kPartialMatchPenaltyRate, false), - 0.36, 0.01); - } - { - base::string16 query(base::UTF8ToUTF16("short text")); - base::string16 text( - base::UTF8ToUTF16("this text is really really really long")); - EXPECT_EQ( - match.TokenSortRatio(TokenizedString(query), TokenizedString(text), - true, kPartialMatchPenaltyRate, false), - 0.5 * std::pow(0.9, 1)); - EXPECT_NEAR( - match.TokenSortRatio(TokenizedString(query), TokenizedString(text), - false, kPartialMatchPenaltyRate, false), - 0.33, 0.01); - } - { - base::string16 query(base::UTF8ToUTF16("common string")); - base::string16 text(base::UTF8ToUTF16("nothing is shared")); - EXPECT_NEAR( - match.TokenSortRatio(TokenizedString(query), TokenizedString(text), - true, kPartialMatchPenaltyRate, false), - 0.38, 0.01); - EXPECT_NEAR( - match.TokenSortRatio(TokenizedString(query), TokenizedString(text), - false, kPartialMatchPenaltyRate, false), - 0.33, 0.01); - } -} - -TEST_F(FuzzyTokenizedStringMatchTest, WeightedRatio) { - FuzzyTokenizedStringMatch match; - { - base::string16 query(base::UTF8ToUTF16("anonymous")); - base::string16 text(base::UTF8ToUTF16("famous")); - EXPECT_NEAR( - match.WeightedRatio(TokenizedString(query), TokenizedString(text), - kPartialMatchPenaltyRate, false), - 0.67, 0.01); - } - { - base::string16 query(base::UTF8ToUTF16("Clash.of.clan")); - base::string16 text(base::UTF8ToUTF16("ClashOfTitan")); - EXPECT_NEAR( - match.WeightedRatio(TokenizedString(query), TokenizedString(text), - kPartialMatchPenaltyRate, false), - 0.81, 0.01); - } - { - base::string16 query(base::UTF8ToUTF16("final fantasy")); - base::string16 text(base::UTF8ToUTF16("finalfantasy")); - EXPECT_NEAR( - match.WeightedRatio(TokenizedString(query), TokenizedString(text), - kPartialMatchPenaltyRate, false), - 0.96, 0.01); - } - { - base::string16 query(base::UTF8ToUTF16("short text!!!")); - base::string16 text( - base::UTF8ToUTF16("this sentence is much much much much much longer " - "than the text before")); - EXPECT_NEAR( - match.WeightedRatio(TokenizedString(query), TokenizedString(text), - kPartialMatchPenaltyRate, false), - 0.85, 0.01); - } -} - -TEST_F(FuzzyTokenizedStringMatchTest, FirstCharacterMatchTest) { - { - base::string16 query(base::UTF8ToUTF16("COC")); - base::string16 text(base::UTF8ToUTF16("Clash of Clan")); - EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch( - TokenizedString(query), TokenizedString(text)), - 1.0); - } - { - base::string16 query(base::UTF8ToUTF16("CC")); - base::string16 text(base::UTF8ToUTF16("Clash of Clan")); - EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch( - TokenizedString(query), TokenizedString(text)), - 0.8); - } - { - base::string16 query(base::UTF8ToUTF16("C o C")); - base::string16 text(base::UTF8ToUTF16("Clash of Clan")); - EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch( - TokenizedString(query), TokenizedString(text)), - 0.0); - } -} - -TEST_F(FuzzyTokenizedStringMatchTest, PrefixMatchTest) { - { - base::string16 query(base::UTF8ToUTF16("clas")); - base::string16 text(base::UTF8ToUTF16("Clash of Clan")); - EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query), - TokenizedString(text)), - 1.0); - } - { - base::string16 query(base::UTF8ToUTF16("clash clan")); - base::string16 text(base::UTF8ToUTF16("Clash of Clan")); - EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query), - TokenizedString(text)), - 0.9); - } - { - base::string16 query(base::UTF8ToUTF16("c o c")); - base::string16 text(base::UTF8ToUTF16("Clash of Clan")); - EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query), - TokenizedString(text)), - 1.0); - } - { - base::string16 query(base::UTF8ToUTF16("clam")); - base::string16 text(base::UTF8ToUTF16("Clash of Clan")); - EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query), - TokenizedString(text)), - 0.0); - } -} - -TEST_F(FuzzyTokenizedStringMatchTest, ParamThresholdTest1) { - FuzzyTokenizedStringMatch match; - { - base::string16 query(base::UTF8ToUTF16("anonymous")); - base::string16 text(base::UTF8ToUTF16("famous")); - EXPECT_FALSE(match.IsRelevant(TokenizedString(query), TokenizedString(text), - 0.4, false, true, false, - kPartialMatchPenaltyRate)); - } - { - base::string16 query(base::UTF8ToUTF16("CC")); - base::string16 text(base::UTF8ToUTF16("Clash Of Clan")); - EXPECT_TRUE(match.IsRelevant(TokenizedString(query), TokenizedString(text), - 0.4, false, true, false, - kPartialMatchPenaltyRate)); - } - { - base::string16 query(base::UTF8ToUTF16("Clash.of.clan")); - base::string16 text(base::UTF8ToUTF16("ClashOfTitan")); - EXPECT_TRUE(match.IsRelevant(TokenizedString(query), TokenizedString(text), - 0.4, false, true, false, - kPartialMatchPenaltyRate)); - } -} - -TEST_F(FuzzyTokenizedStringMatchTest, ParamThresholdTest2) { - FuzzyTokenizedStringMatch match; - { - base::string16 query(base::UTF8ToUTF16("anonymous")); - base::string16 text(base::UTF8ToUTF16("famous")); - EXPECT_FALSE(match.IsRelevant(TokenizedString(query), TokenizedString(text), - 0.5, false, true, false, - kPartialMatchPenaltyRate)); - } - { - base::string16 query(base::UTF8ToUTF16("CC")); - base::string16 text(base::UTF8ToUTF16("Clash Of Clan")); - EXPECT_TRUE(match.IsRelevant(TokenizedString(query), TokenizedString(text), - 0.5, false, true, false, - kPartialMatchPenaltyRate)); - } - { - base::string16 query(base::UTF8ToUTF16("Clash.of.clan")); - base::string16 text(base::UTF8ToUTF16("ClashOfTitan")); - EXPECT_FALSE(match.IsRelevant(TokenizedString(query), TokenizedString(text), - 0.5, false, true, false, - kPartialMatchPenaltyRate)); - } -} - -TEST_F(FuzzyTokenizedStringMatchTest, OtherParamTest) { - FuzzyTokenizedStringMatch match; - base::string16 query(base::UTF8ToUTF16("anonymous")); - base::string16 text(base::UTF8ToUTF16("famous")); - EXPECT_FALSE(match.IsRelevant(TokenizedString(query), TokenizedString(text), - 0.35, false, false, true, - kPartialMatchPenaltyRate)); - EXPECT_NEAR(match.relevance(), 0.33 / 2, 0.01); -} - -TEST_F(FuzzyTokenizedStringMatchTest, ExactTextMatchTest) { - FuzzyTokenizedStringMatch match; - base::string16 query(base::UTF8ToUTF16("yat")); - base::string16 text(base::UTF8ToUTF16("YaT")); - EXPECT_TRUE(match.IsRelevant(TokenizedString(query), TokenizedString(text), - 0.35, false, false, true, - kPartialMatchPenaltyRate)); - EXPECT_DOUBLE_EQ(match.relevance(), 1.0); - EXPECT_EQ(match.hits().size(), 1u); - EXPECT_EQ(match.hits()[0].start(), 0u); - EXPECT_EQ(match.hits()[0].end(), 3u); -} - -} // namespace diff --git a/chromium/chrome/common/string_matching/sequence_matcher.cc b/chromium/chrome/common/string_matching/sequence_matcher.cc deleted file mode 100644 index 0d42146f087..00000000000 --- a/chromium/chrome/common/string_matching/sequence_matcher.cc +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/sequence_matcher.h" - -#include <algorithm> -#include <queue> - -#include "base/check_op.h" - -namespace { -using Match = SequenceMatcher::Match; -using Matches = std::vector<Match>; - -bool CompareMatches(const Match& m1, const Match& m2) { - return m1.pos_first_string < m2.pos_first_string; -} -} // namespace - -SequenceMatcher::Match::Match() = default; -SequenceMatcher::Match::Match(int pos_first, int pos_second, int len) - : pos_first_string(pos_first), pos_second_string(pos_second), length(len) { - DCHECK_GE(pos_first_string, 0); - DCHECK_GE(pos_second_string, 0); - DCHECK_GE(length, 0); -} - -SequenceMatcher::SequenceMatcher(const base::string16& first_string, - const base::string16& second_string, - bool use_edit_distance) - : first_string_(first_string), - second_string_(second_string), - dp_common_string_(second_string.size() + 1, 0) { - DCHECK(!first_string_.empty() || !second_string_.empty()); - - for (size_t i = 0; i < second_string_.size(); i++) { - char_to_positions_[second_string_[i]].emplace_back(i); - } - use_edit_distance_ = use_edit_distance; -} - -Match SequenceMatcher::FindLongestMatch(int first_start, - int first_end, - int second_start, - int second_end) { - Match match(first_start, second_start, 0); - - // These two vectors are used to do "fast update". - // |dp_values_to_erase| contains the values should be erased from - // |dp_common_string_|. - // |dp_values_to_affect| contains the values should be updated from - // |dp_common_string_|. - std::vector<std::pair<int, int>> dp_values_to_erase; - std::vector<std::pair<int, int>> dp_values_to_affect; - - for (int i = first_start; i < first_end; i++) { - dp_values_to_affect.clear(); - for (auto j : char_to_positions_[first_string_[i]]) { - if (j < second_start) { - continue; - } - if (j >= second_end) { - break; - } - // dp_commong_string_[j + 1] is length of longest common substring - // ends at first_string_[i] and second_string_[j + 1] - const int length = dp_common_string_[j] + 1; - dp_values_to_affect.emplace_back(j + 1, length); - if (length > match.length) { - match.pos_first_string = i - length + 1; - match.pos_second_string = j - length + 1; - match.length = length; - } - } - // Updates dp_common_string_ - for (auto const& element : dp_values_to_erase) { - dp_common_string_[element.first] = 0; - } - for (auto const& element : dp_values_to_affect) { - dp_common_string_[element.first] = element.second; - } - std::swap(dp_values_to_erase, dp_values_to_affect); - } - // Erases all updated value for the next call. - std::fill(dp_common_string_.begin(), dp_common_string_.end(), 0); - - return match; -} - -Matches SequenceMatcher::GetMatchingBlocks() { - if (!matching_blocks_.empty()) { - return matching_blocks_; - } - - // This queue contains a tuple of 4 integers that represent 2 substrings to - // find the longest match in the following order: first_start, first_end, - // second_start, second_end. - std::queue<std::tuple<int, int, int, int>> queue_block; - queue_block.emplace(0, first_string_.size(), 0, second_string_.size()); - - // Find all matching blocks recursively. - while (!queue_block.empty()) { - int first_start, first_end, second_start, second_end; - std::tie(first_start, first_end, second_start, second_end) = - queue_block.front(); - queue_block.pop(); - const Match match = - FindLongestMatch(first_start, first_end, second_start, second_end); - if (match.length > 0) { - if (first_start < match.pos_first_string && - second_start < match.pos_second_string) { - queue_block.emplace(first_start, match.pos_first_string, second_start, - match.pos_second_string); - } - if (match.pos_first_string + match.length < first_end && - match.pos_second_string + match.length < second_end) { - queue_block.emplace(match.pos_first_string + match.length, first_end, - match.pos_second_string + match.length, second_end); - } - matching_blocks_.push_back(match); - } - } - - matching_blocks_.push_back( - Match(first_string_.size(), second_string_.size(), 0)); - sort(matching_blocks_.begin(), matching_blocks_.end(), CompareMatches); - return matching_blocks_; -} - -int SequenceMatcher::EditDistance() { - // If edit distance is already calculated - if (edit_distance_ >= 0) { - return edit_distance_; - } - - const int len_first = first_string_.size(); - const int len_second = second_string_.size(); - if (len_first == 0 || len_second == 0) { - edit_distance_ = std::max(len_first, len_second); - return edit_distance_; - } - - // Memory for the dynamic programming: - // dp[i + 1][j + 1] is the edit distane of first |i| characters of - // |first_string_| and first |j| characters of |second_string_| - int dp[len_first + 1][len_second + 1]; - - // Initialize memory - for (int i = 0; i < len_first + 1; i++) { - dp[i][0] = i; - } - for (int j = 0; j < len_second + 1; j++) { - dp[0][j] = j; - } - - // Calculate the edit distance - for (int i = 1; i < len_first + 1; i++) { - for (int j = 1; j < len_second + 1; j++) { - const int cost = first_string_[i - 1] == second_string_[j - 1] ? 0 : 1; - // Insertion and deletion - dp[i][j] = std::min(dp[i - 1][j], dp[i][j - 1]) + 1; - // Substitution - dp[i][j] = std::min(dp[i][j], dp[i - 1][j - 1] + cost); - // Transposition - if (i > 1 && j > 1 && first_string_[i - 2] == second_string_[j - 1] && - first_string_[i - 1] == second_string_[j - 2]) { - dp[i][j] = std::min(dp[i][j], dp[i - 2][j - 2] + cost); - } - } - } - edit_distance_ = dp[len_first][len_second]; - return edit_distance_; -} - -double SequenceMatcher::Ratio() { - if (use_edit_distance_) { - if (edit_distance_ratio_ < 0) { - const int edit_distance = EditDistance(); - edit_distance_ratio_ = std::max( - 0.0, 1.0 - static_cast<double>(edit_distance) * 2 / - (first_string_.size() + second_string_.size())); - } - return edit_distance_ratio_; - } - - // Uses block matching to calculate ratio. - if (block_matching_ratio_ < 0) { - int sum_match = 0; - const int sum_length = first_string_.size() + second_string_.size(); - DCHECK_NE(sum_length, 0); - for (const auto& match : GetMatchingBlocks()) { - sum_match += match.length; - } - block_matching_ratio_ = 2.0 * sum_match / sum_length; - } - return block_matching_ratio_; -} diff --git a/chromium/chrome/common/string_matching/sequence_matcher.h b/chromium/chrome/common/string_matching/sequence_matcher.h deleted file mode 100644 index 71bc10b6892..00000000000 --- a/chromium/chrome/common/string_matching/sequence_matcher.h +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_COMMON_STRING_MATCHING_SEQUENCE_MATCHER_H_ -#define CHROME_COMMON_STRING_MATCHING_SEQUENCE_MATCHER_H_ - -#include <string> -#include <unordered_map> -#include <vector> - -#include "base/logging.h" -#include "base/macros.h" -#include "base/strings/string16.h" - -// Performs the calculation of similarity level between 2 strings. This class's -// functionality is inspired by python's difflib.SequenceMatcher library. -// (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher) -class SequenceMatcher { - public: - // Representing a common substring between |first_string_| and - // |second_string_|. - struct Match { - Match(); - Match(int pos_first, int pos_second, int len); - // Starting position of the common substring in |first_string_|. - int pos_first_string; - // Starting position of the common substring in |second_string_|. - int pos_second_string; - // Length of the common substring. - int length; - }; - SequenceMatcher(const base::string16& first_string, - const base::string16& second_string, - bool use_edit_distance); - - ~SequenceMatcher() = default; - - // Calculates similarity ratio of |first_string_| and |second_string_|. - double Ratio(); - // Calculates the Damerau–Levenshtein distance between |first_string_| and - // |second_string_|. - // See https://en.wikipedia.org/wiki/Damerau–Levenshtein_distance for more - // details. - int EditDistance(); - // Finds the longest common substring between - // |first_string_[first_start:first_end]| and - // |second_string_[second_start:second_end]|. - Match FindLongestMatch(int first_start, - int first_end, - int second_start, - int second_end); - // Get all matching blocks of |first_string_| and |second_string_|. - // All blocks will be sorted by starting position of them in the - // |first_string_|. The last matching block will always be - // Match(first_string_.size(), second_string_.size(), 0). - std::vector<Match> GetMatchingBlocks(); - - private: - base::string16 first_string_; - base::string16 second_string_; - double edit_distance_ratio_ = -1.0; - double block_matching_ratio_ = -1.0; - std::vector<Match> matching_blocks_; - - // Controls whether to use edit distance to calculate ratio. - bool use_edit_distance_; - int edit_distance_ = -1; - // For each character |c| in |second_string_|, this variable - // |char_to_positions_| stores all positions where |c| occurs in - // |second_string_|. - std::unordered_map<char, std::vector<int>> char_to_positions_; - // Memory for dynamic programming algorithm used in FindLongestMatch(). - std::vector<int> dp_common_string_; - DISALLOW_COPY_AND_ASSIGN(SequenceMatcher); -}; - -#endif // CHROME_COMMON_STRING_MATCHING_SEQUENCE_MATCHER_H_ diff --git a/chromium/chrome/common/string_matching/sequence_matcher_unittest.cc b/chromium/chrome/common/string_matching/sequence_matcher_unittest.cc deleted file mode 100644 index c06fcfc6452..00000000000 --- a/chromium/chrome/common/string_matching/sequence_matcher_unittest.cc +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/sequence_matcher.h" - -#include "base/macros.h" -#include "base/strings/utf_string_conversions.h" -#include "testing/gtest/include/gtest/gtest.h" - -namespace { -constexpr bool kDefaultUseEditDistance = false; - -using Match = SequenceMatcher::Match; -bool MatchEqual(const Match& match1, const Match& match2) { - return match1.pos_first_string == match2.pos_first_string && - match1.pos_second_string == match2.pos_second_string && - match1.length == match2.length; -} - -class SequenceMatcherTest : public testing::Test {}; - -TEST_F(SequenceMatcherTest, TestEditDistance) { - // Transposition - ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"), - base::UTF8ToUTF16("abdc"), kDefaultUseEditDistance) - .EditDistance(), - 1); - - // Deletion - ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcde"), - base::UTF8ToUTF16("abcd"), kDefaultUseEditDistance) - .EditDistance(), - 1); - ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("12"), base::UTF8ToUTF16(""), - kDefaultUseEditDistance) - .EditDistance(), - 2); - - // Insertion - ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abc"), - base::UTF8ToUTF16("abxbc"), kDefaultUseEditDistance) - .EditDistance(), - 2); - ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abxbc"), - kDefaultUseEditDistance) - .EditDistance(), - 5); - - // Substitution - ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("book"), - base::UTF8ToUTF16("back"), kDefaultUseEditDistance) - .EditDistance(), - 2); - - // Combination - ASSERT_EQ( - SequenceMatcher(base::UTF8ToUTF16("caclulation"), - base::UTF8ToUTF16("calculator"), kDefaultUseEditDistance) - .EditDistance(), - 3); - ASSERT_EQ( - SequenceMatcher(base::UTF8ToUTF16("sunday"), - base::UTF8ToUTF16("saturday"), kDefaultUseEditDistance) - .EditDistance(), - 3); -} - -TEST_F(SequenceMatcherTest, TestFindLongestMatch) { - SequenceMatcher sequence_match(base::UTF8ToUTF16("miscellanious"), - base::UTF8ToUTF16("miscellaneous"), - kDefaultUseEditDistance); - ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(0, 13, 0, 13), - Match(0, 0, 9))); - ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(7, 13, 7, 13), - Match(10, 10, 3))); - - ASSERT_TRUE(MatchEqual( - SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abcd"), - kDefaultUseEditDistance) - .FindLongestMatch(0, 0, 0, 4), - Match(0, 0, 0))); - ASSERT_TRUE(MatchEqual( - SequenceMatcher(base::UTF8ToUTF16("abababbababa"), - base::UTF8ToUTF16("ababbaba"), kDefaultUseEditDistance) - .FindLongestMatch(0, 12, 0, 8), - Match(2, 0, 8))); - ASSERT_TRUE(MatchEqual( - SequenceMatcher(base::UTF8ToUTF16("aaaaaa"), base::UTF8ToUTF16("aaaaa"), - kDefaultUseEditDistance) - .FindLongestMatch(0, 6, 0, 5), - Match(0, 0, 5))); -} - -TEST_F(SequenceMatcherTest, TestGetMatchingBlocks) { - SequenceMatcher sequence_match( - base::UTF8ToUTF16("This is a demo sentence!!!"), - base::UTF8ToUTF16("This demo sentence is good!!!"), - kDefaultUseEditDistance); - const std::vector<Match> true_matches = {Match(0, 0, 4), Match(9, 4, 14), - Match(23, 26, 3), Match(26, 29, 0)}; - const std::vector<Match> matches = sequence_match.GetMatchingBlocks(); - ASSERT_EQ(matches.size(), 4ul); - for (int i = 0; i < 4; i++) { - ASSERT_TRUE(MatchEqual(matches[i], true_matches[i])); - } -} - -TEST_F(SequenceMatcherTest, TestSequenceMatcherRatio) { - ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"), - base::UTF8ToUTF16("adbc"), kDefaultUseEditDistance) - .Ratio(), - 0.75); - ASSERT_EQ( - SequenceMatcher(base::UTF8ToUTF16("white cats"), - base::UTF8ToUTF16("cats white"), kDefaultUseEditDistance) - .Ratio(), - 0.5); -} - -TEST_F(SequenceMatcherTest, TestEditDistanceRatio) { - ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"), - base::UTF8ToUTF16("adbc"), true) - .Ratio(), - 0.5); - EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("white cats"), - base::UTF8ToUTF16("cats white"), true) - .Ratio(), - 0.2, 0.01); - - // Totally different - EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("dog"), - base::UTF8ToUTF16("elphant"), true) - .Ratio(), - 0.0, 0.01); -} -} // namespace diff --git a/chromium/chrome/common/string_matching/term_break_iterator.cc b/chromium/chrome/common/string_matching/term_break_iterator.cc deleted file mode 100644 index ae2f3b834ff..00000000000 --- a/chromium/chrome/common/string_matching/term_break_iterator.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/term_break_iterator.h" - -#include "base/check.h" -#include "base/i18n/char_iterator.h" -#include "base/notreached.h" -#include "base/strings/string_util.h" -#include "third_party/icu/source/common/unicode/uchar.h" - -TermBreakIterator::TermBreakIterator(const base::string16& word) - : word_(word), - prev_(npos), - pos_(0), - iter_(new base::i18n::UTF16CharIterator(&word)), - state_(STATE_START) {} - -TermBreakIterator::~TermBreakIterator() = default; - -bool TermBreakIterator::Advance() { - // 2D matrix that defines term boundaries. Each row represents current state. - // Each col represents new state from input char. Cells with true value - // represents a term boundary. - const bool kBoundary[][STATE_LAST] = { - // START NUMBER UPPER LOWER CHAR - {false, false, false, false, false}, // START - {false, false, true, true, true}, // NUMBER - {false, true, false, false, true}, // UPPER - {false, true, true, false, true}, // LOWER - {false, true, true, true, false}, // CHAR - }; - - while (iter_->Advance()) { - const State new_state = GetNewState(word_[iter_->array_pos()]); - const bool is_boundary = kBoundary[state_][new_state]; - state_ = new_state; - if (is_boundary) - break; - } - - prev_ = pos_; - pos_ = iter_->array_pos(); - - return prev_ != pos_ || !iter_->end(); -} - -const base::string16 TermBreakIterator::GetCurrentTerm() const { - DCHECK(prev_ != npos && pos_ != npos); - return word_.substr(prev_, pos_ - prev_); -} - -TermBreakIterator::State TermBreakIterator::GetNewState(base::char16 ch) { - if (base::IsAsciiDigit(ch) || ch == '.' || ch == ',') - return STATE_NUMBER; - - const bool is_upper = !!u_isUUppercase(ch); - const bool is_lower = !!u_isULowercase(ch); - - if (is_upper && is_lower) { - NOTREACHED() << "Invalid state for ch=" << ch; - return STATE_CHAR; - } - - if (is_upper) - return STATE_UPPER; - if (is_lower) - return STATE_LOWER; - - return STATE_CHAR; -} diff --git a/chromium/chrome/common/string_matching/term_break_iterator.h b/chromium/chrome/common/string_matching/term_break_iterator.h deleted file mode 100644 index 80d069acc3c..00000000000 --- a/chromium/chrome/common/string_matching/term_break_iterator.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_ -#define CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_ - -#include <stddef.h> - -#include <memory> - -#include "base/macros.h" -#include "base/strings/string16.h" - -namespace base { -namespace i18n { -class UTF16CharIterator; -} -} // namespace base - -// TermBreakIterator breaks terms out of a word. Terms are broken on -// camel case boundaries and alpha/number boundaries. Numbers are defined -// as [0-9\.,]+. -// e.g. -// CamelCase -> Camel, Case -// Python2.7 -> Python, 2.7 -class TermBreakIterator { - public: - // Note that |word| must out live this iterator. - explicit TermBreakIterator(const base::string16& word); - ~TermBreakIterator(); - - // Advance to the next term. Returns false if at the end of the word. - bool Advance(); - - // Returns the current term, which is the substr of |word_| in range - // [prev_, pos_). - const base::string16 GetCurrentTerm() const; - - size_t prev() const { return prev_; } - size_t pos() const { return pos_; } - - static const size_t npos = static_cast<size_t>(-1); - - private: - enum State { - STATE_START, // Initial state - STATE_NUMBER, // Current char is a number [0-9\.,]. - STATE_UPPER, // Current char is upper case. - STATE_LOWER, // Current char is lower case. - STATE_CHAR, // Current char has no case, e.g. a cjk char. - STATE_LAST, - }; - - // Returns new state for given |ch|. - State GetNewState(base::char16 ch); - - const base::string16& word_; - size_t prev_; - size_t pos_; - - std::unique_ptr<base::i18n::UTF16CharIterator> iter_; - State state_; - - DISALLOW_COPY_AND_ASSIGN(TermBreakIterator); -}; - -#endif // CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_ diff --git a/chromium/chrome/common/string_matching/term_break_iterator_unittest.cc b/chromium/chrome/common/string_matching/term_break_iterator_unittest.cc deleted file mode 100644 index 8ff0f2f8766..00000000000 --- a/chromium/chrome/common/string_matching/term_break_iterator_unittest.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/term_break_iterator.h" - -#include "base/strings/utf_string_conversions.h" -#include "testing/gtest/include/gtest/gtest.h" - -using base::UTF8ToUTF16; - -namespace { - -TEST(TermBreakIteratorTest, EmptyWord) { - base::string16 empty; - TermBreakIterator iter(empty); - EXPECT_FALSE(iter.Advance()); -} - -TEST(TermBreakIteratorTest, Simple) { - base::string16 word(UTF8ToUTF16("simple")); - TermBreakIterator iter(word); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("simple"), iter.GetCurrentTerm()); - EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. -} - -TEST(TermBreakIteratorTest, CamelCase) { - base::string16 word(UTF8ToUTF16("CamelCase")); - TermBreakIterator iter(word); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("Camel"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("Case"), iter.GetCurrentTerm()); - EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. -} - -TEST(TermBreakIteratorTest, LowerToUpper) { - base::string16 word(UTF8ToUTF16("lowerToUpper")); - TermBreakIterator iter(word); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("lower"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("To"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("Upper"), iter.GetCurrentTerm()); - EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. -} - -TEST(TermBreakIteratorTest, AlphaNumber) { - base::string16 word(UTF8ToUTF16("Chromium26.0.0.0")); - TermBreakIterator iter(word); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("Chromium"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("26.0.0.0"), iter.GetCurrentTerm()); - EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. -} - -TEST(TermBreakIteratorTest, StartsWithNumber) { - base::string16 word(UTF8ToUTF16("123startWithNumber")); - TermBreakIterator iter(word); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("123"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("start"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("With"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("Number"), iter.GetCurrentTerm()); - EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. -} - -TEST(TermBreakIteratorTest, CaseAndNoCase) { - // "English" + two Chinese chars U+4E2D U+6587 + "Word" - base::string16 word(UTF8ToUTF16("English\xe4\xb8\xad\xe6\x96\x87Word")); - TermBreakIterator iter(word); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("English"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("\xe4\xb8\xad\xe6\x96\x87"), iter.GetCurrentTerm()); - EXPECT_TRUE(iter.Advance()); - EXPECT_EQ(UTF8ToUTF16("Word"), iter.GetCurrentTerm()); - EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. -} - -} // namespace diff --git a/chromium/chrome/common/string_matching/tokenized_string.cc b/chromium/chrome/common/string_matching/tokenized_string.cc deleted file mode 100644 index 77c0703100a..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/tokenized_string.h" - -#include <stddef.h> - -#include "base/i18n/break_iterator.h" -#include "base/i18n/case_conversion.h" -#include "base/notreached.h" -#include "chrome/common/string_matching/term_break_iterator.h" - -using base::i18n::BreakIterator; - -TokenizedString::TokenizedString(const base::string16& text) : text_(text) { - Tokenize(); -} - -TokenizedString::~TokenizedString() = default; - -void TokenizedString::Tokenize() { - BreakIterator break_iter(text_, BreakIterator::BREAK_WORD); - if (!break_iter.Init()) { - NOTREACHED() << "BreakIterator init failed" - << ", text=\"" << text_ << "\""; - return; - } - - while (break_iter.Advance()) { - if (!break_iter.IsWord()) - continue; - - const base::string16 word(break_iter.GetString()); - const size_t word_start = break_iter.prev(); - TermBreakIterator term_iter(word); - while (term_iter.Advance()) { - tokens_.emplace_back(base::i18n::ToLower(term_iter.GetCurrentTerm())); - mappings_.emplace_back(word_start + term_iter.prev(), - word_start + term_iter.pos()); - } - } -} diff --git a/chromium/chrome/common/string_matching/tokenized_string.h b/chromium/chrome/common/string_matching/tokenized_string.h deleted file mode 100644 index a564741f1da..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_H_ -#define CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_H_ - -#include <vector> - -#include "base/macros.h" -#include "base/strings/string16.h" -#include "ui/gfx/range/range.h" - -// TokenizedString takes a string and breaks it down into token words. It -// first breaks using BreakIterator to get all the words. Then it breaks -// the words again at camel case boundaries and alpha/number boundaries. -class TokenizedString { - public: - typedef std::vector<base::string16> Tokens; - typedef std::vector<gfx::Range> Mappings; - - explicit TokenizedString(const base::string16& text); - ~TokenizedString(); - - const base::string16& text() const { return text_; } - const Tokens& tokens() const { return tokens_; } - const Mappings& mappings() const { return mappings_; } - - private: - void Tokenize(); - - // Input text. - const base::string16 text_; - - // Broken down tokens and the index mapping of tokens in original string. - Tokens tokens_; - Mappings mappings_; - - DISALLOW_COPY_AND_ASSIGN(TokenizedString); -}; - -#endif // CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_H_ diff --git a/chromium/chrome/common/string_matching/tokenized_string_char_iterator.cc b/chromium/chrome/common/string_matching/tokenized_string_char_iterator.cc deleted file mode 100644 index b558c4d3f28..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string_char_iterator.cc +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/tokenized_string_char_iterator.h" - -#include "base/check.h" -#include "base/i18n/char_iterator.h" -#include "third_party/icu/source/common/unicode/utf16.h" - -TokenizedStringCharIterator::State::State() : token_index(0u), char_index(0) {} - -TokenizedStringCharIterator::State::State(size_t token_index, int char_index) - : token_index(token_index), char_index(char_index) {} - -TokenizedStringCharIterator::TokenizedStringCharIterator( - const TokenizedString& tokenized) - : tokens_(tokenized.tokens()), - mappings_(tokenized.mappings()), - current_token_(0) { - CreateTokenCharIterator(); -} - -TokenizedStringCharIterator::~TokenizedStringCharIterator() = default; - -bool TokenizedStringCharIterator::NextChar() { - if (current_token_iter_) { - current_token_iter_->Advance(); - if (!current_token_iter_->end()) - return true; - } - - return NextToken(); -} - -bool TokenizedStringCharIterator::NextToken() { - if (current_token_ < tokens_.size()) { - ++current_token_; - CreateTokenCharIterator(); - } - - return !!current_token_iter_; -} - -int32_t TokenizedStringCharIterator::Get() const { - return current_token_iter_ ? current_token_iter_->get() : 0; -} - -int32_t TokenizedStringCharIterator::GetArrayPos() const { - DCHECK(current_token_iter_); - return mappings_[current_token_].start() + current_token_iter_->array_pos(); -} - -size_t TokenizedStringCharIterator::GetCharSize() const { - return current_token_iter_ ? U16_LENGTH(Get()) : 0; -} - -bool TokenizedStringCharIterator::IsFirstCharOfToken() const { - return current_token_iter_ && current_token_iter_->char_offset() == 0; -} - -TokenizedStringCharIterator::State TokenizedStringCharIterator::GetState() - const { - return State(current_token_, - current_token_iter_ ? current_token_iter_->char_offset() : 0); -} - -void TokenizedStringCharIterator::SetState(const State& state) { - current_token_ = state.token_index; - CreateTokenCharIterator(); - if (current_token_iter_) { - while (current_token_iter_->char_offset() < state.char_index) - current_token_iter_->Advance(); - } -} - -void TokenizedStringCharIterator::CreateTokenCharIterator() { - if (current_token_ == tokens_.size()) { - current_token_iter_.reset(); - return; - } - - current_token_iter_.reset( - new base::i18n::UTF16CharIterator(&tokens_[current_token_])); -} diff --git a/chromium/chrome/common/string_matching/tokenized_string_char_iterator.h b/chromium/chrome/common/string_matching/tokenized_string_char_iterator.h deleted file mode 100644 index 76a5be6e19a..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string_char_iterator.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_CHAR_ITERATOR_H_ -#define CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_CHAR_ITERATOR_H_ - -#include <stddef.h> -#include <stdint.h> - -#include <memory> - -#include "base/macros.h" -#include "chrome/common/string_matching/tokenized_string.h" - -namespace base { -namespace i18n { -class UTF16CharIterator; -} -} // namespace base - -// An UTF16 char iterator for a TokenizedString. -class TokenizedStringCharIterator { - public: - struct State { - State(); - State(size_t token_index, int char_index); - - size_t token_index; - int32_t char_index; - }; - - // Requires |tokenized| out-lives this iterator. - explicit TokenizedStringCharIterator(const TokenizedString& tokenized); - ~TokenizedStringCharIterator(); - - // Advances to the next char. Returns false if there is no next char. - bool NextChar(); - - // Advances to the first char of the next token. Returns false if there is - // no next token. - bool NextToken(); - - // Returns the current char if there is one. Otherwise, returns 0. - int32_t Get() const; - - // Returns the array index in original text of the tokenized string that is - // passed in constructor. - int32_t GetArrayPos() const; - - // Returns the number of UTF16 code units for the current char. - size_t GetCharSize() const; - - // Returns true if the current char is the first char of the current token. - bool IsFirstCharOfToken() const; - - // Helpers to get and restore the iterator's state. - State GetState() const; - void SetState(const State& state); - - // Returns true if the iterator is at the end. - bool end() const { return !current_token_iter_; } - - private: - void CreateTokenCharIterator(); - - const TokenizedString::Tokens& tokens_; - const TokenizedString::Mappings& mappings_; - - size_t current_token_; - std::unique_ptr<base::i18n::UTF16CharIterator> current_token_iter_; - - DISALLOW_COPY_AND_ASSIGN(TokenizedStringCharIterator); -}; - -#endif // CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_CHAR_ITERATOR_H_ diff --git a/chromium/chrome/common/string_matching/tokenized_string_char_iterator_unittest.cc b/chromium/chrome/common/string_matching/tokenized_string_char_iterator_unittest.cc deleted file mode 100644 index 61b5c4cfaef..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string_char_iterator_unittest.cc +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/tokenized_string_char_iterator.h" - -#include <string> -#include <vector> - -#include "base/strings/string_util.h" -#include "base/strings/stringprintf.h" -#include "base/strings/utf_string_conversions.h" -#include "testing/gtest/include/gtest/gtest.h" - -namespace { - -// Returns a string represents the current state of |iter|. The state string -// has three fields. The first is the current char. The second is the offset of -// the current char in terms of the original text of the TokenizedString. The -// last one is optional and only shows up when IsFirstCharOfToken returns true. -std::string GetIterateState(const TokenizedStringCharIterator& iter) { - return base::StringPrintf( - "%s%d%s", base::UTF16ToUTF8(base::string16(1, iter.Get())).c_str(), - iter.GetArrayPos(), iter.IsFirstCharOfToken() ? "!" : ""); -} - -void TestBeyondTheEnd(TokenizedStringCharIterator* iter) { - ASSERT_TRUE(iter->end()); - ASSERT_FALSE(iter->NextChar()); - ASSERT_FALSE(iter->NextToken()); - - // Don't care what it returns, but this shouldn't crash. - iter->Get(); -} - -void TestEveryChar(const std::string& text, const std::string& expects) { - TokenizedString tokens(base::UTF8ToUTF16(text)); - TokenizedStringCharIterator iter(tokens); - - std::vector<std::string> results; - while (!iter.end()) { - results.push_back(GetIterateState(iter)); - iter.NextChar(); - } - - EXPECT_EQ(expects, base::JoinString(results, " ")); - TestBeyondTheEnd(&iter); -} - -void TestNextToken(const std::string& text, const std::string& expects) { - TokenizedString tokens(base::UTF8ToUTF16(text)); - TokenizedStringCharIterator iter(tokens); - - std::vector<std::string> results; - while (!iter.end()) { - results.push_back(GetIterateState(iter)); - iter.NextToken(); - } - - EXPECT_EQ(expects, base::JoinString(results, " ")); - TestBeyondTheEnd(&iter); -} - -void TestFirstTwoCharInEveryToken(const std::string& text, - const std::string& expects) { - TokenizedString tokens(base::UTF8ToUTF16(text)); - TokenizedStringCharIterator iter(tokens); - - std::vector<std::string> results; - while (!iter.end()) { - results.push_back(GetIterateState(iter)); - if (iter.NextChar()) - results.push_back(GetIterateState(iter)); - - iter.NextToken(); - } - - EXPECT_EQ(expects, base::JoinString(results, " ")); - TestBeyondTheEnd(&iter); -} - -TEST(TokenizedStringCharIteratorTest, NoTerms) { - const char* text; - - text = ""; - TestEveryChar(text, ""); - TestNextToken(text, ""); - TestFirstTwoCharInEveryToken(text, ""); - - text = "!@#$%^&*()<<<**>>>"; - TestEveryChar(text, ""); - TestNextToken(text, ""); - TestFirstTwoCharInEveryToken(text, ""); -} - -TEST(TokenizedStringCharIteratorTest, Basic) { - const char* text; - - text = "c"; - TestEveryChar(text, "c0!"); - TestNextToken(text, "c0!"); - TestFirstTwoCharInEveryToken(text, "c0!"); - - text = "Simple"; - TestEveryChar(text, "s0! i1 m2 p3 l4 e5"); - TestNextToken(text, "s0!"); - TestFirstTwoCharInEveryToken(text, "s0! i1"); - - text = "ScratchPad"; - TestEveryChar(text, "s0! c1 r2 a3 t4 c5 h6 p7! a8 d9"); - TestNextToken(text, "s0! p7!"); - TestFirstTwoCharInEveryToken(text, "s0! c1 p7! a8"); - - text = "Chess2.0"; - TestEveryChar(text, "c0! h1 e2 s3 s4 25! .6 07"); - TestNextToken(text, "c0! 25!"); - TestFirstTwoCharInEveryToken(text, "c0! h1 25! .6"); - - text = "Cut the rope"; - TestEveryChar(text, "c0! u1 t2 t4! h5 e6 r8! o9 p10 e11"); - TestNextToken(text, "c0! t4! r8!"); - TestFirstTwoCharInEveryToken(text, "c0! u1 t4! h5 r8! o9"); - - text = "AutoCAD WS"; - TestEveryChar(text, "a0! u1 t2 o3 c4! a5 d6 w8! s9"); - TestNextToken(text, "a0! c4! w8!"); - TestFirstTwoCharInEveryToken(text, "a0! u1 c4! a5 w8! s9"); - - text = "Great TweetDeck"; - TestEveryChar(text, "g0! r1 e2 a3 t4 t6! w7 e8 e9 t10 d11! e12 c13 k14"); - TestNextToken(text, "g0! t6! d11!"); - TestFirstTwoCharInEveryToken(text, "g0! r1 t6! w7 d11! e12"); - - text = "Draw-It!"; - TestEveryChar(text, "d0! r1 a2 w3 i5! t6"); - TestNextToken(text, "d0! i5!"); - TestFirstTwoCharInEveryToken(text, "d0! r1 i5! t6"); - - text = "Faxing & Signing"; - TestEveryChar(text, "f0! a1 x2 i3 n4 g5 s9! i10 g11 n12 i13 n14 g15"); - TestNextToken(text, "f0! s9!"); - TestFirstTwoCharInEveryToken(text, "f0! a1 s9! i10"); -} - -} // namespace diff --git a/chromium/chrome/common/string_matching/tokenized_string_fuzzer.cc b/chromium/chrome/common/string_matching/tokenized_string_fuzzer.cc deleted file mode 100644 index 0e40150fda0..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string_fuzzer.cc +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "base/strings/string16.h" -#include "chrome/common/string_matching/tokenized_string.h" - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - if (size < 1 || size % 2 != 0) - return 0; - - // Test for base::string16 if size is even. - base::string16 string_input16(reinterpret_cast<const base::char16*>(data), - size / 2); - TokenizedString tokenized_string_from_string16(string_input16); - return 0; -} diff --git a/chromium/chrome/common/string_matching/tokenized_string_match.cc b/chromium/chrome/common/string_matching/tokenized_string_match.cc deleted file mode 100644 index 0545cb1b2af..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string_match.cc +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/tokenized_string_match.h" - -#include <stddef.h> - -#include <cmath> - -#include "base/check.h" -#include "base/i18n/string_search.h" -#include "base/macros.h" -#include "base/strings/string_util.h" -#include "chrome/common/string_matching/tokenized_string_char_iterator.h" - -namespace { - -// The factors below are applied when the current char of query matches -// the current char of the text to be matched. Different factors are chosen -// based on where the match happens. kIsPrefixMultiplier is used when the -// matched portion is a prefix of both the query and the text, which implies -// that the matched chars are at the same position in query and text. This is -// the most preferred case thus it has the highest score. When the current char -// of the query and the text does not match, the algorithm moves to the next -// token in the text and try to match from there. kIsFrontOfWordMultipler will -// be used if the first char of the token matches the current char of the query. -// Otherwise, the match is considered as weak and kIsWeakHitMultiplier is -// used. -// Examples: -// Suppose the text to be matched is 'Google Chrome'. -// Query 'go' would yield kIsPrefixMultiplier for each char. -// Query 'gc' would use kIsPrefixMultiplier for 'g' and -// kIsFrontOfWordMultipler for 'c'. -// Query 'ch' would use kIsFrontOfWordMultipler for 'c' and -// kIsWeakHitMultiplier for 'h'. -// Query 'oo' does not match any prefix and would use the substring match -// fallback, thus kIsSubstringMultiplier is used for each char. -const double kIsPrefixMultiplier = 1.0; -const double kIsFrontOfWordMultipler = 0.8; -const double kIsWeakHitMultiplier = 0.6; -const double kIsSubstringMultiplier = 0.4; - -// A relevance score that represents no match. -const double kNoMatchScore = 0.0; - -// PrefixMatcher matches the chars of a given query as prefix of tokens in -// a given text or as prefix of the acronyms of those text tokens. -class PrefixMatcher { - public: - PrefixMatcher(const TokenizedString& query, const TokenizedString& text) - : query_iter_(query), - text_iter_(text), - current_match_(gfx::Range::InvalidRange()), - current_relevance_(kNoMatchScore) {} - - // Invokes RunMatch to perform prefix match. Use |states_| as a stack to - // perform DFS (depth first search) so that all possible matches are - // attempted. Stops on the first full match and returns true. Otherwise, - // returns false to indicate no match. - bool Match() { - while (!RunMatch()) { - // No match found and no more states to try. Bail out. - if (states_.empty()) { - current_relevance_ = kNoMatchScore; - current_hits_.clear(); - return false; - } - - PopState(); - - // Skip restored match to try other possibilites. - AdvanceToNextTextToken(); - } - - if (current_match_.IsValid()) - current_hits_.push_back(current_match_); - - return true; - } - - double relevance() const { return current_relevance_; } - const TokenizedStringMatch::Hits& hits() const { return current_hits_; } - - private: - // Context record of a match. - struct State { - State() : relevance(kNoMatchScore) {} - State(double relevance, - const gfx::Range& current_match, - const TokenizedStringMatch::Hits& hits, - const TokenizedStringCharIterator& query_iter, - const TokenizedStringCharIterator& text_iter) - : relevance(relevance), - current_match(current_match), - hits(hits.begin(), hits.end()), - query_iter_state(query_iter.GetState()), - text_iter_state(text_iter.GetState()) {} - - // The current score of the processed query chars. - double relevance; - - // Current matching range. - gfx::Range current_match; - - // Completed matching ranges of the processed query chars. - TokenizedStringMatch::Hits hits; - - // States of the processed query and text chars. - TokenizedStringCharIterator::State query_iter_state; - TokenizedStringCharIterator::State text_iter_state; - }; - typedef std::vector<State> States; - - // Match chars from the query and text one by one. For each matching char, - // calculate relevance and matching ranges. And the current stats is - // recorded so that the match could be skipped later to try other - // possiblities. Repeat until any of the iterators run out. Return true if - // query iterator runs out, i.e. all chars in query are matched. - bool RunMatch() { - while (!query_iter_.end() && !text_iter_.end()) { - if (query_iter_.Get() == text_iter_.Get()) { - PushState(); - - if (query_iter_.GetArrayPos() == text_iter_.GetArrayPos()) - current_relevance_ += kIsPrefixMultiplier; - else if (text_iter_.IsFirstCharOfToken()) - current_relevance_ += kIsFrontOfWordMultipler; - else - current_relevance_ += kIsWeakHitMultiplier; - - if (!current_match_.IsValid()) - current_match_.set_start(text_iter_.GetArrayPos()); - current_match_.set_end(text_iter_.GetArrayPos() + - text_iter_.GetCharSize()); - - query_iter_.NextChar(); - text_iter_.NextChar(); - } else { - AdvanceToNextTextToken(); - } - } - - return query_iter_.end(); - } - - // Skip to the next text token and close current match. Invoked when a - // mismatch happens or to skip a restored match. - void AdvanceToNextTextToken() { - if (current_match_.IsValid()) { - current_hits_.push_back(current_match_); - current_match_ = gfx::Range::InvalidRange(); - } - - text_iter_.NextToken(); - } - - void PushState() { - states_.push_back(State(current_relevance_, current_match_, current_hits_, - query_iter_, text_iter_)); - } - - void PopState() { - DCHECK(!states_.empty()); - - State& last_match = states_.back(); - current_relevance_ = last_match.relevance; - current_match_ = last_match.current_match; - current_hits_.swap(last_match.hits); - query_iter_.SetState(last_match.query_iter_state); - text_iter_.SetState(last_match.text_iter_state); - - states_.pop_back(); - } - - TokenizedStringCharIterator query_iter_; - TokenizedStringCharIterator text_iter_; - - States states_; - gfx::Range current_match_; - - double current_relevance_; - TokenizedStringMatch::Hits current_hits_; - - DISALLOW_COPY_AND_ASSIGN(PrefixMatcher); -}; - -} // namespace - -TokenizedStringMatch::TokenizedStringMatch() : relevance_(kNoMatchScore) {} - -TokenizedStringMatch::~TokenizedStringMatch() = default; - -bool TokenizedStringMatch::Calculate(const TokenizedString& query, - const TokenizedString& text) { - relevance_ = kNoMatchScore; - hits_.clear(); - - // If there is an exact match, relevance will be 1.0 and there is only 1 hit - // that is the entire text/query. - const auto& query_text = query.text(); - const auto& text_text = text.text(); - const auto query_size = query_text.size(); - const auto text_size = text_text.size(); - if (query_size > 0 && query_size == text_size && - base::EqualsCaseInsensitiveASCII(query_text, text_text)) { - hits_.push_back(gfx::Range(0, query_size)); - relevance_ = 1.0; - return true; - } - - PrefixMatcher matcher(query, text); - if (matcher.Match()) { - relevance_ = matcher.relevance(); - hits_.assign(matcher.hits().begin(), matcher.hits().end()); - } - - // Substring match as a fallback. - if (relevance_ == kNoMatchScore) { - size_t substr_match_start = 0; - size_t substr_match_length = 0; - if (base::i18n::StringSearchIgnoringCaseAndAccents( - query.text(), text.text(), &substr_match_start, - &substr_match_length)) { - relevance_ = kIsSubstringMultiplier * substr_match_length; - hits_.push_back(gfx::Range(substr_match_start, - substr_match_start + substr_match_length)); - } - } - - // Temper the relevance score with an exponential curve. Each point of - // relevance (roughly, each keystroke) is worth less than the last. This means - // that typing a few characters of a word is enough to promote matches very - // high, with any subsequent characters being worth comparatively less. - // TODO(mgiuca): This doesn't really play well with Omnibox results, since as - // you type more characters, the app/omnibox results tend to jump over each - // other. - relevance_ = 1.0 - std::pow(0.5, relevance_); - - return relevance_ > kNoMatchScore; -} - -bool TokenizedStringMatch::Calculate(const base::string16& query, - const base::string16& text) { - const TokenizedString tokenized_query(query); - const TokenizedString tokenized_text(text); - return Calculate(tokenized_query, tokenized_text); -} diff --git a/chromium/chrome/common/string_matching/tokenized_string_match.h b/chromium/chrome/common/string_matching/tokenized_string_match.h deleted file mode 100644 index 0112ac9ab10..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string_match.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_MATCH_H_ -#define CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_MATCH_H_ - -#include <vector> - -#include "base/macros.h" -#include "base/strings/string16.h" -#include "ui/gfx/range/range.h" - -class TokenizedString; - -// TokenizedStringMatch takes two tokenized strings: one as the text and -// the other one as the query. It matches the query against the text, -// calculates a relevance score between [0, 1] and marks the matched portions -// of text. A relevance of zero means the two are completely different to each -// other. The higher the relevance score, the better the two strings are -// matched. Matched portions of text are stored as index ranges. -class TokenizedStringMatch { - public: - typedef std::vector<gfx::Range> Hits; - - TokenizedStringMatch(); - ~TokenizedStringMatch(); - - // Calculates the relevance and hits. Returns true if the two strings are - // somewhat matched, i.e. relevance score is not zero. - bool Calculate(const TokenizedString& query, const TokenizedString& text); - - // Convenience wrapper to calculate match from raw string input. - bool Calculate(const base::string16& query, const base::string16& text); - - double relevance() const { return relevance_; } - const Hits& hits() const { return hits_; } - - private: - // Score in range of [0,1] representing how well the query matches the text. - double relevance_; - - // Char index ranges in |text| of where matches are found. - Hits hits_; - - DISALLOW_COPY_AND_ASSIGN(TokenizedStringMatch); -}; - -#endif // CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_MATCH_H_ diff --git a/chromium/chrome/common/string_matching/tokenized_string_match_unittest.cc b/chromium/chrome/common/string_matching/tokenized_string_match_unittest.cc deleted file mode 100644 index 01a23d3be1d..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string_match_unittest.cc +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/tokenized_string_match.h" - -#include <stddef.h> - -#include <string> - -#include "base/stl_util.h" -#include "base/strings/utf_string_conversions.h" -#include "testing/gtest/include/gtest/gtest.h" - -namespace { - -// Returns a string of |text| marked the hits in |match| using block bracket. -// e.g. text= "Text", match.hits = [{0,1}], returns "[T]ext". -std::string MatchHit(const base::string16& text, - const TokenizedStringMatch& match) { - base::string16 marked = text; - - const TokenizedStringMatch::Hits& hits = match.hits(); - for (TokenizedStringMatch::Hits::const_reverse_iterator it = hits.rbegin(); - it != hits.rend(); ++it) { - const gfx::Range& hit = *it; - marked.insert(hit.end(), 1, ']'); - marked.insert(hit.start(), 1, '['); - } - - return base::UTF16ToUTF8(marked); -} - -TEST(TokenizedStringMatchTest, NotMatch) { - struct { - const char* text; - const char* query; - } kTestCases[] = { - {"", ""}, {"", "query"}, - {"text", ""}, {"!", "!@#$%^&*()<<<**>>>"}, - {"abd", "abcd"}, {"cd", "abcd"}, - }; - - TokenizedStringMatch match; - for (size_t i = 0; i < base::size(kTestCases); ++i) { - const base::string16 text(base::UTF8ToUTF16(kTestCases[i].text)); - EXPECT_FALSE(match.Calculate(base::UTF8ToUTF16(kTestCases[i].query), text)) - << "Test case " << i << " : text=" << kTestCases[i].text - << ", query=" << kTestCases[i].query; - } -} - -TEST(TokenizedStringMatchTest, Match) { - struct { - const char* text; - const char* query; - const char* expect; - } kTestCases[] = { - {"ScratchPad", "pad", "Scratch[Pad]"}, - {"ScratchPad", "sp", "[S]cratch[P]ad"}, - {"Chess2", "che", "[Che]ss2"}, - {"Chess2", "c2", "[C]hess[2]"}, - {"Cut the rope", "cut ro", "[Cut] the [ro]pe"}, - {"Cut the rope", "cr", "[C]ut the [r]ope"}, - {"John Doe", "jdoe", "[J]ohn [Doe]"}, - {"John Doe", "johnd", "[John D]oe"}, - {"Secure Shell", "she", "Secure [She]ll"}, - {"Simple Secure Shell", "sish", "[Si]mple Secure [Sh]ell"}, - {"Netflix", "flix", "Net[flix]"}, - }; - - TokenizedStringMatch match; - for (size_t i = 0; i < base::size(kTestCases); ++i) { - const base::string16 text(base::UTF8ToUTF16(kTestCases[i].text)); - EXPECT_TRUE(match.Calculate(base::UTF8ToUTF16(kTestCases[i].query), text)); - EXPECT_EQ(kTestCases[i].expect, MatchHit(text, match)); - } -} - -TEST(TokenizedStringMatchTest, Relevance) { - struct { - const char* text; - const char* query_low; - const char* query_high; - } kTestCases[] = { - // More matched chars are better. - {"Google Chrome", "g", "go"}, - {"Google Chrome", "go", "goo"}, - {"Google Chrome", "goo", "goog"}, - {"Google Chrome", "c", "ch"}, - {"Google Chrome", "ch", "chr"}, - // Acronym match is better than something in the middle. - {"Google Chrome", "ch", "gc"}, - // Prefix match is better than middle match and acronym match. - {"Google Chrome", "ch", "go"}, - {"Google Chrome", "gc", "go"}, - // Substring match has the lowest score. - {"Google Chrome", "oo", "gc"}, - {"Google Chrome", "oo", "go"}, - {"Google Chrome", "oo", "ch"}, - }; - - TokenizedStringMatch match_low; - TokenizedStringMatch match_high; - for (size_t i = 0; i < base::size(kTestCases); ++i) { - const base::string16 text(base::UTF8ToUTF16(kTestCases[i].text)); - EXPECT_TRUE( - match_low.Calculate(base::UTF8ToUTF16(kTestCases[i].query_low), text)); - EXPECT_TRUE(match_high.Calculate( - base::UTF8ToUTF16(kTestCases[i].query_high), text)); - EXPECT_LT(match_low.relevance(), match_high.relevance()) - << "Test case " << i << " : text=" << kTestCases[i].text - << ", query_low=" << kTestCases[i].query_low - << ", query_high=" << kTestCases[i].query_high; - } -} - -// More specialized tests of the absolute relevance scores. (These tests are -// minimal, because they are so brittle. Changing the scoring algorithm will -// require updating this test.) -TEST(TokenizedStringMatchTest, AbsoluteRelevance) { - const double kEpsilon = 0.006; - struct { - const char* text; - const char* query; - double expected_score; - } kTestCases[] = { - // The first few chars should increase the score extremely high. After - // that, they should count less. - // NOTE: 0.87 is a magic number, as it is the Omnibox score for a "pretty - // good" match. We want a 3-letter prefix match to be slightly above 0.87. - {"Google Chrome", "g", 0.5}, - {"Google Chrome", "go", 0.75}, - {"Google Chrome", "goo", 0.88}, - {"Google Chrome", "goog", 0.94}, - }; - - TokenizedStringMatch match; - for (size_t i = 0; i < base::size(kTestCases); ++i) { - const base::string16 text(base::UTF8ToUTF16(kTestCases[i].text)); - EXPECT_TRUE(match.Calculate(base::UTF8ToUTF16(kTestCases[i].query), text)); - EXPECT_NEAR(match.relevance(), kTestCases[i].expected_score, kEpsilon) - << "Test case " << i << " : text=" << kTestCases[i].text - << ", query=" << kTestCases[i].query - << ", expected_score=" << kTestCases[i].expected_score; - } -} - -} // namespace diff --git a/chromium/chrome/common/string_matching/tokenized_string_unittest.cc b/chromium/chrome/common/string_matching/tokenized_string_unittest.cc deleted file mode 100644 index 51c98b2f295..00000000000 --- a/chromium/chrome/common/string_matching/tokenized_string_unittest.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2019 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/common/string_matching/tokenized_string.h" - -#include <stddef.h> - -#include "base/strings/utf_string_conversions.h" -#include "testing/gtest/include/gtest/gtest.h" - -namespace { - -base::string16 GetContent(const TokenizedString& tokenized) { - const TokenizedString::Tokens& tokens = tokenized.tokens(); - const TokenizedString::Mappings& mappings = tokenized.mappings(); - - base::string16 str; - for (size_t i = 0; i < tokens.size(); ++i) { - if (i > 0) - str += ' '; - str += tokens[i]; - str += base::UTF8ToUTF16(mappings[i].ToString()); - } - return str; -} - -TEST(TokenizedStringTest, Empty) { - base::string16 empty; - TokenizedString tokens(empty); - EXPECT_EQ(base::string16(), GetContent(tokens)); -} - -TEST(TokenizedStringTest, Basic) { - { - base::string16 text(base::UTF8ToUTF16("ScratchPad")); - TokenizedString tokens(text); - EXPECT_EQ(base::UTF8ToUTF16("scratch{0,7} pad{7,10}"), GetContent(tokens)); - } - { - base::string16 text(base::UTF8ToUTF16("Chess2.0")); - TokenizedString tokens(text); - EXPECT_EQ(base::UTF8ToUTF16("chess{0,5} 2.0{5,8}"), GetContent(tokens)); - } - { - base::string16 text(base::UTF8ToUTF16("Cut the rope")); - TokenizedString tokens(text); - EXPECT_EQ(base::UTF8ToUTF16("cut{0,3} the{4,7} rope{8,12}"), - GetContent(tokens)); - } - { - base::string16 text(base::UTF8ToUTF16("AutoCAD WS")); - TokenizedString tokens(text); - EXPECT_EQ(base::UTF8ToUTF16("auto{0,4} cad{4,7} ws{8,10}"), - GetContent(tokens)); - } - { - base::string16 text(base::UTF8ToUTF16("Great TweetDeck")); - TokenizedString tokens(text); - EXPECT_EQ(base::UTF8ToUTF16("great{0,5} tweet{6,11} deck{11,15}"), - GetContent(tokens)); - } - { - base::string16 text(base::UTF8ToUTF16("Draw-It!")); - TokenizedString tokens(text); - EXPECT_EQ(base::UTF8ToUTF16("draw{0,4} it{5,7}"), GetContent(tokens)); - } - { - base::string16 text(base::UTF8ToUTF16("Faxing & Signing")); - TokenizedString tokens(text); - EXPECT_EQ(base::UTF8ToUTF16("faxing{0,6} signing{9,16}"), - GetContent(tokens)); - } - { - base::string16 text(base::UTF8ToUTF16("!@#$%^&*()<<<**>>>")); - TokenizedString tokens(text); - EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(tokens)); - } -} - -} // namespace |