summaryrefslogtreecommitdiff
path: root/chromium/chrome/common/string_matching
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/chrome/common/string_matching')
-rw-r--r--chromium/chrome/common/string_matching/BUILD.gn34
-rw-r--r--chromium/chrome/common/string_matching/OWNERS2
-rw-r--r--chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.cc331
-rw-r--r--chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.h100
-rw-r--r--chromium/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc296
-rw-r--r--chromium/chrome/common/string_matching/sequence_matcher.cc198
-rw-r--r--chromium/chrome/common/string_matching/sequence_matcher.h78
-rw-r--r--chromium/chrome/common/string_matching/sequence_matcher_unittest.cc137
-rw-r--r--chromium/chrome/common/string_matching/term_break_iterator.cc72
-rw-r--r--chromium/chrome/common/string_matching/term_break_iterator.h68
-rw-r--r--chromium/chrome/common/string_matching/term_break_iterator_unittest.cc87
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string.cc43
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string.h42
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string_char_iterator.cc85
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string_char_iterator.h76
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string_char_iterator_unittest.cc145
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string_fuzzer.cc17
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string_match.cc248
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string_match.h49
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string_match_unittest.cc149
-rw-r--r--chromium/chrome/common/string_matching/tokenized_string_unittest.cc81
21 files changed, 0 insertions, 2338 deletions
diff --git a/chromium/chrome/common/string_matching/BUILD.gn b/chromium/chrome/common/string_matching/BUILD.gn
deleted file mode 100644
index 183a9ede09b..00000000000
--- a/chromium/chrome/common/string_matching/BUILD.gn
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2019 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-import("//build/config/ui.gni")
-import("//chrome/common/features.gni")
-
-source_set("string_matching") {
- sources = [
- "fuzzy_tokenized_string_match.cc",
- "fuzzy_tokenized_string_match.h",
- "sequence_matcher.cc",
- "sequence_matcher.h",
- "term_break_iterator.cc",
- "term_break_iterator.h",
- "tokenized_string.cc",
- "tokenized_string.h",
- "tokenized_string_char_iterator.cc",
- "tokenized_string_char_iterator.h",
- "tokenized_string_match.cc",
- "tokenized_string_match.h",
- ]
-
- deps = [
- "//base",
- "//base:i18n",
- "//cc",
- ]
-
- public_deps = [
- "//base",
- "//ui/gfx",
- ]
-}
diff --git a/chromium/chrome/common/string_matching/OWNERS b/chromium/chrome/common/string_matching/OWNERS
deleted file mode 100644
index cc83b874e21..00000000000
--- a/chromium/chrome/common/string_matching/OWNERS
+++ /dev/null
@@ -1,2 +0,0 @@
-jiameng@chromium.org
-thanhdng@chromium.org
diff --git a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.cc b/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.cc
deleted file mode 100644
index f26a37650cf..00000000000
--- a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-// Copyright (c) 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/fuzzy_tokenized_string_match.h"
-
-#include <algorithm>
-#include <cmath>
-#include <iterator>
-
-#include "base/i18n/case_conversion.h"
-#include "base/metrics/field_trial_params.h"
-#include "base/strings/strcat.h"
-#include "base/strings/string_util.h"
-#include "base/strings/utf_string_conversions.h"
-#include "chrome/common/string_matching/sequence_matcher.h"
-
-namespace {
-constexpr double kMinScore = 0.0;
-constexpr double kMaxScore = 1.0;
-constexpr double kFirstCharacterMatchPenalty = 0.2;
-constexpr double kPrefixMatchPenalty = 0.1;
-
-// Returns sorted tokens from a TokenizedString.
-std::vector<base::string16> ProcessAndSort(const TokenizedString& text) {
- std::vector<base::string16> result;
- for (const auto& token : text.tokens()) {
- result.emplace_back(token);
- }
- std::sort(result.begin(), result.end());
- return result;
-}
-} // namespace
-
-FuzzyTokenizedStringMatch::~FuzzyTokenizedStringMatch() {}
-FuzzyTokenizedStringMatch::FuzzyTokenizedStringMatch() {}
-
-double FuzzyTokenizedStringMatch::FirstCharacterMatch(
- const TokenizedString& query,
- const TokenizedString& text) {
- const base::string16 query_lower = base::i18n::ToLower(query.text());
- size_t query_index = 0;
- for (size_t text_index = 0; text_index < text.tokens().size(); text_index++) {
- if (query_index < query_lower.size() &&
- text.tokens()[text_index][0] == query_lower[query_index]) {
- query_index++;
- if (query_index == query_lower.size()) {
- // Penalizes the score using the number of text's tokens that are
- // needed.
- return std::max(kMinScore,
- kMaxScore - kFirstCharacterMatchPenalty *
- (text_index + 1 - query_lower.size()));
- }
- }
- }
- return kMinScore;
-}
-
-double FuzzyTokenizedStringMatch::PrefixMatch(const TokenizedString& query,
- const TokenizedString& text) {
- const std::vector<base::string16> query_tokens(query.tokens());
- const std::vector<base::string16> text_tokens(text.tokens());
- double match_score = kMaxScore;
- int previous_matched_index = -1;
- // For every query token, check if it is a prefix of a text token. The newly
- // matching text token must have higher index than the previous matched token.
- for (const auto& query_token : query_tokens) {
- bool matched = false;
- for (size_t text_index = previous_matched_index + 1;
- text_index < text_tokens.size(); text_index++) {
- if (query_token.size() <= text_tokens[text_index].size() &&
- query_token ==
- text_tokens[text_index].substr(0, query_token.size())) {
- matched = true;
- // Penalizes the score based on the number of skipped tokens.
- match_score -=
- kPrefixMatchPenalty * (text_index - previous_matched_index - 1);
- previous_matched_index = text_index;
- break;
- }
- }
- if (!matched) {
- return kMinScore;
- }
- }
- return std::max(kMinScore, match_score);
-}
-
-double FuzzyTokenizedStringMatch::TokenSetRatio(
- const TokenizedString& query,
- const TokenizedString& text,
- bool partial,
- double partial_match_penalty_rate,
- bool use_edit_distance) {
- std::set<base::string16> query_token(query.tokens().begin(),
- query.tokens().end());
- std::set<base::string16> text_token(text.tokens().begin(),
- text.tokens().end());
-
- std::vector<base::string16> intersection;
- std::vector<base::string16> query_diff_text;
- std::vector<base::string16> text_diff_query;
-
- // Find the intersection and the differences between two set of tokens.
- std::set_intersection(query_token.begin(), query_token.end(),
- text_token.begin(), text_token.end(),
- std::back_inserter(intersection));
- std::set_difference(query_token.begin(), query_token.end(),
- text_token.begin(), text_token.end(),
- std::back_inserter(query_diff_text));
- std::set_difference(text_token.begin(), text_token.end(), query_token.begin(),
- query_token.end(), std::back_inserter(text_diff_query));
-
- const base::string16 intersection_string =
- base::JoinString(intersection, base::UTF8ToUTF16(" "));
- const base::string16 query_rewritten =
- intersection.empty()
- ? base::JoinString(query_diff_text, base::UTF8ToUTF16(" "))
- : base::StrCat(
- {intersection_string, base::UTF8ToUTF16(" "),
- base::JoinString(query_diff_text, base::UTF8ToUTF16(" "))});
- const base::string16 text_rewritten =
- intersection.empty()
- ? base::JoinString(text_diff_query, base::UTF8ToUTF16(" "))
- : base::StrCat(
- {intersection_string, base::UTF8ToUTF16(" "),
- base::JoinString(text_diff_query, base::UTF8ToUTF16(" "))});
-
- if (partial) {
- return std::max(
- {PartialRatio(intersection_string, query_rewritten,
- partial_match_penalty_rate, use_edit_distance),
- PartialRatio(intersection_string, text_rewritten,
- partial_match_penalty_rate, use_edit_distance),
- PartialRatio(query_rewritten, text_rewritten,
- partial_match_penalty_rate, use_edit_distance)});
- }
-
- return std::max(
- {SequenceMatcher(intersection_string, query_rewritten, use_edit_distance)
- .Ratio(),
- SequenceMatcher(intersection_string, text_rewritten, use_edit_distance)
- .Ratio(),
- SequenceMatcher(query_rewritten, text_rewritten, use_edit_distance)
- .Ratio()});
-}
-
-double FuzzyTokenizedStringMatch::TokenSortRatio(
- const TokenizedString& query,
- const TokenizedString& text,
- bool partial,
- double partial_match_penalty_rate,
- bool use_edit_distance) {
- const base::string16 query_sorted =
- base::JoinString(ProcessAndSort(query), base::UTF8ToUTF16(" "));
- const base::string16 text_sorted =
- base::JoinString(ProcessAndSort(text), base::UTF8ToUTF16(" "));
-
- if (partial) {
- return PartialRatio(query_sorted, text_sorted, partial_match_penalty_rate,
- use_edit_distance);
- }
- return SequenceMatcher(query_sorted, text_sorted, use_edit_distance).Ratio();
-}
-
-double FuzzyTokenizedStringMatch::PartialRatio(
- const base::string16& query,
- const base::string16& text,
- double partial_match_penalty_rate,
- bool use_edit_distance) {
- if (query.empty() || text.empty()) {
- return kMinScore;
- }
- base::string16 shorter = query;
- base::string16 longer = text;
-
- if (shorter.size() > longer.size()) {
- shorter = text;
- longer = query;
- }
-
- const auto matching_blocks =
- SequenceMatcher(shorter, longer, use_edit_distance).GetMatchingBlocks();
- double partial_ratio = 0;
-
- for (const auto& block : matching_blocks) {
- const int long_start =
- block.pos_second_string > block.pos_first_string
- ? block.pos_second_string - block.pos_first_string
- : 0;
-
- // Penalizes the match if it is not close to the beginning of a token.
- int current = long_start - 1;
- while (current >= 0 &&
- !base::EqualsCaseInsensitiveASCII(longer.substr(current, 1),
- base::UTF8ToUTF16(" "))) {
- current--;
- }
- const double penalty =
- std::pow(partial_match_penalty_rate, long_start - current - 1);
- // TODO(crbug/990684): currently this part re-calculate the ratio for every
- // pair. Improve this to reduce latency.
- partial_ratio = std::max(
- partial_ratio,
- SequenceMatcher(shorter, longer.substr(long_start, shorter.size()),
- use_edit_distance)
- .Ratio() *
- penalty);
-
- if (partial_ratio > 0.995) {
- return kMaxScore;
- }
- }
- return partial_ratio;
-}
-
-double FuzzyTokenizedStringMatch::WeightedRatio(
- const TokenizedString& query,
- const TokenizedString& text,
- double partial_match_penalty_rate,
- bool use_edit_distance) {
- const double unbase_scale = 0.95;
- // Since query.text() and text.text() is not normalized, we use query.tokens()
- // and text.tokens() instead.
- const base::string16 query_normalized(
- base::JoinString(query.tokens(), base::UTF8ToUTF16(" ")));
- const base::string16 text_normalized(
- base::JoinString(text.tokens(), base::UTF8ToUTF16(" ")));
- double weighted_ratio =
- SequenceMatcher(query_normalized, text_normalized, use_edit_distance)
- .Ratio();
- const double length_ratio =
- static_cast<double>(
- std::max(query_normalized.size(), text_normalized.size())) /
- std::min(query_normalized.size(), text_normalized.size());
-
- // Use partial if two strings are quite different in sizes.
- const bool use_partial = length_ratio >= 1.5;
- double partial_scale = 1;
-
- if (use_partial) {
- // If one string is much much shorter than the other, set |partial_scale| to
- // be 0.6, otherwise set it to be 0.9.
- partial_scale = length_ratio > 8 ? 0.6 : 0.9;
- weighted_ratio =
- std::max(weighted_ratio,
- PartialRatio(query_normalized, text_normalized,
- partial_match_penalty_rate, use_edit_distance) *
- partial_scale);
- }
- weighted_ratio =
- std::max(weighted_ratio,
- TokenSortRatio(query, text, use_partial /*partial*/,
- partial_match_penalty_rate, use_edit_distance) *
- unbase_scale * partial_scale);
- weighted_ratio =
- std::max(weighted_ratio,
- TokenSetRatio(query, text, use_partial /*partial*/,
- partial_match_penalty_rate, use_edit_distance) *
- unbase_scale * partial_scale);
- return weighted_ratio;
-}
-
-double FuzzyTokenizedStringMatch::PrefixMatcher(const TokenizedString& query,
- const TokenizedString& text) {
- return std::max(PrefixMatch(query, text), FirstCharacterMatch(query, text));
-}
-
-bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query,
- const TokenizedString& text,
- double relevance_threshold,
- bool use_prefix_only,
- bool use_weighted_ratio,
- bool use_edit_distance,
- double partial_match_penalty_rate) {
- // If there is an exact match, relevance will be 1.0 and there is only 1 hit
- // that is the entire text/query.
- const auto& query_text = query.text();
- const auto& text_text = text.text();
- const auto query_size = query_text.size();
- const auto text_size = text_text.size();
- if (query_size > 0 && query_size == text_size &&
- base::EqualsCaseInsensitiveASCII(query_text, text_text)) {
- hits_.push_back(gfx::Range(0, query_size));
- relevance_ = 1.0;
- return true;
- }
-
- // Find |hits_| using SequenceMatcher on original query and text.
- for (const auto& match :
- SequenceMatcher(query_text, text_text, use_edit_distance)
- .GetMatchingBlocks()) {
- if (match.length > 0) {
- hits_.push_back(gfx::Range(match.pos_second_string,
- match.pos_second_string + match.length));
- }
- }
-
- // If the query is much longer than the text then it's often not a match.
- if (query_size >= text_size * 2) {
- return false;
- }
-
- const double prefix_score = PrefixMatcher(query, text);
-
- if (use_prefix_only && prefix_score >= relevance_threshold) {
- // If the prefix score is already higher than |relevance_threshold|, use
- // prefix score as final score.
- relevance_ = prefix_score;
- return true;
- }
-
- if (use_weighted_ratio) {
- // If WeightedRatio is used, |relevance_| is the average of WeightedRatio
- // and PrefixMatcher scores.
- relevance_ = (WeightedRatio(query, text, partial_match_penalty_rate,
- use_edit_distance) +
- prefix_score) /
- 2;
- } else {
- // Use simple algorithm to calculate match ratio.
- relevance_ =
- (SequenceMatcher(base::i18n::ToLower(query_text),
- base::i18n::ToLower(text_text), use_edit_distance)
- .Ratio() +
- prefix_score) /
- 2;
- }
-
- return relevance_ >= relevance_threshold;
-}
diff --git a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.h b/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.h
deleted file mode 100644
index 01bfbde6078..00000000000
--- a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef CHROME_COMMON_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_
-#define CHROME_COMMON_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_
-
-#include "base/gtest_prod_util.h"
-#include "base/macros.h"
-#include "chrome/common/string_matching/tokenized_string.h"
-#include "ui/gfx/range/range.h"
-
-// FuzzyTokenizedStringMatch takes two tokenized strings: one as the text and
-// the other one as the query. It matches the query against the text,
-// calculates a relevance score between [0, 1] and marks the matched portions
-// of text. A relevance of zero means the two are completely different to each
-// other. The higher the relevance score, the better the two strings are
-// matched. Matched portions of text are stored as index ranges.
-// TODO(crbug.com/1018613): each of these functions have too many input params,
-// we should revise the structure and remove unnecessary ones.
-class FuzzyTokenizedStringMatch {
- public:
- typedef std::vector<gfx::Range> Hits;
-
- FuzzyTokenizedStringMatch();
- ~FuzzyTokenizedStringMatch();
-
- // Check if the query only contains first characters of the text,
- // e.g. "coc" is a match of "Clash of Clan". Range of the score is [0, 1].
- static double FirstCharacterMatch(const TokenizedString& query,
- const TokenizedString& text);
-
- // Check if tokens of query are prefixes of text's tokens. Range of score is
- // [0, 1].
- static double PrefixMatch(const TokenizedString& query,
- const TokenizedString& text);
-
- // TokenSetRatio takes two sets of tokens, finds their intersection and
- // differences. From the intersection and differences, it rewrites the |query|
- // and |text| and find the similarity ratio between them. This function
- // assumes that TokenizedString is already normalized (converted to lower
- // case). Duplicates tokens will be removed for ratio computation.
- static double TokenSetRatio(const TokenizedString& query,
- const TokenizedString& text,
- bool partial,
- double partial_match_penalty_rate,
- bool use_edit_distance);
-
- // TokenSortRatio takes two set of tokens, sorts them and find the similarity
- // between two sorted strings. This function assumes that TokenizedString is
- // already normalized (converted to lower case)
- static double TokenSortRatio(const TokenizedString& query,
- const TokenizedString& text,
- bool partial,
- double partial_match_penalty_rate,
- bool use_edit_distance);
-
- // Finds the best ratio of shorter text with a part of longer text.
- // This function assumes that TokenizedString is already normalized (converted
- // to lower case). The return score is in range of [0, 1].
- static double PartialRatio(const base::string16& query,
- const base::string16& text,
- double partial_match_penalty_rate,
- bool use_edit_distance);
-
- // Combines scores from different ratio functions. This function assumes that
- // TokenizedString is already normalized (converted to lower cases).
- // The return score is in range of [0, 1].
- static double WeightedRatio(const TokenizedString& query,
- const TokenizedString& text,
- double partial_match_penalty_rate,
- bool use_edit_distance);
- // Since prefix match should always be favored over other matches, this
- // function is dedicated to calculate a prefix match score in range of [0, 1].
- // This score has two components: first character match and whole prefix
- // match.
- static double PrefixMatcher(const TokenizedString& query,
- const TokenizedString& text);
-
- // Calculates the relevance of two strings. Returns true if two strings are
- // somewhat matched, i.e. relevance score is greater than a threshold.
- bool IsRelevant(const TokenizedString& query,
- const TokenizedString& text,
- double relevance_threshold,
- bool use_prefix_only,
- bool use_weighted_ratio,
- bool use_edit_distance,
- double partial_match_penalty_rate);
- double relevance() const { return relevance_; }
- const Hits& hits() const { return hits_; }
-
- private:
- // Score in range of [0,1] representing how well the query matches the text.
- double relevance_ = 0;
- Hits hits_;
-
- DISALLOW_COPY_AND_ASSIGN(FuzzyTokenizedStringMatch);
-};
-
-#endif // CHROME_COMMON_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_
diff --git a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc b/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc
deleted file mode 100644
index 9413a36cb9c..00000000000
--- a/chromium/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/fuzzy_tokenized_string_match.h"
-
-#include "base/macros.h"
-#include "base/strings/utf_string_conversions.h"
-#include "chrome/common/string_matching/sequence_matcher.h"
-#include "chrome/common/string_matching/tokenized_string.h"
-#include "testing/gtest/include/gtest/gtest.h"
-
-namespace {
-constexpr double kPartialMatchPenaltyRate = 0.9;
-
-class FuzzyTokenizedStringMatchTest : public testing::Test {};
-
-// TODO(crbug.com/1018613): update the tests once params are consolidated.
-TEST_F(FuzzyTokenizedStringMatchTest, PartialRatioTest) {
- FuzzyTokenizedStringMatch match;
- EXPECT_NEAR(match.PartialRatio(base::UTF8ToUTF16("abcde"),
- base::UTF8ToUTF16("ababcXXXbcdeY"),
- kPartialMatchPenaltyRate, false),
- 0.6, 0.01);
- EXPECT_NEAR(match.PartialRatio(base::UTF8ToUTF16("big string"),
- base::UTF8ToUTF16("strength"),
- kPartialMatchPenaltyRate, false),
- 0.71, 0.01);
- EXPECT_EQ(match.PartialRatio(base::UTF8ToUTF16("abc"), base::UTF8ToUTF16(""),
- kPartialMatchPenaltyRate, false),
- 0);
- EXPECT_NEAR(match.PartialRatio(base::UTF8ToUTF16("different in order"),
- base::UTF8ToUTF16("order text"),
- kPartialMatchPenaltyRate, false),
- 0.67, 0.01);
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, TokenSetRatioTest) {
- FuzzyTokenizedStringMatch match;
- {
- base::string16 query(base::UTF8ToUTF16("order different in"));
- base::string16 text(base::UTF8ToUTF16("text order"));
- EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
- true, kPartialMatchPenaltyRate, false),
- 1);
- EXPECT_NEAR(
- match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
- false, kPartialMatchPenaltyRate, false),
- 0.67, 0.01);
- }
- {
- base::string16 query(base::UTF8ToUTF16("short text"));
- base::string16 text(
- base::UTF8ToUTF16("this text is really really really long"));
- EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
- true, kPartialMatchPenaltyRate, false),
- 1);
- EXPECT_NEAR(
- match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
- false, kPartialMatchPenaltyRate, false),
- 0.57, 0.01);
- }
- {
- base::string16 query(base::UTF8ToUTF16("common string"));
- base::string16 text(base::UTF8ToUTF16("nothing is shared"));
- EXPECT_NEAR(
- match.TokenSetRatio(TokenizedString(query), TokenizedString(text), true,
- kPartialMatchPenaltyRate, false),
- 0.38, 0.01);
- EXPECT_NEAR(
- match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
- false, kPartialMatchPenaltyRate, false),
- 0.33, 0.01);
- }
- {
- base::string16 query(
- base::UTF8ToUTF16("token shared token same shared same"));
- base::string16 text(base::UTF8ToUTF16("token shared token text text long"));
- EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
- true, kPartialMatchPenaltyRate, false),
- 1);
- EXPECT_NEAR(
- match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
- false, kPartialMatchPenaltyRate, false),
- 0.83, 0.01);
- }
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, TokenSortRatioTest) {
- FuzzyTokenizedStringMatch match;
- {
- base::string16 query(base::UTF8ToUTF16("order different in"));
- base::string16 text(base::UTF8ToUTF16("text order"));
- EXPECT_NEAR(
- match.TokenSortRatio(TokenizedString(query), TokenizedString(text),
- true, kPartialMatchPenaltyRate, false),
- 0.67, 0.01);
- EXPECT_NEAR(
- match.TokenSortRatio(TokenizedString(query), TokenizedString(text),
- false, kPartialMatchPenaltyRate, false),
- 0.36, 0.01);
- }
- {
- base::string16 query(base::UTF8ToUTF16("short text"));
- base::string16 text(
- base::UTF8ToUTF16("this text is really really really long"));
- EXPECT_EQ(
- match.TokenSortRatio(TokenizedString(query), TokenizedString(text),
- true, kPartialMatchPenaltyRate, false),
- 0.5 * std::pow(0.9, 1));
- EXPECT_NEAR(
- match.TokenSortRatio(TokenizedString(query), TokenizedString(text),
- false, kPartialMatchPenaltyRate, false),
- 0.33, 0.01);
- }
- {
- base::string16 query(base::UTF8ToUTF16("common string"));
- base::string16 text(base::UTF8ToUTF16("nothing is shared"));
- EXPECT_NEAR(
- match.TokenSortRatio(TokenizedString(query), TokenizedString(text),
- true, kPartialMatchPenaltyRate, false),
- 0.38, 0.01);
- EXPECT_NEAR(
- match.TokenSortRatio(TokenizedString(query), TokenizedString(text),
- false, kPartialMatchPenaltyRate, false),
- 0.33, 0.01);
- }
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, WeightedRatio) {
- FuzzyTokenizedStringMatch match;
- {
- base::string16 query(base::UTF8ToUTF16("anonymous"));
- base::string16 text(base::UTF8ToUTF16("famous"));
- EXPECT_NEAR(
- match.WeightedRatio(TokenizedString(query), TokenizedString(text),
- kPartialMatchPenaltyRate, false),
- 0.67, 0.01);
- }
- {
- base::string16 query(base::UTF8ToUTF16("Clash.of.clan"));
- base::string16 text(base::UTF8ToUTF16("ClashOfTitan"));
- EXPECT_NEAR(
- match.WeightedRatio(TokenizedString(query), TokenizedString(text),
- kPartialMatchPenaltyRate, false),
- 0.81, 0.01);
- }
- {
- base::string16 query(base::UTF8ToUTF16("final fantasy"));
- base::string16 text(base::UTF8ToUTF16("finalfantasy"));
- EXPECT_NEAR(
- match.WeightedRatio(TokenizedString(query), TokenizedString(text),
- kPartialMatchPenaltyRate, false),
- 0.96, 0.01);
- }
- {
- base::string16 query(base::UTF8ToUTF16("short text!!!"));
- base::string16 text(
- base::UTF8ToUTF16("this sentence is much much much much much longer "
- "than the text before"));
- EXPECT_NEAR(
- match.WeightedRatio(TokenizedString(query), TokenizedString(text),
- kPartialMatchPenaltyRate, false),
- 0.85, 0.01);
- }
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, FirstCharacterMatchTest) {
- {
- base::string16 query(base::UTF8ToUTF16("COC"));
- base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
- EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
- TokenizedString(query), TokenizedString(text)),
- 1.0);
- }
- {
- base::string16 query(base::UTF8ToUTF16("CC"));
- base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
- EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
- TokenizedString(query), TokenizedString(text)),
- 0.8);
- }
- {
- base::string16 query(base::UTF8ToUTF16("C o C"));
- base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
- EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
- TokenizedString(query), TokenizedString(text)),
- 0.0);
- }
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, PrefixMatchTest) {
- {
- base::string16 query(base::UTF8ToUTF16("clas"));
- base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
- EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
- TokenizedString(text)),
- 1.0);
- }
- {
- base::string16 query(base::UTF8ToUTF16("clash clan"));
- base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
- EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
- TokenizedString(text)),
- 0.9);
- }
- {
- base::string16 query(base::UTF8ToUTF16("c o c"));
- base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
- EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
- TokenizedString(text)),
- 1.0);
- }
- {
- base::string16 query(base::UTF8ToUTF16("clam"));
- base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
- EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
- TokenizedString(text)),
- 0.0);
- }
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, ParamThresholdTest1) {
- FuzzyTokenizedStringMatch match;
- {
- base::string16 query(base::UTF8ToUTF16("anonymous"));
- base::string16 text(base::UTF8ToUTF16("famous"));
- EXPECT_FALSE(match.IsRelevant(TokenizedString(query), TokenizedString(text),
- 0.4, false, true, false,
- kPartialMatchPenaltyRate));
- }
- {
- base::string16 query(base::UTF8ToUTF16("CC"));
- base::string16 text(base::UTF8ToUTF16("Clash Of Clan"));
- EXPECT_TRUE(match.IsRelevant(TokenizedString(query), TokenizedString(text),
- 0.4, false, true, false,
- kPartialMatchPenaltyRate));
- }
- {
- base::string16 query(base::UTF8ToUTF16("Clash.of.clan"));
- base::string16 text(base::UTF8ToUTF16("ClashOfTitan"));
- EXPECT_TRUE(match.IsRelevant(TokenizedString(query), TokenizedString(text),
- 0.4, false, true, false,
- kPartialMatchPenaltyRate));
- }
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, ParamThresholdTest2) {
- FuzzyTokenizedStringMatch match;
- {
- base::string16 query(base::UTF8ToUTF16("anonymous"));
- base::string16 text(base::UTF8ToUTF16("famous"));
- EXPECT_FALSE(match.IsRelevant(TokenizedString(query), TokenizedString(text),
- 0.5, false, true, false,
- kPartialMatchPenaltyRate));
- }
- {
- base::string16 query(base::UTF8ToUTF16("CC"));
- base::string16 text(base::UTF8ToUTF16("Clash Of Clan"));
- EXPECT_TRUE(match.IsRelevant(TokenizedString(query), TokenizedString(text),
- 0.5, false, true, false,
- kPartialMatchPenaltyRate));
- }
- {
- base::string16 query(base::UTF8ToUTF16("Clash.of.clan"));
- base::string16 text(base::UTF8ToUTF16("ClashOfTitan"));
- EXPECT_FALSE(match.IsRelevant(TokenizedString(query), TokenizedString(text),
- 0.5, false, true, false,
- kPartialMatchPenaltyRate));
- }
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, OtherParamTest) {
- FuzzyTokenizedStringMatch match;
- base::string16 query(base::UTF8ToUTF16("anonymous"));
- base::string16 text(base::UTF8ToUTF16("famous"));
- EXPECT_FALSE(match.IsRelevant(TokenizedString(query), TokenizedString(text),
- 0.35, false, false, true,
- kPartialMatchPenaltyRate));
- EXPECT_NEAR(match.relevance(), 0.33 / 2, 0.01);
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, ExactTextMatchTest) {
- FuzzyTokenizedStringMatch match;
- base::string16 query(base::UTF8ToUTF16("yat"));
- base::string16 text(base::UTF8ToUTF16("YaT"));
- EXPECT_TRUE(match.IsRelevant(TokenizedString(query), TokenizedString(text),
- 0.35, false, false, true,
- kPartialMatchPenaltyRate));
- EXPECT_DOUBLE_EQ(match.relevance(), 1.0);
- EXPECT_EQ(match.hits().size(), 1u);
- EXPECT_EQ(match.hits()[0].start(), 0u);
- EXPECT_EQ(match.hits()[0].end(), 3u);
-}
-
-} // namespace
diff --git a/chromium/chrome/common/string_matching/sequence_matcher.cc b/chromium/chrome/common/string_matching/sequence_matcher.cc
deleted file mode 100644
index 0d42146f087..00000000000
--- a/chromium/chrome/common/string_matching/sequence_matcher.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright (c) 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/sequence_matcher.h"
-
-#include <algorithm>
-#include <queue>
-
-#include "base/check_op.h"
-
-namespace {
-using Match = SequenceMatcher::Match;
-using Matches = std::vector<Match>;
-
-bool CompareMatches(const Match& m1, const Match& m2) {
- return m1.pos_first_string < m2.pos_first_string;
-}
-} // namespace
-
-SequenceMatcher::Match::Match() = default;
-SequenceMatcher::Match::Match(int pos_first, int pos_second, int len)
- : pos_first_string(pos_first), pos_second_string(pos_second), length(len) {
- DCHECK_GE(pos_first_string, 0);
- DCHECK_GE(pos_second_string, 0);
- DCHECK_GE(length, 0);
-}
-
-SequenceMatcher::SequenceMatcher(const base::string16& first_string,
- const base::string16& second_string,
- bool use_edit_distance)
- : first_string_(first_string),
- second_string_(second_string),
- dp_common_string_(second_string.size() + 1, 0) {
- DCHECK(!first_string_.empty() || !second_string_.empty());
-
- for (size_t i = 0; i < second_string_.size(); i++) {
- char_to_positions_[second_string_[i]].emplace_back(i);
- }
- use_edit_distance_ = use_edit_distance;
-}
-
-Match SequenceMatcher::FindLongestMatch(int first_start,
- int first_end,
- int second_start,
- int second_end) {
- Match match(first_start, second_start, 0);
-
- // These two vectors are used to do "fast update".
- // |dp_values_to_erase| contains the values should be erased from
- // |dp_common_string_|.
- // |dp_values_to_affect| contains the values should be updated from
- // |dp_common_string_|.
- std::vector<std::pair<int, int>> dp_values_to_erase;
- std::vector<std::pair<int, int>> dp_values_to_affect;
-
- for (int i = first_start; i < first_end; i++) {
- dp_values_to_affect.clear();
- for (auto j : char_to_positions_[first_string_[i]]) {
- if (j < second_start) {
- continue;
- }
- if (j >= second_end) {
- break;
- }
- // dp_commong_string_[j + 1] is length of longest common substring
- // ends at first_string_[i] and second_string_[j + 1]
- const int length = dp_common_string_[j] + 1;
- dp_values_to_affect.emplace_back(j + 1, length);
- if (length > match.length) {
- match.pos_first_string = i - length + 1;
- match.pos_second_string = j - length + 1;
- match.length = length;
- }
- }
- // Updates dp_common_string_
- for (auto const& element : dp_values_to_erase) {
- dp_common_string_[element.first] = 0;
- }
- for (auto const& element : dp_values_to_affect) {
- dp_common_string_[element.first] = element.second;
- }
- std::swap(dp_values_to_erase, dp_values_to_affect);
- }
- // Erases all updated value for the next call.
- std::fill(dp_common_string_.begin(), dp_common_string_.end(), 0);
-
- return match;
-}
-
-Matches SequenceMatcher::GetMatchingBlocks() {
- if (!matching_blocks_.empty()) {
- return matching_blocks_;
- }
-
- // This queue contains a tuple of 4 integers that represent 2 substrings to
- // find the longest match in the following order: first_start, first_end,
- // second_start, second_end.
- std::queue<std::tuple<int, int, int, int>> queue_block;
- queue_block.emplace(0, first_string_.size(), 0, second_string_.size());
-
- // Find all matching blocks recursively.
- while (!queue_block.empty()) {
- int first_start, first_end, second_start, second_end;
- std::tie(first_start, first_end, second_start, second_end) =
- queue_block.front();
- queue_block.pop();
- const Match match =
- FindLongestMatch(first_start, first_end, second_start, second_end);
- if (match.length > 0) {
- if (first_start < match.pos_first_string &&
- second_start < match.pos_second_string) {
- queue_block.emplace(first_start, match.pos_first_string, second_start,
- match.pos_second_string);
- }
- if (match.pos_first_string + match.length < first_end &&
- match.pos_second_string + match.length < second_end) {
- queue_block.emplace(match.pos_first_string + match.length, first_end,
- match.pos_second_string + match.length, second_end);
- }
- matching_blocks_.push_back(match);
- }
- }
-
- matching_blocks_.push_back(
- Match(first_string_.size(), second_string_.size(), 0));
- sort(matching_blocks_.begin(), matching_blocks_.end(), CompareMatches);
- return matching_blocks_;
-}
-
-int SequenceMatcher::EditDistance() {
- // If edit distance is already calculated
- if (edit_distance_ >= 0) {
- return edit_distance_;
- }
-
- const int len_first = first_string_.size();
- const int len_second = second_string_.size();
- if (len_first == 0 || len_second == 0) {
- edit_distance_ = std::max(len_first, len_second);
- return edit_distance_;
- }
-
- // Memory for the dynamic programming:
- // dp[i + 1][j + 1] is the edit distane of first |i| characters of
- // |first_string_| and first |j| characters of |second_string_|
- int dp[len_first + 1][len_second + 1];
-
- // Initialize memory
- for (int i = 0; i < len_first + 1; i++) {
- dp[i][0] = i;
- }
- for (int j = 0; j < len_second + 1; j++) {
- dp[0][j] = j;
- }
-
- // Calculate the edit distance
- for (int i = 1; i < len_first + 1; i++) {
- for (int j = 1; j < len_second + 1; j++) {
- const int cost = first_string_[i - 1] == second_string_[j - 1] ? 0 : 1;
- // Insertion and deletion
- dp[i][j] = std::min(dp[i - 1][j], dp[i][j - 1]) + 1;
- // Substitution
- dp[i][j] = std::min(dp[i][j], dp[i - 1][j - 1] + cost);
- // Transposition
- if (i > 1 && j > 1 && first_string_[i - 2] == second_string_[j - 1] &&
- first_string_[i - 1] == second_string_[j - 2]) {
- dp[i][j] = std::min(dp[i][j], dp[i - 2][j - 2] + cost);
- }
- }
- }
- edit_distance_ = dp[len_first][len_second];
- return edit_distance_;
-}
-
-double SequenceMatcher::Ratio() {
- if (use_edit_distance_) {
- if (edit_distance_ratio_ < 0) {
- const int edit_distance = EditDistance();
- edit_distance_ratio_ = std::max(
- 0.0, 1.0 - static_cast<double>(edit_distance) * 2 /
- (first_string_.size() + second_string_.size()));
- }
- return edit_distance_ratio_;
- }
-
- // Uses block matching to calculate ratio.
- if (block_matching_ratio_ < 0) {
- int sum_match = 0;
- const int sum_length = first_string_.size() + second_string_.size();
- DCHECK_NE(sum_length, 0);
- for (const auto& match : GetMatchingBlocks()) {
- sum_match += match.length;
- }
- block_matching_ratio_ = 2.0 * sum_match / sum_length;
- }
- return block_matching_ratio_;
-}
diff --git a/chromium/chrome/common/string_matching/sequence_matcher.h b/chromium/chrome/common/string_matching/sequence_matcher.h
deleted file mode 100644
index 71bc10b6892..00000000000
--- a/chromium/chrome/common/string_matching/sequence_matcher.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef CHROME_COMMON_STRING_MATCHING_SEQUENCE_MATCHER_H_
-#define CHROME_COMMON_STRING_MATCHING_SEQUENCE_MATCHER_H_
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "base/logging.h"
-#include "base/macros.h"
-#include "base/strings/string16.h"
-
-// Performs the calculation of similarity level between 2 strings. This class's
-// functionality is inspired by python's difflib.SequenceMatcher library.
-// (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher)
-class SequenceMatcher {
- public:
- // Representing a common substring between |first_string_| and
- // |second_string_|.
- struct Match {
- Match();
- Match(int pos_first, int pos_second, int len);
- // Starting position of the common substring in |first_string_|.
- int pos_first_string;
- // Starting position of the common substring in |second_string_|.
- int pos_second_string;
- // Length of the common substring.
- int length;
- };
- SequenceMatcher(const base::string16& first_string,
- const base::string16& second_string,
- bool use_edit_distance);
-
- ~SequenceMatcher() = default;
-
- // Calculates similarity ratio of |first_string_| and |second_string_|.
- double Ratio();
- // Calculates the Damerau–Levenshtein distance between |first_string_| and
- // |second_string_|.
- // See https://en.wikipedia.org/wiki/Damerau–Levenshtein_distance for more
- // details.
- int EditDistance();
- // Finds the longest common substring between
- // |first_string_[first_start:first_end]| and
- // |second_string_[second_start:second_end]|.
- Match FindLongestMatch(int first_start,
- int first_end,
- int second_start,
- int second_end);
- // Get all matching blocks of |first_string_| and |second_string_|.
- // All blocks will be sorted by starting position of them in the
- // |first_string_|. The last matching block will always be
- // Match(first_string_.size(), second_string_.size(), 0).
- std::vector<Match> GetMatchingBlocks();
-
- private:
- base::string16 first_string_;
- base::string16 second_string_;
- double edit_distance_ratio_ = -1.0;
- double block_matching_ratio_ = -1.0;
- std::vector<Match> matching_blocks_;
-
- // Controls whether to use edit distance to calculate ratio.
- bool use_edit_distance_;
- int edit_distance_ = -1;
- // For each character |c| in |second_string_|, this variable
- // |char_to_positions_| stores all positions where |c| occurs in
- // |second_string_|.
- std::unordered_map<char, std::vector<int>> char_to_positions_;
- // Memory for dynamic programming algorithm used in FindLongestMatch().
- std::vector<int> dp_common_string_;
- DISALLOW_COPY_AND_ASSIGN(SequenceMatcher);
-};
-
-#endif // CHROME_COMMON_STRING_MATCHING_SEQUENCE_MATCHER_H_
diff --git a/chromium/chrome/common/string_matching/sequence_matcher_unittest.cc b/chromium/chrome/common/string_matching/sequence_matcher_unittest.cc
deleted file mode 100644
index c06fcfc6452..00000000000
--- a/chromium/chrome/common/string_matching/sequence_matcher_unittest.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/sequence_matcher.h"
-
-#include "base/macros.h"
-#include "base/strings/utf_string_conversions.h"
-#include "testing/gtest/include/gtest/gtest.h"
-
-namespace {
-constexpr bool kDefaultUseEditDistance = false;
-
-using Match = SequenceMatcher::Match;
-bool MatchEqual(const Match& match1, const Match& match2) {
- return match1.pos_first_string == match2.pos_first_string &&
- match1.pos_second_string == match2.pos_second_string &&
- match1.length == match2.length;
-}
-
-class SequenceMatcherTest : public testing::Test {};
-
-TEST_F(SequenceMatcherTest, TestEditDistance) {
- // Transposition
- ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"),
- base::UTF8ToUTF16("abdc"), kDefaultUseEditDistance)
- .EditDistance(),
- 1);
-
- // Deletion
- ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcde"),
- base::UTF8ToUTF16("abcd"), kDefaultUseEditDistance)
- .EditDistance(),
- 1);
- ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("12"), base::UTF8ToUTF16(""),
- kDefaultUseEditDistance)
- .EditDistance(),
- 2);
-
- // Insertion
- ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abc"),
- base::UTF8ToUTF16("abxbc"), kDefaultUseEditDistance)
- .EditDistance(),
- 2);
- ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abxbc"),
- kDefaultUseEditDistance)
- .EditDistance(),
- 5);
-
- // Substitution
- ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("book"),
- base::UTF8ToUTF16("back"), kDefaultUseEditDistance)
- .EditDistance(),
- 2);
-
- // Combination
- ASSERT_EQ(
- SequenceMatcher(base::UTF8ToUTF16("caclulation"),
- base::UTF8ToUTF16("calculator"), kDefaultUseEditDistance)
- .EditDistance(),
- 3);
- ASSERT_EQ(
- SequenceMatcher(base::UTF8ToUTF16("sunday"),
- base::UTF8ToUTF16("saturday"), kDefaultUseEditDistance)
- .EditDistance(),
- 3);
-}
-
-TEST_F(SequenceMatcherTest, TestFindLongestMatch) {
- SequenceMatcher sequence_match(base::UTF8ToUTF16("miscellanious"),
- base::UTF8ToUTF16("miscellaneous"),
- kDefaultUseEditDistance);
- ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(0, 13, 0, 13),
- Match(0, 0, 9)));
- ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(7, 13, 7, 13),
- Match(10, 10, 3)));
-
- ASSERT_TRUE(MatchEqual(
- SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abcd"),
- kDefaultUseEditDistance)
- .FindLongestMatch(0, 0, 0, 4),
- Match(0, 0, 0)));
- ASSERT_TRUE(MatchEqual(
- SequenceMatcher(base::UTF8ToUTF16("abababbababa"),
- base::UTF8ToUTF16("ababbaba"), kDefaultUseEditDistance)
- .FindLongestMatch(0, 12, 0, 8),
- Match(2, 0, 8)));
- ASSERT_TRUE(MatchEqual(
- SequenceMatcher(base::UTF8ToUTF16("aaaaaa"), base::UTF8ToUTF16("aaaaa"),
- kDefaultUseEditDistance)
- .FindLongestMatch(0, 6, 0, 5),
- Match(0, 0, 5)));
-}
-
-TEST_F(SequenceMatcherTest, TestGetMatchingBlocks) {
- SequenceMatcher sequence_match(
- base::UTF8ToUTF16("This is a demo sentence!!!"),
- base::UTF8ToUTF16("This demo sentence is good!!!"),
- kDefaultUseEditDistance);
- const std::vector<Match> true_matches = {Match(0, 0, 4), Match(9, 4, 14),
- Match(23, 26, 3), Match(26, 29, 0)};
- const std::vector<Match> matches = sequence_match.GetMatchingBlocks();
- ASSERT_EQ(matches.size(), 4ul);
- for (int i = 0; i < 4; i++) {
- ASSERT_TRUE(MatchEqual(matches[i], true_matches[i]));
- }
-}
-
-TEST_F(SequenceMatcherTest, TestSequenceMatcherRatio) {
- ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"),
- base::UTF8ToUTF16("adbc"), kDefaultUseEditDistance)
- .Ratio(),
- 0.75);
- ASSERT_EQ(
- SequenceMatcher(base::UTF8ToUTF16("white cats"),
- base::UTF8ToUTF16("cats white"), kDefaultUseEditDistance)
- .Ratio(),
- 0.5);
-}
-
-TEST_F(SequenceMatcherTest, TestEditDistanceRatio) {
- ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"),
- base::UTF8ToUTF16("adbc"), true)
- .Ratio(),
- 0.5);
- EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("white cats"),
- base::UTF8ToUTF16("cats white"), true)
- .Ratio(),
- 0.2, 0.01);
-
- // Totally different
- EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("dog"),
- base::UTF8ToUTF16("elphant"), true)
- .Ratio(),
- 0.0, 0.01);
-}
-} // namespace
diff --git a/chromium/chrome/common/string_matching/term_break_iterator.cc b/chromium/chrome/common/string_matching/term_break_iterator.cc
deleted file mode 100644
index ae2f3b834ff..00000000000
--- a/chromium/chrome/common/string_matching/term_break_iterator.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/term_break_iterator.h"
-
-#include "base/check.h"
-#include "base/i18n/char_iterator.h"
-#include "base/notreached.h"
-#include "base/strings/string_util.h"
-#include "third_party/icu/source/common/unicode/uchar.h"
-
-TermBreakIterator::TermBreakIterator(const base::string16& word)
- : word_(word),
- prev_(npos),
- pos_(0),
- iter_(new base::i18n::UTF16CharIterator(&word)),
- state_(STATE_START) {}
-
-TermBreakIterator::~TermBreakIterator() = default;
-
-bool TermBreakIterator::Advance() {
- // 2D matrix that defines term boundaries. Each row represents current state.
- // Each col represents new state from input char. Cells with true value
- // represents a term boundary.
- const bool kBoundary[][STATE_LAST] = {
- // START NUMBER UPPER LOWER CHAR
- {false, false, false, false, false}, // START
- {false, false, true, true, true}, // NUMBER
- {false, true, false, false, true}, // UPPER
- {false, true, true, false, true}, // LOWER
- {false, true, true, true, false}, // CHAR
- };
-
- while (iter_->Advance()) {
- const State new_state = GetNewState(word_[iter_->array_pos()]);
- const bool is_boundary = kBoundary[state_][new_state];
- state_ = new_state;
- if (is_boundary)
- break;
- }
-
- prev_ = pos_;
- pos_ = iter_->array_pos();
-
- return prev_ != pos_ || !iter_->end();
-}
-
-const base::string16 TermBreakIterator::GetCurrentTerm() const {
- DCHECK(prev_ != npos && pos_ != npos);
- return word_.substr(prev_, pos_ - prev_);
-}
-
-TermBreakIterator::State TermBreakIterator::GetNewState(base::char16 ch) {
- if (base::IsAsciiDigit(ch) || ch == '.' || ch == ',')
- return STATE_NUMBER;
-
- const bool is_upper = !!u_isUUppercase(ch);
- const bool is_lower = !!u_isULowercase(ch);
-
- if (is_upper && is_lower) {
- NOTREACHED() << "Invalid state for ch=" << ch;
- return STATE_CHAR;
- }
-
- if (is_upper)
- return STATE_UPPER;
- if (is_lower)
- return STATE_LOWER;
-
- return STATE_CHAR;
-}
diff --git a/chromium/chrome/common/string_matching/term_break_iterator.h b/chromium/chrome/common/string_matching/term_break_iterator.h
deleted file mode 100644
index 80d069acc3c..00000000000
--- a/chromium/chrome/common/string_matching/term_break_iterator.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_
-#define CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_
-
-#include <stddef.h>
-
-#include <memory>
-
-#include "base/macros.h"
-#include "base/strings/string16.h"
-
-namespace base {
-namespace i18n {
-class UTF16CharIterator;
-}
-} // namespace base
-
-// TermBreakIterator breaks terms out of a word. Terms are broken on
-// camel case boundaries and alpha/number boundaries. Numbers are defined
-// as [0-9\.,]+.
-// e.g.
-// CamelCase -> Camel, Case
-// Python2.7 -> Python, 2.7
-class TermBreakIterator {
- public:
- // Note that |word| must out live this iterator.
- explicit TermBreakIterator(const base::string16& word);
- ~TermBreakIterator();
-
- // Advance to the next term. Returns false if at the end of the word.
- bool Advance();
-
- // Returns the current term, which is the substr of |word_| in range
- // [prev_, pos_).
- const base::string16 GetCurrentTerm() const;
-
- size_t prev() const { return prev_; }
- size_t pos() const { return pos_; }
-
- static const size_t npos = static_cast<size_t>(-1);
-
- private:
- enum State {
- STATE_START, // Initial state
- STATE_NUMBER, // Current char is a number [0-9\.,].
- STATE_UPPER, // Current char is upper case.
- STATE_LOWER, // Current char is lower case.
- STATE_CHAR, // Current char has no case, e.g. a cjk char.
- STATE_LAST,
- };
-
- // Returns new state for given |ch|.
- State GetNewState(base::char16 ch);
-
- const base::string16& word_;
- size_t prev_;
- size_t pos_;
-
- std::unique_ptr<base::i18n::UTF16CharIterator> iter_;
- State state_;
-
- DISALLOW_COPY_AND_ASSIGN(TermBreakIterator);
-};
-
-#endif // CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_
diff --git a/chromium/chrome/common/string_matching/term_break_iterator_unittest.cc b/chromium/chrome/common/string_matching/term_break_iterator_unittest.cc
deleted file mode 100644
index 8ff0f2f8766..00000000000
--- a/chromium/chrome/common/string_matching/term_break_iterator_unittest.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/term_break_iterator.h"
-
-#include "base/strings/utf_string_conversions.h"
-#include "testing/gtest/include/gtest/gtest.h"
-
-using base::UTF8ToUTF16;
-
-namespace {
-
-TEST(TermBreakIteratorTest, EmptyWord) {
- base::string16 empty;
- TermBreakIterator iter(empty);
- EXPECT_FALSE(iter.Advance());
-}
-
-TEST(TermBreakIteratorTest, Simple) {
- base::string16 word(UTF8ToUTF16("simple"));
- TermBreakIterator iter(word);
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("simple"), iter.GetCurrentTerm());
- EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
-}
-
-TEST(TermBreakIteratorTest, CamelCase) {
- base::string16 word(UTF8ToUTF16("CamelCase"));
- TermBreakIterator iter(word);
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("Camel"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("Case"), iter.GetCurrentTerm());
- EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
-}
-
-TEST(TermBreakIteratorTest, LowerToUpper) {
- base::string16 word(UTF8ToUTF16("lowerToUpper"));
- TermBreakIterator iter(word);
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("lower"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("To"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("Upper"), iter.GetCurrentTerm());
- EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
-}
-
-TEST(TermBreakIteratorTest, AlphaNumber) {
- base::string16 word(UTF8ToUTF16("Chromium26.0.0.0"));
- TermBreakIterator iter(word);
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("Chromium"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("26.0.0.0"), iter.GetCurrentTerm());
- EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
-}
-
-TEST(TermBreakIteratorTest, StartsWithNumber) {
- base::string16 word(UTF8ToUTF16("123startWithNumber"));
- TermBreakIterator iter(word);
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("123"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("start"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("With"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("Number"), iter.GetCurrentTerm());
- EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
-}
-
-TEST(TermBreakIteratorTest, CaseAndNoCase) {
- // "English" + two Chinese chars U+4E2D U+6587 + "Word"
- base::string16 word(UTF8ToUTF16("English\xe4\xb8\xad\xe6\x96\x87Word"));
- TermBreakIterator iter(word);
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("English"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("\xe4\xb8\xad\xe6\x96\x87"), iter.GetCurrentTerm());
- EXPECT_TRUE(iter.Advance());
- EXPECT_EQ(UTF8ToUTF16("Word"), iter.GetCurrentTerm());
- EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
-}
-
-} // namespace
diff --git a/chromium/chrome/common/string_matching/tokenized_string.cc b/chromium/chrome/common/string_matching/tokenized_string.cc
deleted file mode 100644
index 77c0703100a..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/tokenized_string.h"
-
-#include <stddef.h>
-
-#include "base/i18n/break_iterator.h"
-#include "base/i18n/case_conversion.h"
-#include "base/notreached.h"
-#include "chrome/common/string_matching/term_break_iterator.h"
-
-using base::i18n::BreakIterator;
-
-TokenizedString::TokenizedString(const base::string16& text) : text_(text) {
- Tokenize();
-}
-
-TokenizedString::~TokenizedString() = default;
-
-void TokenizedString::Tokenize() {
- BreakIterator break_iter(text_, BreakIterator::BREAK_WORD);
- if (!break_iter.Init()) {
- NOTREACHED() << "BreakIterator init failed"
- << ", text=\"" << text_ << "\"";
- return;
- }
-
- while (break_iter.Advance()) {
- if (!break_iter.IsWord())
- continue;
-
- const base::string16 word(break_iter.GetString());
- const size_t word_start = break_iter.prev();
- TermBreakIterator term_iter(word);
- while (term_iter.Advance()) {
- tokens_.emplace_back(base::i18n::ToLower(term_iter.GetCurrentTerm()));
- mappings_.emplace_back(word_start + term_iter.prev(),
- word_start + term_iter.pos());
- }
- }
-}
diff --git a/chromium/chrome/common/string_matching/tokenized_string.h b/chromium/chrome/common/string_matching/tokenized_string.h
deleted file mode 100644
index a564741f1da..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_H_
-#define CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_H_
-
-#include <vector>
-
-#include "base/macros.h"
-#include "base/strings/string16.h"
-#include "ui/gfx/range/range.h"
-
-// TokenizedString takes a string and breaks it down into token words. It
-// first breaks using BreakIterator to get all the words. Then it breaks
-// the words again at camel case boundaries and alpha/number boundaries.
-class TokenizedString {
- public:
- typedef std::vector<base::string16> Tokens;
- typedef std::vector<gfx::Range> Mappings;
-
- explicit TokenizedString(const base::string16& text);
- ~TokenizedString();
-
- const base::string16& text() const { return text_; }
- const Tokens& tokens() const { return tokens_; }
- const Mappings& mappings() const { return mappings_; }
-
- private:
- void Tokenize();
-
- // Input text.
- const base::string16 text_;
-
- // Broken down tokens and the index mapping of tokens in original string.
- Tokens tokens_;
- Mappings mappings_;
-
- DISALLOW_COPY_AND_ASSIGN(TokenizedString);
-};
-
-#endif // CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_H_
diff --git a/chromium/chrome/common/string_matching/tokenized_string_char_iterator.cc b/chromium/chrome/common/string_matching/tokenized_string_char_iterator.cc
deleted file mode 100644
index b558c4d3f28..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string_char_iterator.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/tokenized_string_char_iterator.h"
-
-#include "base/check.h"
-#include "base/i18n/char_iterator.h"
-#include "third_party/icu/source/common/unicode/utf16.h"
-
-TokenizedStringCharIterator::State::State() : token_index(0u), char_index(0) {}
-
-TokenizedStringCharIterator::State::State(size_t token_index, int char_index)
- : token_index(token_index), char_index(char_index) {}
-
-TokenizedStringCharIterator::TokenizedStringCharIterator(
- const TokenizedString& tokenized)
- : tokens_(tokenized.tokens()),
- mappings_(tokenized.mappings()),
- current_token_(0) {
- CreateTokenCharIterator();
-}
-
-TokenizedStringCharIterator::~TokenizedStringCharIterator() = default;
-
-bool TokenizedStringCharIterator::NextChar() {
- if (current_token_iter_) {
- current_token_iter_->Advance();
- if (!current_token_iter_->end())
- return true;
- }
-
- return NextToken();
-}
-
-bool TokenizedStringCharIterator::NextToken() {
- if (current_token_ < tokens_.size()) {
- ++current_token_;
- CreateTokenCharIterator();
- }
-
- return !!current_token_iter_;
-}
-
-int32_t TokenizedStringCharIterator::Get() const {
- return current_token_iter_ ? current_token_iter_->get() : 0;
-}
-
-int32_t TokenizedStringCharIterator::GetArrayPos() const {
- DCHECK(current_token_iter_);
- return mappings_[current_token_].start() + current_token_iter_->array_pos();
-}
-
-size_t TokenizedStringCharIterator::GetCharSize() const {
- return current_token_iter_ ? U16_LENGTH(Get()) : 0;
-}
-
-bool TokenizedStringCharIterator::IsFirstCharOfToken() const {
- return current_token_iter_ && current_token_iter_->char_offset() == 0;
-}
-
-TokenizedStringCharIterator::State TokenizedStringCharIterator::GetState()
- const {
- return State(current_token_,
- current_token_iter_ ? current_token_iter_->char_offset() : 0);
-}
-
-void TokenizedStringCharIterator::SetState(const State& state) {
- current_token_ = state.token_index;
- CreateTokenCharIterator();
- if (current_token_iter_) {
- while (current_token_iter_->char_offset() < state.char_index)
- current_token_iter_->Advance();
- }
-}
-
-void TokenizedStringCharIterator::CreateTokenCharIterator() {
- if (current_token_ == tokens_.size()) {
- current_token_iter_.reset();
- return;
- }
-
- current_token_iter_.reset(
- new base::i18n::UTF16CharIterator(&tokens_[current_token_]));
-}
diff --git a/chromium/chrome/common/string_matching/tokenized_string_char_iterator.h b/chromium/chrome/common/string_matching/tokenized_string_char_iterator.h
deleted file mode 100644
index 76a5be6e19a..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string_char_iterator.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_CHAR_ITERATOR_H_
-#define CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_CHAR_ITERATOR_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <memory>
-
-#include "base/macros.h"
-#include "chrome/common/string_matching/tokenized_string.h"
-
-namespace base {
-namespace i18n {
-class UTF16CharIterator;
-}
-} // namespace base
-
-// An UTF16 char iterator for a TokenizedString.
-class TokenizedStringCharIterator {
- public:
- struct State {
- State();
- State(size_t token_index, int char_index);
-
- size_t token_index;
- int32_t char_index;
- };
-
- // Requires |tokenized| out-lives this iterator.
- explicit TokenizedStringCharIterator(const TokenizedString& tokenized);
- ~TokenizedStringCharIterator();
-
- // Advances to the next char. Returns false if there is no next char.
- bool NextChar();
-
- // Advances to the first char of the next token. Returns false if there is
- // no next token.
- bool NextToken();
-
- // Returns the current char if there is one. Otherwise, returns 0.
- int32_t Get() const;
-
- // Returns the array index in original text of the tokenized string that is
- // passed in constructor.
- int32_t GetArrayPos() const;
-
- // Returns the number of UTF16 code units for the current char.
- size_t GetCharSize() const;
-
- // Returns true if the current char is the first char of the current token.
- bool IsFirstCharOfToken() const;
-
- // Helpers to get and restore the iterator's state.
- State GetState() const;
- void SetState(const State& state);
-
- // Returns true if the iterator is at the end.
- bool end() const { return !current_token_iter_; }
-
- private:
- void CreateTokenCharIterator();
-
- const TokenizedString::Tokens& tokens_;
- const TokenizedString::Mappings& mappings_;
-
- size_t current_token_;
- std::unique_ptr<base::i18n::UTF16CharIterator> current_token_iter_;
-
- DISALLOW_COPY_AND_ASSIGN(TokenizedStringCharIterator);
-};
-
-#endif // CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_CHAR_ITERATOR_H_
diff --git a/chromium/chrome/common/string_matching/tokenized_string_char_iterator_unittest.cc b/chromium/chrome/common/string_matching/tokenized_string_char_iterator_unittest.cc
deleted file mode 100644
index 61b5c4cfaef..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string_char_iterator_unittest.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/tokenized_string_char_iterator.h"
-
-#include <string>
-#include <vector>
-
-#include "base/strings/string_util.h"
-#include "base/strings/stringprintf.h"
-#include "base/strings/utf_string_conversions.h"
-#include "testing/gtest/include/gtest/gtest.h"
-
-namespace {
-
-// Returns a string represents the current state of |iter|. The state string
-// has three fields. The first is the current char. The second is the offset of
-// the current char in terms of the original text of the TokenizedString. The
-// last one is optional and only shows up when IsFirstCharOfToken returns true.
-std::string GetIterateState(const TokenizedStringCharIterator& iter) {
- return base::StringPrintf(
- "%s%d%s", base::UTF16ToUTF8(base::string16(1, iter.Get())).c_str(),
- iter.GetArrayPos(), iter.IsFirstCharOfToken() ? "!" : "");
-}
-
-void TestBeyondTheEnd(TokenizedStringCharIterator* iter) {
- ASSERT_TRUE(iter->end());
- ASSERT_FALSE(iter->NextChar());
- ASSERT_FALSE(iter->NextToken());
-
- // Don't care what it returns, but this shouldn't crash.
- iter->Get();
-}
-
-void TestEveryChar(const std::string& text, const std::string& expects) {
- TokenizedString tokens(base::UTF8ToUTF16(text));
- TokenizedStringCharIterator iter(tokens);
-
- std::vector<std::string> results;
- while (!iter.end()) {
- results.push_back(GetIterateState(iter));
- iter.NextChar();
- }
-
- EXPECT_EQ(expects, base::JoinString(results, " "));
- TestBeyondTheEnd(&iter);
-}
-
-void TestNextToken(const std::string& text, const std::string& expects) {
- TokenizedString tokens(base::UTF8ToUTF16(text));
- TokenizedStringCharIterator iter(tokens);
-
- std::vector<std::string> results;
- while (!iter.end()) {
- results.push_back(GetIterateState(iter));
- iter.NextToken();
- }
-
- EXPECT_EQ(expects, base::JoinString(results, " "));
- TestBeyondTheEnd(&iter);
-}
-
-void TestFirstTwoCharInEveryToken(const std::string& text,
- const std::string& expects) {
- TokenizedString tokens(base::UTF8ToUTF16(text));
- TokenizedStringCharIterator iter(tokens);
-
- std::vector<std::string> results;
- while (!iter.end()) {
- results.push_back(GetIterateState(iter));
- if (iter.NextChar())
- results.push_back(GetIterateState(iter));
-
- iter.NextToken();
- }
-
- EXPECT_EQ(expects, base::JoinString(results, " "));
- TestBeyondTheEnd(&iter);
-}
-
-TEST(TokenizedStringCharIteratorTest, NoTerms) {
- const char* text;
-
- text = "";
- TestEveryChar(text, "");
- TestNextToken(text, "");
- TestFirstTwoCharInEveryToken(text, "");
-
- text = "!@#$%^&*()<<<**>>>";
- TestEveryChar(text, "");
- TestNextToken(text, "");
- TestFirstTwoCharInEveryToken(text, "");
-}
-
-TEST(TokenizedStringCharIteratorTest, Basic) {
- const char* text;
-
- text = "c";
- TestEveryChar(text, "c0!");
- TestNextToken(text, "c0!");
- TestFirstTwoCharInEveryToken(text, "c0!");
-
- text = "Simple";
- TestEveryChar(text, "s0! i1 m2 p3 l4 e5");
- TestNextToken(text, "s0!");
- TestFirstTwoCharInEveryToken(text, "s0! i1");
-
- text = "ScratchPad";
- TestEveryChar(text, "s0! c1 r2 a3 t4 c5 h6 p7! a8 d9");
- TestNextToken(text, "s0! p7!");
- TestFirstTwoCharInEveryToken(text, "s0! c1 p7! a8");
-
- text = "Chess2.0";
- TestEveryChar(text, "c0! h1 e2 s3 s4 25! .6 07");
- TestNextToken(text, "c0! 25!");
- TestFirstTwoCharInEveryToken(text, "c0! h1 25! .6");
-
- text = "Cut the rope";
- TestEveryChar(text, "c0! u1 t2 t4! h5 e6 r8! o9 p10 e11");
- TestNextToken(text, "c0! t4! r8!");
- TestFirstTwoCharInEveryToken(text, "c0! u1 t4! h5 r8! o9");
-
- text = "AutoCAD WS";
- TestEveryChar(text, "a0! u1 t2 o3 c4! a5 d6 w8! s9");
- TestNextToken(text, "a0! c4! w8!");
- TestFirstTwoCharInEveryToken(text, "a0! u1 c4! a5 w8! s9");
-
- text = "Great TweetDeck";
- TestEveryChar(text, "g0! r1 e2 a3 t4 t6! w7 e8 e9 t10 d11! e12 c13 k14");
- TestNextToken(text, "g0! t6! d11!");
- TestFirstTwoCharInEveryToken(text, "g0! r1 t6! w7 d11! e12");
-
- text = "Draw-It!";
- TestEveryChar(text, "d0! r1 a2 w3 i5! t6");
- TestNextToken(text, "d0! i5!");
- TestFirstTwoCharInEveryToken(text, "d0! r1 i5! t6");
-
- text = "Faxing & Signing";
- TestEveryChar(text, "f0! a1 x2 i3 n4 g5 s9! i10 g11 n12 i13 n14 g15");
- TestNextToken(text, "f0! s9!");
- TestFirstTwoCharInEveryToken(text, "f0! a1 s9! i10");
-}
-
-} // namespace
diff --git a/chromium/chrome/common/string_matching/tokenized_string_fuzzer.cc b/chromium/chrome/common/string_matching/tokenized_string_fuzzer.cc
deleted file mode 100644
index 0e40150fda0..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string_fuzzer.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "base/strings/string16.h"
-#include "chrome/common/string_matching/tokenized_string.h"
-
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
- if (size < 1 || size % 2 != 0)
- return 0;
-
- // Test for base::string16 if size is even.
- base::string16 string_input16(reinterpret_cast<const base::char16*>(data),
- size / 2);
- TokenizedString tokenized_string_from_string16(string_input16);
- return 0;
-}
diff --git a/chromium/chrome/common/string_matching/tokenized_string_match.cc b/chromium/chrome/common/string_matching/tokenized_string_match.cc
deleted file mode 100644
index 0545cb1b2af..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string_match.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/tokenized_string_match.h"
-
-#include <stddef.h>
-
-#include <cmath>
-
-#include "base/check.h"
-#include "base/i18n/string_search.h"
-#include "base/macros.h"
-#include "base/strings/string_util.h"
-#include "chrome/common/string_matching/tokenized_string_char_iterator.h"
-
-namespace {
-
-// The factors below are applied when the current char of query matches
-// the current char of the text to be matched. Different factors are chosen
-// based on where the match happens. kIsPrefixMultiplier is used when the
-// matched portion is a prefix of both the query and the text, which implies
-// that the matched chars are at the same position in query and text. This is
-// the most preferred case thus it has the highest score. When the current char
-// of the query and the text does not match, the algorithm moves to the next
-// token in the text and try to match from there. kIsFrontOfWordMultipler will
-// be used if the first char of the token matches the current char of the query.
-// Otherwise, the match is considered as weak and kIsWeakHitMultiplier is
-// used.
-// Examples:
-// Suppose the text to be matched is 'Google Chrome'.
-// Query 'go' would yield kIsPrefixMultiplier for each char.
-// Query 'gc' would use kIsPrefixMultiplier for 'g' and
-// kIsFrontOfWordMultipler for 'c'.
-// Query 'ch' would use kIsFrontOfWordMultipler for 'c' and
-// kIsWeakHitMultiplier for 'h'.
-// Query 'oo' does not match any prefix and would use the substring match
-// fallback, thus kIsSubstringMultiplier is used for each char.
-const double kIsPrefixMultiplier = 1.0;
-const double kIsFrontOfWordMultipler = 0.8;
-const double kIsWeakHitMultiplier = 0.6;
-const double kIsSubstringMultiplier = 0.4;
-
-// A relevance score that represents no match.
-const double kNoMatchScore = 0.0;
-
-// PrefixMatcher matches the chars of a given query as prefix of tokens in
-// a given text or as prefix of the acronyms of those text tokens.
-class PrefixMatcher {
- public:
- PrefixMatcher(const TokenizedString& query, const TokenizedString& text)
- : query_iter_(query),
- text_iter_(text),
- current_match_(gfx::Range::InvalidRange()),
- current_relevance_(kNoMatchScore) {}
-
- // Invokes RunMatch to perform prefix match. Use |states_| as a stack to
- // perform DFS (depth first search) so that all possible matches are
- // attempted. Stops on the first full match and returns true. Otherwise,
- // returns false to indicate no match.
- bool Match() {
- while (!RunMatch()) {
- // No match found and no more states to try. Bail out.
- if (states_.empty()) {
- current_relevance_ = kNoMatchScore;
- current_hits_.clear();
- return false;
- }
-
- PopState();
-
- // Skip restored match to try other possibilites.
- AdvanceToNextTextToken();
- }
-
- if (current_match_.IsValid())
- current_hits_.push_back(current_match_);
-
- return true;
- }
-
- double relevance() const { return current_relevance_; }
- const TokenizedStringMatch::Hits& hits() const { return current_hits_; }
-
- private:
- // Context record of a match.
- struct State {
- State() : relevance(kNoMatchScore) {}
- State(double relevance,
- const gfx::Range& current_match,
- const TokenizedStringMatch::Hits& hits,
- const TokenizedStringCharIterator& query_iter,
- const TokenizedStringCharIterator& text_iter)
- : relevance(relevance),
- current_match(current_match),
- hits(hits.begin(), hits.end()),
- query_iter_state(query_iter.GetState()),
- text_iter_state(text_iter.GetState()) {}
-
- // The current score of the processed query chars.
- double relevance;
-
- // Current matching range.
- gfx::Range current_match;
-
- // Completed matching ranges of the processed query chars.
- TokenizedStringMatch::Hits hits;
-
- // States of the processed query and text chars.
- TokenizedStringCharIterator::State query_iter_state;
- TokenizedStringCharIterator::State text_iter_state;
- };
- typedef std::vector<State> States;
-
- // Match chars from the query and text one by one. For each matching char,
- // calculate relevance and matching ranges. And the current stats is
- // recorded so that the match could be skipped later to try other
- // possiblities. Repeat until any of the iterators run out. Return true if
- // query iterator runs out, i.e. all chars in query are matched.
- bool RunMatch() {
- while (!query_iter_.end() && !text_iter_.end()) {
- if (query_iter_.Get() == text_iter_.Get()) {
- PushState();
-
- if (query_iter_.GetArrayPos() == text_iter_.GetArrayPos())
- current_relevance_ += kIsPrefixMultiplier;
- else if (text_iter_.IsFirstCharOfToken())
- current_relevance_ += kIsFrontOfWordMultipler;
- else
- current_relevance_ += kIsWeakHitMultiplier;
-
- if (!current_match_.IsValid())
- current_match_.set_start(text_iter_.GetArrayPos());
- current_match_.set_end(text_iter_.GetArrayPos() +
- text_iter_.GetCharSize());
-
- query_iter_.NextChar();
- text_iter_.NextChar();
- } else {
- AdvanceToNextTextToken();
- }
- }
-
- return query_iter_.end();
- }
-
- // Skip to the next text token and close current match. Invoked when a
- // mismatch happens or to skip a restored match.
- void AdvanceToNextTextToken() {
- if (current_match_.IsValid()) {
- current_hits_.push_back(current_match_);
- current_match_ = gfx::Range::InvalidRange();
- }
-
- text_iter_.NextToken();
- }
-
- void PushState() {
- states_.push_back(State(current_relevance_, current_match_, current_hits_,
- query_iter_, text_iter_));
- }
-
- void PopState() {
- DCHECK(!states_.empty());
-
- State& last_match = states_.back();
- current_relevance_ = last_match.relevance;
- current_match_ = last_match.current_match;
- current_hits_.swap(last_match.hits);
- query_iter_.SetState(last_match.query_iter_state);
- text_iter_.SetState(last_match.text_iter_state);
-
- states_.pop_back();
- }
-
- TokenizedStringCharIterator query_iter_;
- TokenizedStringCharIterator text_iter_;
-
- States states_;
- gfx::Range current_match_;
-
- double current_relevance_;
- TokenizedStringMatch::Hits current_hits_;
-
- DISALLOW_COPY_AND_ASSIGN(PrefixMatcher);
-};
-
-} // namespace
-
-TokenizedStringMatch::TokenizedStringMatch() : relevance_(kNoMatchScore) {}
-
-TokenizedStringMatch::~TokenizedStringMatch() = default;
-
-bool TokenizedStringMatch::Calculate(const TokenizedString& query,
- const TokenizedString& text) {
- relevance_ = kNoMatchScore;
- hits_.clear();
-
- // If there is an exact match, relevance will be 1.0 and there is only 1 hit
- // that is the entire text/query.
- const auto& query_text = query.text();
- const auto& text_text = text.text();
- const auto query_size = query_text.size();
- const auto text_size = text_text.size();
- if (query_size > 0 && query_size == text_size &&
- base::EqualsCaseInsensitiveASCII(query_text, text_text)) {
- hits_.push_back(gfx::Range(0, query_size));
- relevance_ = 1.0;
- return true;
- }
-
- PrefixMatcher matcher(query, text);
- if (matcher.Match()) {
- relevance_ = matcher.relevance();
- hits_.assign(matcher.hits().begin(), matcher.hits().end());
- }
-
- // Substring match as a fallback.
- if (relevance_ == kNoMatchScore) {
- size_t substr_match_start = 0;
- size_t substr_match_length = 0;
- if (base::i18n::StringSearchIgnoringCaseAndAccents(
- query.text(), text.text(), &substr_match_start,
- &substr_match_length)) {
- relevance_ = kIsSubstringMultiplier * substr_match_length;
- hits_.push_back(gfx::Range(substr_match_start,
- substr_match_start + substr_match_length));
- }
- }
-
- // Temper the relevance score with an exponential curve. Each point of
- // relevance (roughly, each keystroke) is worth less than the last. This means
- // that typing a few characters of a word is enough to promote matches very
- // high, with any subsequent characters being worth comparatively less.
- // TODO(mgiuca): This doesn't really play well with Omnibox results, since as
- // you type more characters, the app/omnibox results tend to jump over each
- // other.
- relevance_ = 1.0 - std::pow(0.5, relevance_);
-
- return relevance_ > kNoMatchScore;
-}
-
-bool TokenizedStringMatch::Calculate(const base::string16& query,
- const base::string16& text) {
- const TokenizedString tokenized_query(query);
- const TokenizedString tokenized_text(text);
- return Calculate(tokenized_query, tokenized_text);
-}
diff --git a/chromium/chrome/common/string_matching/tokenized_string_match.h b/chromium/chrome/common/string_matching/tokenized_string_match.h
deleted file mode 100644
index 0112ac9ab10..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string_match.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_MATCH_H_
-#define CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_MATCH_H_
-
-#include <vector>
-
-#include "base/macros.h"
-#include "base/strings/string16.h"
-#include "ui/gfx/range/range.h"
-
-class TokenizedString;
-
-// TokenizedStringMatch takes two tokenized strings: one as the text and
-// the other one as the query. It matches the query against the text,
-// calculates a relevance score between [0, 1] and marks the matched portions
-// of text. A relevance of zero means the two are completely different to each
-// other. The higher the relevance score, the better the two strings are
-// matched. Matched portions of text are stored as index ranges.
-class TokenizedStringMatch {
- public:
- typedef std::vector<gfx::Range> Hits;
-
- TokenizedStringMatch();
- ~TokenizedStringMatch();
-
- // Calculates the relevance and hits. Returns true if the two strings are
- // somewhat matched, i.e. relevance score is not zero.
- bool Calculate(const TokenizedString& query, const TokenizedString& text);
-
- // Convenience wrapper to calculate match from raw string input.
- bool Calculate(const base::string16& query, const base::string16& text);
-
- double relevance() const { return relevance_; }
- const Hits& hits() const { return hits_; }
-
- private:
- // Score in range of [0,1] representing how well the query matches the text.
- double relevance_;
-
- // Char index ranges in |text| of where matches are found.
- Hits hits_;
-
- DISALLOW_COPY_AND_ASSIGN(TokenizedStringMatch);
-};
-
-#endif // CHROME_COMMON_STRING_MATCHING_TOKENIZED_STRING_MATCH_H_
diff --git a/chromium/chrome/common/string_matching/tokenized_string_match_unittest.cc b/chromium/chrome/common/string_matching/tokenized_string_match_unittest.cc
deleted file mode 100644
index 01a23d3be1d..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string_match_unittest.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/tokenized_string_match.h"
-
-#include <stddef.h>
-
-#include <string>
-
-#include "base/stl_util.h"
-#include "base/strings/utf_string_conversions.h"
-#include "testing/gtest/include/gtest/gtest.h"
-
-namespace {
-
-// Returns a string of |text| marked the hits in |match| using block bracket.
-// e.g. text= "Text", match.hits = [{0,1}], returns "[T]ext".
-std::string MatchHit(const base::string16& text,
- const TokenizedStringMatch& match) {
- base::string16 marked = text;
-
- const TokenizedStringMatch::Hits& hits = match.hits();
- for (TokenizedStringMatch::Hits::const_reverse_iterator it = hits.rbegin();
- it != hits.rend(); ++it) {
- const gfx::Range& hit = *it;
- marked.insert(hit.end(), 1, ']');
- marked.insert(hit.start(), 1, '[');
- }
-
- return base::UTF16ToUTF8(marked);
-}
-
-TEST(TokenizedStringMatchTest, NotMatch) {
- struct {
- const char* text;
- const char* query;
- } kTestCases[] = {
- {"", ""}, {"", "query"},
- {"text", ""}, {"!", "!@#$%^&*()<<<**>>>"},
- {"abd", "abcd"}, {"cd", "abcd"},
- };
-
- TokenizedStringMatch match;
- for (size_t i = 0; i < base::size(kTestCases); ++i) {
- const base::string16 text(base::UTF8ToUTF16(kTestCases[i].text));
- EXPECT_FALSE(match.Calculate(base::UTF8ToUTF16(kTestCases[i].query), text))
- << "Test case " << i << " : text=" << kTestCases[i].text
- << ", query=" << kTestCases[i].query;
- }
-}
-
-TEST(TokenizedStringMatchTest, Match) {
- struct {
- const char* text;
- const char* query;
- const char* expect;
- } kTestCases[] = {
- {"ScratchPad", "pad", "Scratch[Pad]"},
- {"ScratchPad", "sp", "[S]cratch[P]ad"},
- {"Chess2", "che", "[Che]ss2"},
- {"Chess2", "c2", "[C]hess[2]"},
- {"Cut the rope", "cut ro", "[Cut] the [ro]pe"},
- {"Cut the rope", "cr", "[C]ut the [r]ope"},
- {"John Doe", "jdoe", "[J]ohn [Doe]"},
- {"John Doe", "johnd", "[John D]oe"},
- {"Secure Shell", "she", "Secure [She]ll"},
- {"Simple Secure Shell", "sish", "[Si]mple Secure [Sh]ell"},
- {"Netflix", "flix", "Net[flix]"},
- };
-
- TokenizedStringMatch match;
- for (size_t i = 0; i < base::size(kTestCases); ++i) {
- const base::string16 text(base::UTF8ToUTF16(kTestCases[i].text));
- EXPECT_TRUE(match.Calculate(base::UTF8ToUTF16(kTestCases[i].query), text));
- EXPECT_EQ(kTestCases[i].expect, MatchHit(text, match));
- }
-}
-
-TEST(TokenizedStringMatchTest, Relevance) {
- struct {
- const char* text;
- const char* query_low;
- const char* query_high;
- } kTestCases[] = {
- // More matched chars are better.
- {"Google Chrome", "g", "go"},
- {"Google Chrome", "go", "goo"},
- {"Google Chrome", "goo", "goog"},
- {"Google Chrome", "c", "ch"},
- {"Google Chrome", "ch", "chr"},
- // Acronym match is better than something in the middle.
- {"Google Chrome", "ch", "gc"},
- // Prefix match is better than middle match and acronym match.
- {"Google Chrome", "ch", "go"},
- {"Google Chrome", "gc", "go"},
- // Substring match has the lowest score.
- {"Google Chrome", "oo", "gc"},
- {"Google Chrome", "oo", "go"},
- {"Google Chrome", "oo", "ch"},
- };
-
- TokenizedStringMatch match_low;
- TokenizedStringMatch match_high;
- for (size_t i = 0; i < base::size(kTestCases); ++i) {
- const base::string16 text(base::UTF8ToUTF16(kTestCases[i].text));
- EXPECT_TRUE(
- match_low.Calculate(base::UTF8ToUTF16(kTestCases[i].query_low), text));
- EXPECT_TRUE(match_high.Calculate(
- base::UTF8ToUTF16(kTestCases[i].query_high), text));
- EXPECT_LT(match_low.relevance(), match_high.relevance())
- << "Test case " << i << " : text=" << kTestCases[i].text
- << ", query_low=" << kTestCases[i].query_low
- << ", query_high=" << kTestCases[i].query_high;
- }
-}
-
-// More specialized tests of the absolute relevance scores. (These tests are
-// minimal, because they are so brittle. Changing the scoring algorithm will
-// require updating this test.)
-TEST(TokenizedStringMatchTest, AbsoluteRelevance) {
- const double kEpsilon = 0.006;
- struct {
- const char* text;
- const char* query;
- double expected_score;
- } kTestCases[] = {
- // The first few chars should increase the score extremely high. After
- // that, they should count less.
- // NOTE: 0.87 is a magic number, as it is the Omnibox score for a "pretty
- // good" match. We want a 3-letter prefix match to be slightly above 0.87.
- {"Google Chrome", "g", 0.5},
- {"Google Chrome", "go", 0.75},
- {"Google Chrome", "goo", 0.88},
- {"Google Chrome", "goog", 0.94},
- };
-
- TokenizedStringMatch match;
- for (size_t i = 0; i < base::size(kTestCases); ++i) {
- const base::string16 text(base::UTF8ToUTF16(kTestCases[i].text));
- EXPECT_TRUE(match.Calculate(base::UTF8ToUTF16(kTestCases[i].query), text));
- EXPECT_NEAR(match.relevance(), kTestCases[i].expected_score, kEpsilon)
- << "Test case " << i << " : text=" << kTestCases[i].text
- << ", query=" << kTestCases[i].query
- << ", expected_score=" << kTestCases[i].expected_score;
- }
-}
-
-} // namespace
diff --git a/chromium/chrome/common/string_matching/tokenized_string_unittest.cc b/chromium/chrome/common/string_matching/tokenized_string_unittest.cc
deleted file mode 100644
index 51c98b2f295..00000000000
--- a/chromium/chrome/common/string_matching/tokenized_string_unittest.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2019 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/string_matching/tokenized_string.h"
-
-#include <stddef.h>
-
-#include "base/strings/utf_string_conversions.h"
-#include "testing/gtest/include/gtest/gtest.h"
-
-namespace {
-
-base::string16 GetContent(const TokenizedString& tokenized) {
- const TokenizedString::Tokens& tokens = tokenized.tokens();
- const TokenizedString::Mappings& mappings = tokenized.mappings();
-
- base::string16 str;
- for (size_t i = 0; i < tokens.size(); ++i) {
- if (i > 0)
- str += ' ';
- str += tokens[i];
- str += base::UTF8ToUTF16(mappings[i].ToString());
- }
- return str;
-}
-
-TEST(TokenizedStringTest, Empty) {
- base::string16 empty;
- TokenizedString tokens(empty);
- EXPECT_EQ(base::string16(), GetContent(tokens));
-}
-
-TEST(TokenizedStringTest, Basic) {
- {
- base::string16 text(base::UTF8ToUTF16("ScratchPad"));
- TokenizedString tokens(text);
- EXPECT_EQ(base::UTF8ToUTF16("scratch{0,7} pad{7,10}"), GetContent(tokens));
- }
- {
- base::string16 text(base::UTF8ToUTF16("Chess2.0"));
- TokenizedString tokens(text);
- EXPECT_EQ(base::UTF8ToUTF16("chess{0,5} 2.0{5,8}"), GetContent(tokens));
- }
- {
- base::string16 text(base::UTF8ToUTF16("Cut the rope"));
- TokenizedString tokens(text);
- EXPECT_EQ(base::UTF8ToUTF16("cut{0,3} the{4,7} rope{8,12}"),
- GetContent(tokens));
- }
- {
- base::string16 text(base::UTF8ToUTF16("AutoCAD WS"));
- TokenizedString tokens(text);
- EXPECT_EQ(base::UTF8ToUTF16("auto{0,4} cad{4,7} ws{8,10}"),
- GetContent(tokens));
- }
- {
- base::string16 text(base::UTF8ToUTF16("Great TweetDeck"));
- TokenizedString tokens(text);
- EXPECT_EQ(base::UTF8ToUTF16("great{0,5} tweet{6,11} deck{11,15}"),
- GetContent(tokens));
- }
- {
- base::string16 text(base::UTF8ToUTF16("Draw-It!"));
- TokenizedString tokens(text);
- EXPECT_EQ(base::UTF8ToUTF16("draw{0,4} it{5,7}"), GetContent(tokens));
- }
- {
- base::string16 text(base::UTF8ToUTF16("Faxing & Signing"));
- TokenizedString tokens(text);
- EXPECT_EQ(base::UTF8ToUTF16("faxing{0,6} signing{9,16}"),
- GetContent(tokens));
- }
- {
- base::string16 text(base::UTF8ToUTF16("!@#$%^&*()<<<**>>>"));
- TokenizedString tokens(text);
- EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(tokens));
- }
-}
-
-} // namespace