|
| 1 | +/**************************************************************************/ |
| 2 | +/* fuzzy_search.cpp */ |
| 3 | +/**************************************************************************/ |
| 4 | +/* This file is part of: */ |
| 5 | +/* GODOT ENGINE */ |
| 6 | +/* https://godotengine.org */ |
| 7 | +/**************************************************************************/ |
| 8 | +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ |
| 9 | +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ |
| 10 | +/* */ |
| 11 | +/* Permission is hereby granted, free of charge, to any person obtaining */ |
| 12 | +/* a copy of this software and associated documentation files (the */ |
| 13 | +/* "Software"), to deal in the Software without restriction, including */ |
| 14 | +/* without limitation the rights to use, copy, modify, merge, publish, */ |
| 15 | +/* distribute, sublicense, and/or sell copies of the Software, and to */ |
| 16 | +/* permit persons to whom the Software is furnished to do so, subject to */ |
| 17 | +/* the following conditions: */ |
| 18 | +/* */ |
| 19 | +/* The above copyright notice and this permission notice shall be */ |
| 20 | +/* included in all copies or substantial portions of the Software. */ |
| 21 | +/* */ |
| 22 | +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ |
| 23 | +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ |
| 24 | +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ |
| 25 | +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ |
| 26 | +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ |
| 27 | +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ |
| 28 | +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ |
| 29 | +/**************************************************************************/ |
| 30 | + |
| 31 | +#include "fuzzy_search.h" |
| 32 | + |
| 33 | +constexpr float cull_factor = 0.1f; |
| 34 | +constexpr float cull_cutoff = 30.0f; |
| 35 | +const String boundary_chars = "/\\-_."; |
| 36 | + |
| 37 | +static bool _is_valid_interval(const Vector2i &p_interval) { |
| 38 | + // Empty intervals are represented as (-1, -1). |
| 39 | + return p_interval.x >= 0 && p_interval.y >= p_interval.x; |
| 40 | +} |
| 41 | + |
| 42 | +static Vector2i _extend_interval(const Vector2i &p_a, const Vector2i &p_b) { |
| 43 | + if (!_is_valid_interval(p_a)) { |
| 44 | + return p_b; |
| 45 | + } |
| 46 | + if (!_is_valid_interval(p_b)) { |
| 47 | + return p_a; |
| 48 | + } |
| 49 | + return Vector2i(MIN(p_a.x, p_b.x), MAX(p_a.y, p_b.y)); |
| 50 | +} |
| 51 | + |
| 52 | +static bool _is_word_boundary(const String &p_str, int p_index) { |
| 53 | + if (p_index == -1 || p_index == p_str.size()) { |
| 54 | + return true; |
| 55 | + } |
| 56 | + return boundary_chars.find_char(p_str[p_index]) != -1; |
| 57 | +} |
| 58 | + |
| 59 | +bool FuzzySearchToken::try_exact_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset) const { |
| 60 | + p_match.token_idx = idx; |
| 61 | + p_match.token_length = string.length(); |
| 62 | + int match_idx = p_target.find(string, p_offset); |
| 63 | + if (match_idx == -1) { |
| 64 | + return false; |
| 65 | + } |
| 66 | + p_match.add_substring(match_idx, string.length()); |
| 67 | + return true; |
| 68 | +} |
| 69 | + |
| 70 | +bool FuzzySearchToken::try_fuzzy_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset, int p_miss_budget) const { |
| 71 | + p_match.token_idx = idx; |
| 72 | + p_match.token_length = string.length(); |
| 73 | + int run_start = -1; |
| 74 | + int run_len = 0; |
| 75 | + |
| 76 | + // Search for the subsequence p_token in p_target starting from p_offset, recording each substring for |
| 77 | + // later scoring and display. |
| 78 | + for (int i = 0; i < string.length(); i++) { |
| 79 | + int new_offset = p_target.find_char(string[i], p_offset); |
| 80 | + if (new_offset < 0) { |
| 81 | + p_miss_budget--; |
| 82 | + if (p_miss_budget < 0) { |
| 83 | + return false; |
| 84 | + } |
| 85 | + } else { |
| 86 | + if (run_start == -1 || p_offset != new_offset) { |
| 87 | + if (run_start != -1) { |
| 88 | + p_match.add_substring(run_start, run_len); |
| 89 | + } |
| 90 | + run_start = new_offset; |
| 91 | + run_len = 1; |
| 92 | + } else { |
| 93 | + run_len += 1; |
| 94 | + } |
| 95 | + p_offset = new_offset + 1; |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + if (run_start != -1) { |
| 100 | + p_match.add_substring(run_start, run_len); |
| 101 | + } |
| 102 | + |
| 103 | + return true; |
| 104 | +} |
| 105 | + |
| 106 | +void FuzzyTokenMatch::add_substring(int p_substring_start, int p_substring_length) { |
| 107 | + substrings.append(Vector2i(p_substring_start, p_substring_length)); |
| 108 | + matched_length += p_substring_length; |
| 109 | + Vector2i substring_interval = { p_substring_start, p_substring_start + p_substring_length - 1 }; |
| 110 | + interval = _extend_interval(interval, substring_interval); |
| 111 | +} |
| 112 | + |
| 113 | +bool FuzzyTokenMatch::intersects(const Vector2i &p_other_interval) const { |
| 114 | + if (!_is_valid_interval(interval) || !_is_valid_interval(p_other_interval)) { |
| 115 | + return false; |
| 116 | + } |
| 117 | + return interval.y >= p_other_interval.x && interval.x <= p_other_interval.y; |
| 118 | +} |
| 119 | + |
| 120 | +bool FuzzySearchResult::can_add_token_match(const FuzzyTokenMatch &p_match) const { |
| 121 | + if (p_match.get_miss_count() > miss_budget) { |
| 122 | + return false; |
| 123 | + } |
| 124 | + |
| 125 | + if (p_match.intersects(match_interval)) { |
| 126 | + if (token_matches.size() == 1) { |
| 127 | + return false; |
| 128 | + } |
| 129 | + for (const FuzzyTokenMatch &existing_match : token_matches) { |
| 130 | + if (existing_match.intersects(p_match.interval)) { |
| 131 | + return false; |
| 132 | + } |
| 133 | + } |
| 134 | + } |
| 135 | + |
| 136 | + return true; |
| 137 | +} |
| 138 | + |
| 139 | +bool FuzzyTokenMatch::is_case_insensitive(const String &p_original, const String &p_adjusted) const { |
| 140 | + for (const Vector2i &substr : substrings) { |
| 141 | + const int end = substr.x + substr.y; |
| 142 | + for (int i = substr.x; i < end; i++) { |
| 143 | + if (p_original[i] != p_adjusted[i]) { |
| 144 | + return true; |
| 145 | + } |
| 146 | + } |
| 147 | + } |
| 148 | + return false; |
| 149 | +} |
| 150 | + |
| 151 | +void FuzzySearchResult::score_token_match(FuzzyTokenMatch &p_match, bool p_case_insensitive) const { |
| 152 | + // This can always be tweaked more. The intuition is that exact matches should almost always |
| 153 | + // be prioritized over broken up matches, and other criteria more or less act as tie breakers. |
| 154 | + |
| 155 | + p_match.score = -20 * p_match.get_miss_count() - (p_case_insensitive ? 3 : 0); |
| 156 | + |
| 157 | + for (const Vector2i &substring : p_match.substrings) { |
| 158 | + // Score longer substrings higher than short substrings. |
| 159 | + int substring_score = substring.y * substring.y; |
| 160 | + // Score matches deeper in path higher than shallower matches |
| 161 | + if (substring.x > dir_index) { |
| 162 | + substring_score *= 2; |
| 163 | + } |
| 164 | + // Score matches on a word boundary higher than matches within a word |
| 165 | + if (_is_word_boundary(target, substring.x - 1) || _is_word_boundary(target, substring.x + substring.y)) { |
| 166 | + substring_score += 4; |
| 167 | + } |
| 168 | + // Score exact query matches higher than non-compact subsequence matches |
| 169 | + if (substring.y == p_match.token_length) { |
| 170 | + substring_score += 100; |
| 171 | + } |
| 172 | + p_match.score += substring_score; |
| 173 | + } |
| 174 | +} |
| 175 | + |
| 176 | +void FuzzySearchResult::maybe_apply_score_bonus() { |
| 177 | + // This adds a small bonus to results which match tokens in the same order they appear in the query. |
| 178 | + int *token_range_starts = (int *)alloca(sizeof(int) * token_matches.size()); |
| 179 | + |
| 180 | + for (const FuzzyTokenMatch &match : token_matches) { |
| 181 | + token_range_starts[match.token_idx] = match.interval.x; |
| 182 | + } |
| 183 | + |
| 184 | + int last = token_range_starts[0]; |
| 185 | + for (int i = 1; i < token_matches.size(); i++) { |
| 186 | + if (last > token_range_starts[i]) { |
| 187 | + return; |
| 188 | + } |
| 189 | + last = token_range_starts[i]; |
| 190 | + } |
| 191 | + |
| 192 | + score += 1; |
| 193 | +} |
| 194 | + |
| 195 | +void FuzzySearchResult::add_token_match(const FuzzyTokenMatch &p_match) { |
| 196 | + score += p_match.score; |
| 197 | + match_interval = _extend_interval(match_interval, p_match.interval); |
| 198 | + miss_budget -= p_match.get_miss_count(); |
| 199 | + token_matches.append(p_match); |
| 200 | +} |
| 201 | + |
| 202 | +void remove_low_scores(Vector<FuzzySearchResult> &p_results, float p_cull_score) { |
| 203 | + // Removes all results with score < p_cull_score in-place. |
| 204 | + int i = 0; |
| 205 | + int j = p_results.size() - 1; |
| 206 | + FuzzySearchResult *results = p_results.ptrw(); |
| 207 | + |
| 208 | + while (true) { |
| 209 | + // Advances i to an element to remove and j to an element to keep. |
| 210 | + while (j >= i && results[j].score < p_cull_score) { |
| 211 | + j--; |
| 212 | + } |
| 213 | + while (i < j && results[i].score >= p_cull_score) { |
| 214 | + i++; |
| 215 | + } |
| 216 | + if (i >= j) { |
| 217 | + break; |
| 218 | + } |
| 219 | + results[i++] = results[j--]; |
| 220 | + } |
| 221 | + |
| 222 | + p_results.resize(j + 1); |
| 223 | +} |
| 224 | + |
| 225 | +void FuzzySearch::sort_and_filter(Vector<FuzzySearchResult> &p_results) const { |
| 226 | + if (p_results.is_empty()) { |
| 227 | + return; |
| 228 | + } |
| 229 | + |
| 230 | + float avg_score = 0; |
| 231 | + float max_score = 0; |
| 232 | + |
| 233 | + for (const FuzzySearchResult &result : p_results) { |
| 234 | + avg_score += result.score; |
| 235 | + max_score = MAX(max_score, result.score); |
| 236 | + } |
| 237 | + |
| 238 | + // TODO: Tune scoring and culling here to display fewer subsequence soup matches when good matches |
| 239 | + // are available. |
| 240 | + avg_score /= p_results.size(); |
| 241 | + float cull_score = MIN(cull_cutoff, Math::lerp(avg_score, max_score, cull_factor)); |
| 242 | + remove_low_scores(p_results, cull_score); |
| 243 | + |
| 244 | + struct FuzzySearchResultComparator { |
| 245 | + bool operator()(const FuzzySearchResult &p_lhs, const FuzzySearchResult &p_rhs) const { |
| 246 | + // Sort on (score, length, alphanumeric) to ensure consistent ordering. |
| 247 | + if (p_lhs.score == p_rhs.score) { |
| 248 | + if (p_lhs.target.length() == p_rhs.target.length()) { |
| 249 | + return p_lhs.target < p_rhs.target; |
| 250 | + } |
| 251 | + return p_lhs.target.length() < p_rhs.target.length(); |
| 252 | + } |
| 253 | + return p_lhs.score > p_rhs.score; |
| 254 | + } |
| 255 | + }; |
| 256 | + |
| 257 | + SortArray<FuzzySearchResult, FuzzySearchResultComparator> sorter; |
| 258 | + |
| 259 | + if (p_results.size() > max_results) { |
| 260 | + sorter.partial_sort(0, p_results.size(), max_results, p_results.ptrw()); |
| 261 | + p_results.resize(max_results); |
| 262 | + } else { |
| 263 | + sorter.sort(p_results.ptrw(), p_results.size()); |
| 264 | + } |
| 265 | +} |
| 266 | + |
| 267 | +void FuzzySearch::set_query(const String &p_query) { |
| 268 | + tokens.clear(); |
| 269 | + for (const String &string : p_query.split(" ", false)) { |
| 270 | + tokens.append({ static_cast<int>(tokens.size()), string }); |
| 271 | + } |
| 272 | + |
| 273 | + case_sensitive = !p_query.is_lowercase(); |
| 274 | + |
| 275 | + struct TokenComparator { |
| 276 | + bool operator()(const FuzzySearchToken &A, const FuzzySearchToken &B) const { |
| 277 | + if (A.string.length() == B.string.length()) { |
| 278 | + return A.idx < B.idx; |
| 279 | + } |
| 280 | + return A.string.length() > B.string.length(); |
| 281 | + } |
| 282 | + }; |
| 283 | + |
| 284 | + // Prioritize matching longer tokens before shorter ones since match overlaps are not accepted. |
| 285 | + tokens.sort_custom<TokenComparator>(); |
| 286 | +} |
| 287 | + |
| 288 | +bool FuzzySearch::search(const String &p_target, FuzzySearchResult &p_result) const { |
| 289 | + p_result.target = p_target; |
| 290 | + p_result.dir_index = p_target.rfind_char('/'); |
| 291 | + p_result.miss_budget = max_misses; |
| 292 | + |
| 293 | + String adjusted_target = case_sensitive ? p_target : p_target.to_lower(); |
| 294 | + |
| 295 | + // For each token, eagerly generate subsequences starting from index 0 and keep the best scoring one |
| 296 | + // which does not conflict with prior token matches. This is not ensured to find the highest scoring |
| 297 | + // combination of matches, or necessarily the highest scoring single subsequence, as it only considers |
| 298 | + // eager subsequences for a given index, and likewise eagerly finds matches for each token in sequence. |
| 299 | + for (const FuzzySearchToken &token : tokens) { |
| 300 | + FuzzyTokenMatch best_match; |
| 301 | + int offset = start_offset; |
| 302 | + |
| 303 | + while (true) { |
| 304 | + FuzzyTokenMatch match; |
| 305 | + if (allow_subsequences) { |
| 306 | + if (!token.try_fuzzy_match(match, adjusted_target, offset, p_result.miss_budget)) { |
| 307 | + break; |
| 308 | + } |
| 309 | + } else { |
| 310 | + if (!token.try_exact_match(match, adjusted_target, offset)) { |
| 311 | + break; |
| 312 | + } |
| 313 | + } |
| 314 | + if (p_result.can_add_token_match(match)) { |
| 315 | + p_result.score_token_match(match, match.is_case_insensitive(p_target, adjusted_target)); |
| 316 | + if (best_match.token_idx == -1 || best_match.score < match.score) { |
| 317 | + best_match = match; |
| 318 | + } |
| 319 | + } |
| 320 | + if (_is_valid_interval(match.interval)) { |
| 321 | + offset = match.interval.x + 1; |
| 322 | + } else { |
| 323 | + break; |
| 324 | + } |
| 325 | + } |
| 326 | + |
| 327 | + if (best_match.token_idx == -1) { |
| 328 | + return false; |
| 329 | + } |
| 330 | + |
| 331 | + p_result.add_token_match(best_match); |
| 332 | + } |
| 333 | + |
| 334 | + p_result.maybe_apply_score_bonus(); |
| 335 | + return true; |
| 336 | +} |
| 337 | + |
| 338 | +void FuzzySearch::search_all(const PackedStringArray &p_targets, Vector<FuzzySearchResult> &p_results) const { |
| 339 | + p_results.clear(); |
| 340 | + |
| 341 | + for (const String &target : p_targets) { |
| 342 | + FuzzySearchResult result; |
| 343 | + if (search(target, result)) { |
| 344 | + p_results.append(result); |
| 345 | + } |
| 346 | + } |
| 347 | + |
| 348 | + sort_and_filter(p_results); |
| 349 | +} |
0 commit comments