Serenity Operating System
at master 136 lines 5.5 kB view raw
1/* 2 * Copyright (c) 2021, Spencer Dixon <spencercdixon@gmail.com> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/CharacterTypes.h> 8#include <AK/FuzzyMatch.h> 9#include <string.h> 10 11namespace AK { 12 13static constexpr int const RECURSION_LIMIT = 10; 14static constexpr int const MAX_MATCHES = 256; 15 16// Bonuses and penalties are used to build up a final score for the match. 17static constexpr int const SEQUENTIAL_BONUS = 15; // bonus for adjacent matches (needle: 'ca', haystack: 'cat') 18static constexpr int const SEPARATOR_BONUS = 30; // bonus if match occurs after a separator ('_' or ' ') 19static constexpr int const CAMEL_BONUS = 30; // bonus if match is uppercase and prev is lower (needle: 'myF' haystack: '/path/to/myFile.txt') 20static constexpr int const FIRST_LETTER_BONUS = 20; // bonus if the first letter is matched (needle: 'c' haystack: 'cat') 21static constexpr int const LEADING_LETTER_PENALTY = -5; // penalty applied for every letter in str before the first match 22static constexpr int const MAX_LEADING_LETTER_PENALTY = -15; // maximum penalty for leading letters 23static constexpr int const UNMATCHED_LETTER_PENALTY = -1; // penalty for every letter that doesn't matter 24 25static int calculate_score(StringView string, u8* index_points, size_t index_points_size) 26{ 27 int out_score = 100; 28 29 int penalty = LEADING_LETTER_PENALTY * index_points[0]; 30 if (penalty < MAX_LEADING_LETTER_PENALTY) 31 penalty = MAX_LEADING_LETTER_PENALTY; 32 out_score += penalty; 33 34 int unmatched = string.length() - index_points_size; 35 out_score += UNMATCHED_LETTER_PENALTY * unmatched; 36 37 for (size_t i = 0; i < index_points_size; i++) { 38 u8 current_idx = index_points[i]; 39 40 if (current_idx == 0) 41 out_score += FIRST_LETTER_BONUS; 42 43 if (i == 0) 44 continue; 45 46 u8 previous_idx = index_points[i - 1]; 47 if (current_idx - 1 == previous_idx) 48 out_score += SEQUENTIAL_BONUS; 49 50 u32 current_character = string[current_idx]; 51 u32 neighbor_character = string[current_idx - 1]; 52 53 if (neighbor_character != to_ascii_uppercase(neighbor_character) && current_character != to_ascii_lowercase(current_character)) 54 out_score += CAMEL_BONUS; 55 56 if (neighbor_character == '_' || neighbor_character == ' ') 57 out_score += SEPARATOR_BONUS; 58 } 59 60 return out_score; 61} 62 63static FuzzyMatchResult fuzzy_match_recursive(StringView needle, StringView haystack, size_t needle_idx, size_t haystack_idx, 64 u8 const* src_matches, u8* matches, int next_match, int& recursion_count) 65{ 66 int out_score = 0; 67 68 ++recursion_count; 69 if (recursion_count >= RECURSION_LIMIT) 70 return { false, out_score }; 71 72 if (needle.length() == needle_idx || haystack.length() == haystack_idx) 73 return { false, out_score }; 74 75 bool had_recursive_match = false; 76 constexpr size_t recursive_match_limit = 256; 77 u8 best_recursive_matches[recursive_match_limit]; 78 int best_recursive_score = 0; 79 80 bool first_match = true; 81 while (needle_idx < needle.length() && haystack_idx < haystack.length()) { 82 83 if (to_ascii_lowercase(needle[needle_idx]) == to_ascii_lowercase(haystack[haystack_idx])) { 84 if (next_match >= MAX_MATCHES) 85 return { false, out_score }; 86 87 if (first_match && src_matches) { 88 memcpy(matches, src_matches, next_match); 89 first_match = false; 90 } 91 92 u8 recursive_matches[recursive_match_limit] {}; 93 auto result = fuzzy_match_recursive(needle, haystack, needle_idx, haystack_idx + 1, matches, recursive_matches, next_match, recursion_count); 94 if (result.matched) { 95 if (!had_recursive_match || result.score > best_recursive_score) { 96 memcpy(best_recursive_matches, recursive_matches, recursive_match_limit); 97 best_recursive_score = result.score; 98 } 99 had_recursive_match = true; 100 } 101 matches[next_match++] = haystack_idx; 102 needle_idx++; 103 } 104 haystack_idx++; 105 } 106 107 bool matched = needle_idx == needle.length(); 108 if (!matched) 109 return { false, out_score }; 110 111 out_score = calculate_score(haystack, matches, next_match); 112 113 if (had_recursive_match && (best_recursive_score > out_score)) { 114 memcpy(matches, best_recursive_matches, MAX_MATCHES); 115 out_score = best_recursive_score; 116 } 117 118 return { true, out_score }; 119} 120 121// This fuzzy_match algorithm is based off a similar algorithm used by Sublime Text. The key insight is that instead 122// of doing a total in the distance between characters (I.E. Levenshtein Distance), we apply some meaningful heuristics 123// related to our dataset that we're trying to match to build up a score. Scores can then be sorted and displayed 124// with the highest at the top. 125// 126// Scores are not normalized between any values and have no particular meaning. The starting value is 100 and when we 127// detect good indicators of a match we add to the score. When we detect bad indicators, we penalize the match and subtract 128// from its score. Therefore, the longer the needle/haystack the greater the range of scores could be. 129FuzzyMatchResult fuzzy_match(StringView needle, StringView haystack) 130{ 131 int recursion_count = 0; 132 u8 matches[MAX_MATCHES] {}; 133 return fuzzy_match_recursive(needle, haystack, 0, 0, nullptr, matches, 0, recursion_count); 134} 135 136}