Serenity Operating System
1/*
2 * Copyright (c) 2021, Spencer Dixon <spencercdixon@gmail.com>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/CharacterTypes.h>
8#include <AK/FuzzyMatch.h>
9#include <string.h>
10
11namespace AK {
12
13static constexpr int const RECURSION_LIMIT = 10;
14static constexpr int const MAX_MATCHES = 256;
15
16// Bonuses and penalties are used to build up a final score for the match.
17static constexpr int const SEQUENTIAL_BONUS = 15; // bonus for adjacent matches (needle: 'ca', haystack: 'cat')
18static constexpr int const SEPARATOR_BONUS = 30; // bonus if match occurs after a separator ('_' or ' ')
19static constexpr int const CAMEL_BONUS = 30; // bonus if match is uppercase and prev is lower (needle: 'myF' haystack: '/path/to/myFile.txt')
20static constexpr int const FIRST_LETTER_BONUS = 20; // bonus if the first letter is matched (needle: 'c' haystack: 'cat')
21static constexpr int const LEADING_LETTER_PENALTY = -5; // penalty applied for every letter in str before the first match
22static constexpr int const MAX_LEADING_LETTER_PENALTY = -15; // maximum penalty for leading letters
23static constexpr int const UNMATCHED_LETTER_PENALTY = -1; // penalty for every letter that doesn't matter
24
25static int calculate_score(StringView string, u8* index_points, size_t index_points_size)
26{
27 int out_score = 100;
28
29 int penalty = LEADING_LETTER_PENALTY * index_points[0];
30 if (penalty < MAX_LEADING_LETTER_PENALTY)
31 penalty = MAX_LEADING_LETTER_PENALTY;
32 out_score += penalty;
33
34 int unmatched = string.length() - index_points_size;
35 out_score += UNMATCHED_LETTER_PENALTY * unmatched;
36
37 for (size_t i = 0; i < index_points_size; i++) {
38 u8 current_idx = index_points[i];
39
40 if (current_idx == 0)
41 out_score += FIRST_LETTER_BONUS;
42
43 if (i == 0)
44 continue;
45
46 u8 previous_idx = index_points[i - 1];
47 if (current_idx - 1 == previous_idx)
48 out_score += SEQUENTIAL_BONUS;
49
50 u32 current_character = string[current_idx];
51 u32 neighbor_character = string[current_idx - 1];
52
53 if (neighbor_character != to_ascii_uppercase(neighbor_character) && current_character != to_ascii_lowercase(current_character))
54 out_score += CAMEL_BONUS;
55
56 if (neighbor_character == '_' || neighbor_character == ' ')
57 out_score += SEPARATOR_BONUS;
58 }
59
60 return out_score;
61}
62
63static FuzzyMatchResult fuzzy_match_recursive(StringView needle, StringView haystack, size_t needle_idx, size_t haystack_idx,
64 u8 const* src_matches, u8* matches, int next_match, int& recursion_count)
65{
66 int out_score = 0;
67
68 ++recursion_count;
69 if (recursion_count >= RECURSION_LIMIT)
70 return { false, out_score };
71
72 if (needle.length() == needle_idx || haystack.length() == haystack_idx)
73 return { false, out_score };
74
75 bool had_recursive_match = false;
76 constexpr size_t recursive_match_limit = 256;
77 u8 best_recursive_matches[recursive_match_limit];
78 int best_recursive_score = 0;
79
80 bool first_match = true;
81 while (needle_idx < needle.length() && haystack_idx < haystack.length()) {
82
83 if (to_ascii_lowercase(needle[needle_idx]) == to_ascii_lowercase(haystack[haystack_idx])) {
84 if (next_match >= MAX_MATCHES)
85 return { false, out_score };
86
87 if (first_match && src_matches) {
88 memcpy(matches, src_matches, next_match);
89 first_match = false;
90 }
91
92 u8 recursive_matches[recursive_match_limit] {};
93 auto result = fuzzy_match_recursive(needle, haystack, needle_idx, haystack_idx + 1, matches, recursive_matches, next_match, recursion_count);
94 if (result.matched) {
95 if (!had_recursive_match || result.score > best_recursive_score) {
96 memcpy(best_recursive_matches, recursive_matches, recursive_match_limit);
97 best_recursive_score = result.score;
98 }
99 had_recursive_match = true;
100 }
101 matches[next_match++] = haystack_idx;
102 needle_idx++;
103 }
104 haystack_idx++;
105 }
106
107 bool matched = needle_idx == needle.length();
108 if (!matched)
109 return { false, out_score };
110
111 out_score = calculate_score(haystack, matches, next_match);
112
113 if (had_recursive_match && (best_recursive_score > out_score)) {
114 memcpy(matches, best_recursive_matches, MAX_MATCHES);
115 out_score = best_recursive_score;
116 }
117
118 return { true, out_score };
119}
120
121// This fuzzy_match algorithm is based off a similar algorithm used by Sublime Text. The key insight is that instead
122// of doing a total in the distance between characters (I.E. Levenshtein Distance), we apply some meaningful heuristics
123// related to our dataset that we're trying to match to build up a score. Scores can then be sorted and displayed
124// with the highest at the top.
125//
126// Scores are not normalized between any values and have no particular meaning. The starting value is 100 and when we
127// detect good indicators of a match we add to the score. When we detect bad indicators, we penalize the match and subtract
128// from its score. Therefore, the longer the needle/haystack the greater the range of scores could be.
129FuzzyMatchResult fuzzy_match(StringView needle, StringView haystack)
130{
131 int recursion_count = 0;
132 u8 matches[MAX_MATCHES] {};
133 return fuzzy_match_recursive(needle, haystack, 0, 0, nullptr, matches, 0, recursion_count);
134}
135
136}