src/lib/normalize-text.ts at dev · aviva.gay/deckbelcher

aviva.gay / deckbelcher

fork atom

👁️

fork atom

deckbelcher / src / lib / normalize-text.ts

at dev 20 lines 758 B view raw

wrap content

Aviva Ruben diacritic mark and ampersand aware fuzzy tokenization, diacritic aware syntax search 2mo ago

46e38ed7

 1/**
 2 * Text normalization for search
 3 *
 4 * Strips diacritical marks from text for search matching (ö→o, û→u, etc.)
 5 *
 6 * IMPORTANT: This uses the Unicode "Combining Diacritical Marks" block
 7 * (U+0300-U+036F) which specifically targets Latin script diacritics.
 8 * Other scripts (Arabic, Hebrew, CJK, Cyrillic) are unaffected.
 9 *
10 * This is appropriate for English MTG card names which may contain borrowed
11 * diacritics (Jötun, Lim-Dûl, Dandân). If we ever index non-English printings
12 * (using `printed_name`), we'd need to reconsider script handling.
13 */
14
15/**
16 * Strip diacritical marks from text for search normalization
17 */
18export function stripDiacritics(text: string): string {
19	return text.normalize("NFD").replace(/[\u0300-\u036f]/g, "");
20}