A tool for parsing traffic on the jetstream and applying a moderation workstream based on regexp based rules
1import { homoglyphMap } from "./homoglyphs.js";
2
3/**
4 * Normalizes a string by converting it to lowercase, replacing homoglyphs,
5 * and stripping diacritics. This is useful for sanitizing user input
6 * before performing checks for forbidden words.
7 *
8 * The process is as follows:
9 * 1. Convert the entire string to lowercase.
10 * 2. Replace characters that are visually similar to ASCII letters (homoglyphs)
11 * with their ASCII counterparts based on the `homoglyphMap`.
12 * 3. Apply NFD (Normalization Form D) Unicode normalization to decompose
13 * characters into their base characters and combining marks.
14 * 4. Remove all Unicode combining diacritical marks.
15 * 5. Apply NFKC (Normalization Form KC) Unicode normalization for a final
16 * cleanup, which handles compatibility characters.
17 *
18 * @param text The input string to normalize.
19 * @returns The normalized string.
20 */
21export function normalizeUnicode(text: string): string {
22 // Convert to lowercase to match the homoglyph map keys
23 const lowercased = text.toLowerCase();
24
25 // Replace characters using the homoglyph map.
26 // This is done before NFD so that pre-composed characters are caught.
27 let replaced = "";
28 for (const char of lowercased) {
29 replaced += homoglyphMap[char] || char;
30 }
31
32 // First decompose the characters (NFD), then remove diacritics.
33 const withoutDiacritics = replaced
34 .normalize("NFD")
35 .replace(/[\u0300-\u036f]/g, "");
36
37 // Final NFKC normalization to handle any remaining special characters.
38 return withoutDiacritics.normalize("NFKC");
39}