A tool for parsing traffic on the jetstream and applying a moderation workstream based on regexp based rules
at main 1.6 kB view raw
1import { homoglyphMap } from "./homoglyphs.js"; 2 3/** 4 * Normalizes a string by converting it to lowercase, replacing homoglyphs, 5 * and stripping diacritics. This is useful for sanitizing user input 6 * before performing checks for forbidden words. 7 * 8 * The process is as follows: 9 * 1. Convert the entire string to lowercase. 10 * 2. Replace characters that are visually similar to ASCII letters (homoglyphs) 11 * with their ASCII counterparts based on the `homoglyphMap`. 12 * 3. Apply NFD (Normalization Form D) Unicode normalization to decompose 13 * characters into their base characters and combining marks. 14 * 4. Remove all Unicode combining diacritical marks. 15 * 5. Apply NFKC (Normalization Form KC) Unicode normalization for a final 16 * cleanup, which handles compatibility characters. 17 * 18 * @param text The input string to normalize. 19 * @returns The normalized string. 20 */ 21export function normalizeUnicode(text: string): string { 22 // Convert to lowercase to match the homoglyph map keys 23 const lowercased = text.toLowerCase(); 24 25 // Replace characters using the homoglyph map. 26 // This is done before NFD so that pre-composed characters are caught. 27 let replaced = ""; 28 for (const char of lowercased) { 29 replaced += homoglyphMap[char] || char; 30 } 31 32 // First decompose the characters (NFD), then remove diacritics. 33 const withoutDiacritics = replaced 34 .normalize("NFD") 35 .replace(/[\u0300-\u036f]/g, ""); 36 37 // Final NFKC normalization to handle any remaining special characters. 38 return withoutDiacritics.normalize("NFKC"); 39}