src/utils/normalizeUnicode.ts at main · skywatch.blue/skywatch-automod

A tool for parsing traffic on the jetstream and applying a moderation workstream based on regexp based rules

skywatch-automod / src / utils / normalizeUnicode.ts

at main 1.6 kB view raw

 1import { homoglyphMap } from "./homoglyphs.js";
 2
 3/**
 4 * Normalizes a string by converting it to lowercase, replacing homoglyphs,
 5 * and stripping diacritics. This is useful for sanitizing user input
 6 * before performing checks for forbidden words.
 7 *
 8 * The process is as follows:
 9 * 1. Convert the entire string to lowercase.
10 * 2. Replace characters that are visually similar to ASCII letters (homoglyphs)
11 *    with their ASCII counterparts based on the `homoglyphMap`.
12 * 3. Apply NFD (Normalization Form D) Unicode normalization to decompose
13 *    characters into their base characters and combining marks.
14 * 4. Remove all Unicode combining diacritical marks.
15 * 5. Apply NFKC (Normalization Form KC) Unicode normalization for a final
16 *    cleanup, which handles compatibility characters.
17 *
18 * @param text The input string to normalize.
19 * @returns The normalized string.
20 */
21export function normalizeUnicode(text: string): string {
22  // Convert to lowercase to match the homoglyph map keys
23  const lowercased = text.toLowerCase();
24
25  // Replace characters using the homoglyph map.
26  // This is done before NFD so that pre-composed characters are caught.
27  let replaced = "";
28  for (const char of lowercased) {
29    replaced += homoglyphMap[char] || char;
30  }
31
32  // First decompose the characters (NFD), then remove diacritics.
33  const withoutDiacritics = replaced
34    .normalize("NFD")
35    .replace(/[\u0300-\u036f]/g, "");
36
37  // Final NFKC normalization to handle any remaining special characters.
38  return withoutDiacritics.normalize("NFKC");
39}