Markdown parser fork with extended syntax for personal use.
at hack 78 lines 2.9 kB view raw
1//! Normalize identifiers. 2 3use alloc::string::String; 4 5/// Normalize an identifier, as found in [references][label_end] and 6/// [definitions][definition], so it can be compared when matching. 7/// 8/// This collapsed whitespace found in markdown (`\t`, `\r`, `\n`, and ` `) 9/// into one space, trims it (as in, dropping the first and last space), and 10/// then performs unicode case folding twice: first by lowercasing uppercase 11/// characters, and then uppercasing lowercase characters. 12/// 13/// Some characters are considered “uppercase”, such as U+03F4 (`ϴ`), but if 14/// their lowercase counterpart (U+03B8 (`θ`)) is uppercased will result in a 15/// different uppercase character (U+0398 (`Θ`)). 16/// Hence, to get that form, we perform both lower- and uppercase. 17/// 18/// Performing these steps in that order works, but the inverse does not work. 19/// To illustrate, say the source markdown containes two identifiers 20/// `SS` (U+0053 U+0053) and `ẞ` (U+1E9E), which would be lowercased to 21/// `ss` (U+0073 U+0073) and `ß` (U+00DF), and those in turn would both 22/// uppercase to `SS` (U+0053 U+0053). 23/// If we’d inverse the steps, for `ẞ`, we’d first uppercase without a 24/// change, and then lowercase to `ß`, which would not match `ss`. 25/// 26/// ## Examples 27/// 28/// ```rust ignore 29/// markdown::util::normalize_identifier::normalize_identifier; 30/// 31/// assert_eq!(normalize_identifier(" a "), "a"); 32/// assert_eq!(normalize_identifier("a\t\r\nb"), "a b"); 33/// assert_eq!(normalize_identifier("ПРИВЕТ"), "привет"); 34/// assert_eq!(normalize_identifier("Привет"), "привет"); 35/// assert_eq!(normalize_identifier("привет"), "привет"); 36/// ``` 37/// 38/// ## References 39/// 40/// * [`micromark-util-normalize-identifier` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-normalize-identifier) 41/// 42/// [definition]: crate::construct::definition 43/// [label_end]: crate::construct::label_end 44pub fn normalize_identifier(value: &str) -> String { 45 // Note: it’ll grow a bit smaller for consecutive whitespace. 46 let mut result = String::with_capacity(value.len()); 47 let bytes = value.as_bytes(); 48 let mut in_whitespace = true; 49 let mut index = 0; 50 let mut start = 0; 51 52 while index < bytes.len() { 53 if matches!(bytes[index], b'\t' | b'\n' | b'\r' | b' ') { 54 // First whitespace we see after non-whitespace. 55 if !in_whitespace { 56 result.push_str(&value[start..index]); 57 in_whitespace = true; 58 } 59 } 60 // First non-whitespace we see after whitespace. 61 else if in_whitespace { 62 if start != 0 { 63 result.push(' '); 64 } 65 66 start = index; 67 in_whitespace = false; 68 } 69 70 index += 1; 71 } 72 73 if !in_whitespace { 74 result.push_str(&value[start..]); 75 } 76 77 result.to_lowercase().to_uppercase() 78}