Markdown parser fork with extended syntax for personal use.
1//! Normalize identifiers.
2
3use alloc::string::String;
4
5/// Normalize an identifier, as found in [references][label_end] and
6/// [definitions][definition], so it can be compared when matching.
7///
8/// This collapsed whitespace found in markdown (`\t`, `\r`, `\n`, and ` `)
9/// into one space, trims it (as in, dropping the first and last space), and
10/// then performs unicode case folding twice: first by lowercasing uppercase
11/// characters, and then uppercasing lowercase characters.
12///
13/// Some characters are considered “uppercase”, such as U+03F4 (`ϴ`), but if
14/// their lowercase counterpart (U+03B8 (`θ`)) is uppercased will result in a
15/// different uppercase character (U+0398 (`Θ`)).
16/// Hence, to get that form, we perform both lower- and uppercase.
17///
18/// Performing these steps in that order works, but the inverse does not work.
19/// To illustrate, say the source markdown containes two identifiers
20/// `SS` (U+0053 U+0053) and `ẞ` (U+1E9E), which would be lowercased to
21/// `ss` (U+0073 U+0073) and `ß` (U+00DF), and those in turn would both
22/// uppercase to `SS` (U+0053 U+0053).
23/// If we’d inverse the steps, for `ẞ`, we’d first uppercase without a
24/// change, and then lowercase to `ß`, which would not match `ss`.
25///
26/// ## Examples
27///
28/// ```rust ignore
29/// markdown::util::normalize_identifier::normalize_identifier;
30///
31/// assert_eq!(normalize_identifier(" a "), "a");
32/// assert_eq!(normalize_identifier("a\t\r\nb"), "a b");
33/// assert_eq!(normalize_identifier("ПРИВЕТ"), "привет");
34/// assert_eq!(normalize_identifier("Привет"), "привет");
35/// assert_eq!(normalize_identifier("привет"), "привет");
36/// ```
37///
38/// ## References
39///
40/// * [`micromark-util-normalize-identifier` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-normalize-identifier)
41///
42/// [definition]: crate::construct::definition
43/// [label_end]: crate::construct::label_end
44pub fn normalize_identifier(value: &str) -> String {
45 // Note: it’ll grow a bit smaller for consecutive whitespace.
46 let mut result = String::with_capacity(value.len());
47 let bytes = value.as_bytes();
48 let mut in_whitespace = true;
49 let mut index = 0;
50 let mut start = 0;
51
52 while index < bytes.len() {
53 if matches!(bytes[index], b'\t' | b'\n' | b'\r' | b' ') {
54 // First whitespace we see after non-whitespace.
55 if !in_whitespace {
56 result.push_str(&value[start..index]);
57 in_whitespace = true;
58 }
59 }
60 // First non-whitespace we see after whitespace.
61 else if in_whitespace {
62 if start != 0 {
63 result.push(' ');
64 }
65
66 start = index;
67 in_whitespace = false;
68 }
69
70 index += 1;
71 }
72
73 if !in_whitespace {
74 result.push_str(&value[start..]);
75 }
76
77 result.to_lowercase().to_uppercase()
78}