Markdown parser fork with extended syntax for personal use.
at main 165 lines 4.9 kB view raw
1// To regenerate, run the following from the repository root: 2// 3// ```sh 4// cargo run --manifest-path generate/Cargo.toml 5// ``` 6 7use regex::Regex; 8use std::fs; 9 10#[tokio::main] 11async fn main() { 12 commonmark().await; 13 punctuation().await; 14} 15 16async fn commonmark() { 17 let url = "https://raw.githubusercontent.com/commonmark/commonmark-spec/0.31.2/spec.txt"; 18 let data_url = "commonmark-data.txt"; 19 let code_url = "tests/commonmark.rs"; 20 21 let value = if let Ok(value) = fs::read_to_string(data_url) { 22 value 23 } else { 24 let value = reqwest::get(url).await.unwrap().text().await.unwrap(); 25 26 fs::write(data_url, value.clone()).unwrap(); 27 28 value 29 }; 30 31 let re = Regex::new(r"(?m)(?:^`{32} example\n[\s\S]*?\n`{32}$|^#{1,6} *(.*)$)").unwrap(); 32 let re_heading_prefix = Regex::new(r"#{1,6} ").unwrap(); 33 let re_in_out = Regex::new(r"\n\.(?:\n|$)").unwrap(); 34 let mut current_heading = None; 35 let mut number = 1; 36 37 let value = Regex::new(r"<!-- END TESTS -->[\s\S]*") 38 .unwrap() 39 .replace(&value, ""); 40 let value = Regex::new(r"→").unwrap().replace_all(&value, "\t"); 41 let mut cases = vec![]; 42 43 for mat in re.find_iter(&value) { 44 let mut lines = mat.as_str().lines().collect::<Vec<_>>(); 45 46 if lines.len() == 1 { 47 current_heading = Some(re_heading_prefix.replace(lines[0], "").to_string()); 48 } else { 49 lines.remove(0); 50 lines.pop(); 51 let section = current_heading.as_ref().unwrap(); 52 let case = lines.join("\n"); 53 let parts = re_in_out.split(&case).collect::<Vec<_>>(); 54 let input = format!("{}\n", parts[0]); 55 let output = if parts[1].is_empty() { 56 "".into() 57 } else { 58 format!("{}\n", parts[1]) 59 }; 60 61 let test = format!(" assert_eq!(\n to_html_with_options(\n r###\"{}\"###,\n &danger\n )?,\n r###\"{}\"###,\n r###\"{} ({})\"###\n);", input, output, section, number); 62 63 cases.push(test); 64 65 number += 1; 66 } 67 } 68 69 let doc = format!( 70 "//! `CommonMark` test suite. 71 72// > 👉 **Important**: this module is generated by `generate/src/main.rs`. 73// > It is generate from the latest CommonMark website. 74 75use markdown::{{message, to_html_with_options, CompileOptions, Options}}; 76use pretty_assertions::assert_eq; 77 78#[rustfmt::skip] 79#[test] 80fn commonmark() -> Result<(), message::Message> {{ 81 let danger = Options {{ 82 compile: CompileOptions {{ 83 allow_dangerous_html: true, 84 allow_dangerous_protocol: true, 85 ..CompileOptions::default() 86 }}, 87 ..Options::default() 88 }}; 89 90{} 91 92 Ok(()) 93}} 94", 95 cases.join("\n\n") 96 ); 97 98 fs::write(code_url, doc).unwrap(); 99} 100 101async fn punctuation() { 102 let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"; 103 let data_url = "unicode-data.txt"; 104 let code_url = "src/util/unicode.rs"; 105 106 let value = if let Ok(value) = fs::read_to_string(data_url) { 107 value 108 } else { 109 let value = reqwest::get(url).await.unwrap().text().await.unwrap(); 110 111 fs::write(data_url, value.clone()).unwrap(); 112 113 value 114 }; 115 116 let search = [ 117 "Pc", // Punctuation, Connector 118 "Pd", // Punctuation, Dash 119 "Pe", // Punctuation, Close 120 "Pf", // Punctuation, FinalQuote 121 "Pi", // Punctuation, InitialQuote 122 "Po", // Punctuation, Other 123 "Ps", // Punctuation, Open 124 "Sc", // Symbol, Currency 125 "Sk", // Symbol, Modifier 126 "Sm", // Symbol, Math 127 "So", // Symbol, Other 128 ]; 129 130 let found = value 131 .lines() 132 .map(|line| line.split(';').collect::<Vec<_>>()) 133 .map(|cells| (cells[0], cells[2])) 134 .filter(|c| search.contains(&c.1)) 135 .map(|c| c.0) 136 .collect::<Vec<_>>(); 137 138 let doc = format!( 139 "//! Info on Unicode. 140 141/// List of characters that are considered punctuation. 142/// 143/// > 👉 **Important**: this module is generated by `generate/src/main.rs`. 144/// > It is generate from the latest Unicode data. 145/// 146/// Rust does not contain an `is_punctuation` method on `char`, while it does 147/// support [`is_ascii_alphanumeric`](char::is_ascii_alphanumeric). 148/// 149/// `CommonMark` handles attention (emphasis, strong) markers based on what 150/// comes before or after them. 151/// One such difference is if those characters are Unicode punctuation. 152/// 153/// ## References 154/// 155/// * [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.31.2/#unicode-punctuation-character) 156pub static PUNCTUATION: [char; {}] = [ 157{} 158]; 159", 160 found.len(), 161 found.iter().map(|d| format!(" '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n") 162 ); 163 164 fs::write(code_url, doc).unwrap(); 165}