parser · nonbinary.computer/jacquard@c4daaad

+1

Cargo.lock

··· 2238 2238 "p256", 2239 2239 "percent-encoding", 2240 2240 "rand_core 0.6.4", 2241 + "regex", 2241 2242 "reqwest", 2242 2243 "serde", 2243 2244 "serde_html_form",

+3

Cargo.toml

··· 80 80 # Crypto curves and JOSE 81 81 p256 = "0.13" 82 82 jose-jwk = "0.1" 83 + 84 + # Text processing 85 + regex = "1.11"

+1

crates/jacquard/Cargo.toml

··· 146 146 url.workspace = true 147 147 smol_str.workspace = true 148 148 percent-encoding.workspace = true 149 + regex.workspace = true 149 150 jose-jwk = { workspace = true, features = ["p256"] } 150 151 p256 = { workspace = true, features = ["ecdsa"] } 151 152 rand_core.workspace = true

+213 -11

crates/jacquard/src/richtext.rs

··· 4 4 //! and detection of embed candidates (record and external embeds). 5 5 6 6 use crate::common::CowStr; 7 + use regex::Regex; 7 8 use std::marker::PhantomData; 8 9 use std::ops::Range; 10 + use std::sync::LazyLock; 11 + 12 + // Regex patterns based on Bluesky's official implementation 13 + // https://github.com/bluesky-social/atproto/blob/main/packages/api/src/rich-text/util.ts 14 + 15 + static MENTION_REGEX: LazyLock<Regex> = 16 + LazyLock::new(|| Regex::new(r"(^|\s|$)(@)([a-zA-Z0-9.-]+)(\b)").unwrap()); 17 + 18 + static URL_REGEX: LazyLock<Regex> = LazyLock::new(|| { 19 + Regex::new(r"(^|\s|\()((https?://[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))") 20 + .unwrap() 21 + }); 22 + 23 + static TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| { 24 + // Simplified version - full unicode handling would need more work 25 + Regex::new(r"(^|\s)[#＃]([^\s\x{00AD}\x{2060}\x{200A}\x{200B}\x{200C}\x{200D}]+)").unwrap() 26 + }); 27 + 28 + static MARKDOWN_LINK_REGEX: LazyLock<Regex> = 29 + LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)$").unwrap()); 30 + 31 + static TRAILING_PUNCT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\p{P}+$").unwrap()); 9 32 10 33 /// Marker type indicating all facets are resolved (no handles pending DID resolution) 11 34 pub struct Resolved; ··· 47 70 48 71 /// Rich text builder supporting both parsing and manual construction 49 72 #[derive(Debug)] 50 - pub struct RichTextBuilder<'a, State> { 73 + pub struct RichTextBuilder<State> { 51 74 text: String, 52 - facet_candidates: Vec<FacetCandidate<'a>>, 75 + facet_candidates: Vec<FacetCandidate>, 53 76 _state: PhantomData<State>, 54 77 } 55 78 56 79 /// Internal representation of facet before resolution 80 + /// 81 + /// Stores minimal data to save memory: 82 + /// - Markdown links store URL (since syntax is stripped from text) 83 + /// - Mentions/tags store just ranges (@ and # included, extract at build time) 84 + /// - Links store just ranges (normalize URL at build time) 57 85 #[derive(Debug, Clone)] 58 - enum FacetCandidate<'a> { 86 + enum FacetCandidate { 87 + /// Markdown link: `[display](url)` → display text in final text 88 + MarkdownLink { 89 + /// Range of display text in final processed text 90 + display_range: Range<usize>, 91 + /// URL from markdown (not in final text, so must store) 92 + url: String, 93 + }, 94 + /// Mention: `@handle.bsky.social` 95 + /// Range includes the @ symbol, process at build time 59 96 Mention { 60 - handle_or_did: CowStr<'a>, 97 + /// Range in text including @ symbol 61 98 range: Range<usize>, 62 99 /// DID when provided, otherwise resolved later 63 100 did: Option<Did<'static>>, 64 101 }, 102 + /// Plain URL link 103 + /// Range points to URL in text, normalize at build time 65 104 Link { 66 - url: CowStr<'a>, 105 + /// Range in text pointing to URL (may need normalization) 67 106 range: Range<usize>, 68 107 }, 108 + /// Hashtag: `#tag` 109 + /// Range includes the # symbol, process at build time 69 110 Tag { 70 - tag: CowStr<'a>, 111 + /// Range in text including # symbol 71 112 range: Range<usize>, 72 113 }, 73 114 } 74 115 75 - impl<'a> RichTextBuilder<'a, Unresolved> { 76 - /// Entry point for parsing text with automatic facet detection 77 - pub fn parse(text: impl Into<String>) -> Self { 78 - todo!("Task 2") 116 + /// Entry point for parsing text with automatic facet detection 117 + pub fn parse(text: impl Into<String>) -> RichTextBuilder<Unresolved> { 118 + let text = text.into(); 119 + let mut facet_candidates = Vec::new(); 120 + 121 + // Step 1: Detect and strip markdown links first 122 + let (text_processed, markdown_facets) = detect_markdown_links(&text); 123 + facet_candidates.extend(markdown_facets); 124 + 125 + // Step 2: Detect mentions 126 + let mention_facets = detect_mentions(&text_processed); 127 + facet_candidates.extend(mention_facets); 128 + 129 + // Step 3: Detect URLs 130 + let url_facets = detect_urls(&text_processed); 131 + facet_candidates.extend(url_facets); 132 + 133 + // Step 4: Detect tags 134 + let tag_facets = detect_tags(&text_processed); 135 + facet_candidates.extend(tag_facets); 136 + 137 + RichTextBuilder { 138 + text: text_processed, 139 + facet_candidates, 140 + #[cfg(feature = "api_bluesky")] 141 + embed_candidates: Vec::new(), 142 + _state: PhantomData, 79 143 } 80 144 } 81 145 82 - impl<'a> RichTextBuilder<'a, Resolved> { 146 + impl RichTextBuilder<Resolved> { 83 147 /// Entry point for manual richtext construction 84 148 pub fn builder() -> Self { 85 149 RichTextBuilder { ··· 200 264 }) 201 265 } 202 266 } 267 + 268 + fn detect_markdown_links(text: &str) -> (String, Vec<FacetCandidate>) { 269 + let mut result = String::with_capacity(text.len()); 270 + let mut facets = Vec::new(); 271 + let mut last_end = 0; 272 + let mut offset = 0; 273 + 274 + for cap in MARKDOWN_LINK_REGEX.captures_iter(text) { 275 + let full_match = cap.get(0).unwrap(); 276 + let display_text = cap.get(1).unwrap().as_str(); 277 + let url = cap.get(2).unwrap().as_str(); 278 + 279 + // Append text before this match 280 + result.push_str(&text[last_end..full_match.start()]); 281 + 282 + // Append only the display text (strip markdown syntax) 283 + let start = result.len() - offset; 284 + result.push_str(display_text); 285 + let end = result.len() - offset; 286 + 287 + // Track offset change (we removed the markdown syntax) 288 + offset += full_match.as_str().len() - display_text.len(); 289 + 290 + // Store URL string since it's not in the final text 291 + facets.push(FacetCandidate::MarkdownLink { 292 + display_range: start..end, 293 + url: url.to_string(), 294 + }); 295 + 296 + last_end = full_match.end(); 297 + } 298 + 299 + // Append remaining text 300 + result.push_str(&text[last_end..]); 301 + 302 + (result, facets) 303 + } 304 + 305 + fn detect_mentions(text: &str) -> Vec<FacetCandidate> { 306 + let mut facets = Vec::new(); 307 + 308 + for cap in MENTION_REGEX.captures_iter(text) { 309 + let handle = cap.get(3).unwrap().as_str(); 310 + 311 + if !HANDLE_REGEX.is_match(handle) && !DID_REGEX.is_match(handle) { 312 + continue; 313 + } 314 + 315 + let did = if let Ok(did) = Did::new(handle) { 316 + Some(did.into_static()) 317 + } else { 318 + None 319 + }; 320 + 321 + // Store range including @ symbol - extract text at build time 322 + let at_sign = cap.get(2).unwrap(); 323 + let start = at_sign.start(); 324 + let end = cap.get(3).unwrap().end(); 325 + 326 + facets.push(FacetCandidate::Mention { 327 + range: start..end, 328 + did, 329 + }); 330 + } 331 + 332 + facets 333 + } 334 + 335 + fn detect_urls(text: &str) -> Vec<FacetCandidate> { 336 + let mut facets = Vec::new(); 337 + 338 + for cap in URL_REGEX.captures_iter(text) { 339 + let url_match = if let Some(full_url) = cap.get(3) { 340 + full_url 341 + } else if let Some(_domain) = cap.name("domain") { 342 + // Bare domain - will prepend https:// at build time 343 + cap.get(2).unwrap() 344 + } else { 345 + continue; 346 + }; 347 + 348 + let url_str = url_match.as_str(); 349 + 350 + // Calculate actual end after stripping trailing punctuation 351 + let trimmed_len = if let Some(trimmed) = TRAILING_PUNCT_REGEX.find(url_str) { 352 + trimmed.start() 353 + } else { 354 + url_str.len() 355 + }; 356 + 357 + if trimmed_len == 0 { 358 + continue; 359 + } 360 + 361 + let start = url_match.start(); 362 + let end = start + trimmed_len; 363 + 364 + // Store just the range - normalize URL at build time 365 + facets.push(FacetCandidate::Link { range: start..end }); 366 + } 367 + 368 + facets 369 + } 370 + 371 + fn detect_tags(text: &str) -> Vec<FacetCandidate> { 372 + let mut facets = Vec::new(); 373 + 374 + for cap in TAG_REGEX.captures_iter(text) { 375 + let tag_match = cap.get(2).unwrap(); 376 + let tag_str = tag_match.as_str(); 377 + 378 + // Calculate trimmed length after stripping trailing punctuation 379 + let trimmed_len = if let Some(trimmed) = TRAILING_PUNCT_REGEX.find(tag_str) { 380 + trimmed.start() 381 + } else { 382 + tag_str.len() 383 + }; 384 + 385 + // Validate length (0-64 chars per Bluesky spec) 386 + if trimmed_len == 0 || trimmed_len > 64 { 387 + continue; 388 + } 389 + 390 + let hash_pos = cap.get(0).unwrap().start(); 391 + // Find the actual # character position 392 + let hash_start = text[hash_pos..] 393 + .chars() 394 + .position(|c| c == '#' || c == '＃') 395 + .unwrap(); 396 + let start = hash_pos + hash_start; 397 + let end = start + 1 + trimmed_len; // # + tag length 398 + 399 + // Store range including # symbol - extract and process at build time 400 + facets.push(FacetCandidate::Tag { range: start..end }); 401 + } 402 + 403 + facets 404 + }