···44//! and detection of embed candidates (record and external embeds).
5566use crate::common::CowStr;
77+use regex::Regex;
78use std::marker::PhantomData;
89use std::ops::Range;
1010+use std::sync::LazyLock;
1111+1212+// Regex patterns based on Bluesky's official implementation
1313+// https://github.com/bluesky-social/atproto/blob/main/packages/api/src/rich-text/util.ts
1414+1515+static MENTION_REGEX: LazyLock<Regex> =
1616+ LazyLock::new(|| Regex::new(r"(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)").unwrap());
1717+1818+static URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
1919+ Regex::new(r"(^|\s|\()((https?://[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))")
2020+ .unwrap()
2121+});
2222+2323+static TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| {
2424+ // Simplified version - full unicode handling would need more work
2525+ Regex::new(r"(^|\s)[##]([^\s\x{00AD}\x{2060}\x{200A}\x{200B}\x{200C}\x{200D}]+)").unwrap()
2626+});
2727+2828+static MARKDOWN_LINK_REGEX: LazyLock<Regex> =
2929+ LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
3030+3131+static TRAILING_PUNCT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\p{P}+$").unwrap());
9321033/// Marker type indicating all facets are resolved (no handles pending DID resolution)
1134pub struct Resolved;
···47704871/// Rich text builder supporting both parsing and manual construction
4972#[derive(Debug)]
5050-pub struct RichTextBuilder<'a, State> {
7373+pub struct RichTextBuilder<State> {
5174 text: String,
5252- facet_candidates: Vec<FacetCandidate<'a>>,
7575+ facet_candidates: Vec<FacetCandidate>,
5376 _state: PhantomData<State>,
5477}
55785679/// Internal representation of facet before resolution
8080+///
8181+/// Stores minimal data to save memory:
8282+/// - Markdown links store URL (since syntax is stripped from text)
8383+/// - Mentions/tags store just ranges (@ and # included, extract at build time)
8484+/// - Links store just ranges (normalize URL at build time)
5785#[derive(Debug, Clone)]
5858-enum FacetCandidate<'a> {
8686+enum FacetCandidate {
8787+ /// Markdown link: `[display](url)` → display text in final text
8888+ MarkdownLink {
8989+ /// Range of display text in final processed text
9090+ display_range: Range<usize>,
9191+ /// URL from markdown (not in final text, so must store)
9292+ url: String,
9393+ },
9494+ /// Mention: `@handle.bsky.social`
9595+ /// Range includes the @ symbol, process at build time
5996 Mention {
6060- handle_or_did: CowStr<'a>,
9797+ /// Range in text including @ symbol
6198 range: Range<usize>,
6299 /// DID when provided, otherwise resolved later
63100 did: Option<Did<'static>>,
64101 },
102102+ /// Plain URL link
103103+ /// Range points to URL in text, normalize at build time
65104 Link {
6666- url: CowStr<'a>,
105105+ /// Range in text pointing to URL (may need normalization)
67106 range: Range<usize>,
68107 },
108108+ /// Hashtag: `#tag`
109109+ /// Range includes the # symbol, process at build time
69110 Tag {
7070- tag: CowStr<'a>,
111111+ /// Range in text including # symbol
71112 range: Range<usize>,
72113 },
73114}
741157575-impl<'a> RichTextBuilder<'a, Unresolved> {
7676- /// Entry point for parsing text with automatic facet detection
7777- pub fn parse(text: impl Into<String>) -> Self {
7878- todo!("Task 2")
116116+/// Entry point for parsing text with automatic facet detection
117117+pub fn parse(text: impl Into<String>) -> RichTextBuilder<Unresolved> {
118118+ let text = text.into();
119119+ let mut facet_candidates = Vec::new();
120120+121121+ // Step 1: Detect and strip markdown links first
122122+ let (text_processed, markdown_facets) = detect_markdown_links(&text);
123123+ facet_candidates.extend(markdown_facets);
124124+125125+ // Step 2: Detect mentions
126126+ let mention_facets = detect_mentions(&text_processed);
127127+ facet_candidates.extend(mention_facets);
128128+129129+ // Step 3: Detect URLs
130130+ let url_facets = detect_urls(&text_processed);
131131+ facet_candidates.extend(url_facets);
132132+133133+ // Step 4: Detect tags
134134+ let tag_facets = detect_tags(&text_processed);
135135+ facet_candidates.extend(tag_facets);
136136+137137+ RichTextBuilder {
138138+ text: text_processed,
139139+ facet_candidates,
140140+ #[cfg(feature = "api_bluesky")]
141141+ embed_candidates: Vec::new(),
142142+ _state: PhantomData,
79143 }
80144}
811458282-impl<'a> RichTextBuilder<'a, Resolved> {
146146+impl RichTextBuilder<Resolved> {
83147 /// Entry point for manual richtext construction
84148 pub fn builder() -> Self {
85149 RichTextBuilder {
···200264 })
201265 }
202266}
267267+268268+fn detect_markdown_links(text: &str) -> (String, Vec<FacetCandidate>) {
269269+ let mut result = String::with_capacity(text.len());
270270+ let mut facets = Vec::new();
271271+ let mut last_end = 0;
272272+ let mut offset = 0;
273273+274274+ for cap in MARKDOWN_LINK_REGEX.captures_iter(text) {
275275+ let full_match = cap.get(0).unwrap();
276276+ let display_text = cap.get(1).unwrap().as_str();
277277+ let url = cap.get(2).unwrap().as_str();
278278+279279+ // Append text before this match
280280+ result.push_str(&text[last_end..full_match.start()]);
281281+282282+ // Append only the display text (strip markdown syntax)
283283+ let start = result.len() - offset;
284284+ result.push_str(display_text);
285285+ let end = result.len() - offset;
286286+287287+ // Track offset change (we removed the markdown syntax)
288288+ offset += full_match.as_str().len() - display_text.len();
289289+290290+ // Store URL string since it's not in the final text
291291+ facets.push(FacetCandidate::MarkdownLink {
292292+ display_range: start..end,
293293+ url: url.to_string(),
294294+ });
295295+296296+ last_end = full_match.end();
297297+ }
298298+299299+ // Append remaining text
300300+ result.push_str(&text[last_end..]);
301301+302302+ (result, facets)
303303+}
304304+305305+fn detect_mentions(text: &str) -> Vec<FacetCandidate> {
306306+ let mut facets = Vec::new();
307307+308308+ for cap in MENTION_REGEX.captures_iter(text) {
309309+ let handle = cap.get(3).unwrap().as_str();
310310+311311+ if !HANDLE_REGEX.is_match(handle) && !DID_REGEX.is_match(handle) {
312312+ continue;
313313+ }
314314+315315+ let did = if let Ok(did) = Did::new(handle) {
316316+ Some(did.into_static())
317317+ } else {
318318+ None
319319+ };
320320+321321+ // Store range including @ symbol - extract text at build time
322322+ let at_sign = cap.get(2).unwrap();
323323+ let start = at_sign.start();
324324+ let end = cap.get(3).unwrap().end();
325325+326326+ facets.push(FacetCandidate::Mention {
327327+ range: start..end,
328328+ did,
329329+ });
330330+ }
331331+332332+ facets
333333+}
334334+335335+fn detect_urls(text: &str) -> Vec<FacetCandidate> {
336336+ let mut facets = Vec::new();
337337+338338+ for cap in URL_REGEX.captures_iter(text) {
339339+ let url_match = if let Some(full_url) = cap.get(3) {
340340+ full_url
341341+ } else if let Some(_domain) = cap.name("domain") {
342342+ // Bare domain - will prepend https:// at build time
343343+ cap.get(2).unwrap()
344344+ } else {
345345+ continue;
346346+ };
347347+348348+ let url_str = url_match.as_str();
349349+350350+ // Calculate actual end after stripping trailing punctuation
351351+ let trimmed_len = if let Some(trimmed) = TRAILING_PUNCT_REGEX.find(url_str) {
352352+ trimmed.start()
353353+ } else {
354354+ url_str.len()
355355+ };
356356+357357+ if trimmed_len == 0 {
358358+ continue;
359359+ }
360360+361361+ let start = url_match.start();
362362+ let end = start + trimmed_len;
363363+364364+ // Store just the range - normalize URL at build time
365365+ facets.push(FacetCandidate::Link { range: start..end });
366366+ }
367367+368368+ facets
369369+}
370370+371371+fn detect_tags(text: &str) -> Vec<FacetCandidate> {
372372+ let mut facets = Vec::new();
373373+374374+ for cap in TAG_REGEX.captures_iter(text) {
375375+ let tag_match = cap.get(2).unwrap();
376376+ let tag_str = tag_match.as_str();
377377+378378+ // Calculate trimmed length after stripping trailing punctuation
379379+ let trimmed_len = if let Some(trimmed) = TRAILING_PUNCT_REGEX.find(tag_str) {
380380+ trimmed.start()
381381+ } else {
382382+ tag_str.len()
383383+ };
384384+385385+ // Validate length (0-64 chars per Bluesky spec)
386386+ if trimmed_len == 0 || trimmed_len > 64 {
387387+ continue;
388388+ }
389389+390390+ let hash_pos = cap.get(0).unwrap().start();
391391+ // Find the actual # character position
392392+ let hash_start = text[hash_pos..]
393393+ .chars()
394394+ .position(|c| c == '#' || c == '#')
395395+ .unwrap();
396396+ let start = hash_pos + hash_start;
397397+ let end = start + 1 + trimmed_len; // # + tag length
398398+399399+ // Store range including # symbol - extract and process at build time
400400+ facets.push(FacetCandidate::Tag { range: start..end });
401401+ }
402402+403403+ facets
404404+}