A library for ATProtocol identities.

feature: atproto-extras crate

Changed files
+1373 -3
crates
+27
Cargo.lock
··· 141 141 ] 142 142 143 143 [[package]] 144 + name = "atproto-extras" 145 + version = "0.13.0" 146 + dependencies = [ 147 + "anyhow", 148 + "async-trait", 149 + "atproto-identity", 150 + "atproto-record", 151 + "clap", 152 + "regex", 153 + "reqwest", 154 + "serde_json", 155 + "tokio", 156 + ] 157 + 158 + [[package]] 144 159 name = "atproto-identity" 145 160 version = "0.13.0" 146 161 dependencies = [ ··· 1876 1891 checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" 1877 1892 dependencies = [ 1878 1893 "bitflags", 1894 + ] 1895 + 1896 + [[package]] 1897 + name = "regex" 1898 + version = "1.12.2" 1899 + source = "registry+https://github.com/rust-lang/crates.io-index" 1900 + checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" 1901 + dependencies = [ 1902 + "aho-corasick", 1903 + "memchr", 1904 + "regex-automata", 1905 + "regex-syntax", 1879 1906 ] 1880 1907 1881 1908 [[package]]
+7 -3
Cargo.toml
··· 1 1 [workspace] 2 2 members = [ 3 3 "crates/atproto-client", 4 + "crates/atproto-extras", 4 5 "crates/atproto-identity", 5 6 "crates/atproto-jetstream", 6 7 "crates/atproto-oauth-aip", ··· 24 25 categories = ["command-line-utilities", "web-programming"] 25 26 26 27 [workspace.dependencies] 28 + atproto-attestation = { version = "0.13.0", path = "crates/atproto-attestation" } 27 29 atproto-client = { version = "0.13.0", path = "crates/atproto-client" } 30 + atproto-extras = { version = "0.13.0", path = "crates/atproto-extras" } 28 31 atproto-identity = { version = "0.13.0", path = "crates/atproto-identity" } 32 + atproto-jetstream = { version = "0.13.0", path = "crates/atproto-jetstream" } 29 33 atproto-oauth = { version = "0.13.0", path = "crates/atproto-oauth" } 30 - atproto-oauth-axum = { version = "0.13.0", path = "crates/atproto-oauth-axum" } 31 34 atproto-oauth-aip = { version = "0.13.0", path = "crates/atproto-oauth-aip" } 35 + atproto-oauth-axum = { version = "0.13.0", path = "crates/atproto-oauth-axum" } 32 36 atproto-record = { version = "0.13.0", path = "crates/atproto-record" } 33 37 atproto-xrpcs = { version = "0.13.0", path = "crates/atproto-xrpcs" } 34 - atproto-jetstream = { version = "0.13.0", path = "crates/atproto-jetstream" } 35 - atproto-attestation = { version = "0.13.0", path = "crates/atproto-attestation" } 36 38 39 + ammonia = "4.0" 37 40 anyhow = "1.0" 38 41 async-trait = "0.1" 39 42 base64 = "0.22" ··· 50 53 p256 = "0.13" 51 54 p384 = "0.13" 52 55 rand = "0.8" 56 + regex = "1.11" 53 57 reqwest = { version = "0.12", default-features = false, features = ["charset", "http2", "system-proxy", "json", "rustls-tls"] } 54 58 reqwest-chain = "1.0" 55 59 reqwest-middleware = { version = "0.4", features = ["json", "multipart"]}
+43
crates/atproto-extras/Cargo.toml
··· 1 + [package] 2 + name = "atproto-extras" 3 + version = "0.13.0" 4 + description = "AT Protocol extras - facet parsing and rich text utilities" 5 + readme = "README.md" 6 + homepage = "https://tangled.sh/@smokesignal.events/atproto-identity-rs" 7 + documentation = "https://docs.rs/atproto-extras" 8 + 9 + edition.workspace = true 10 + rust-version.workspace = true 11 + authors.workspace = true 12 + repository.workspace = true 13 + license.workspace = true 14 + keywords.workspace = true 15 + categories.workspace = true 16 + 17 + [dependencies] 18 + atproto-identity.workspace = true 19 + atproto-record.workspace = true 20 + 21 + anyhow.workspace = true 22 + async-trait.workspace = true 23 + clap = { workspace = true, optional = true } 24 + regex.workspace = true 25 + reqwest = { workspace = true, optional = true } 26 + serde_json = { workspace = true, optional = true } 27 + tokio = { workspace = true, optional = true } 28 + 29 + [dev-dependencies] 30 + tokio = { workspace = true, features = ["macros", "rt"] } 31 + 32 + [features] 33 + default = ["hickory-dns"] 34 + hickory-dns = ["atproto-identity/hickory-dns"] 35 + clap = ["dep:clap"] 36 + cli = ["dep:clap", "dep:serde_json", "dep:tokio", "dep:reqwest"] 37 + 38 + [[bin]] 39 + name = "atproto-extras-parse-facets" 40 + required-features = ["clap", "cli", "hickory-dns"] 41 + 42 + [lints] 43 + workspace = true
+128
crates/atproto-extras/README.md
··· 1 + # atproto-extras 2 + 3 + Extra utilities for AT Protocol applications, including rich text facet parsing. 4 + 5 + ## Features 6 + 7 + - **Facet Parsing**: Extract mentions (`@handle`), URLs, and hashtags (`#tag`) from plain text with correct UTF-8 byte offset calculation 8 + - **Identity Integration**: Resolve mention handles to DIDs during parsing 9 + 10 + ## Installation 11 + 12 + Add to your `Cargo.toml`: 13 + 14 + ```toml 15 + [dependencies] 16 + atproto-extras = "0.13" 17 + ``` 18 + 19 + ## Usage 20 + 21 + ### Parsing Text for Facets 22 + 23 + ```rust 24 + use atproto_extras::{parse_urls, parse_tags}; 25 + use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature; 26 + 27 + let text = "Check out https://example.com #rust"; 28 + 29 + // Parse URLs and tags - returns Vec<Facet> directly 30 + let url_facets = parse_urls(text); 31 + let tag_facets = parse_tags(text); 32 + 33 + // Each facet includes byte positions and typed features 34 + for facet in url_facets { 35 + if let Some(FacetFeature::Link(link)) = facet.features.first() { 36 + println!("URL at bytes {}..{}: {}", 37 + facet.index.byte_start, facet.index.byte_end, link.uri); 38 + } 39 + } 40 + 41 + for facet in tag_facets { 42 + if let Some(FacetFeature::Tag(tag)) = facet.features.first() { 43 + println!("Tag at bytes {}..{}: #{}", 44 + facet.index.byte_start, facet.index.byte_end, tag.tag); 45 + } 46 + } 47 + ``` 48 + 49 + ### Parsing Mentions 50 + 51 + Mention parsing requires an `IdentityResolver` to convert handles to DIDs: 52 + 53 + ```rust 54 + use atproto_extras::{parse_mentions, FacetLimits}; 55 + use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature; 56 + 57 + let text = "Hello @alice.bsky.social!"; 58 + let limits = FacetLimits::default(); 59 + 60 + // Requires an async context and IdentityResolver 61 + let facets = parse_mentions(text, &resolver, &limits).await; 62 + 63 + for facet in facets { 64 + if let Some(FacetFeature::Mention(mention)) = facet.features.first() { 65 + println!("Mention at bytes {}..{} resolved to {}", 66 + facet.index.byte_start, facet.index.byte_end, mention.did); 67 + } 68 + } 69 + ``` 70 + 71 + Mentions that cannot be resolved to a valid DID are automatically skipped. Mentions appearing within URLs are also excluded. 72 + 73 + ### Creating AT Protocol Facets 74 + 75 + ```rust 76 + use atproto_extras::{parse_facets_from_text, FacetLimits}; 77 + 78 + let text = "Hello @alice.bsky.social! Check https://rust-lang.org #rust"; 79 + let limits = FacetLimits::default(); 80 + 81 + // Requires an async context and IdentityResolver 82 + let facets = parse_facets_from_text(text, &resolver, &limits).await; 83 + 84 + if let Some(facets) = facets { 85 + for facet in &facets { 86 + println!("Facet at {}..{}", facet.index.byte_start, facet.index.byte_end); 87 + } 88 + } 89 + ``` 90 + 91 + ## Byte Offset Handling 92 + 93 + AT Protocol facets use UTF-8 byte offsets, not character indices. This is critical for correct handling of multi-byte characters like emojis or non-ASCII text. 94 + 95 + ```rust 96 + use atproto_extras::parse_urls; 97 + 98 + // Text with emojis (multi-byte UTF-8 characters) 99 + let text = "✨ Check https://example.com ✨"; 100 + 101 + let facets = parse_urls(text); 102 + // Byte positions correctly account for the 4-byte emoji 103 + assert_eq!(facets[0].index.byte_start, 11); // After "✨ Check " (4 + 1 + 6 = 11 bytes) 104 + ``` 105 + 106 + ## Facet Limits 107 + 108 + Use `FacetLimits` to control the maximum number of facets processed: 109 + 110 + ```rust 111 + use atproto_extras::FacetLimits; 112 + 113 + // Default limits 114 + let limits = FacetLimits::default(); 115 + // mentions_max: 5, tags_max: 5, links_max: 5, max: 10 116 + 117 + // Custom limits 118 + let custom = FacetLimits { 119 + mentions_max: 10, 120 + tags_max: 10, 121 + links_max: 10, 122 + max: 20, 123 + }; 124 + ``` 125 + 126 + ## License 127 + 128 + MIT
+176
crates/atproto-extras/src/bin/atproto-extras-parse-facets.rs
··· 1 + //! Command-line tool for generating AT Protocol facet arrays from text. 2 + //! 3 + //! This tool parses a string and outputs the facet array in JSON format. 4 + //! Facets include mentions (@handle), URLs (https://...), and hashtags (#tag). 5 + //! 6 + //! By default, mentions are detected but output with placeholder DIDs. Use 7 + //! `--resolve-mentions` to resolve handles to actual DIDs (requires network access). 8 + //! 9 + //! # Usage 10 + //! 11 + //! ```bash 12 + //! # Parse facets without resolving mentions 13 + //! cargo run --features clap,serde_json,tokio,hickory-dns --bin atproto-extras-parse-facets -- "Check out https://example.com and #rust" 14 + //! 15 + //! # Resolve mentions to DIDs 16 + //! cargo run --features clap,serde_json,tokio,hickory-dns --bin atproto-extras-parse-facets -- --resolve-mentions "Hello @bsky.app!" 17 + //! ``` 18 + 19 + use atproto_extras::{FacetLimits, parse_mentions, parse_tags, parse_urls}; 20 + use atproto_identity::resolve::{HickoryDnsResolver, InnerIdentityResolver}; 21 + use atproto_record::lexicon::app::bsky::richtext::facet::{ 22 + ByteSlice, Facet, FacetFeature, Mention, 23 + }; 24 + use clap::Parser; 25 + use regex::bytes::Regex; 26 + use std::sync::Arc; 27 + 28 + /// Parse text and output AT Protocol facets as JSON. 29 + #[derive(Parser)] 30 + #[command( 31 + name = "atproto-extras-parse-facets", 32 + version, 33 + about = "Parse text and output AT Protocol facets as JSON", 34 + long_about = "This tool parses a string for mentions, URLs, and hashtags,\n\ 35 + then outputs the corresponding AT Protocol facet array in JSON format.\n\n\ 36 + By default, mentions are detected but output with placeholder DIDs.\n\ 37 + Use --resolve-mentions to resolve handles to actual DIDs (requires network)." 38 + )] 39 + struct Args { 40 + /// The text to parse for facets 41 + text: String, 42 + 43 + /// Resolve mention handles to DIDs (requires network access) 44 + #[arg(long)] 45 + resolve_mentions: bool, 46 + 47 + /// Show debug information on stderr 48 + #[arg(long, short = 'd')] 49 + debug: bool, 50 + } 51 + 52 + /// Parse mention spans from text without resolution (returns placeholder DIDs). 53 + fn parse_mention_spans(text: &str) -> Vec<Facet> { 54 + let mut facets = Vec::new(); 55 + 56 + // Get URL ranges to exclude mentions within URLs 57 + let url_facets = parse_urls(text); 58 + 59 + // Same regex pattern as parse_mentions 60 + let mention_regex = Regex::new( 61 + r"(?:^|[^\w])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)", 62 + ) 63 + .expect("Invalid mention regex"); 64 + 65 + let text_bytes = text.as_bytes(); 66 + 67 + for capture in mention_regex.captures_iter(text_bytes) { 68 + if let Some(mention_match) = capture.get(1) { 69 + let start = mention_match.start(); 70 + let end = mention_match.end(); 71 + 72 + // Check if this mention overlaps with any URL 73 + let overlaps_url = url_facets.iter().any(|facet| { 74 + (start >= facet.index.byte_start && start < facet.index.byte_end) 75 + || (end > facet.index.byte_start && end <= facet.index.byte_end) 76 + }); 77 + 78 + if !overlaps_url { 79 + let handle = std::str::from_utf8(&mention_match.as_bytes()[1..]) 80 + .unwrap_or_default() 81 + .to_string(); 82 + 83 + facets.push(Facet { 84 + index: ByteSlice { 85 + byte_start: start, 86 + byte_end: end, 87 + }, 88 + features: vec![FacetFeature::Mention(Mention { 89 + did: format!("did:plc:<unresolved:{}>", handle), 90 + })], 91 + }); 92 + } 93 + } 94 + } 95 + 96 + facets 97 + } 98 + 99 + #[tokio::main] 100 + async fn main() { 101 + let args = Args::parse(); 102 + let text = &args.text; 103 + let mut facets: Vec<Facet> = Vec::new(); 104 + let limits = FacetLimits::default(); 105 + 106 + // Parse mentions (either resolved or with placeholders) 107 + if args.resolve_mentions { 108 + let http_client = reqwest::Client::new(); 109 + let dns_resolver = HickoryDnsResolver::create_resolver(&[]); 110 + let resolver = InnerIdentityResolver { 111 + http_client, 112 + dns_resolver: Arc::new(dns_resolver), 113 + plc_hostname: "plc.directory".to_string(), 114 + }; 115 + let mention_facets = parse_mentions(text, &resolver, &limits).await; 116 + facets.extend(mention_facets); 117 + } else { 118 + let mention_facets = parse_mention_spans(text); 119 + facets.extend(mention_facets); 120 + } 121 + 122 + // Parse URLs 123 + let url_facets = parse_urls(text); 124 + facets.extend(url_facets); 125 + 126 + // Parse hashtags 127 + let tag_facets = parse_tags(text); 128 + facets.extend(tag_facets); 129 + 130 + // Sort facets by byte_start for consistent output 131 + facets.sort_by_key(|f| f.index.byte_start); 132 + 133 + // Output as JSON 134 + if facets.is_empty() { 135 + println!("null"); 136 + } else { 137 + match serde_json::to_string_pretty(&facets) { 138 + Ok(json) => println!("{}", json), 139 + Err(e) => { 140 + eprintln!( 141 + "error-atproto-extras-parse-facets-1 Error serializing facets: {}", 142 + e 143 + ); 144 + std::process::exit(1); 145 + } 146 + } 147 + } 148 + 149 + // Show debug info if requested 150 + if args.debug { 151 + eprintln!(); 152 + eprintln!("--- Debug Info ---"); 153 + eprintln!("Input text: {:?}", text); 154 + eprintln!("Text length: {} bytes", text.len()); 155 + eprintln!("Facets found: {}", facets.len()); 156 + eprintln!("Mentions resolved: {}", args.resolve_mentions); 157 + 158 + // Show byte slice verification 159 + let text_bytes = text.as_bytes(); 160 + for (i, facet) in facets.iter().enumerate() { 161 + let start = facet.index.byte_start; 162 + let end = facet.index.byte_end; 163 + let slice_text = 164 + std::str::from_utf8(&text_bytes[start..end]).unwrap_or("<invalid utf8>"); 165 + let feature_type = match &facet.features[0] { 166 + FacetFeature::Mention(_) => "mention", 167 + FacetFeature::Link(_) => "link", 168 + FacetFeature::Tag(_) => "tag", 169 + }; 170 + eprintln!( 171 + " [{}] {} @ bytes {}..{}: {:?}", 172 + i, feature_type, start, end, slice_text 173 + ); 174 + } 175 + } 176 + }
+942
crates/atproto-extras/src/facets.rs
··· 1 + //! Rich text facet parsing for AT Protocol. 2 + //! 3 + //! This module provides functionality for extracting semantic annotations (facets) 4 + //! from plain text. Facets include mentions, links (URLs), and hashtags. 5 + //! 6 + //! # Overview 7 + //! 8 + //! AT Protocol rich text uses "facets" to annotate specific byte ranges within text with 9 + //! semantic meaning. This module handles: 10 + //! 11 + //! - **Parsing**: Extract mentions, URLs, and hashtags from plain text 12 + //! - **Facet Creation**: Build proper AT Protocol facet structures with resolved DIDs 13 + //! 14 + //! # Byte Offset Calculation 15 + //! 16 + //! This implementation correctly uses UTF-8 byte offsets as required by AT Protocol. 17 + //! The facets use "inclusive start and exclusive end" byte ranges. All parsing is done 18 + //! using `regex::bytes::Regex` which operates on byte slices and returns byte positions, 19 + //! ensuring correct handling of multi-byte UTF-8 characters (emojis, CJK, accented chars). 20 + //! 21 + //! # Example 22 + //! 23 + //! ```ignore 24 + //! use atproto_extras::facets::{parse_urls, parse_tags, FacetLimits}; 25 + //! use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature; 26 + //! 27 + //! let text = "Check out https://example.com #rust"; 28 + //! 29 + //! // Parse URLs and tags as Facet objects 30 + //! let url_facets = parse_urls(text); 31 + //! let tag_facets = parse_tags(text); 32 + //! 33 + //! // Access facet data directly 34 + //! for facet in url_facets { 35 + //! if let Some(FacetFeature::Link(link)) = facet.features.first() { 36 + //! println!("URL at bytes {}..{}: {}", 37 + //! facet.index.byte_start, facet.index.byte_end, link.uri); 38 + //! } 39 + //! } 40 + //! ``` 41 + 42 + use atproto_identity::resolve::IdentityResolver; 43 + use atproto_record::lexicon::app::bsky::richtext::facet::{ 44 + ByteSlice, Facet, FacetFeature, Link, Mention, Tag, 45 + }; 46 + use regex::bytes::Regex; 47 + 48 + /// Configuration for facet parsing limits. 49 + /// 50 + /// These limits protect against abuse by capping the number of facets 51 + /// that will be processed. This is important for both performance and 52 + /// security when handling user-generated content. 53 + /// 54 + /// # Example 55 + /// 56 + /// ``` 57 + /// use atproto_extras::FacetLimits; 58 + /// 59 + /// // Use defaults 60 + /// let limits = FacetLimits::default(); 61 + /// 62 + /// // Or customize 63 + /// let custom = FacetLimits { 64 + /// mentions_max: 10, 65 + /// tags_max: 10, 66 + /// links_max: 10, 67 + /// max: 20, 68 + /// }; 69 + /// ``` 70 + #[derive(Debug, Clone, Copy)] 71 + pub struct FacetLimits { 72 + /// Maximum number of mention facets to process (default: 5) 73 + pub mentions_max: usize, 74 + /// Maximum number of tag facets to process (default: 5) 75 + pub tags_max: usize, 76 + /// Maximum number of link facets to process (default: 5) 77 + pub links_max: usize, 78 + /// Maximum total number of facets to process (default: 10) 79 + pub max: usize, 80 + } 81 + 82 + impl Default for FacetLimits { 83 + fn default() -> Self { 84 + Self { 85 + mentions_max: 5, 86 + tags_max: 5, 87 + links_max: 5, 88 + max: 10, 89 + } 90 + } 91 + } 92 + 93 + /// Parse mentions from text and return them as Facet objects with resolved DIDs. 94 + /// 95 + /// This function extracts AT Protocol handle mentions (e.g., `@alice.bsky.social`) 96 + /// from text, resolves each handle to a DID using the provided identity resolver, 97 + /// and returns AT Protocol Facet objects with Mention features. 98 + /// 99 + /// Mentions that cannot be resolved to a valid DID are skipped. Mentions that 100 + /// appear within URLs are also excluded to avoid false positives. 101 + /// 102 + /// # Arguments 103 + /// 104 + /// * `text` - The text to parse for mentions 105 + /// * `identity_resolver` - Resolver for converting handles to DIDs 106 + /// * `limits` - Configuration for maximum mentions to process 107 + /// 108 + /// # Returns 109 + /// 110 + /// A vector of Facet objects for successfully resolved mentions. 111 + /// 112 + /// # Example 113 + /// 114 + /// ```ignore 115 + /// use atproto_extras::{parse_mentions, FacetLimits}; 116 + /// use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature; 117 + /// 118 + /// let text = "Hello @alice.bsky.social!"; 119 + /// let limits = FacetLimits::default(); 120 + /// 121 + /// // Requires an async context and identity resolver 122 + /// let facets = parse_mentions(text, &resolver, &limits).await; 123 + /// 124 + /// for facet in facets { 125 + /// if let Some(FacetFeature::Mention(mention)) = facet.features.first() { 126 + /// println!("Mention {} resolved to {}", 127 + /// &text[facet.index.byte_start..facet.index.byte_end], 128 + /// mention.did); 129 + /// } 130 + /// } 131 + /// ``` 132 + pub async fn parse_mentions( 133 + text: &str, 134 + identity_resolver: &dyn IdentityResolver, 135 + limits: &FacetLimits, 136 + ) -> Vec<Facet> { 137 + let mut facets = Vec::new(); 138 + 139 + // First, parse all URLs to exclude mention matches within them 140 + let url_facets = parse_urls(text); 141 + 142 + // Regex based on: https://atproto.com/specs/handle#handle-identifier-syntax 143 + // Pattern: [$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?) 144 + let mention_regex = Regex::new( 145 + r"(?:^|[^\w])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)", 146 + ) 147 + .unwrap(); 148 + 149 + let text_bytes = text.as_bytes(); 150 + let mut mention_count = 0; 151 + 152 + for capture in mention_regex.captures_iter(text_bytes) { 153 + if mention_count >= limits.mentions_max { 154 + break; 155 + } 156 + 157 + if let Some(mention_match) = capture.get(1) { 158 + let start = mention_match.start(); 159 + let end = mention_match.end(); 160 + 161 + // Check if this mention overlaps with any URL 162 + let overlaps_url = url_facets.iter().any(|facet| { 163 + // Check if mention is within or overlaps the URL span 164 + (start >= facet.index.byte_start && start < facet.index.byte_end) 165 + || (end > facet.index.byte_start && end <= facet.index.byte_end) 166 + }); 167 + 168 + // Only process the mention if it doesn't overlap with a URL 169 + if !overlaps_url { 170 + let handle = std::str::from_utf8(&mention_match.as_bytes()[1..]) 171 + .unwrap_or_default() 172 + .to_string(); 173 + 174 + // Try to resolve the handle to a DID 175 + // First try with at:// prefix, then without 176 + let at_uri = format!("at://{}", handle); 177 + let did_result = match identity_resolver.resolve(&at_uri).await { 178 + Ok(doc) => Ok(doc), 179 + Err(_) => identity_resolver.resolve(&handle).await, 180 + }; 181 + 182 + // Only add the mention facet if we successfully resolved the DID 183 + if let Ok(did_doc) = did_result { 184 + facets.push(Facet { 185 + index: ByteSlice { 186 + byte_start: start, 187 + byte_end: end, 188 + }, 189 + features: vec![FacetFeature::Mention(Mention { 190 + did: did_doc.id.to_string(), 191 + })], 192 + }); 193 + mention_count += 1; 194 + } 195 + } 196 + } 197 + } 198 + 199 + facets 200 + } 201 + 202 + /// Parse URLs from text and return them as Facet objects. 203 + /// 204 + /// This function extracts HTTP and HTTPS URLs from text with correct 205 + /// byte position tracking for UTF-8 text, returning AT Protocol Facet objects 206 + /// with Link features. 207 + /// 208 + /// # Supported URL Patterns 209 + /// 210 + /// - HTTP URLs: `http://example.com` 211 + /// - HTTPS URLs: `https://example.com` 212 + /// - URLs with paths, query strings, and fragments 213 + /// - URLs with subdomains: `https://www.example.com` 214 + /// 215 + /// # Example 216 + /// 217 + /// ``` 218 + /// use atproto_extras::parse_urls; 219 + /// use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature; 220 + /// 221 + /// let text = "Visit https://example.com/path?query=1 for more info"; 222 + /// let facets = parse_urls(text); 223 + /// 224 + /// assert_eq!(facets.len(), 1); 225 + /// assert_eq!(facets[0].index.byte_start, 6); 226 + /// assert_eq!(facets[0].index.byte_end, 38); 227 + /// if let Some(FacetFeature::Link(link)) = facets[0].features.first() { 228 + /// assert_eq!(link.uri, "https://example.com/path?query=1"); 229 + /// } 230 + /// ``` 231 + /// 232 + /// # Multi-byte Character Handling 233 + /// 234 + /// Byte positions are correctly calculated even with emojis and other 235 + /// multi-byte UTF-8 characters: 236 + /// 237 + /// ``` 238 + /// use atproto_extras::parse_urls; 239 + /// use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature; 240 + /// 241 + /// let text = "Check out https://example.com now!"; 242 + /// let facets = parse_urls(text); 243 + /// let text_bytes = text.as_bytes(); 244 + /// 245 + /// // The byte slice matches the URL 246 + /// let url_bytes = &text_bytes[facets[0].index.byte_start..facets[0].index.byte_end]; 247 + /// assert_eq!(std::str::from_utf8(url_bytes).unwrap(), "https://example.com"); 248 + /// ``` 249 + pub fn parse_urls(text: &str) -> Vec<Facet> { 250 + let mut facets = Vec::new(); 251 + 252 + // Partial/naive URL regex based on: https://stackoverflow.com/a/3809435 253 + // Pattern: [$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?) 254 + // Modified to use + instead of {1,6} to support longer TLDs and multi-level subdomains 255 + let url_regex = Regex::new( 256 + r"(?:^|[^\w])(https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)" 257 + ).unwrap(); 258 + 259 + let text_bytes = text.as_bytes(); 260 + for capture in url_regex.captures_iter(text_bytes) { 261 + if let Some(url_match) = capture.get(1) { 262 + let url = std::str::from_utf8(url_match.as_bytes()) 263 + .unwrap_or_default() 264 + .to_string(); 265 + 266 + facets.push(Facet { 267 + index: ByteSlice { 268 + byte_start: url_match.start(), 269 + byte_end: url_match.end(), 270 + }, 271 + features: vec![FacetFeature::Link(Link { uri: url })], 272 + }); 273 + } 274 + } 275 + 276 + facets 277 + } 278 + 279 + /// Parse hashtags from text and return them as Facet objects. 280 + /// 281 + /// This function extracts hashtags (e.g., `#rust`, `#ATProto`) from text, 282 + /// returning AT Protocol Facet objects with Tag features. 283 + /// It supports both standard `#` and full-width `#` (U+FF03) hash symbols. 284 + /// 285 + /// # Tag Syntax 286 + /// 287 + /// - Tags must start with `#` or `#` (full-width) 288 + /// - Tag content follows word character rules (`\w`) 289 + /// - Purely numeric tags (e.g., `#123`) are excluded 290 + /// 291 + /// # Example 292 + /// 293 + /// ``` 294 + /// use atproto_extras::parse_tags; 295 + /// use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature; 296 + /// 297 + /// let text = "Learning #rust and #golang today! #100DaysOfCode"; 298 + /// let facets = parse_tags(text); 299 + /// 300 + /// assert_eq!(facets.len(), 3); 301 + /// if let Some(FacetFeature::Tag(tag)) = facets[0].features.first() { 302 + /// assert_eq!(tag.tag, "rust"); 303 + /// } 304 + /// if let Some(FacetFeature::Tag(tag)) = facets[1].features.first() { 305 + /// assert_eq!(tag.tag, "golang"); 306 + /// } 307 + /// if let Some(FacetFeature::Tag(tag)) = facets[2].features.first() { 308 + /// assert_eq!(tag.tag, "100DaysOfCode"); 309 + /// } 310 + /// ``` 311 + /// 312 + /// # Numeric Tags 313 + /// 314 + /// Purely numeric tags are excluded: 315 + /// 316 + /// ``` 317 + /// use atproto_extras::parse_tags; 318 + /// 319 + /// let text = "Item #42 is special"; 320 + /// let facets = parse_tags(text); 321 + /// 322 + /// // #42 is not extracted because it's purely numeric 323 + /// assert_eq!(facets.len(), 0); 324 + /// ``` 325 + pub fn parse_tags(text: &str) -> Vec<Facet> { 326 + let mut facets = Vec::new(); 327 + 328 + // Regex based on: https://github.com/bluesky-social/atproto/blob/d91988fe79030b61b556dd6f16a46f0c3b9d0b44/packages/api/src/rich-text/util.ts 329 + // Simplified for Rust - matches hashtags at word boundaries 330 + // Pattern matches: start of string or non-word char, then # or #, then tag content 331 + let tag_regex = Regex::new(r"(?:^|[^\w])([#\xEF\xBC\x83])([\w]+(?:[\w]*)*)").unwrap(); 332 + 333 + let text_bytes = text.as_bytes(); 334 + 335 + // Work with bytes for proper position tracking 336 + for capture in tag_regex.captures_iter(text_bytes) { 337 + if let (Some(full_match), Some(hash_match), Some(tag_match)) = 338 + (capture.get(0), capture.get(1), capture.get(2)) 339 + { 340 + // Calculate the absolute byte position of the hash symbol 341 + // The full match includes the preceding character (if any) 342 + // so we need to adjust for that 343 + let match_start = full_match.start(); 344 + let hash_offset = hash_match.start() - full_match.start(); 345 + let start = match_start + hash_offset; 346 + let end = match_start + hash_offset + hash_match.len() + tag_match.len(); 347 + 348 + // Extract just the tag text (without the hash symbol) 349 + let tag = std::str::from_utf8(tag_match.as_bytes()).unwrap_or_default(); 350 + 351 + // Only include tags that are not purely numeric 352 + if !tag.chars().all(|c| c.is_ascii_digit()) { 353 + facets.push(Facet { 354 + index: ByteSlice { 355 + byte_start: start, 356 + byte_end: end, 357 + }, 358 + features: vec![FacetFeature::Tag(Tag { 359 + tag: tag.to_string(), 360 + })], 361 + }); 362 + } 363 + } 364 + } 365 + 366 + facets 367 + } 368 + 369 + /// Parse facets from text and return a vector of Facet objects. 370 + /// 371 + /// This function extracts mentions, URLs, and hashtags from the provided text 372 + /// and creates AT Protocol facets with proper byte indices. 373 + /// 374 + /// Mentions are resolved to actual DIDs using the provided identity resolver. 375 + /// If a handle cannot be resolved to a DID, the mention facet is skipped. 376 + /// 377 + /// # Arguments 378 + /// 379 + /// * `text` - The text to extract facets from 380 + /// * `identity_resolver` - Resolver for converting handles to DIDs 381 + /// * `limits` - Configuration for maximum facets per type and total 382 + /// 383 + /// # Returns 384 + /// 385 + /// Optional vector of facets. Returns `None` if no facets were found. 386 + /// 387 + /// # Example 388 + /// 389 + /// ```ignore 390 + /// use atproto_extras::{parse_facets_from_text, FacetLimits}; 391 + /// 392 + /// let text = "Hello @alice.bsky.social! Check #rust at https://rust-lang.org"; 393 + /// let limits = FacetLimits::default(); 394 + /// 395 + /// // Requires an async context and identity resolver 396 + /// let facets = parse_facets_from_text(text, &resolver, &limits).await; 397 + /// 398 + /// if let Some(facets) = facets { 399 + /// for facet in &facets { 400 + /// println!("Facet at {}..{}", facet.index.byte_start, facet.index.byte_end); 401 + /// } 402 + /// } 403 + /// ``` 404 + /// 405 + /// # Mention Resolution 406 + /// 407 + /// Mentions are only included if the handle resolves to a valid DID: 408 + /// 409 + /// ```ignore 410 + /// let text = "@valid.handle.com and @invalid.handle.xyz"; 411 + /// let facets = parse_facets_from_text(text, &resolver, &limits).await; 412 + /// 413 + /// // Only @valid.handle.com appears as a facet if @invalid.handle.xyz 414 + /// // cannot be resolved to a DID 415 + /// ``` 416 + pub async fn parse_facets_from_text( 417 + text: &str, 418 + identity_resolver: &dyn IdentityResolver, 419 + limits: &FacetLimits, 420 + ) -> Option<Vec<Facet>> { 421 + let mut facets = Vec::new(); 422 + 423 + // Parse mentions (already limited by mentions_max in parse_mentions) 424 + let mention_facets = parse_mentions(text, identity_resolver, limits).await; 425 + facets.extend(mention_facets); 426 + 427 + // Parse URLs (limited by links_max) 428 + let url_facets = parse_urls(text); 429 + for (idx, facet) in url_facets.into_iter().enumerate() { 430 + if idx >= limits.links_max { 431 + break; 432 + } 433 + facets.push(facet); 434 + } 435 + 436 + // Parse hashtags (limited by tags_max) 437 + let tag_facets = parse_tags(text); 438 + for (idx, facet) in tag_facets.into_iter().enumerate() { 439 + if idx >= limits.tags_max { 440 + break; 441 + } 442 + facets.push(facet); 443 + } 444 + 445 + // Apply global facet limit (truncate if exceeds max) 446 + if facets.len() > limits.max { 447 + facets.truncate(limits.max); 448 + } 449 + 450 + // Only return facets if we found any 451 + if !facets.is_empty() { 452 + Some(facets) 453 + } else { 454 + None 455 + } 456 + } 457 + 458 + #[cfg(test)] 459 + mod tests { 460 + use async_trait::async_trait; 461 + use atproto_identity::model::Document; 462 + use std::collections::HashMap; 463 + 464 + use super::*; 465 + 466 + /// Mock identity resolver for testing 467 + struct MockIdentityResolver { 468 + handles_to_dids: HashMap<String, String>, 469 + } 470 + 471 + impl MockIdentityResolver { 472 + fn new() -> Self { 473 + let mut handles_to_dids = HashMap::new(); 474 + handles_to_dids.insert( 475 + "alice.bsky.social".to_string(), 476 + "did:plc:alice123".to_string(), 477 + ); 478 + handles_to_dids.insert( 479 + "at://alice.bsky.social".to_string(), 480 + "did:plc:alice123".to_string(), 481 + ); 482 + Self { handles_to_dids } 483 + } 484 + 485 + fn add_identity(&mut self, handle: &str, did: &str) { 486 + self.handles_to_dids 487 + .insert(handle.to_string(), did.to_string()); 488 + self.handles_to_dids 489 + .insert(format!("at://{}", handle), did.to_string()); 490 + } 491 + } 492 + 493 + #[async_trait] 494 + impl IdentityResolver for MockIdentityResolver { 495 + async fn resolve(&self, handle: &str) -> anyhow::Result<Document> { 496 + let handle_key = handle.to_string(); 497 + 498 + if let Some(did) = self.handles_to_dids.get(&handle_key) { 499 + Ok(Document { 500 + context: vec![], 501 + id: did.clone(), 502 + also_known_as: vec![format!("at://{}", handle_key.trim_start_matches("at://"))], 503 + verification_method: vec![], 504 + service: vec![], 505 + extra: HashMap::new(), 506 + }) 507 + } else { 508 + Err(anyhow::anyhow!("Handle not found")) 509 + } 510 + } 511 + } 512 + 513 + #[tokio::test] 514 + async fn test_parse_facets_from_text_comprehensive() { 515 + let mut resolver = MockIdentityResolver::new(); 516 + resolver.add_identity("bob.test.com", "did:plc:bob456"); 517 + 518 + let limits = FacetLimits::default(); 519 + let text = "Join @alice.bsky.social and @bob.test.com at https://example.com #rust #golang"; 520 + let facets = parse_facets_from_text(text, &resolver, &limits).await; 521 + 522 + assert!(facets.is_some()); 523 + let facets = facets.unwrap(); 524 + assert_eq!(facets.len(), 5); // 2 mentions, 1 URL, 2 hashtags 525 + 526 + // Check first mention 527 + assert_eq!(facets[0].index.byte_start, 5); 528 + assert_eq!(facets[0].index.byte_end, 23); 529 + if let FacetFeature::Mention(ref mention) = facets[0].features[0] { 530 + assert_eq!(mention.did, "did:plc:alice123"); 531 + } else { 532 + panic!("Expected Mention feature"); 533 + } 534 + 535 + // Check second mention 536 + assert_eq!(facets[1].index.byte_start, 28); 537 + assert_eq!(facets[1].index.byte_end, 41); 538 + if let FacetFeature::Mention(mention) = &facets[1].features[0] { 539 + assert_eq!(mention.did, "did:plc:bob456"); 540 + } else { 541 + panic!("Expected Mention feature"); 542 + } 543 + 544 + // Check URL 545 + assert_eq!(facets[2].index.byte_start, 45); 546 + assert_eq!(facets[2].index.byte_end, 64); 547 + if let FacetFeature::Link(link) = &facets[2].features[0] { 548 + assert_eq!(link.uri, "https://example.com"); 549 + } else { 550 + panic!("Expected Link feature"); 551 + } 552 + 553 + // Check first hashtag 554 + assert_eq!(facets[3].index.byte_start, 65); 555 + assert_eq!(facets[3].index.byte_end, 70); 556 + if let FacetFeature::Tag(tag) = &facets[3].features[0] { 557 + assert_eq!(tag.tag, "rust"); 558 + } else { 559 + panic!("Expected Tag feature"); 560 + } 561 + 562 + // Check second hashtag 563 + assert_eq!(facets[4].index.byte_start, 71); 564 + assert_eq!(facets[4].index.byte_end, 78); 565 + if let FacetFeature::Tag(tag) = &facets[4].features[0] { 566 + assert_eq!(tag.tag, "golang"); 567 + } else { 568 + panic!("Expected Tag feature"); 569 + } 570 + } 571 + 572 + #[tokio::test] 573 + async fn test_parse_facets_from_text_with_unresolvable_mention() { 574 + let resolver = MockIdentityResolver::new(); 575 + let limits = FacetLimits::default(); 576 + 577 + // Only alice.bsky.social is in the resolver, not unknown.handle.com 578 + let text = "Contact @unknown.handle.com for details #rust"; 579 + let facets = parse_facets_from_text(text, &resolver, &limits).await; 580 + 581 + assert!(facets.is_some()); 582 + let facets = facets.unwrap(); 583 + // Should only have 1 facet (the hashtag) since the mention couldn't be resolved 584 + assert_eq!(facets.len(), 1); 585 + 586 + // Check that it's the hashtag facet 587 + if let FacetFeature::Tag(tag) = &facets[0].features[0] { 588 + assert_eq!(tag.tag, "rust"); 589 + } else { 590 + panic!("Expected Tag feature"); 591 + } 592 + } 593 + 594 + #[tokio::test] 595 + async fn test_parse_facets_from_text_empty() { 596 + let resolver = MockIdentityResolver::new(); 597 + let limits = FacetLimits::default(); 598 + let text = "No mentions, URLs, or hashtags here"; 599 + let facets = parse_facets_from_text(text, &resolver, &limits).await; 600 + assert!(facets.is_none()); 601 + } 602 + 603 + #[tokio::test] 604 + async fn test_parse_facets_from_text_url_with_at_mention() { 605 + let resolver = MockIdentityResolver::new(); 606 + let limits = FacetLimits::default(); 607 + 608 + // URLs with @ should not create mention facets 609 + let text = "Tangled https://tangled.org/@smokesignal.events"; 610 + let facets = parse_facets_from_text(text, &resolver, &limits).await; 611 + 612 + assert!(facets.is_some()); 613 + let facets = facets.unwrap(); 614 + 615 + // Should have exactly 1 facet (the URL), not 2 (URL + mention) 616 + assert_eq!( 617 + facets.len(), 618 + 1, 619 + "Expected 1 facet (URL only), got {}", 620 + facets.len() 621 + ); 622 + 623 + // Verify it's a link facet, not a mention 624 + if let FacetFeature::Link(link) = &facets[0].features[0] { 625 + assert_eq!(link.uri, "https://tangled.org/@smokesignal.events"); 626 + } else { 627 + panic!("Expected Link feature, got Mention or Tag instead"); 628 + } 629 + } 630 + 631 + #[tokio::test] 632 + async fn test_parse_facets_with_mention_limit() { 633 + let mut resolver = MockIdentityResolver::new(); 634 + resolver.add_identity("bob.test.com", "did:plc:bob456"); 635 + resolver.add_identity("charlie.test.com", "did:plc:charlie789"); 636 + 637 + // Limit to 2 mentions 638 + let limits = FacetLimits { 639 + mentions_max: 2, 640 + tags_max: 5, 641 + links_max: 5, 642 + max: 10, 643 + }; 644 + 645 + let text = "Join @alice.bsky.social @bob.test.com @charlie.test.com"; 646 + let facets = parse_facets_from_text(text, &resolver, &limits).await; 647 + 648 + assert!(facets.is_some()); 649 + let facets = facets.unwrap(); 650 + // Should only have 2 mentions (alice and bob), charlie should be skipped 651 + assert_eq!(facets.len(), 2); 652 + 653 + // Verify they're both mentions 654 + for facet in &facets { 655 + assert!(matches!(facet.features[0], FacetFeature::Mention(_))); 656 + } 657 + } 658 + 659 + #[tokio::test] 660 + async fn test_parse_facets_with_global_limit() { 661 + let mut resolver = MockIdentityResolver::new(); 662 + resolver.add_identity("bob.test.com", "did:plc:bob456"); 663 + 664 + // Very restrictive global limit 665 + let limits = FacetLimits { 666 + mentions_max: 5, 667 + tags_max: 5, 668 + links_max: 5, 669 + max: 3, // Only allow 3 total facets 670 + }; 671 + 672 + let text = 673 + "Join @alice.bsky.social @bob.test.com at https://example.com #rust #golang #python"; 674 + let facets = parse_facets_from_text(text, &resolver, &limits).await; 675 + 676 + assert!(facets.is_some()); 677 + let facets = facets.unwrap(); 678 + // Should be truncated to 3 facets total 679 + assert_eq!(facets.len(), 3); 680 + } 681 + 682 + #[test] 683 + fn test_parse_urls_multiple_links() { 684 + let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"; 685 + 686 + let facets = parse_urls(text); 687 + 688 + // Should find both URLs 689 + assert_eq!( 690 + facets.len(), 691 + 2, 692 + "Expected 2 URLs but found {}", 693 + facets.len() 694 + ); 695 + 696 + // Check first URL 697 + if let Some(FacetFeature::Link(link)) = facets[0].features.first() { 698 + assert_eq!(link.uri, "https://www.ietf.org/meeting/124/"); 699 + } else { 700 + panic!("Expected Link feature"); 701 + } 702 + 703 + // Check second URL 704 + if let Some(FacetFeature::Link(link)) = facets[1].features.first() { 705 + assert_eq!( 706 + link.uri, 707 + "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164" 708 + ); 709 + } else { 710 + panic!("Expected Link feature"); 711 + } 712 + } 713 + 714 + #[test] 715 + fn test_parse_urls_with_html_entity() { 716 + // Test with the HTML entity &amp; in the text 717 + let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd &amp; Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"; 718 + 719 + let facets = parse_urls(text); 720 + 721 + // Should find both URLs 722 + assert_eq!( 723 + facets.len(), 724 + 2, 725 + "Expected 2 URLs but found {}", 726 + facets.len() 727 + ); 728 + 729 + // Check first URL 730 + if let Some(FacetFeature::Link(link)) = facets[0].features.first() { 731 + assert_eq!(link.uri, "https://www.ietf.org/meeting/124/"); 732 + } else { 733 + panic!("Expected Link feature"); 734 + } 735 + 736 + // Check second URL 737 + if let Some(FacetFeature::Link(link)) = facets[1].features.first() { 738 + assert_eq!( 739 + link.uri, 740 + "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164" 741 + ); 742 + } else { 743 + panic!("Expected Link feature"); 744 + } 745 + } 746 + 747 + #[test] 748 + fn test_byte_offset_with_html_entities() { 749 + // This test demonstrates that HTML entity escaping shifts byte positions. 750 + // The byte positions shift: 751 + // In original: '&' is at byte 8 (1 byte) 752 + // In escaped: '&amp;' starts at byte 8 (5 bytes) 753 + // This causes facet byte offsets to be misaligned if text is escaped before rendering. 754 + 755 + // If we have a URL after the ampersand in the original: 756 + let original_with_url = "Nov 3rd & Tuesday https://example.com"; 757 + let escaped_with_url = "Nov 3rd &amp; Tuesday https://example.com"; 758 + 759 + // Parse URLs from both versions 760 + let original_facets = parse_urls(original_with_url); 761 + let escaped_facets = parse_urls(escaped_with_url); 762 + 763 + // Both should find the URL, but at different byte positions 764 + assert_eq!(original_facets.len(), 1); 765 + assert_eq!(escaped_facets.len(), 1); 766 + 767 + // The byte positions will be different 768 + assert_eq!(original_facets[0].index.byte_start, 18); // After "Nov 3rd & Tuesday " 769 + assert_eq!(escaped_facets[0].index.byte_start, 22); // After "Nov 3rd &amp; Tuesday " (4 extra bytes for &amp;) 770 + } 771 + 772 + #[test] 773 + fn test_parse_urls_from_atproto_record_text() { 774 + // Test parsing URLs from real AT Protocol record description text. 775 + // This demonstrates the correct byte positions that should be used for facets. 776 + let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/"; 777 + 778 + let facets = parse_urls(text); 779 + 780 + assert_eq!(facets.len(), 2, "Should find 2 URLs"); 781 + 782 + // First URL: https://stream.place/psingletary.com 783 + assert_eq!(facets[0].index.byte_start, 221); 784 + assert_eq!(facets[0].index.byte_end, 257); 785 + if let Some(FacetFeature::Link(link)) = facets[0].features.first() { 786 + assert_eq!(link.uri, "https://stream.place/psingletary.com"); 787 + } 788 + 789 + // Second URL: https://atprotocalls.leaflet.pub/ 790 + assert_eq!(facets[1].index.byte_start, 290); 791 + assert_eq!(facets[1].index.byte_end, 323); 792 + if let Some(FacetFeature::Link(link)) = facets[1].features.first() { 793 + assert_eq!(link.uri, "https://atprotocalls.leaflet.pub/"); 794 + } 795 + 796 + // Verify the byte slices match the expected text 797 + let text_bytes = text.as_bytes(); 798 + assert_eq!( 799 + std::str::from_utf8(&text_bytes[221..257]).unwrap(), 800 + "https://stream.place/psingletary.com" 801 + ); 802 + assert_eq!( 803 + std::str::from_utf8(&text_bytes[290..323]).unwrap(), 804 + "https://atprotocalls.leaflet.pub/" 805 + ); 806 + } 807 + 808 + #[tokio::test] 809 + async fn test_parse_mentions_basic() { 810 + let resolver = MockIdentityResolver::new(); 811 + let limits = FacetLimits::default(); 812 + let text = "Hello @alice.bsky.social!"; 813 + let facets = parse_mentions(text, &resolver, &limits).await; 814 + 815 + assert_eq!(facets.len(), 1); 816 + assert_eq!(facets[0].index.byte_start, 6); 817 + assert_eq!(facets[0].index.byte_end, 24); 818 + if let Some(FacetFeature::Mention(mention)) = facets[0].features.first() { 819 + assert_eq!(mention.did, "did:plc:alice123"); 820 + } else { 821 + panic!("Expected Mention feature"); 822 + } 823 + } 824 + 825 + #[tokio::test] 826 + async fn test_parse_mentions_multiple() { 827 + let mut resolver = MockIdentityResolver::new(); 828 + resolver.add_identity("bob.example.com", "did:plc:bob456"); 829 + let limits = FacetLimits::default(); 830 + let text = "CC @alice.bsky.social and @bob.example.com"; 831 + let facets = parse_mentions(text, &resolver, &limits).await; 832 + 833 + assert_eq!(facets.len(), 2); 834 + if let Some(FacetFeature::Mention(mention)) = facets[0].features.first() { 835 + assert_eq!(mention.did, "did:plc:alice123"); 836 + } 837 + if let Some(FacetFeature::Mention(mention)) = facets[1].features.first() { 838 + assert_eq!(mention.did, "did:plc:bob456"); 839 + } 840 + } 841 + 842 + #[tokio::test] 843 + async fn test_parse_mentions_unresolvable() { 844 + let resolver = MockIdentityResolver::new(); 845 + let limits = FacetLimits::default(); 846 + // unknown.handle.com is not in the resolver 847 + let text = "Hello @unknown.handle.com!"; 848 + let facets = parse_mentions(text, &resolver, &limits).await; 849 + 850 + // Should be empty since the handle can't be resolved 851 + assert_eq!(facets.len(), 0); 852 + } 853 + 854 + #[tokio::test] 855 + async fn test_parse_mentions_in_url_excluded() { 856 + let resolver = MockIdentityResolver::new(); 857 + let limits = FacetLimits::default(); 858 + // The @smokesignal.events is inside a URL and should not be parsed as a mention 859 + let text = "Check https://tangled.org/@smokesignal.events"; 860 + let facets = parse_mentions(text, &resolver, &limits).await; 861 + 862 + // Should be empty since the mention is inside a URL 863 + assert_eq!(facets.len(), 0); 864 + } 865 + 866 + #[test] 867 + fn test_parse_tags_basic() { 868 + let text = "Learning #rust today!"; 869 + let facets = parse_tags(text); 870 + 871 + assert_eq!(facets.len(), 1); 872 + assert_eq!(facets[0].index.byte_start, 9); 873 + assert_eq!(facets[0].index.byte_end, 14); 874 + if let Some(FacetFeature::Tag(tag)) = facets[0].features.first() { 875 + assert_eq!(tag.tag, "rust"); 876 + } else { 877 + panic!("Expected Tag feature"); 878 + } 879 + } 880 + 881 + #[test] 882 + fn test_parse_tags_multiple() { 883 + let text = "#rust #golang #python are great!"; 884 + let facets = parse_tags(text); 885 + 886 + assert_eq!(facets.len(), 3); 887 + if let Some(FacetFeature::Tag(tag)) = facets[0].features.first() { 888 + assert_eq!(tag.tag, "rust"); 889 + } 890 + if let Some(FacetFeature::Tag(tag)) = facets[1].features.first() { 891 + assert_eq!(tag.tag, "golang"); 892 + } 893 + if let Some(FacetFeature::Tag(tag)) = facets[2].features.first() { 894 + assert_eq!(tag.tag, "python"); 895 + } 896 + } 897 + 898 + #[test] 899 + fn test_parse_tags_excludes_numeric() { 900 + let text = "Item #42 is special #test123"; 901 + let facets = parse_tags(text); 902 + 903 + // #42 should be excluded (purely numeric), #test123 should be included 904 + assert_eq!(facets.len(), 1); 905 + if let Some(FacetFeature::Tag(tag)) = facets[0].features.first() { 906 + assert_eq!(tag.tag, "test123"); 907 + } 908 + } 909 + 910 + #[test] 911 + fn test_parse_urls_basic() { 912 + let text = "Visit https://example.com today!"; 913 + let facets = parse_urls(text); 914 + 915 + assert_eq!(facets.len(), 1); 916 + assert_eq!(facets[0].index.byte_start, 6); 917 + assert_eq!(facets[0].index.byte_end, 25); 918 + if let Some(FacetFeature::Link(link)) = facets[0].features.first() { 919 + assert_eq!(link.uri, "https://example.com"); 920 + } 921 + } 922 + 923 + #[test] 924 + fn test_parse_urls_with_path() { 925 + let text = "Check https://example.com/path/to/page?query=1#section"; 926 + let facets = parse_urls(text); 927 + 928 + assert_eq!(facets.len(), 1); 929 + if let Some(FacetFeature::Link(link)) = facets[0].features.first() { 930 + assert_eq!(link.uri, "https://example.com/path/to/page?query=1#section"); 931 + } 932 + } 933 + 934 + #[test] 935 + fn test_facet_limits_default() { 936 + let limits = FacetLimits::default(); 937 + assert_eq!(limits.mentions_max, 5); 938 + assert_eq!(limits.tags_max, 5); 939 + assert_eq!(limits.links_max, 5); 940 + assert_eq!(limits.max, 10); 941 + } 942 + }
+50
crates/atproto-extras/src/lib.rs
··· 1 + //! Extra utilities for AT Protocol applications. 2 + //! 3 + //! This crate provides additional utilities that complement the core AT Protocol 4 + //! identity and record crates. Currently, it focuses on rich text facet parsing. 5 + //! 6 + //! ## Features 7 + //! 8 + //! - **Facet Parsing**: Extract mentions, URLs, and hashtags from plain text 9 + //! with correct UTF-8 byte offset calculation 10 + //! - **Identity Integration**: Resolve mention handles to DIDs during parsing 11 + //! 12 + //! ## Example 13 + //! 14 + //! ```ignore 15 + //! use atproto_extras::{parse_facets_from_text, FacetLimits}; 16 + //! 17 + //! // Parse facets from text (requires an IdentityResolver) 18 + //! let text = "Hello @alice.bsky.social! Check out https://example.com #rust"; 19 + //! let limits = FacetLimits::default(); 20 + //! let facets = parse_facets_from_text(text, &resolver, &limits).await; 21 + //! ``` 22 + //! 23 + //! ## Byte Offset Calculation 24 + //! 25 + //! This implementation correctly uses UTF-8 byte offsets as required by AT Protocol. 26 + //! The facets use "inclusive start and exclusive end" byte ranges. All parsing is done 27 + //! using `regex::bytes::Regex` which operates on byte slices and returns byte positions, 28 + //! ensuring correct handling of multi-byte UTF-8 characters (emojis, CJK, accented chars). 29 + 30 + #![forbid(unsafe_code)] 31 + #![warn(missing_docs)] 32 + 33 + /// Rich text facet parsing for AT Protocol. 34 + /// 35 + /// This module provides functionality for extracting semantic annotations (facets) 36 + /// from plain text. Facets include: 37 + /// 38 + /// - **Mentions**: User handles prefixed with `@` (e.g., `@alice.bsky.social`) 39 + /// - **Links**: HTTP/HTTPS URLs 40 + /// - **Tags**: Hashtags prefixed with `#` or `#` (e.g., `#rust`) 41 + /// 42 + /// ## Byte Offsets 43 + /// 44 + /// All facet indices use UTF-8 byte offsets, not character indices. This is 45 + /// critical for correct handling of multi-byte characters like emojis or 46 + /// non-ASCII text. 47 + pub mod facets; 48 + 49 + /// Re-export commonly used types for convenience. 50 + pub use facets::{FacetLimits, parse_facets_from_text, parse_mentions, parse_tags, parse_urls};