+27
Cargo.lock
+27
Cargo.lock
···
141
]
142
143
[[package]]
144
+
name = "atproto-extras"
145
+
version = "0.13.0"
146
+
dependencies = [
147
+
"anyhow",
148
+
"async-trait",
149
+
"atproto-identity",
150
+
"atproto-record",
151
+
"clap",
152
+
"regex",
153
+
"reqwest",
154
+
"serde_json",
155
+
"tokio",
156
+
]
157
+
158
+
[[package]]
159
name = "atproto-identity"
160
version = "0.13.0"
161
dependencies = [
···
1891
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
1892
dependencies = [
1893
"bitflags",
1894
+
]
1895
+
1896
+
[[package]]
1897
+
name = "regex"
1898
+
version = "1.12.2"
1899
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1900
+
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
1901
+
dependencies = [
1902
+
"aho-corasick",
1903
+
"memchr",
1904
+
"regex-automata",
1905
+
"regex-syntax",
1906
]
1907
1908
[[package]]
+7
-3
Cargo.toml
+7
-3
Cargo.toml
···
1
[workspace]
2
members = [
3
"crates/atproto-client",
4
"crates/atproto-identity",
5
"crates/atproto-jetstream",
6
"crates/atproto-oauth-aip",
···
24
categories = ["command-line-utilities", "web-programming"]
25
26
[workspace.dependencies]
27
atproto-client = { version = "0.13.0", path = "crates/atproto-client" }
28
atproto-identity = { version = "0.13.0", path = "crates/atproto-identity" }
29
atproto-oauth = { version = "0.13.0", path = "crates/atproto-oauth" }
30
-
atproto-oauth-axum = { version = "0.13.0", path = "crates/atproto-oauth-axum" }
31
atproto-oauth-aip = { version = "0.13.0", path = "crates/atproto-oauth-aip" }
32
atproto-record = { version = "0.13.0", path = "crates/atproto-record" }
33
atproto-xrpcs = { version = "0.13.0", path = "crates/atproto-xrpcs" }
34
-
atproto-jetstream = { version = "0.13.0", path = "crates/atproto-jetstream" }
35
-
atproto-attestation = { version = "0.13.0", path = "crates/atproto-attestation" }
36
37
anyhow = "1.0"
38
async-trait = "0.1"
39
base64 = "0.22"
···
50
p256 = "0.13"
51
p384 = "0.13"
52
rand = "0.8"
53
reqwest = { version = "0.12", default-features = false, features = ["charset", "http2", "system-proxy", "json", "rustls-tls"] }
54
reqwest-chain = "1.0"
55
reqwest-middleware = { version = "0.4", features = ["json", "multipart"]}
···
1
[workspace]
2
members = [
3
"crates/atproto-client",
4
+
"crates/atproto-extras",
5
"crates/atproto-identity",
6
"crates/atproto-jetstream",
7
"crates/atproto-oauth-aip",
···
25
categories = ["command-line-utilities", "web-programming"]
26
27
[workspace.dependencies]
28
+
atproto-attestation = { version = "0.13.0", path = "crates/atproto-attestation" }
29
atproto-client = { version = "0.13.0", path = "crates/atproto-client" }
30
+
atproto-extras = { version = "0.13.0", path = "crates/atproto-extras" }
31
atproto-identity = { version = "0.13.0", path = "crates/atproto-identity" }
32
+
atproto-jetstream = { version = "0.13.0", path = "crates/atproto-jetstream" }
33
atproto-oauth = { version = "0.13.0", path = "crates/atproto-oauth" }
34
atproto-oauth-aip = { version = "0.13.0", path = "crates/atproto-oauth-aip" }
35
+
atproto-oauth-axum = { version = "0.13.0", path = "crates/atproto-oauth-axum" }
36
atproto-record = { version = "0.13.0", path = "crates/atproto-record" }
37
atproto-xrpcs = { version = "0.13.0", path = "crates/atproto-xrpcs" }
38
39
+
ammonia = "4.0"
40
anyhow = "1.0"
41
async-trait = "0.1"
42
base64 = "0.22"
···
53
p256 = "0.13"
54
p384 = "0.13"
55
rand = "0.8"
56
+
regex = "1.11"
57
reqwest = { version = "0.12", default-features = false, features = ["charset", "http2", "system-proxy", "json", "rustls-tls"] }
58
reqwest-chain = "1.0"
59
reqwest-middleware = { version = "0.4", features = ["json", "multipart"]}
+43
crates/atproto-extras/Cargo.toml
+43
crates/atproto-extras/Cargo.toml
···
···
1
+
[package]
2
+
name = "atproto-extras"
3
+
version = "0.13.0"
4
+
description = "AT Protocol extras - facet parsing and rich text utilities"
5
+
readme = "README.md"
6
+
homepage = "https://tangled.sh/@smokesignal.events/atproto-identity-rs"
7
+
documentation = "https://docs.rs/atproto-extras"
8
+
9
+
edition.workspace = true
10
+
rust-version.workspace = true
11
+
authors.workspace = true
12
+
repository.workspace = true
13
+
license.workspace = true
14
+
keywords.workspace = true
15
+
categories.workspace = true
16
+
17
+
[dependencies]
18
+
atproto-identity.workspace = true
19
+
atproto-record.workspace = true
20
+
21
+
anyhow.workspace = true
22
+
async-trait.workspace = true
23
+
clap = { workspace = true, optional = true }
24
+
regex.workspace = true
25
+
reqwest = { workspace = true, optional = true }
26
+
serde_json = { workspace = true, optional = true }
27
+
tokio = { workspace = true, optional = true }
28
+
29
+
[dev-dependencies]
30
+
tokio = { workspace = true, features = ["macros", "rt"] }
31
+
32
+
[features]
33
+
default = ["hickory-dns"]
34
+
hickory-dns = ["atproto-identity/hickory-dns"]
35
+
clap = ["dep:clap"]
36
+
cli = ["dep:clap", "dep:serde_json", "dep:tokio", "dep:reqwest"]
37
+
38
+
[[bin]]
39
+
name = "atproto-extras-parse-facets"
40
+
required-features = ["clap", "cli", "hickory-dns"]
41
+
42
+
[lints]
43
+
workspace = true
+128
crates/atproto-extras/README.md
+128
crates/atproto-extras/README.md
···
···
1
+
# atproto-extras
2
+
3
+
Extra utilities for AT Protocol applications, including rich text facet parsing.
4
+
5
+
## Features
6
+
7
+
- **Facet Parsing**: Extract mentions (`@handle`), URLs, and hashtags (`#tag`) from plain text with correct UTF-8 byte offset calculation
8
+
- **Identity Integration**: Resolve mention handles to DIDs during parsing
9
+
10
+
## Installation
11
+
12
+
Add to your `Cargo.toml`:
13
+
14
+
```toml
15
+
[dependencies]
16
+
atproto-extras = "0.13"
17
+
```
18
+
19
+
## Usage
20
+
21
+
### Parsing Text for Facets
22
+
23
+
```rust
24
+
use atproto_extras::{parse_urls, parse_tags};
25
+
use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature;
26
+
27
+
let text = "Check out https://example.com #rust";
28
+
29
+
// Parse URLs and tags - returns Vec<Facet> directly
30
+
let url_facets = parse_urls(text);
31
+
let tag_facets = parse_tags(text);
32
+
33
+
// Each facet includes byte positions and typed features
34
+
for facet in url_facets {
35
+
if let Some(FacetFeature::Link(link)) = facet.features.first() {
36
+
println!("URL at bytes {}..{}: {}",
37
+
facet.index.byte_start, facet.index.byte_end, link.uri);
38
+
}
39
+
}
40
+
41
+
for facet in tag_facets {
42
+
if let Some(FacetFeature::Tag(tag)) = facet.features.first() {
43
+
println!("Tag at bytes {}..{}: #{}",
44
+
facet.index.byte_start, facet.index.byte_end, tag.tag);
45
+
}
46
+
}
47
+
```
48
+
49
+
### Parsing Mentions
50
+
51
+
Mention parsing requires an `IdentityResolver` to convert handles to DIDs:
52
+
53
+
```rust
54
+
use atproto_extras::{parse_mentions, FacetLimits};
55
+
use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature;
56
+
57
+
let text = "Hello @alice.bsky.social!";
58
+
let limits = FacetLimits::default();
59
+
60
+
// Requires an async context and IdentityResolver
61
+
let facets = parse_mentions(text, &resolver, &limits).await;
62
+
63
+
for facet in facets {
64
+
if let Some(FacetFeature::Mention(mention)) = facet.features.first() {
65
+
println!("Mention at bytes {}..{} resolved to {}",
66
+
facet.index.byte_start, facet.index.byte_end, mention.did);
67
+
}
68
+
}
69
+
```
70
+
71
+
Mentions that cannot be resolved to a valid DID are automatically skipped. Mentions appearing within URLs are also excluded.
72
+
73
+
### Creating AT Protocol Facets
74
+
75
+
```rust
76
+
use atproto_extras::{parse_facets_from_text, FacetLimits};
77
+
78
+
let text = "Hello @alice.bsky.social! Check https://rust-lang.org #rust";
79
+
let limits = FacetLimits::default();
80
+
81
+
// Requires an async context and IdentityResolver
82
+
let facets = parse_facets_from_text(text, &resolver, &limits).await;
83
+
84
+
if let Some(facets) = facets {
85
+
for facet in &facets {
86
+
println!("Facet at {}..{}", facet.index.byte_start, facet.index.byte_end);
87
+
}
88
+
}
89
+
```
90
+
91
+
## Byte Offset Handling
92
+
93
+
AT Protocol facets use UTF-8 byte offsets, not character indices. This is critical for correct handling of multi-byte characters like emojis or non-ASCII text.
94
+
95
+
```rust
96
+
use atproto_extras::parse_urls;
97
+
98
+
// Text with emojis (multi-byte UTF-8 characters)
99
+
let text = "✨ Check https://example.com ✨";
100
+
101
+
let facets = parse_urls(text);
102
+
// Byte positions correctly account for the 4-byte emoji
103
+
assert_eq!(facets[0].index.byte_start, 11); // After "✨ Check " (4 + 1 + 6 = 11 bytes)
104
+
```
105
+
106
+
## Facet Limits
107
+
108
+
Use `FacetLimits` to control the maximum number of facets processed:
109
+
110
+
```rust
111
+
use atproto_extras::FacetLimits;
112
+
113
+
// Default limits
114
+
let limits = FacetLimits::default();
115
+
// mentions_max: 5, tags_max: 5, links_max: 5, max: 10
116
+
117
+
// Custom limits
118
+
let custom = FacetLimits {
119
+
mentions_max: 10,
120
+
tags_max: 10,
121
+
links_max: 10,
122
+
max: 20,
123
+
};
124
+
```
125
+
126
+
## License
127
+
128
+
MIT
+176
crates/atproto-extras/src/bin/atproto-extras-parse-facets.rs
+176
crates/atproto-extras/src/bin/atproto-extras-parse-facets.rs
···
···
1
+
//! Command-line tool for generating AT Protocol facet arrays from text.
2
+
//!
3
+
//! This tool parses a string and outputs the facet array in JSON format.
4
+
//! Facets include mentions (@handle), URLs (https://...), and hashtags (#tag).
5
+
//!
6
+
//! By default, mentions are detected but output with placeholder DIDs. Use
7
+
//! `--resolve-mentions` to resolve handles to actual DIDs (requires network access).
8
+
//!
9
+
//! # Usage
10
+
//!
11
+
//! ```bash
12
+
//! # Parse facets without resolving mentions
13
+
//! cargo run --features clap,serde_json,tokio,hickory-dns --bin atproto-extras-parse-facets -- "Check out https://example.com and #rust"
14
+
//!
15
+
//! # Resolve mentions to DIDs
16
+
//! cargo run --features clap,serde_json,tokio,hickory-dns --bin atproto-extras-parse-facets -- --resolve-mentions "Hello @bsky.app!"
17
+
//! ```
18
+
19
+
use atproto_extras::{FacetLimits, parse_mentions, parse_tags, parse_urls};
20
+
use atproto_identity::resolve::{HickoryDnsResolver, InnerIdentityResolver};
21
+
use atproto_record::lexicon::app::bsky::richtext::facet::{
22
+
ByteSlice, Facet, FacetFeature, Mention,
23
+
};
24
+
use clap::Parser;
25
+
use regex::bytes::Regex;
26
+
use std::sync::Arc;
27
+
28
+
/// Parse text and output AT Protocol facets as JSON.
29
+
#[derive(Parser)]
30
+
#[command(
31
+
name = "atproto-extras-parse-facets",
32
+
version,
33
+
about = "Parse text and output AT Protocol facets as JSON",
34
+
long_about = "This tool parses a string for mentions, URLs, and hashtags,\n\
35
+
then outputs the corresponding AT Protocol facet array in JSON format.\n\n\
36
+
By default, mentions are detected but output with placeholder DIDs.\n\
37
+
Use --resolve-mentions to resolve handles to actual DIDs (requires network)."
38
+
)]
39
+
struct Args {
40
+
/// The text to parse for facets
41
+
text: String,
42
+
43
+
/// Resolve mention handles to DIDs (requires network access)
44
+
#[arg(long)]
45
+
resolve_mentions: bool,
46
+
47
+
/// Show debug information on stderr
48
+
#[arg(long, short = 'd')]
49
+
debug: bool,
50
+
}
51
+
52
+
/// Parse mention spans from text without resolution (returns placeholder DIDs).
53
+
fn parse_mention_spans(text: &str) -> Vec<Facet> {
54
+
let mut facets = Vec::new();
55
+
56
+
// Get URL ranges to exclude mentions within URLs
57
+
let url_facets = parse_urls(text);
58
+
59
+
// Same regex pattern as parse_mentions
60
+
let mention_regex = Regex::new(
61
+
r"(?:^|[^\w])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)",
62
+
)
63
+
.expect("Invalid mention regex");
64
+
65
+
let text_bytes = text.as_bytes();
66
+
67
+
for capture in mention_regex.captures_iter(text_bytes) {
68
+
if let Some(mention_match) = capture.get(1) {
69
+
let start = mention_match.start();
70
+
let end = mention_match.end();
71
+
72
+
// Check if this mention overlaps with any URL
73
+
let overlaps_url = url_facets.iter().any(|facet| {
74
+
(start >= facet.index.byte_start && start < facet.index.byte_end)
75
+
|| (end > facet.index.byte_start && end <= facet.index.byte_end)
76
+
});
77
+
78
+
if !overlaps_url {
79
+
let handle = std::str::from_utf8(&mention_match.as_bytes()[1..])
80
+
.unwrap_or_default()
81
+
.to_string();
82
+
83
+
facets.push(Facet {
84
+
index: ByteSlice {
85
+
byte_start: start,
86
+
byte_end: end,
87
+
},
88
+
features: vec![FacetFeature::Mention(Mention {
89
+
did: format!("did:plc:<unresolved:{}>", handle),
90
+
})],
91
+
});
92
+
}
93
+
}
94
+
}
95
+
96
+
facets
97
+
}
98
+
99
+
#[tokio::main]
100
+
async fn main() {
101
+
let args = Args::parse();
102
+
let text = &args.text;
103
+
let mut facets: Vec<Facet> = Vec::new();
104
+
let limits = FacetLimits::default();
105
+
106
+
// Parse mentions (either resolved or with placeholders)
107
+
if args.resolve_mentions {
108
+
let http_client = reqwest::Client::new();
109
+
let dns_resolver = HickoryDnsResolver::create_resolver(&[]);
110
+
let resolver = InnerIdentityResolver {
111
+
http_client,
112
+
dns_resolver: Arc::new(dns_resolver),
113
+
plc_hostname: "plc.directory".to_string(),
114
+
};
115
+
let mention_facets = parse_mentions(text, &resolver, &limits).await;
116
+
facets.extend(mention_facets);
117
+
} else {
118
+
let mention_facets = parse_mention_spans(text);
119
+
facets.extend(mention_facets);
120
+
}
121
+
122
+
// Parse URLs
123
+
let url_facets = parse_urls(text);
124
+
facets.extend(url_facets);
125
+
126
+
// Parse hashtags
127
+
let tag_facets = parse_tags(text);
128
+
facets.extend(tag_facets);
129
+
130
+
// Sort facets by byte_start for consistent output
131
+
facets.sort_by_key(|f| f.index.byte_start);
132
+
133
+
// Output as JSON
134
+
if facets.is_empty() {
135
+
println!("null");
136
+
} else {
137
+
match serde_json::to_string_pretty(&facets) {
138
+
Ok(json) => println!("{}", json),
139
+
Err(e) => {
140
+
eprintln!(
141
+
"error-atproto-extras-parse-facets-1 Error serializing facets: {}",
142
+
e
143
+
);
144
+
std::process::exit(1);
145
+
}
146
+
}
147
+
}
148
+
149
+
// Show debug info if requested
150
+
if args.debug {
151
+
eprintln!();
152
+
eprintln!("--- Debug Info ---");
153
+
eprintln!("Input text: {:?}", text);
154
+
eprintln!("Text length: {} bytes", text.len());
155
+
eprintln!("Facets found: {}", facets.len());
156
+
eprintln!("Mentions resolved: {}", args.resolve_mentions);
157
+
158
+
// Show byte slice verification
159
+
let text_bytes = text.as_bytes();
160
+
for (i, facet) in facets.iter().enumerate() {
161
+
let start = facet.index.byte_start;
162
+
let end = facet.index.byte_end;
163
+
let slice_text =
164
+
std::str::from_utf8(&text_bytes[start..end]).unwrap_or("<invalid utf8>");
165
+
let feature_type = match &facet.features[0] {
166
+
FacetFeature::Mention(_) => "mention",
167
+
FacetFeature::Link(_) => "link",
168
+
FacetFeature::Tag(_) => "tag",
169
+
};
170
+
eprintln!(
171
+
" [{}] {} @ bytes {}..{}: {:?}",
172
+
i, feature_type, start, end, slice_text
173
+
);
174
+
}
175
+
}
176
+
}
+942
crates/atproto-extras/src/facets.rs
+942
crates/atproto-extras/src/facets.rs
···
···
1
+
//! Rich text facet parsing for AT Protocol.
2
+
//!
3
+
//! This module provides functionality for extracting semantic annotations (facets)
4
+
//! from plain text. Facets include mentions, links (URLs), and hashtags.
5
+
//!
6
+
//! # Overview
7
+
//!
8
+
//! AT Protocol rich text uses "facets" to annotate specific byte ranges within text with
9
+
//! semantic meaning. This module handles:
10
+
//!
11
+
//! - **Parsing**: Extract mentions, URLs, and hashtags from plain text
12
+
//! - **Facet Creation**: Build proper AT Protocol facet structures with resolved DIDs
13
+
//!
14
+
//! # Byte Offset Calculation
15
+
//!
16
+
//! This implementation correctly uses UTF-8 byte offsets as required by AT Protocol.
17
+
//! The facets use "inclusive start and exclusive end" byte ranges. All parsing is done
18
+
//! using `regex::bytes::Regex` which operates on byte slices and returns byte positions,
19
+
//! ensuring correct handling of multi-byte UTF-8 characters (emojis, CJK, accented chars).
20
+
//!
21
+
//! # Example
22
+
//!
23
+
//! ```ignore
24
+
//! use atproto_extras::facets::{parse_urls, parse_tags, FacetLimits};
25
+
//! use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature;
26
+
//!
27
+
//! let text = "Check out https://example.com #rust";
28
+
//!
29
+
//! // Parse URLs and tags as Facet objects
30
+
//! let url_facets = parse_urls(text);
31
+
//! let tag_facets = parse_tags(text);
32
+
//!
33
+
//! // Access facet data directly
34
+
//! for facet in url_facets {
35
+
//! if let Some(FacetFeature::Link(link)) = facet.features.first() {
36
+
//! println!("URL at bytes {}..{}: {}",
37
+
//! facet.index.byte_start, facet.index.byte_end, link.uri);
38
+
//! }
39
+
//! }
40
+
//! ```
41
+
42
+
use atproto_identity::resolve::IdentityResolver;
43
+
use atproto_record::lexicon::app::bsky::richtext::facet::{
44
+
ByteSlice, Facet, FacetFeature, Link, Mention, Tag,
45
+
};
46
+
use regex::bytes::Regex;
47
+
48
+
/// Configuration for facet parsing limits.
49
+
///
50
+
/// These limits protect against abuse by capping the number of facets
51
+
/// that will be processed. This is important for both performance and
52
+
/// security when handling user-generated content.
53
+
///
54
+
/// # Example
55
+
///
56
+
/// ```
57
+
/// use atproto_extras::FacetLimits;
58
+
///
59
+
/// // Use defaults
60
+
/// let limits = FacetLimits::default();
61
+
///
62
+
/// // Or customize
63
+
/// let custom = FacetLimits {
64
+
/// mentions_max: 10,
65
+
/// tags_max: 10,
66
+
/// links_max: 10,
67
+
/// max: 20,
68
+
/// };
69
+
/// ```
70
+
#[derive(Debug, Clone, Copy)]
71
+
pub struct FacetLimits {
72
+
/// Maximum number of mention facets to process (default: 5)
73
+
pub mentions_max: usize,
74
+
/// Maximum number of tag facets to process (default: 5)
75
+
pub tags_max: usize,
76
+
/// Maximum number of link facets to process (default: 5)
77
+
pub links_max: usize,
78
+
/// Maximum total number of facets to process (default: 10)
79
+
pub max: usize,
80
+
}
81
+
82
+
impl Default for FacetLimits {
83
+
fn default() -> Self {
84
+
Self {
85
+
mentions_max: 5,
86
+
tags_max: 5,
87
+
links_max: 5,
88
+
max: 10,
89
+
}
90
+
}
91
+
}
92
+
93
+
/// Parse mentions from text and return them as Facet objects with resolved DIDs.
94
+
///
95
+
/// This function extracts AT Protocol handle mentions (e.g., `@alice.bsky.social`)
96
+
/// from text, resolves each handle to a DID using the provided identity resolver,
97
+
/// and returns AT Protocol Facet objects with Mention features.
98
+
///
99
+
/// Mentions that cannot be resolved to a valid DID are skipped. Mentions that
100
+
/// appear within URLs are also excluded to avoid false positives.
101
+
///
102
+
/// # Arguments
103
+
///
104
+
/// * `text` - The text to parse for mentions
105
+
/// * `identity_resolver` - Resolver for converting handles to DIDs
106
+
/// * `limits` - Configuration for maximum mentions to process
107
+
///
108
+
/// # Returns
109
+
///
110
+
/// A vector of Facet objects for successfully resolved mentions.
111
+
///
112
+
/// # Example
113
+
///
114
+
/// ```ignore
115
+
/// use atproto_extras::{parse_mentions, FacetLimits};
116
+
/// use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature;
117
+
///
118
+
/// let text = "Hello @alice.bsky.social!";
119
+
/// let limits = FacetLimits::default();
120
+
///
121
+
/// // Requires an async context and identity resolver
122
+
/// let facets = parse_mentions(text, &resolver, &limits).await;
123
+
///
124
+
/// for facet in facets {
125
+
/// if let Some(FacetFeature::Mention(mention)) = facet.features.first() {
126
+
/// println!("Mention {} resolved to {}",
127
+
/// &text[facet.index.byte_start..facet.index.byte_end],
128
+
/// mention.did);
129
+
/// }
130
+
/// }
131
+
/// ```
132
+
pub async fn parse_mentions(
133
+
text: &str,
134
+
identity_resolver: &dyn IdentityResolver,
135
+
limits: &FacetLimits,
136
+
) -> Vec<Facet> {
137
+
let mut facets = Vec::new();
138
+
139
+
// First, parse all URLs to exclude mention matches within them
140
+
let url_facets = parse_urls(text);
141
+
142
+
// Regex based on: https://atproto.com/specs/handle#handle-identifier-syntax
143
+
// Pattern: [$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)
144
+
let mention_regex = Regex::new(
145
+
r"(?:^|[^\w])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)",
146
+
)
147
+
.unwrap();
148
+
149
+
let text_bytes = text.as_bytes();
150
+
let mut mention_count = 0;
151
+
152
+
for capture in mention_regex.captures_iter(text_bytes) {
153
+
if mention_count >= limits.mentions_max {
154
+
break;
155
+
}
156
+
157
+
if let Some(mention_match) = capture.get(1) {
158
+
let start = mention_match.start();
159
+
let end = mention_match.end();
160
+
161
+
// Check if this mention overlaps with any URL
162
+
let overlaps_url = url_facets.iter().any(|facet| {
163
+
// Check if mention is within or overlaps the URL span
164
+
(start >= facet.index.byte_start && start < facet.index.byte_end)
165
+
|| (end > facet.index.byte_start && end <= facet.index.byte_end)
166
+
});
167
+
168
+
// Only process the mention if it doesn't overlap with a URL
169
+
if !overlaps_url {
170
+
let handle = std::str::from_utf8(&mention_match.as_bytes()[1..])
171
+
.unwrap_or_default()
172
+
.to_string();
173
+
174
+
// Try to resolve the handle to a DID
175
+
// First try with at:// prefix, then without
176
+
let at_uri = format!("at://{}", handle);
177
+
let did_result = match identity_resolver.resolve(&at_uri).await {
178
+
Ok(doc) => Ok(doc),
179
+
Err(_) => identity_resolver.resolve(&handle).await,
180
+
};
181
+
182
+
// Only add the mention facet if we successfully resolved the DID
183
+
if let Ok(did_doc) = did_result {
184
+
facets.push(Facet {
185
+
index: ByteSlice {
186
+
byte_start: start,
187
+
byte_end: end,
188
+
},
189
+
features: vec![FacetFeature::Mention(Mention {
190
+
did: did_doc.id.to_string(),
191
+
})],
192
+
});
193
+
mention_count += 1;
194
+
}
195
+
}
196
+
}
197
+
}
198
+
199
+
facets
200
+
}
201
+
202
+
/// Parse URLs from text and return them as Facet objects.
203
+
///
204
+
/// This function extracts HTTP and HTTPS URLs from text with correct
205
+
/// byte position tracking for UTF-8 text, returning AT Protocol Facet objects
206
+
/// with Link features.
207
+
///
208
+
/// # Supported URL Patterns
209
+
///
210
+
/// - HTTP URLs: `http://example.com`
211
+
/// - HTTPS URLs: `https://example.com`
212
+
/// - URLs with paths, query strings, and fragments
213
+
/// - URLs with subdomains: `https://www.example.com`
214
+
///
215
+
/// # Example
216
+
///
217
+
/// ```
218
+
/// use atproto_extras::parse_urls;
219
+
/// use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature;
220
+
///
221
+
/// let text = "Visit https://example.com/path?query=1 for more info";
222
+
/// let facets = parse_urls(text);
223
+
///
224
+
/// assert_eq!(facets.len(), 1);
225
+
/// assert_eq!(facets[0].index.byte_start, 6);
226
+
/// assert_eq!(facets[0].index.byte_end, 38);
227
+
/// if let Some(FacetFeature::Link(link)) = facets[0].features.first() {
228
+
/// assert_eq!(link.uri, "https://example.com/path?query=1");
229
+
/// }
230
+
/// ```
231
+
///
232
+
/// # Multi-byte Character Handling
233
+
///
234
+
/// Byte positions are correctly calculated even with emojis and other
235
+
/// multi-byte UTF-8 characters:
236
+
///
237
+
/// ```
238
+
/// use atproto_extras::parse_urls;
239
+
/// use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature;
240
+
///
241
+
/// let text = "Check out https://example.com now!";
242
+
/// let facets = parse_urls(text);
243
+
/// let text_bytes = text.as_bytes();
244
+
///
245
+
/// // The byte slice matches the URL
246
+
/// let url_bytes = &text_bytes[facets[0].index.byte_start..facets[0].index.byte_end];
247
+
/// assert_eq!(std::str::from_utf8(url_bytes).unwrap(), "https://example.com");
248
+
/// ```
249
+
pub fn parse_urls(text: &str) -> Vec<Facet> {
250
+
let mut facets = Vec::new();
251
+
252
+
// Partial/naive URL regex based on: https://stackoverflow.com/a/3809435
253
+
// Pattern: [$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)
254
+
// Modified to use + instead of {1,6} to support longer TLDs and multi-level subdomains
255
+
let url_regex = Regex::new(
256
+
r"(?:^|[^\w])(https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)"
257
+
).unwrap();
258
+
259
+
let text_bytes = text.as_bytes();
260
+
for capture in url_regex.captures_iter(text_bytes) {
261
+
if let Some(url_match) = capture.get(1) {
262
+
let url = std::str::from_utf8(url_match.as_bytes())
263
+
.unwrap_or_default()
264
+
.to_string();
265
+
266
+
facets.push(Facet {
267
+
index: ByteSlice {
268
+
byte_start: url_match.start(),
269
+
byte_end: url_match.end(),
270
+
},
271
+
features: vec![FacetFeature::Link(Link { uri: url })],
272
+
});
273
+
}
274
+
}
275
+
276
+
facets
277
+
}
278
+
279
+
/// Parse hashtags from text and return them as Facet objects.
280
+
///
281
+
/// This function extracts hashtags (e.g., `#rust`, `#ATProto`) from text,
282
+
/// returning AT Protocol Facet objects with Tag features.
283
+
/// It supports both standard `#` and full-width `#` (U+FF03) hash symbols.
284
+
///
285
+
/// # Tag Syntax
286
+
///
287
+
/// - Tags must start with `#` or `#` (full-width)
288
+
/// - Tag content follows word character rules (`\w`)
289
+
/// - Purely numeric tags (e.g., `#123`) are excluded
290
+
///
291
+
/// # Example
292
+
///
293
+
/// ```
294
+
/// use atproto_extras::parse_tags;
295
+
/// use atproto_record::lexicon::app::bsky::richtext::facet::FacetFeature;
296
+
///
297
+
/// let text = "Learning #rust and #golang today! #100DaysOfCode";
298
+
/// let facets = parse_tags(text);
299
+
///
300
+
/// assert_eq!(facets.len(), 3);
301
+
/// if let Some(FacetFeature::Tag(tag)) = facets[0].features.first() {
302
+
/// assert_eq!(tag.tag, "rust");
303
+
/// }
304
+
/// if let Some(FacetFeature::Tag(tag)) = facets[1].features.first() {
305
+
/// assert_eq!(tag.tag, "golang");
306
+
/// }
307
+
/// if let Some(FacetFeature::Tag(tag)) = facets[2].features.first() {
308
+
/// assert_eq!(tag.tag, "100DaysOfCode");
309
+
/// }
310
+
/// ```
311
+
///
312
+
/// # Numeric Tags
313
+
///
314
+
/// Purely numeric tags are excluded:
315
+
///
316
+
/// ```
317
+
/// use atproto_extras::parse_tags;
318
+
///
319
+
/// let text = "Item #42 is special";
320
+
/// let facets = parse_tags(text);
321
+
///
322
+
/// // #42 is not extracted because it's purely numeric
323
+
/// assert_eq!(facets.len(), 0);
324
+
/// ```
325
+
pub fn parse_tags(text: &str) -> Vec<Facet> {
326
+
let mut facets = Vec::new();
327
+
328
+
// Regex based on: https://github.com/bluesky-social/atproto/blob/d91988fe79030b61b556dd6f16a46f0c3b9d0b44/packages/api/src/rich-text/util.ts
329
+
// Simplified for Rust - matches hashtags at word boundaries
330
+
// Pattern matches: start of string or non-word char, then # or #, then tag content
331
+
let tag_regex = Regex::new(r"(?:^|[^\w])([#\xEF\xBC\x83])([\w]+(?:[\w]*)*)").unwrap();
332
+
333
+
let text_bytes = text.as_bytes();
334
+
335
+
// Work with bytes for proper position tracking
336
+
for capture in tag_regex.captures_iter(text_bytes) {
337
+
if let (Some(full_match), Some(hash_match), Some(tag_match)) =
338
+
(capture.get(0), capture.get(1), capture.get(2))
339
+
{
340
+
// Calculate the absolute byte position of the hash symbol
341
+
// The full match includes the preceding character (if any)
342
+
// so we need to adjust for that
343
+
let match_start = full_match.start();
344
+
let hash_offset = hash_match.start() - full_match.start();
345
+
let start = match_start + hash_offset;
346
+
let end = match_start + hash_offset + hash_match.len() + tag_match.len();
347
+
348
+
// Extract just the tag text (without the hash symbol)
349
+
let tag = std::str::from_utf8(tag_match.as_bytes()).unwrap_or_default();
350
+
351
+
// Only include tags that are not purely numeric
352
+
if !tag.chars().all(|c| c.is_ascii_digit()) {
353
+
facets.push(Facet {
354
+
index: ByteSlice {
355
+
byte_start: start,
356
+
byte_end: end,
357
+
},
358
+
features: vec![FacetFeature::Tag(Tag {
359
+
tag: tag.to_string(),
360
+
})],
361
+
});
362
+
}
363
+
}
364
+
}
365
+
366
+
facets
367
+
}
368
+
369
+
/// Parse facets from text and return a vector of Facet objects.
370
+
///
371
+
/// This function extracts mentions, URLs, and hashtags from the provided text
372
+
/// and creates AT Protocol facets with proper byte indices.
373
+
///
374
+
/// Mentions are resolved to actual DIDs using the provided identity resolver.
375
+
/// If a handle cannot be resolved to a DID, the mention facet is skipped.
376
+
///
377
+
/// # Arguments
378
+
///
379
+
/// * `text` - The text to extract facets from
380
+
/// * `identity_resolver` - Resolver for converting handles to DIDs
381
+
/// * `limits` - Configuration for maximum facets per type and total
382
+
///
383
+
/// # Returns
384
+
///
385
+
/// Optional vector of facets. Returns `None` if no facets were found.
386
+
///
387
+
/// # Example
388
+
///
389
+
/// ```ignore
390
+
/// use atproto_extras::{parse_facets_from_text, FacetLimits};
391
+
///
392
+
/// let text = "Hello @alice.bsky.social! Check #rust at https://rust-lang.org";
393
+
/// let limits = FacetLimits::default();
394
+
///
395
+
/// // Requires an async context and identity resolver
396
+
/// let facets = parse_facets_from_text(text, &resolver, &limits).await;
397
+
///
398
+
/// if let Some(facets) = facets {
399
+
/// for facet in &facets {
400
+
/// println!("Facet at {}..{}", facet.index.byte_start, facet.index.byte_end);
401
+
/// }
402
+
/// }
403
+
/// ```
404
+
///
405
+
/// # Mention Resolution
406
+
///
407
+
/// Mentions are only included if the handle resolves to a valid DID:
408
+
///
409
+
/// ```ignore
410
+
/// let text = "@valid.handle.com and @invalid.handle.xyz";
411
+
/// let facets = parse_facets_from_text(text, &resolver, &limits).await;
412
+
///
413
+
/// // Only @valid.handle.com appears as a facet if @invalid.handle.xyz
414
+
/// // cannot be resolved to a DID
415
+
/// ```
416
+
pub async fn parse_facets_from_text(
417
+
text: &str,
418
+
identity_resolver: &dyn IdentityResolver,
419
+
limits: &FacetLimits,
420
+
) -> Option<Vec<Facet>> {
421
+
let mut facets = Vec::new();
422
+
423
+
// Parse mentions (already limited by mentions_max in parse_mentions)
424
+
let mention_facets = parse_mentions(text, identity_resolver, limits).await;
425
+
facets.extend(mention_facets);
426
+
427
+
// Parse URLs (limited by links_max)
428
+
let url_facets = parse_urls(text);
429
+
for (idx, facet) in url_facets.into_iter().enumerate() {
430
+
if idx >= limits.links_max {
431
+
break;
432
+
}
433
+
facets.push(facet);
434
+
}
435
+
436
+
// Parse hashtags (limited by tags_max)
437
+
let tag_facets = parse_tags(text);
438
+
for (idx, facet) in tag_facets.into_iter().enumerate() {
439
+
if idx >= limits.tags_max {
440
+
break;
441
+
}
442
+
facets.push(facet);
443
+
}
444
+
445
+
// Apply global facet limit (truncate if exceeds max)
446
+
if facets.len() > limits.max {
447
+
facets.truncate(limits.max);
448
+
}
449
+
450
+
// Only return facets if we found any
451
+
if !facets.is_empty() {
452
+
Some(facets)
453
+
} else {
454
+
None
455
+
}
456
+
}
457
+
458
+
#[cfg(test)]
459
+
mod tests {
460
+
use async_trait::async_trait;
461
+
use atproto_identity::model::Document;
462
+
use std::collections::HashMap;
463
+
464
+
use super::*;
465
+
466
+
/// Mock identity resolver for testing
467
+
struct MockIdentityResolver {
468
+
handles_to_dids: HashMap<String, String>,
469
+
}
470
+
471
+
impl MockIdentityResolver {
472
+
fn new() -> Self {
473
+
let mut handles_to_dids = HashMap::new();
474
+
handles_to_dids.insert(
475
+
"alice.bsky.social".to_string(),
476
+
"did:plc:alice123".to_string(),
477
+
);
478
+
handles_to_dids.insert(
479
+
"at://alice.bsky.social".to_string(),
480
+
"did:plc:alice123".to_string(),
481
+
);
482
+
Self { handles_to_dids }
483
+
}
484
+
485
+
fn add_identity(&mut self, handle: &str, did: &str) {
486
+
self.handles_to_dids
487
+
.insert(handle.to_string(), did.to_string());
488
+
self.handles_to_dids
489
+
.insert(format!("at://{}", handle), did.to_string());
490
+
}
491
+
}
492
+
493
+
#[async_trait]
494
+
impl IdentityResolver for MockIdentityResolver {
495
+
async fn resolve(&self, handle: &str) -> anyhow::Result<Document> {
496
+
let handle_key = handle.to_string();
497
+
498
+
if let Some(did) = self.handles_to_dids.get(&handle_key) {
499
+
Ok(Document {
500
+
context: vec![],
501
+
id: did.clone(),
502
+
also_known_as: vec![format!("at://{}", handle_key.trim_start_matches("at://"))],
503
+
verification_method: vec![],
504
+
service: vec![],
505
+
extra: HashMap::new(),
506
+
})
507
+
} else {
508
+
Err(anyhow::anyhow!("Handle not found"))
509
+
}
510
+
}
511
+
}
512
+
513
+
#[tokio::test]
514
+
async fn test_parse_facets_from_text_comprehensive() {
515
+
let mut resolver = MockIdentityResolver::new();
516
+
resolver.add_identity("bob.test.com", "did:plc:bob456");
517
+
518
+
let limits = FacetLimits::default();
519
+
let text = "Join @alice.bsky.social and @bob.test.com at https://example.com #rust #golang";
520
+
let facets = parse_facets_from_text(text, &resolver, &limits).await;
521
+
522
+
assert!(facets.is_some());
523
+
let facets = facets.unwrap();
524
+
assert_eq!(facets.len(), 5); // 2 mentions, 1 URL, 2 hashtags
525
+
526
+
// Check first mention
527
+
assert_eq!(facets[0].index.byte_start, 5);
528
+
assert_eq!(facets[0].index.byte_end, 23);
529
+
if let FacetFeature::Mention(ref mention) = facets[0].features[0] {
530
+
assert_eq!(mention.did, "did:plc:alice123");
531
+
} else {
532
+
panic!("Expected Mention feature");
533
+
}
534
+
535
+
// Check second mention
536
+
assert_eq!(facets[1].index.byte_start, 28);
537
+
assert_eq!(facets[1].index.byte_end, 41);
538
+
if let FacetFeature::Mention(mention) = &facets[1].features[0] {
539
+
assert_eq!(mention.did, "did:plc:bob456");
540
+
} else {
541
+
panic!("Expected Mention feature");
542
+
}
543
+
544
+
// Check URL
545
+
assert_eq!(facets[2].index.byte_start, 45);
546
+
assert_eq!(facets[2].index.byte_end, 64);
547
+
if let FacetFeature::Link(link) = &facets[2].features[0] {
548
+
assert_eq!(link.uri, "https://example.com");
549
+
} else {
550
+
panic!("Expected Link feature");
551
+
}
552
+
553
+
// Check first hashtag
554
+
assert_eq!(facets[3].index.byte_start, 65);
555
+
assert_eq!(facets[3].index.byte_end, 70);
556
+
if let FacetFeature::Tag(tag) = &facets[3].features[0] {
557
+
assert_eq!(tag.tag, "rust");
558
+
} else {
559
+
panic!("Expected Tag feature");
560
+
}
561
+
562
+
// Check second hashtag
563
+
assert_eq!(facets[4].index.byte_start, 71);
564
+
assert_eq!(facets[4].index.byte_end, 78);
565
+
if let FacetFeature::Tag(tag) = &facets[4].features[0] {
566
+
assert_eq!(tag.tag, "golang");
567
+
} else {
568
+
panic!("Expected Tag feature");
569
+
}
570
+
}
571
+
572
+
#[tokio::test]
573
+
async fn test_parse_facets_from_text_with_unresolvable_mention() {
574
+
let resolver = MockIdentityResolver::new();
575
+
let limits = FacetLimits::default();
576
+
577
+
// Only alice.bsky.social is in the resolver, not unknown.handle.com
578
+
let text = "Contact @unknown.handle.com for details #rust";
579
+
let facets = parse_facets_from_text(text, &resolver, &limits).await;
580
+
581
+
assert!(facets.is_some());
582
+
let facets = facets.unwrap();
583
+
// Should only have 1 facet (the hashtag) since the mention couldn't be resolved
584
+
assert_eq!(facets.len(), 1);
585
+
586
+
// Check that it's the hashtag facet
587
+
if let FacetFeature::Tag(tag) = &facets[0].features[0] {
588
+
assert_eq!(tag.tag, "rust");
589
+
} else {
590
+
panic!("Expected Tag feature");
591
+
}
592
+
}
593
+
594
+
#[tokio::test]
595
+
async fn test_parse_facets_from_text_empty() {
596
+
let resolver = MockIdentityResolver::new();
597
+
let limits = FacetLimits::default();
598
+
let text = "No mentions, URLs, or hashtags here";
599
+
let facets = parse_facets_from_text(text, &resolver, &limits).await;
600
+
assert!(facets.is_none());
601
+
}
602
+
603
+
#[tokio::test]
604
+
async fn test_parse_facets_from_text_url_with_at_mention() {
605
+
let resolver = MockIdentityResolver::new();
606
+
let limits = FacetLimits::default();
607
+
608
+
// URLs with @ should not create mention facets
609
+
let text = "Tangled https://tangled.org/@smokesignal.events";
610
+
let facets = parse_facets_from_text(text, &resolver, &limits).await;
611
+
612
+
assert!(facets.is_some());
613
+
let facets = facets.unwrap();
614
+
615
+
// Should have exactly 1 facet (the URL), not 2 (URL + mention)
616
+
assert_eq!(
617
+
facets.len(),
618
+
1,
619
+
"Expected 1 facet (URL only), got {}",
620
+
facets.len()
621
+
);
622
+
623
+
// Verify it's a link facet, not a mention
624
+
if let FacetFeature::Link(link) = &facets[0].features[0] {
625
+
assert_eq!(link.uri, "https://tangled.org/@smokesignal.events");
626
+
} else {
627
+
panic!("Expected Link feature, got Mention or Tag instead");
628
+
}
629
+
}
630
+
631
+
#[tokio::test]
632
+
async fn test_parse_facets_with_mention_limit() {
633
+
let mut resolver = MockIdentityResolver::new();
634
+
resolver.add_identity("bob.test.com", "did:plc:bob456");
635
+
resolver.add_identity("charlie.test.com", "did:plc:charlie789");
636
+
637
+
// Limit to 2 mentions
638
+
let limits = FacetLimits {
639
+
mentions_max: 2,
640
+
tags_max: 5,
641
+
links_max: 5,
642
+
max: 10,
643
+
};
644
+
645
+
let text = "Join @alice.bsky.social @bob.test.com @charlie.test.com";
646
+
let facets = parse_facets_from_text(text, &resolver, &limits).await;
647
+
648
+
assert!(facets.is_some());
649
+
let facets = facets.unwrap();
650
+
// Should only have 2 mentions (alice and bob), charlie should be skipped
651
+
assert_eq!(facets.len(), 2);
652
+
653
+
// Verify they're both mentions
654
+
for facet in &facets {
655
+
assert!(matches!(facet.features[0], FacetFeature::Mention(_)));
656
+
}
657
+
}
658
+
659
+
#[tokio::test]
660
+
async fn test_parse_facets_with_global_limit() {
661
+
let mut resolver = MockIdentityResolver::new();
662
+
resolver.add_identity("bob.test.com", "did:plc:bob456");
663
+
664
+
// Very restrictive global limit
665
+
let limits = FacetLimits {
666
+
mentions_max: 5,
667
+
tags_max: 5,
668
+
links_max: 5,
669
+
max: 3, // Only allow 3 total facets
670
+
};
671
+
672
+
let text =
673
+
"Join @alice.bsky.social @bob.test.com at https://example.com #rust #golang #python";
674
+
let facets = parse_facets_from_text(text, &resolver, &limits).await;
675
+
676
+
assert!(facets.is_some());
677
+
let facets = facets.unwrap();
678
+
// Should be truncated to 3 facets total
679
+
assert_eq!(facets.len(), 3);
680
+
}
681
+
682
+
#[test]
683
+
fn test_parse_urls_multiple_links() {
684
+
let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164";
685
+
686
+
let facets = parse_urls(text);
687
+
688
+
// Should find both URLs
689
+
assert_eq!(
690
+
facets.len(),
691
+
2,
692
+
"Expected 2 URLs but found {}",
693
+
facets.len()
694
+
);
695
+
696
+
// Check first URL
697
+
if let Some(FacetFeature::Link(link)) = facets[0].features.first() {
698
+
assert_eq!(link.uri, "https://www.ietf.org/meeting/124/");
699
+
} else {
700
+
panic!("Expected Link feature");
701
+
}
702
+
703
+
// Check second URL
704
+
if let Some(FacetFeature::Link(link)) = facets[1].features.first() {
705
+
assert_eq!(
706
+
link.uri,
707
+
"https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"
708
+
);
709
+
} else {
710
+
panic!("Expected Link feature");
711
+
}
712
+
}
713
+
714
+
#[test]
715
+
fn test_parse_urls_with_html_entity() {
716
+
// Test with the HTML entity & in the text
717
+
let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164";
718
+
719
+
let facets = parse_urls(text);
720
+
721
+
// Should find both URLs
722
+
assert_eq!(
723
+
facets.len(),
724
+
2,
725
+
"Expected 2 URLs but found {}",
726
+
facets.len()
727
+
);
728
+
729
+
// Check first URL
730
+
if let Some(FacetFeature::Link(link)) = facets[0].features.first() {
731
+
assert_eq!(link.uri, "https://www.ietf.org/meeting/124/");
732
+
} else {
733
+
panic!("Expected Link feature");
734
+
}
735
+
736
+
// Check second URL
737
+
if let Some(FacetFeature::Link(link)) = facets[1].features.first() {
738
+
assert_eq!(
739
+
link.uri,
740
+
"https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"
741
+
);
742
+
} else {
743
+
panic!("Expected Link feature");
744
+
}
745
+
}
746
+
747
+
#[test]
748
+
fn test_byte_offset_with_html_entities() {
749
+
// This test demonstrates that HTML entity escaping shifts byte positions.
750
+
// The byte positions shift:
751
+
// In original: '&' is at byte 8 (1 byte)
752
+
// In escaped: '&' starts at byte 8 (5 bytes)
753
+
// This causes facet byte offsets to be misaligned if text is escaped before rendering.
754
+
755
+
// If we have a URL after the ampersand in the original:
756
+
let original_with_url = "Nov 3rd & Tuesday https://example.com";
757
+
let escaped_with_url = "Nov 3rd & Tuesday https://example.com";
758
+
759
+
// Parse URLs from both versions
760
+
let original_facets = parse_urls(original_with_url);
761
+
let escaped_facets = parse_urls(escaped_with_url);
762
+
763
+
// Both should find the URL, but at different byte positions
764
+
assert_eq!(original_facets.len(), 1);
765
+
assert_eq!(escaped_facets.len(), 1);
766
+
767
+
// The byte positions will be different
768
+
assert_eq!(original_facets[0].index.byte_start, 18); // After "Nov 3rd & Tuesday "
769
+
assert_eq!(escaped_facets[0].index.byte_start, 22); // After "Nov 3rd & Tuesday " (4 extra bytes for &)
770
+
}
771
+
772
+
#[test]
773
+
fn test_parse_urls_from_atproto_record_text() {
774
+
// Test parsing URLs from real AT Protocol record description text.
775
+
// This demonstrates the correct byte positions that should be used for facets.
776
+
let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/";
777
+
778
+
let facets = parse_urls(text);
779
+
780
+
assert_eq!(facets.len(), 2, "Should find 2 URLs");
781
+
782
+
// First URL: https://stream.place/psingletary.com
783
+
assert_eq!(facets[0].index.byte_start, 221);
784
+
assert_eq!(facets[0].index.byte_end, 257);
785
+
if let Some(FacetFeature::Link(link)) = facets[0].features.first() {
786
+
assert_eq!(link.uri, "https://stream.place/psingletary.com");
787
+
}
788
+
789
+
// Second URL: https://atprotocalls.leaflet.pub/
790
+
assert_eq!(facets[1].index.byte_start, 290);
791
+
assert_eq!(facets[1].index.byte_end, 323);
792
+
if let Some(FacetFeature::Link(link)) = facets[1].features.first() {
793
+
assert_eq!(link.uri, "https://atprotocalls.leaflet.pub/");
794
+
}
795
+
796
+
// Verify the byte slices match the expected text
797
+
let text_bytes = text.as_bytes();
798
+
assert_eq!(
799
+
std::str::from_utf8(&text_bytes[221..257]).unwrap(),
800
+
"https://stream.place/psingletary.com"
801
+
);
802
+
assert_eq!(
803
+
std::str::from_utf8(&text_bytes[290..323]).unwrap(),
804
+
"https://atprotocalls.leaflet.pub/"
805
+
);
806
+
}
807
+
808
+
#[tokio::test]
809
+
async fn test_parse_mentions_basic() {
810
+
let resolver = MockIdentityResolver::new();
811
+
let limits = FacetLimits::default();
812
+
let text = "Hello @alice.bsky.social!";
813
+
let facets = parse_mentions(text, &resolver, &limits).await;
814
+
815
+
assert_eq!(facets.len(), 1);
816
+
assert_eq!(facets[0].index.byte_start, 6);
817
+
assert_eq!(facets[0].index.byte_end, 24);
818
+
if let Some(FacetFeature::Mention(mention)) = facets[0].features.first() {
819
+
assert_eq!(mention.did, "did:plc:alice123");
820
+
} else {
821
+
panic!("Expected Mention feature");
822
+
}
823
+
}
824
+
825
+
#[tokio::test]
826
+
async fn test_parse_mentions_multiple() {
827
+
let mut resolver = MockIdentityResolver::new();
828
+
resolver.add_identity("bob.example.com", "did:plc:bob456");
829
+
let limits = FacetLimits::default();
830
+
let text = "CC @alice.bsky.social and @bob.example.com";
831
+
let facets = parse_mentions(text, &resolver, &limits).await;
832
+
833
+
assert_eq!(facets.len(), 2);
834
+
if let Some(FacetFeature::Mention(mention)) = facets[0].features.first() {
835
+
assert_eq!(mention.did, "did:plc:alice123");
836
+
}
837
+
if let Some(FacetFeature::Mention(mention)) = facets[1].features.first() {
838
+
assert_eq!(mention.did, "did:plc:bob456");
839
+
}
840
+
}
841
+
842
+
#[tokio::test]
843
+
async fn test_parse_mentions_unresolvable() {
844
+
let resolver = MockIdentityResolver::new();
845
+
let limits = FacetLimits::default();
846
+
// unknown.handle.com is not in the resolver
847
+
let text = "Hello @unknown.handle.com!";
848
+
let facets = parse_mentions(text, &resolver, &limits).await;
849
+
850
+
// Should be empty since the handle can't be resolved
851
+
assert_eq!(facets.len(), 0);
852
+
}
853
+
854
+
#[tokio::test]
855
+
async fn test_parse_mentions_in_url_excluded() {
856
+
let resolver = MockIdentityResolver::new();
857
+
let limits = FacetLimits::default();
858
+
// The @smokesignal.events is inside a URL and should not be parsed as a mention
859
+
let text = "Check https://tangled.org/@smokesignal.events";
860
+
let facets = parse_mentions(text, &resolver, &limits).await;
861
+
862
+
// Should be empty since the mention is inside a URL
863
+
assert_eq!(facets.len(), 0);
864
+
}
865
+
866
+
#[test]
867
+
fn test_parse_tags_basic() {
868
+
let text = "Learning #rust today!";
869
+
let facets = parse_tags(text);
870
+
871
+
assert_eq!(facets.len(), 1);
872
+
assert_eq!(facets[0].index.byte_start, 9);
873
+
assert_eq!(facets[0].index.byte_end, 14);
874
+
if let Some(FacetFeature::Tag(tag)) = facets[0].features.first() {
875
+
assert_eq!(tag.tag, "rust");
876
+
} else {
877
+
panic!("Expected Tag feature");
878
+
}
879
+
}
880
+
881
+
#[test]
882
+
fn test_parse_tags_multiple() {
883
+
let text = "#rust #golang #python are great!";
884
+
let facets = parse_tags(text);
885
+
886
+
assert_eq!(facets.len(), 3);
887
+
if let Some(FacetFeature::Tag(tag)) = facets[0].features.first() {
888
+
assert_eq!(tag.tag, "rust");
889
+
}
890
+
if let Some(FacetFeature::Tag(tag)) = facets[1].features.first() {
891
+
assert_eq!(tag.tag, "golang");
892
+
}
893
+
if let Some(FacetFeature::Tag(tag)) = facets[2].features.first() {
894
+
assert_eq!(tag.tag, "python");
895
+
}
896
+
}
897
+
898
+
#[test]
899
+
fn test_parse_tags_excludes_numeric() {
900
+
let text = "Item #42 is special #test123";
901
+
let facets = parse_tags(text);
902
+
903
+
// #42 should be excluded (purely numeric), #test123 should be included
904
+
assert_eq!(facets.len(), 1);
905
+
if let Some(FacetFeature::Tag(tag)) = facets[0].features.first() {
906
+
assert_eq!(tag.tag, "test123");
907
+
}
908
+
}
909
+
910
+
#[test]
911
+
fn test_parse_urls_basic() {
912
+
let text = "Visit https://example.com today!";
913
+
let facets = parse_urls(text);
914
+
915
+
assert_eq!(facets.len(), 1);
916
+
assert_eq!(facets[0].index.byte_start, 6);
917
+
assert_eq!(facets[0].index.byte_end, 25);
918
+
if let Some(FacetFeature::Link(link)) = facets[0].features.first() {
919
+
assert_eq!(link.uri, "https://example.com");
920
+
}
921
+
}
922
+
923
+
#[test]
924
+
fn test_parse_urls_with_path() {
925
+
let text = "Check https://example.com/path/to/page?query=1#section";
926
+
let facets = parse_urls(text);
927
+
928
+
assert_eq!(facets.len(), 1);
929
+
if let Some(FacetFeature::Link(link)) = facets[0].features.first() {
930
+
assert_eq!(link.uri, "https://example.com/path/to/page?query=1#section");
931
+
}
932
+
}
933
+
934
+
#[test]
935
+
fn test_facet_limits_default() {
936
+
let limits = FacetLimits::default();
937
+
assert_eq!(limits.mentions_max, 5);
938
+
assert_eq!(limits.tags_max, 5);
939
+
assert_eq!(limits.links_max, 5);
940
+
assert_eq!(limits.max, 10);
941
+
}
942
+
}
+50
crates/atproto-extras/src/lib.rs
+50
crates/atproto-extras/src/lib.rs
···
···
1
+
//! Extra utilities for AT Protocol applications.
2
+
//!
3
+
//! This crate provides additional utilities that complement the core AT Protocol
4
+
//! identity and record crates. Currently, it focuses on rich text facet parsing.
5
+
//!
6
+
//! ## Features
7
+
//!
8
+
//! - **Facet Parsing**: Extract mentions, URLs, and hashtags from plain text
9
+
//! with correct UTF-8 byte offset calculation
10
+
//! - **Identity Integration**: Resolve mention handles to DIDs during parsing
11
+
//!
12
+
//! ## Example
13
+
//!
14
+
//! ```ignore
15
+
//! use atproto_extras::{parse_facets_from_text, FacetLimits};
16
+
//!
17
+
//! // Parse facets from text (requires an IdentityResolver)
18
+
//! let text = "Hello @alice.bsky.social! Check out https://example.com #rust";
19
+
//! let limits = FacetLimits::default();
20
+
//! let facets = parse_facets_from_text(text, &resolver, &limits).await;
21
+
//! ```
22
+
//!
23
+
//! ## Byte Offset Calculation
24
+
//!
25
+
//! This implementation correctly uses UTF-8 byte offsets as required by AT Protocol.
26
+
//! The facets use "inclusive start and exclusive end" byte ranges. All parsing is done
27
+
//! using `regex::bytes::Regex` which operates on byte slices and returns byte positions,
28
+
//! ensuring correct handling of multi-byte UTF-8 characters (emojis, CJK, accented chars).
29
+
30
+
#![forbid(unsafe_code)]
31
+
#![warn(missing_docs)]
32
+
33
+
/// Rich text facet parsing for AT Protocol.
34
+
///
35
+
/// This module provides functionality for extracting semantic annotations (facets)
36
+
/// from plain text. Facets include:
37
+
///
38
+
/// - **Mentions**: User handles prefixed with `@` (e.g., `@alice.bsky.social`)
39
+
/// - **Links**: HTTP/HTTPS URLs
40
+
/// - **Tags**: Hashtags prefixed with `#` or `#` (e.g., `#rust`)
41
+
///
42
+
/// ## Byte Offsets
43
+
///
44
+
/// All facet indices use UTF-8 byte offsets, not character indices. This is
45
+
/// critical for correct handling of multi-byte characters like emojis or
46
+
/// non-ASCII text.
47
+
pub mod facets;
48
+
49
+
/// Re-export commonly used types for convenience.
50
+
pub use facets::{FacetLimits, parse_facets_from_text, parse_mentions, parse_tags, parse_urls};