use fluent_uri::{Uri, UriRef}; use std::sync::LazyLock; static BASE: LazyLock> = LazyLock::new(|| Uri::parse("https://example.com").unwrap()); // normalizing is a bit opinionated but eh /// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme /// this parser is intentinonally lax: it should accept all valid at-uris, and /// may accept some invalid at-uris. /// /// at the moment this implementation is quite bad and incomplete pub fn parse_at_uri(s: &str) -> Option { // for now, just working through the rules laid out in the docs in order, // without much regard for efficiency for now. // The overall URI is restricted to a subset of ASCII characters if !s.is_ascii() { return None; } // Maximum overall length is 8 kilobytes (which may be shortened in the future) if s.len() > (8 * 2_usize.pow(10)) { return None; } // Hex-encoding of characters is permitted (but in practice not necessary) // -> decode any unreserved characters. from rfc 3986: // -> For consistency, percent-encoded octets in the ranges of ALPHA // -> (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), // -> underscore (%5F), or tilde (%7E) should not be created by URI // -> producers and, when found in a URI, should be decoded to their // -> corresponding unreserved characters by URI normalizers. let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') { let mut out = String::with_capacity(s.len()); out.push_str(unencoded_prefix); for segment in rest.split('%') { let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else { return None; // bail: % must always be followed by 2 hex digits }; let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else { return None; // bail: % must be followed by decodable hex }; if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') { out.push(decoded); } else { out.push('%'); out.push_str(&hex2.to_ascii_uppercase()); // norm } out.push_str(unencoded_suffix); } out } else { s.to_string() }; // The URI scheme is `at`, and an authority part preceded with double slashes is always // required, so the URI always starts at:// // -> the spec doesn't explicitly say, but uri schemes can be case-insensitive? let (proto, rest) = s.split_at_checked(5)?; if !proto.eq_ignore_ascii_case("at://") { return None; } // An authority section is required and must be non-empty. the authority can be either an // atproto Handle, or a DID meeting the restrictions for use with atproto. note that the // authority part can not be interpreted as a host:port pair, because of the use of colon // characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs, // but other reserved characters (including #, /, $, &, @) must be escaped. // Note that none of the current "blessed" DID methods for atproto allow these // characters in DID identifiers // An optional path section may follow the authority. The path may contain multiple segments // separated by a single slash (/). Generic URI path normalization rules may be used. // An optional query part is allowed, following generic URI syntax restrictions // An optional fragment part is allowed, using JSON Path syntax // -> work backwards from fragment, query, path -> authority let mut base = rest; let (mut fragment, mut query, mut path) = (None, None, None); if let Some((pre, f)) = base.split_once('#') { base = pre; fragment = Some(f); } if let Some((pre, q)) = base.split_once('?') { base = pre; query = Some(q); } if let Some((pre, p)) = base.split_once('/') { base = pre; path = Some(p); } let mut authority = base.to_string(); if authority.is_empty() { return None; } // Normalization: Authority as handle: lowercased if !authority.starts_with("did:") { // lowercase handles authority.make_ascii_lowercase(); } // Normalization: No trailing slashes in path part // Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example) // -> be so lazy let path = match path { Some(p) => { let p = p.trim_end_matches('/'); let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE let normalized = resolved.normalize().path().to_string(); let without_trailing_slashes = normalized.trim_end_matches('/'); Some(without_trailing_slashes.to_string()) } None => None, }; let mut out = format!("at://{authority}"); if let Some(p) = path { // no need for `/` -- it's added by fluent_uri normalization out.push_str(&p); } if let Some(q) = query { out.push('?'); out.push_str(q); } if let Some(f) = fragment { out.push('#'); out.push_str(f); } Some(out) // there's a more normalization to do still. ugh. } pub fn at_uri_collection(at_uri: &str) -> Option { let (proto, rest) = at_uri.split_at_checked(5)?; if !proto.eq_ignore_ascii_case("at://") { return None; } let (_did, rest) = rest.split_once('/')?; if let Some((collection, _path_rest)) = rest.split_once('/') { return Some(collection.to_string()); } if let Some((collection, _query_rest)) = rest.split_once('?') { return Some(collection.to_string()); } if let Some((collection, _hash_rest)) = rest.split_once('#') { return Some(collection.to_string()); } Some(rest.to_string()) } #[cfg(test)] mod tests { use super::*; #[test] fn test_at_uri_parse() { for (case, expected, detail) in vec![ ("", None, "empty"), (" ", None, "whitespace"), ("https://bad-example.com", None, "not at scheme"), ("at://µcosm.bad-example.com", None, "not ascii"), ( "at://bad-example.com", Some("at://bad-example.com"), "handle, authority-only", ), ( "at://did:plc:hdhoaan3xa3jiuq4fg4mefid", Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"), "DID, authority-only", ), ( "at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26", Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"), "bsky post (handle)", ), ( "at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27", Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"), "bsky post (DID)", ), ( "AT://bad-example.com", Some("at://bad-example.com"), "scheme case is normalized", ), ( "at://bad-example.com", Some("at://bad-example.com"), "scheme case is normalized", ), ( "at://bad-example.com?q=z", Some("at://bad-example.com?q=z"), "query is allowed", ), ( "at://bad-example.com#a", Some("at://bad-example.com#a"), "fragment is allowed", ), ( "at://bad-example.com/%", None, "invalid percent-encoding: ends with %", ), ( "at://bad-example.com/%2", None, "invalid percent-encoding: ends with only one digit after %", ), ( "at://bad-example.com/%ZZ", None, "invalid percent-encoding: non-hex after %", ), ( "at://bad-example.com/%3A", Some("at://bad-example.com/%3A"), "valid percent-encoding is left", ), ( "at://bad-example.com/%3a", Some("at://bad-example.com/%3A"), "valid percent-encoding is hex-uppercased", ), ( "at://bad-example.com/%61/%62", Some("at://bad-example.com/a/b"), "unreserved characters are percent-decoded", ), ( "at://bad-example.com/a/../b", Some("at://bad-example.com/b"), "paths have traversals resolved (oof)", // reminder to self: we are normalizing, not sanitizing ), ( "at://bad-example.com/../", Some("at://bad-example.com"), "paths always have trailing slashes removed", ), ] { assert_eq!( parse_at_uri(case), expected.map(|s| s.to_string()), "{detail}" ); } } #[test] fn test_at_uri_collection() { for (case, expected, detail) in vec![ ("", None, "empty"), ("at://did:plc:vc7f4oafdgxsihk4cry2xpze", None, "did only"), ( "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion", Some("collec.tion"), "no path (weird)", ), ( "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion/path", Some("collec.tion"), "normal at-uri", ), ( "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion?query", Some("collec.tion"), "colleciton with query", ), ( "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion#hash", Some("collec.tion"), "colleciton with hash", ), ( "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion/path?query#hash", Some("collec.tion"), "colleciton with everything", ), ( "at://did:web:example.com/collec.tion/path", Some("collec.tion"), "did:web", ), ( "at://did:web:example.com/col.lec.tio.ns.so.long.going.on.and.on", Some("col.lec.tio.ns.so.long.going.on.and.on"), "long collection", ), ] { assert_eq!( at_uri_collection(case), expected.map(|s| s.to_string()), "{detail}" ); } } }