Live video on the AT Protocol
at next 131 lines 3.5 kB view raw
1// Taken from ATcute's richtext-segmenter 2// https://github.com/mary-ext/atcute/blob/trunk/packages/bluesky/richtext-segmenter/lib/index.ts 3// repoed b/c we need to import types from @atproto/api not @atcute/bsky 4import { Main } from "@atproto/api/dist/client/types/app/bsky/richtext/facet"; 5 6type UnwrapArray<T> = T extends (infer V)[] ? V : never; 7 8export type Facet = Main; 9export type FacetFeature = UnwrapArray<Facet["features"]>; 10 11export interface RichtextSegment { 12 text: string; 13 features: FacetFeature[] | undefined; 14} 15 16const segment = ( 17 text: string, 18 features: FacetFeature[] | undefined, 19): RichtextSegment => { 20 return { text, features: text.length > 0 ? features : undefined }; 21}; 22 23export const segmentize = ( 24 text: string, 25 facets: Facet[] | undefined, 26): RichtextSegment[] => { 27 if (facets === undefined || facets.length === 0) { 28 return [segment(text, undefined)]; 29 } 30 31 const segments: RichtextSegment[] = []; 32 const utf16Length = text.length; 33 let utf16Cursor = 0; 34 let utf8Cursor = 0; 35 36 const advanceCursor = (startUtf16: number, endUtf8: number): number => { 37 let curs = startUtf16; 38 39 // Fast-path for entirely ASCII text 40 const isLikelyAsciiText = text.charCodeAt(curs) < 0x80; 41 if (isLikelyAsciiText) { 42 curs += 1; 43 utf8Cursor += 1; 44 45 // SIMD-like batch processing 46 while (utf8Cursor + 8 <= endUtf8 && curs + 8 <= utf16Length) { 47 const char1 = text.charCodeAt(curs); 48 const char2 = text.charCodeAt(curs + 1); 49 const char3 = text.charCodeAt(curs + 2); 50 const char4 = text.charCodeAt(curs + 3); 51 const char5 = text.charCodeAt(curs + 4); 52 const char6 = text.charCodeAt(curs + 5); 53 const char7 = text.charCodeAt(curs + 6); 54 const char8 = text.charCodeAt(curs + 7); 55 56 if ( 57 (char1 | char2 | char3 | char4 | char5 | char6 | char7 | char8) < 58 0x80 59 ) { 60 curs += 8; 61 utf8Cursor += 8; 62 continue; 63 } 64 65 break; 66 } 67 } 68 69 // Process remaining characters individually 70 while (utf8Cursor < endUtf8 && curs < utf16Length) { 71 const code = text.charCodeAt(curs); 72 73 if (code < 0x80) { 74 curs += 1; 75 utf8Cursor += 1; 76 } else if (code < 0x800) { 77 curs += 1; 78 utf8Cursor += 2; 79 } else if (code < 0xd800 || code > 0xdbff) { 80 curs += 1; 81 utf8Cursor += 3; 82 } else { 83 curs += 2; 84 utf8Cursor += 4; 85 } 86 } 87 88 return curs; 89 }; 90 91 // Process facets 92 for (let idx = 0, len = facets.length; idx < len; idx++) { 93 const facet = facets[idx]; 94 95 const { byteStart, byteEnd } = facet.index; 96 const features = facet.features; 97 98 if (byteStart > byteEnd || features.length === 0) { 99 continue; 100 } 101 102 if (utf8Cursor < byteStart) { 103 const nextUtf16Cursor = advanceCursor(utf16Cursor, byteStart); 104 if (nextUtf16Cursor > utf16Cursor) { 105 segments.push( 106 segment(text.slice(utf16Cursor, nextUtf16Cursor), undefined), 107 ); 108 } 109 110 utf16Cursor = nextUtf16Cursor; 111 } 112 113 { 114 const nextUtf16Cursor = advanceCursor(utf16Cursor, byteEnd); 115 if (nextUtf16Cursor > utf16Cursor) { 116 segments.push( 117 segment(text.slice(utf16Cursor, nextUtf16Cursor), features), 118 ); 119 } 120 121 utf16Cursor = nextUtf16Cursor; 122 } 123 } 124 125 // Handle remaining text 126 if (utf16Cursor < utf16Length) { 127 segments.push(segment(text.slice(utf16Cursor), undefined)); 128 } 129 130 return segments; 131};