Live video on the AT Protocol
1// Taken from ATcute's richtext-segmenter
2// https://github.com/mary-ext/atcute/blob/trunk/packages/bluesky/richtext-segmenter/lib/index.ts
3// repoed b/c we need to import types from @atproto/api not @atcute/bsky
4import { Main } from "@atproto/api/dist/client/types/app/bsky/richtext/facet";
5
6type UnwrapArray<T> = T extends (infer V)[] ? V : never;
7
8export type Facet = Main;
9export type FacetFeature = UnwrapArray<Facet["features"]>;
10
11export interface RichtextSegment {
12 text: string;
13 features: FacetFeature[] | undefined;
14}
15
16const segment = (
17 text: string,
18 features: FacetFeature[] | undefined,
19): RichtextSegment => {
20 return { text, features: text.length > 0 ? features : undefined };
21};
22
23export const segmentize = (
24 text: string,
25 facets: Facet[] | undefined,
26): RichtextSegment[] => {
27 if (facets === undefined || facets.length === 0) {
28 return [segment(text, undefined)];
29 }
30
31 const segments: RichtextSegment[] = [];
32 const utf16Length = text.length;
33 let utf16Cursor = 0;
34 let utf8Cursor = 0;
35
36 const advanceCursor = (startUtf16: number, endUtf8: number): number => {
37 let curs = startUtf16;
38
39 // Fast-path for entirely ASCII text
40 const isLikelyAsciiText = text.charCodeAt(curs) < 0x80;
41 if (isLikelyAsciiText) {
42 curs += 1;
43 utf8Cursor += 1;
44
45 // SIMD-like batch processing
46 while (utf8Cursor + 8 <= endUtf8 && curs + 8 <= utf16Length) {
47 const char1 = text.charCodeAt(curs);
48 const char2 = text.charCodeAt(curs + 1);
49 const char3 = text.charCodeAt(curs + 2);
50 const char4 = text.charCodeAt(curs + 3);
51 const char5 = text.charCodeAt(curs + 4);
52 const char6 = text.charCodeAt(curs + 5);
53 const char7 = text.charCodeAt(curs + 6);
54 const char8 = text.charCodeAt(curs + 7);
55
56 if (
57 (char1 | char2 | char3 | char4 | char5 | char6 | char7 | char8) <
58 0x80
59 ) {
60 curs += 8;
61 utf8Cursor += 8;
62 continue;
63 }
64
65 break;
66 }
67 }
68
69 // Process remaining characters individually
70 while (utf8Cursor < endUtf8 && curs < utf16Length) {
71 const code = text.charCodeAt(curs);
72
73 if (code < 0x80) {
74 curs += 1;
75 utf8Cursor += 1;
76 } else if (code < 0x800) {
77 curs += 1;
78 utf8Cursor += 2;
79 } else if (code < 0xd800 || code > 0xdbff) {
80 curs += 1;
81 utf8Cursor += 3;
82 } else {
83 curs += 2;
84 utf8Cursor += 4;
85 }
86 }
87
88 return curs;
89 };
90
91 // Process facets
92 for (let idx = 0, len = facets.length; idx < len; idx++) {
93 const facet = facets[idx];
94
95 const { byteStart, byteEnd } = facet.index;
96 const features = facet.features;
97
98 if (byteStart > byteEnd || features.length === 0) {
99 continue;
100 }
101
102 if (utf8Cursor < byteStart) {
103 const nextUtf16Cursor = advanceCursor(utf16Cursor, byteStart);
104 if (nextUtf16Cursor > utf16Cursor) {
105 segments.push(
106 segment(text.slice(utf16Cursor, nextUtf16Cursor), undefined),
107 );
108 }
109
110 utf16Cursor = nextUtf16Cursor;
111 }
112
113 {
114 const nextUtf16Cursor = advanceCursor(utf16Cursor, byteEnd);
115 if (nextUtf16Cursor > utf16Cursor) {
116 segments.push(
117 segment(text.slice(utf16Cursor, nextUtf16Cursor), features),
118 );
119 }
120
121 utf16Cursor = nextUtf16Cursor;
122 }
123 }
124
125 // Handle remaining text
126 if (utf16Cursor < utf16Length) {
127 segments.push(segment(text.slice(utf16Cursor), undefined));
128 }
129
130 return segments;
131};