js/components/src/lib/facet.ts at next · stream.place/streamplace

stream.place / streamplace
Live video on the AT Protocol
streamplace / js / components / src / lib / facet.ts
at next 131 lines 3.5 kB view raw
  1// Taken from ATcute's richtext-segmenter
  2// https://github.com/mary-ext/atcute/blob/trunk/packages/bluesky/richtext-segmenter/lib/index.ts
  3// repoed b/c we need to import types from @atproto/api not @atcute/bsky
  4import { Main } from "@atproto/api/dist/client/types/app/bsky/richtext/facet";
  5
  6type UnwrapArray<T> = T extends (infer V)[] ? V : never;
  7
  8export type Facet = Main;
  9export type FacetFeature = UnwrapArray<Facet["features"]>;
 10
 11export interface RichtextSegment {
 12  text: string;
 13  features: FacetFeature[] | undefined;
 14}
 15
 16const segment = (
 17  text: string,
 18  features: FacetFeature[] | undefined,
 19): RichtextSegment => {
 20  return { text, features: text.length > 0 ? features : undefined };
 21};
 22
 23export const segmentize = (
 24  text: string,
 25  facets: Facet[] | undefined,
 26): RichtextSegment[] => {
 27  if (facets === undefined || facets.length === 0) {
 28    return [segment(text, undefined)];
 29  }
 30
 31  const segments: RichtextSegment[] = [];
 32  const utf16Length = text.length;
 33  let utf16Cursor = 0;
 34  let utf8Cursor = 0;
 35
 36  const advanceCursor = (startUtf16: number, endUtf8: number): number => {
 37    let curs = startUtf16;
 38
 39    // Fast-path for entirely ASCII text
 40    const isLikelyAsciiText = text.charCodeAt(curs) < 0x80;
 41    if (isLikelyAsciiText) {
 42      curs += 1;
 43      utf8Cursor += 1;
 44
 45      // SIMD-like batch processing
 46      while (utf8Cursor + 8 <= endUtf8 && curs + 8 <= utf16Length) {
 47        const char1 = text.charCodeAt(curs);
 48        const char2 = text.charCodeAt(curs + 1);
 49        const char3 = text.charCodeAt(curs + 2);
 50        const char4 = text.charCodeAt(curs + 3);
 51        const char5 = text.charCodeAt(curs + 4);
 52        const char6 = text.charCodeAt(curs + 5);
 53        const char7 = text.charCodeAt(curs + 6);
 54        const char8 = text.charCodeAt(curs + 7);
 55
 56        if (
 57          (char1 | char2 | char3 | char4 | char5 | char6 | char7 | char8) <
 58          0x80
 59        ) {
 60          curs += 8;
 61          utf8Cursor += 8;
 62          continue;
 63        }
 64
 65        break;
 66      }
 67    }
 68
 69    // Process remaining characters individually
 70    while (utf8Cursor < endUtf8 && curs < utf16Length) {
 71      const code = text.charCodeAt(curs);
 72
 73      if (code < 0x80) {
 74        curs += 1;
 75        utf8Cursor += 1;
 76      } else if (code < 0x800) {
 77        curs += 1;
 78        utf8Cursor += 2;
 79      } else if (code < 0xd800 || code > 0xdbff) {
 80        curs += 1;
 81        utf8Cursor += 3;
 82      } else {
 83        curs += 2;
 84        utf8Cursor += 4;
 85      }
 86    }
 87
 88    return curs;
 89  };
 90
 91  // Process facets
 92  for (let idx = 0, len = facets.length; idx < len; idx++) {
 93    const facet = facets[idx];
 94
 95    const { byteStart, byteEnd } = facet.index;
 96    const features = facet.features;
 97
 98    if (byteStart > byteEnd || features.length === 0) {
 99      continue;
100    }
101
102    if (utf8Cursor < byteStart) {
103      const nextUtf16Cursor = advanceCursor(utf16Cursor, byteStart);
104      if (nextUtf16Cursor > utf16Cursor) {
105        segments.push(
106          segment(text.slice(utf16Cursor, nextUtf16Cursor), undefined),
107        );
108      }
109
110      utf16Cursor = nextUtf16Cursor;
111    }
112
113    {
114      const nextUtf16Cursor = advanceCursor(utf16Cursor, byteEnd);
115      if (nextUtf16Cursor > utf16Cursor) {
116        segments.push(
117          segment(text.slice(utf16Cursor, nextUtf16Cursor), features),
118        );
119      }
120
121      utf16Cursor = nextUtf16Cursor;
122    }
123  }
124
125  // Handle remaining text
126  if (utf16Cursor < utf16Length) {
127    segments.push(segment(text.slice(utf16Cursor), undefined));
128  }
129
130  return segments;
131};