1package search
2
3import (
4 "log/slog"
5 "net/url"
6 "strings"
7 "time"
8
9 appbsky "github.com/bluesky-social/indigo/api/bsky"
10 "github.com/bluesky-social/indigo/atproto/identity"
11 "github.com/bluesky-social/indigo/atproto/syntax"
12
13 "github.com/rivo/uniseg"
14)
15
16type ProfileDoc struct {
17 DocIndexTs string `json:"doc_index_ts"`
18 DID string `json:"did"`
19 RecordCID string `json:"record_cid"`
20 Handle string `json:"handle"`
21 DisplayName *string `json:"display_name,omitempty"`
22 Description *string `json:"description,omitempty"`
23 ImgAltText []string `json:"img_alt_text,omitempty"`
24 SelfLabel []string `json:"self_label,omitempty"`
25 URL []string `json:"url,omitempty"`
26 Domain []string `json:"domain,omitempty"`
27 Tag []string `json:"tag,omitempty"`
28 Emoji []string `json:"emoji,omitempty"`
29 HasAvatar bool `json:"has_avatar"`
30 HasBanner bool `json:"has_banner"`
31}
32
33type PostDoc struct {
34 DocIndexTs string `json:"doc_index_ts"`
35 DID string `json:"did"`
36 RecordRkey string `json:"record_rkey"`
37 RecordCID string `json:"record_cid"`
38 CreatedAt *string `json:"created_at,omitempty"`
39 Text string `json:"text"`
40 TextJA *string `json:"text_ja,omitempty"`
41 LangCode []string `json:"lang_code,omitempty"`
42 LangCodeIso2 []string `json:"lang_code_iso2,omitempty"`
43 MentionDID []string `json:"mention_did,omitempty"`
44 EmbedATURI *string `json:"embed_aturi,omitempty"`
45 ReplyRootATURI *string `json:"reply_root_aturi,omitempty"`
46 EmbedImgCount int `json:"embed_img_count"`
47 EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"`
48 EmbedImgAltTextJA []string `json:"embed_img_alt_text_ja,omitempty"`
49 SelfLabel []string `json:"self_label,omitempty"`
50 URL []string `json:"url,omitempty"`
51 Domain []string `json:"domain,omitempty"`
52 Tag []string `json:"tag,omitempty"`
53 Emoji []string `json:"emoji,omitempty"`
54}
55
56// Returns the search index document ID (`_id`) for this document.
57//
58// This identifier should be URL safe and not contain a slash ("/").
59func (d *ProfileDoc) DocId() string {
60 return d.DID
61}
62
63// Returns the search index document ID (`_id`) for this document.
64//
65// This identifier should be URL safe and not contain a slash ("/").
66func (d *PostDoc) DocId() string {
67 return d.DID + "_" + d.RecordRkey
68}
69
70func TransformProfile(profile *appbsky.ActorProfile, ident *identity.Identity, cid string) ProfileDoc {
71 // TODO: placeholder for future alt text on profile blobs
72 var altText []string
73 var tags []string
74 var emojis []string
75 if profile.Description != nil {
76 tags = parseProfileTags(profile)
77 emojis = parseEmojis(*profile.Description)
78 }
79 var selfLabels []string
80 if profile.Labels != nil && profile.Labels.LabelDefs_SelfLabels != nil {
81 for _, le := range profile.Labels.LabelDefs_SelfLabels.Values {
82 selfLabels = append(selfLabels, le.Val)
83 }
84 }
85 handle := ""
86 if !ident.Handle.IsInvalidHandle() {
87 handle = ident.Handle.String()
88 }
89 return ProfileDoc{
90 DocIndexTs: syntax.DatetimeNow().String(),
91 DID: ident.DID.String(),
92 RecordCID: cid,
93 Handle: handle,
94 DisplayName: profile.DisplayName,
95 Description: profile.Description,
96 ImgAltText: altText,
97 SelfLabel: selfLabels,
98 Tag: tags,
99 Emoji: emojis,
100 HasAvatar: profile.Avatar != nil,
101 HasBanner: profile.Banner != nil,
102 }
103}
104
105func TransformPost(post *appbsky.FeedPost, did syntax.DID, rkey, cid string) PostDoc {
106 altText := []string{}
107 if post.Embed != nil && post.Embed.EmbedImages != nil {
108 for _, img := range post.Embed.EmbedImages.Images {
109 if img.Alt != "" {
110 altText = append(altText, img.Alt)
111 }
112 }
113 }
114 var langCodeIso2 []string
115 for _, lang := range post.Langs {
116 // TODO: include an actual language code map to go from 3char to 2char
117 prefix := strings.SplitN(lang, "-", 2)[0]
118 if len(prefix) == 2 {
119 langCodeIso2 = append(langCodeIso2, strings.ToLower(prefix))
120 }
121 }
122 var mentionDIDs []string
123 var urls []string
124 for _, facet := range post.Facets {
125 for _, feat := range facet.Features {
126 if feat.RichtextFacet_Mention != nil {
127 mentionDIDs = append(mentionDIDs, feat.RichtextFacet_Mention.Did)
128 }
129 if feat.RichtextFacet_Link != nil {
130 urls = append(urls, feat.RichtextFacet_Link.Uri)
131 }
132 }
133 }
134 var replyRootATURI *string
135 if post.Reply != nil {
136 replyRootATURI = &(post.Reply.Root.Uri)
137 }
138 if post.Embed != nil && post.Embed.EmbedExternal != nil {
139 urls = append(urls, post.Embed.EmbedExternal.External.Uri)
140 }
141 var embedATURI *string
142 if post.Embed != nil && post.Embed.EmbedRecord != nil {
143 embedATURI = &post.Embed.EmbedRecord.Record.Uri
144 }
145 if post.Embed != nil && post.Embed.EmbedRecordWithMedia != nil {
146 embedATURI = &post.Embed.EmbedRecordWithMedia.Record.Record.Uri
147 }
148 var embedImgCount int
149 var embedImgAltText []string
150 var embedImgAltTextJA []string
151 if post.Embed != nil && post.Embed.EmbedImages != nil {
152 embedImgCount = len(post.Embed.EmbedImages.Images)
153 for _, img := range post.Embed.EmbedImages.Images {
154 if img.Alt != "" {
155 embedImgAltText = append(embedImgAltText, img.Alt)
156 if containsJapanese(img.Alt) {
157 embedImgAltTextJA = append(embedImgAltTextJA, img.Alt)
158 }
159 }
160 }
161 }
162
163 if post.Embed != nil &&
164 post.Embed.EmbedRecordWithMedia != nil &&
165 post.Embed.EmbedRecordWithMedia.Media != nil &&
166 post.Embed.EmbedRecordWithMedia.Media.EmbedImages != nil &&
167 len(post.Embed.EmbedRecordWithMedia.Media.EmbedImages.Images) > 0 {
168 embedImgCount += len(post.Embed.EmbedRecordWithMedia.Media.EmbedImages.Images)
169 for _, img := range post.Embed.EmbedRecordWithMedia.Media.EmbedImages.Images {
170 if img.Alt != "" {
171 embedImgAltText = append(embedImgAltText, img.Alt)
172 if containsJapanese(img.Alt) {
173 embedImgAltTextJA = append(embedImgAltTextJA, img.Alt)
174 }
175 }
176 }
177 }
178
179 var selfLabels []string
180 if post.Labels != nil && post.Labels.LabelDefs_SelfLabels != nil {
181 for _, le := range post.Labels.LabelDefs_SelfLabels.Values {
182 selfLabels = append(selfLabels, le.Val)
183 }
184 }
185
186 var domains []string
187 for i, raw := range urls {
188 clean := NormalizeLossyURL(raw)
189 urls[i] = clean
190 u, err := url.Parse(clean)
191 if nil == err {
192 domains = append(domains, u.Hostname())
193 }
194 }
195
196 doc := PostDoc{
197 DocIndexTs: syntax.DatetimeNow().String(),
198 DID: did.String(),
199 RecordRkey: rkey,
200 RecordCID: cid,
201 Text: post.Text,
202 LangCode: post.Langs,
203 LangCodeIso2: langCodeIso2,
204 MentionDID: mentionDIDs,
205 EmbedATURI: embedATURI,
206 ReplyRootATURI: replyRootATURI,
207 EmbedImgCount: embedImgCount,
208 EmbedImgAltText: embedImgAltText,
209 EmbedImgAltTextJA: embedImgAltTextJA,
210 SelfLabel: selfLabels,
211 URL: urls,
212 Domain: domains,
213 Tag: parsePostTags(post),
214 Emoji: parseEmojis(post.Text),
215 }
216
217 if containsJapanese(post.Text) {
218 doc.TextJA = &post.Text
219 }
220
221 if post.CreatedAt != "" {
222 // there are some old bad timestamps out there!
223 dt, err := syntax.ParseDatetimeLenient(post.CreatedAt)
224 if nil == err { // *not* an error
225 // not more than a few minutes in the future
226 if time.Since(dt.Time()) >= -1*5*time.Minute {
227 s := dt.String()
228 doc.CreatedAt = &s
229 } else {
230 slog.Warn("rejecting future post CreatedAt", "datetime", dt.String(), "did", did.String(), "rkey", rkey)
231 s := syntax.DatetimeNow().String()
232 doc.CreatedAt = &s
233 }
234 }
235 }
236
237 return doc
238}
239
240func dedupeStrings(in []string) []string {
241 var out []string
242 seen := make(map[string]bool)
243 for _, v := range in {
244 if !seen[v] {
245 out = append(out, v)
246 seen[v] = true
247 }
248 }
249 return out
250}
251
252func parseProfileTags(p *appbsky.ActorProfile) []string {
253 // TODO: waiting for profile tag lexicon support
254 var ret []string = []string{}
255 if len(ret) == 0 {
256 return nil
257 }
258 return dedupeStrings(ret)
259}
260
261func parsePostTags(p *appbsky.FeedPost) []string {
262 var ret []string = []string{}
263 for _, facet := range p.Facets {
264 for _, feat := range facet.Features {
265 if feat.RichtextFacet_Tag != nil {
266 ret = append(ret, feat.RichtextFacet_Tag.Tag)
267 }
268 }
269 }
270 ret = append(ret, p.Tags...)
271 if len(ret) == 0 {
272 return nil
273 }
274 return dedupeStrings(ret)
275}
276
277func parseEmojis(s string) []string {
278 var ret []string = []string{}
279 seen := make(map[string]bool)
280 gr := uniseg.NewGraphemes(s)
281 for gr.Next() {
282 // check if this grapheme cluster starts with an emoji rune (Unicode codepoint, int32)
283 firstRune := gr.Runes()[0]
284 if (firstRune >= 0x1F000 && firstRune <= 0x1FFFF) || (firstRune >= 0x2600 && firstRune <= 0x26FF) {
285 emoji := gr.Str()
286 if seen[emoji] == false {
287 ret = append(ret, emoji)
288 seen[emoji] = true
289 }
290 }
291 }
292 if len(ret) == 0 {
293 return nil
294 }
295 return ret
296}