fork of indigo with slightly nicer lexgen
at main 8.9 kB view raw
1package search 2 3import ( 4 "log/slog" 5 "net/url" 6 "strings" 7 "time" 8 9 appbsky "github.com/bluesky-social/indigo/api/bsky" 10 "github.com/bluesky-social/indigo/atproto/identity" 11 "github.com/bluesky-social/indigo/atproto/syntax" 12 13 "github.com/rivo/uniseg" 14) 15 16type ProfileDoc struct { 17 DocIndexTs string `json:"doc_index_ts"` 18 DID string `json:"did"` 19 RecordCID string `json:"record_cid"` 20 Handle string `json:"handle"` 21 DisplayName *string `json:"display_name,omitempty"` 22 Description *string `json:"description,omitempty"` 23 ImgAltText []string `json:"img_alt_text,omitempty"` 24 SelfLabel []string `json:"self_label,omitempty"` 25 URL []string `json:"url,omitempty"` 26 Domain []string `json:"domain,omitempty"` 27 Tag []string `json:"tag,omitempty"` 28 Emoji []string `json:"emoji,omitempty"` 29 HasAvatar bool `json:"has_avatar"` 30 HasBanner bool `json:"has_banner"` 31} 32 33type PostDoc struct { 34 DocIndexTs string `json:"doc_index_ts"` 35 DID string `json:"did"` 36 RecordRkey string `json:"record_rkey"` 37 RecordCID string `json:"record_cid"` 38 CreatedAt *string `json:"created_at,omitempty"` 39 Text string `json:"text"` 40 TextJA *string `json:"text_ja,omitempty"` 41 LangCode []string `json:"lang_code,omitempty"` 42 LangCodeIso2 []string `json:"lang_code_iso2,omitempty"` 43 MentionDID []string `json:"mention_did,omitempty"` 44 EmbedATURI *string `json:"embed_aturi,omitempty"` 45 ReplyRootATURI *string `json:"reply_root_aturi,omitempty"` 46 EmbedImgCount int `json:"embed_img_count"` 47 EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"` 48 EmbedImgAltTextJA []string `json:"embed_img_alt_text_ja,omitempty"` 49 SelfLabel []string `json:"self_label,omitempty"` 50 URL []string `json:"url,omitempty"` 51 Domain []string `json:"domain,omitempty"` 52 Tag []string `json:"tag,omitempty"` 53 Emoji []string `json:"emoji,omitempty"` 54} 55 56// Returns the search index document ID (`_id`) for this document. 57// 58// This identifier should be URL safe and not contain a slash ("/"). 59func (d *ProfileDoc) DocId() string { 60 return d.DID 61} 62 63// Returns the search index document ID (`_id`) for this document. 64// 65// This identifier should be URL safe and not contain a slash ("/"). 66func (d *PostDoc) DocId() string { 67 return d.DID + "_" + d.RecordRkey 68} 69 70func TransformProfile(profile *appbsky.ActorProfile, ident *identity.Identity, cid string) ProfileDoc { 71 // TODO: placeholder for future alt text on profile blobs 72 var altText []string 73 var tags []string 74 var emojis []string 75 if profile.Description != nil { 76 tags = parseProfileTags(profile) 77 emojis = parseEmojis(*profile.Description) 78 } 79 var selfLabels []string 80 if profile.Labels != nil && profile.Labels.LabelDefs_SelfLabels != nil { 81 for _, le := range profile.Labels.LabelDefs_SelfLabels.Values { 82 selfLabels = append(selfLabels, le.Val) 83 } 84 } 85 handle := "" 86 if !ident.Handle.IsInvalidHandle() { 87 handle = ident.Handle.String() 88 } 89 return ProfileDoc{ 90 DocIndexTs: syntax.DatetimeNow().String(), 91 DID: ident.DID.String(), 92 RecordCID: cid, 93 Handle: handle, 94 DisplayName: profile.DisplayName, 95 Description: profile.Description, 96 ImgAltText: altText, 97 SelfLabel: selfLabels, 98 Tag: tags, 99 Emoji: emojis, 100 HasAvatar: profile.Avatar != nil, 101 HasBanner: profile.Banner != nil, 102 } 103} 104 105func TransformPost(post *appbsky.FeedPost, did syntax.DID, rkey, cid string) PostDoc { 106 altText := []string{} 107 if post.Embed != nil && post.Embed.EmbedImages != nil { 108 for _, img := range post.Embed.EmbedImages.Images { 109 if img.Alt != "" { 110 altText = append(altText, img.Alt) 111 } 112 } 113 } 114 var langCodeIso2 []string 115 for _, lang := range post.Langs { 116 // TODO: include an actual language code map to go from 3char to 2char 117 prefix := strings.SplitN(lang, "-", 2)[0] 118 if len(prefix) == 2 { 119 langCodeIso2 = append(langCodeIso2, strings.ToLower(prefix)) 120 } 121 } 122 var mentionDIDs []string 123 var urls []string 124 for _, facet := range post.Facets { 125 for _, feat := range facet.Features { 126 if feat.RichtextFacet_Mention != nil { 127 mentionDIDs = append(mentionDIDs, feat.RichtextFacet_Mention.Did) 128 } 129 if feat.RichtextFacet_Link != nil { 130 urls = append(urls, feat.RichtextFacet_Link.Uri) 131 } 132 } 133 } 134 var replyRootATURI *string 135 if post.Reply != nil { 136 replyRootATURI = &(post.Reply.Root.Uri) 137 } 138 if post.Embed != nil && post.Embed.EmbedExternal != nil { 139 urls = append(urls, post.Embed.EmbedExternal.External.Uri) 140 } 141 var embedATURI *string 142 if post.Embed != nil && post.Embed.EmbedRecord != nil { 143 embedATURI = &post.Embed.EmbedRecord.Record.Uri 144 } 145 if post.Embed != nil && post.Embed.EmbedRecordWithMedia != nil { 146 embedATURI = &post.Embed.EmbedRecordWithMedia.Record.Record.Uri 147 } 148 var embedImgCount int 149 var embedImgAltText []string 150 var embedImgAltTextJA []string 151 if post.Embed != nil && post.Embed.EmbedImages != nil { 152 embedImgCount = len(post.Embed.EmbedImages.Images) 153 for _, img := range post.Embed.EmbedImages.Images { 154 if img.Alt != "" { 155 embedImgAltText = append(embedImgAltText, img.Alt) 156 if containsJapanese(img.Alt) { 157 embedImgAltTextJA = append(embedImgAltTextJA, img.Alt) 158 } 159 } 160 } 161 } 162 163 if post.Embed != nil && 164 post.Embed.EmbedRecordWithMedia != nil && 165 post.Embed.EmbedRecordWithMedia.Media != nil && 166 post.Embed.EmbedRecordWithMedia.Media.EmbedImages != nil && 167 len(post.Embed.EmbedRecordWithMedia.Media.EmbedImages.Images) > 0 { 168 embedImgCount += len(post.Embed.EmbedRecordWithMedia.Media.EmbedImages.Images) 169 for _, img := range post.Embed.EmbedRecordWithMedia.Media.EmbedImages.Images { 170 if img.Alt != "" { 171 embedImgAltText = append(embedImgAltText, img.Alt) 172 if containsJapanese(img.Alt) { 173 embedImgAltTextJA = append(embedImgAltTextJA, img.Alt) 174 } 175 } 176 } 177 } 178 179 var selfLabels []string 180 if post.Labels != nil && post.Labels.LabelDefs_SelfLabels != nil { 181 for _, le := range post.Labels.LabelDefs_SelfLabels.Values { 182 selfLabels = append(selfLabels, le.Val) 183 } 184 } 185 186 var domains []string 187 for i, raw := range urls { 188 clean := NormalizeLossyURL(raw) 189 urls[i] = clean 190 u, err := url.Parse(clean) 191 if nil == err { 192 domains = append(domains, u.Hostname()) 193 } 194 } 195 196 doc := PostDoc{ 197 DocIndexTs: syntax.DatetimeNow().String(), 198 DID: did.String(), 199 RecordRkey: rkey, 200 RecordCID: cid, 201 Text: post.Text, 202 LangCode: post.Langs, 203 LangCodeIso2: langCodeIso2, 204 MentionDID: mentionDIDs, 205 EmbedATURI: embedATURI, 206 ReplyRootATURI: replyRootATURI, 207 EmbedImgCount: embedImgCount, 208 EmbedImgAltText: embedImgAltText, 209 EmbedImgAltTextJA: embedImgAltTextJA, 210 SelfLabel: selfLabels, 211 URL: urls, 212 Domain: domains, 213 Tag: parsePostTags(post), 214 Emoji: parseEmojis(post.Text), 215 } 216 217 if containsJapanese(post.Text) { 218 doc.TextJA = &post.Text 219 } 220 221 if post.CreatedAt != "" { 222 // there are some old bad timestamps out there! 223 dt, err := syntax.ParseDatetimeLenient(post.CreatedAt) 224 if nil == err { // *not* an error 225 // not more than a few minutes in the future 226 if time.Since(dt.Time()) >= -1*5*time.Minute { 227 s := dt.String() 228 doc.CreatedAt = &s 229 } else { 230 slog.Warn("rejecting future post CreatedAt", "datetime", dt.String(), "did", did.String(), "rkey", rkey) 231 s := syntax.DatetimeNow().String() 232 doc.CreatedAt = &s 233 } 234 } 235 } 236 237 return doc 238} 239 240func dedupeStrings(in []string) []string { 241 var out []string 242 seen := make(map[string]bool) 243 for _, v := range in { 244 if !seen[v] { 245 out = append(out, v) 246 seen[v] = true 247 } 248 } 249 return out 250} 251 252func parseProfileTags(p *appbsky.ActorProfile) []string { 253 // TODO: waiting for profile tag lexicon support 254 var ret []string = []string{} 255 if len(ret) == 0 { 256 return nil 257 } 258 return dedupeStrings(ret) 259} 260 261func parsePostTags(p *appbsky.FeedPost) []string { 262 var ret []string = []string{} 263 for _, facet := range p.Facets { 264 for _, feat := range facet.Features { 265 if feat.RichtextFacet_Tag != nil { 266 ret = append(ret, feat.RichtextFacet_Tag.Tag) 267 } 268 } 269 } 270 ret = append(ret, p.Tags...) 271 if len(ret) == 0 { 272 return nil 273 } 274 return dedupeStrings(ret) 275} 276 277func parseEmojis(s string) []string { 278 var ret []string = []string{} 279 seen := make(map[string]bool) 280 gr := uniseg.NewGraphemes(s) 281 for gr.Next() { 282 // check if this grapheme cluster starts with an emoji rune (Unicode codepoint, int32) 283 firstRune := gr.Runes()[0] 284 if (firstRune >= 0x1F000 && firstRune <= 0x1FFFF) || (firstRune >= 0x2600 && firstRune <= 0x26FF) { 285 emoji := gr.Str() 286 if seen[emoji] == false { 287 ret = append(ret, emoji) 288 seen[emoji] = true 289 } 290 } 291 } 292 if len(ret) == 0 { 293 return nil 294 } 295 return ret 296}