fork of indigo with slightly nicer lexgen

first pass at japanese-specific post search

This is only for posts (not profiles).

+14
search/japanese.go
··· 1 + package search 2 + 3 + import ( 4 + "regexp" 5 + ) 6 + 7 + // U+3040 - U+30FF: hiragana and katakana (Japanese only) 8 + // U+FF66 - U+FF9F: half-width katakana (Japanese only) 9 + var japaneseRegex = regexp.MustCompile(`[\x{3040}-\x{30ff}\x{ff66}-\x{ff9f}]`) 10 + 11 + // helper to check if an input string contains any Japanese-specific characters (hiragana or katakana). will not trigger on CJK characters which are not specific to Japanese 12 + func containsJapanese(text string) bool { 13 + return japaneseRegex.MatchString(text) 14 + }
+23
search/japanese_test.go
··· 1 + package search 2 + 3 + import ( 4 + "testing" 5 + 6 + "github.com/stretchr/testify/assert" 7 + ) 8 + 9 + func TestJapaneseDetection(t *testing.T) { 10 + assert := assert.New(t) 11 + 12 + assert.False(containsJapanese("")) 13 + assert.False(containsJapanese("basic english")) 14 + assert.False(containsJapanese("basic english")) 15 + 16 + assert.True(containsJapanese("学校から帰って熱いお風呂に入ったら力一杯がんばる")) 17 + assert.True(containsJapanese("パリ")) 18 + assert.True(containsJapanese("ハリー・ポッター")) 19 + assert.True(containsJapanese("some japanese パリ and some english")) 20 + 21 + // CJK, but not japanese-specific 22 + assert.False(containsJapanese("熱力学")) 23 + }
+5 -1
search/query.go
··· 64 64 return nil, err 65 65 } 66 66 queryStr, filters := ParseQuery(ctx, dir, q) 67 + idx := "everything" 68 + if containsJapanese(queryStr) { 69 + idx = "everything_ja" 70 + } 67 71 basic := map[string]interface{}{ 68 72 "simple_query_string": map[string]interface{}{ 69 73 "query": queryStr, 70 - "fields": []string{"everything"}, 74 + "fields": []string{idx}, 71 75 "flags": "AND|NOT|OR|PHRASE|PRECEDENCE|WHITESPACE", 72 76 "default_operator": "and", 73 77 "lenient": true,
+200
search/query_test.go
··· 1 + package search 2 + 3 + import ( 4 + "context" 5 + "crypto/tls" 6 + "io" 7 + "log/slog" 8 + "net/http" 9 + "testing" 10 + 11 + appbsky "github.com/bluesky-social/indigo/api/bsky" 12 + "github.com/bluesky-social/indigo/atproto/identity" 13 + "github.com/bluesky-social/indigo/atproto/syntax" 14 + 15 + "github.com/ipfs/go-cid" 16 + es "github.com/opensearch-project/opensearch-go/v2" 17 + "github.com/stretchr/testify/assert" 18 + "gorm.io/driver/sqlite" 19 + "gorm.io/gorm" 20 + ) 21 + 22 + var ( 23 + testPostIndex = "palomar_test_post" 24 + testProfileIndex = "palomar_test_profile" 25 + ) 26 + 27 + func testEsClient(t *testing.T) *es.Client { 28 + cfg := es.Config{ 29 + Addresses: []string{"http://localhost:9200"}, 30 + Username: "admin", 31 + Password: "0penSearch-Pal0mar", 32 + CACert: nil, 33 + Transport: &http.Transport{ 34 + MaxIdleConnsPerHost: 5, 35 + TLSClientConfig: &tls.Config{ 36 + InsecureSkipVerify: true, 37 + }, 38 + }, 39 + } 40 + escli, err := es.NewClient(cfg) 41 + if err != nil { 42 + t.Fatal(err) 43 + } 44 + info, err := escli.Info() 45 + if err != nil { 46 + t.Fatal(err) 47 + } 48 + info.Body.Close() 49 + return escli 50 + 51 + } 52 + 53 + func testServer(ctx context.Context, t *testing.T, escli *es.Client, dir identity.Directory) *Server { 54 + db, err := gorm.Open(sqlite.Open("file::memory:?cache=shared"), &gorm.Config{}) 55 + if err != nil { 56 + t.Fatal(err) 57 + } 58 + 59 + srv, err := NewServer( 60 + db, 61 + escli, 62 + dir, 63 + Config{ 64 + RelayHost: "wss://relay.invalid", 65 + PostIndex: testPostIndex, 66 + ProfileIndex: testProfileIndex, 67 + Logger: slog.Default(), 68 + RelaySyncRateLimit: 1, 69 + IndexMaxConcurrency: 1, 70 + }, 71 + ) 72 + if err != nil { 73 + t.Fatal(err) 74 + } 75 + 76 + // NOTE: skipping errors 77 + resp, _ := srv.escli.Indices.Delete([]string{testPostIndex, testProfileIndex}) 78 + defer resp.Body.Close() 79 + io.ReadAll(resp.Body) 80 + 81 + if err := srv.EnsureIndices(ctx); err != nil { 82 + t.Fatal(err) 83 + } 84 + 85 + return srv 86 + } 87 + 88 + func TestJapaneseRegressions(t *testing.T) { 89 + assert := assert.New(t) 90 + ctx := context.Background() 91 + escli := testEsClient(t) 92 + dir := identity.NewMockDirectory() 93 + srv := testServer(ctx, t, escli, &dir) 94 + ident := identity.Identity{ 95 + DID: syntax.DID("did:plc:abc111"), 96 + Handle: syntax.Handle("handle.example.com"), 97 + } 98 + 99 + res, err := DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20) 100 + if err != nil { 101 + t.Fatal(err) 102 + } 103 + assert.Equal(0, len(res.Hits.Hits)) 104 + 105 + p1 := appbsky.FeedPost{Text: "basic english post", CreatedAt: "2024-01-02T03:04:05.006Z"} 106 + assert.NoError(srv.indexPost(ctx, &ident, &p1, "app.bsky.feed.post/3kpnillluoh2y", cid.Undef)) 107 + 108 + // https://github.com/bluesky-social/indigo/issues/302 109 + p2 := appbsky.FeedPost{Text: "学校から帰って熱いお風呂に入ったら力一杯がんばる", CreatedAt: "2024-01-02T03:04:05.006Z"} 110 + assert.NoError(srv.indexPost(ctx, &ident, &p2, "app.bsky.feed.post/3kpnillluo222", cid.Undef)) 111 + p3 := appbsky.FeedPost{Text: "熱力学", CreatedAt: "2024-01-02T03:04:05.006Z"} 112 + assert.NoError(srv.indexPost(ctx, &ident, &p3, "app.bsky.feed.post/3kpnillluo333", cid.Undef)) 113 + p4 := appbsky.FeedPost{Text: "東京都", CreatedAt: "2024-01-02T03:04:05.006Z"} 114 + assert.NoError(srv.indexPost(ctx, &ident, &p4, "app.bsky.feed.post/3kpnillluo444", cid.Undef)) 115 + p5 := appbsky.FeedPost{Text: "京都", CreatedAt: "2024-01-02T03:04:05.006Z"} 116 + assert.NoError(srv.indexPost(ctx, &ident, &p5, "app.bsky.feed.post/3kpnillluo555", cid.Undef)) 117 + p6 := appbsky.FeedPost{Text: "パリ", CreatedAt: "2024-01-02T03:04:05.006Z"} 118 + assert.NoError(srv.indexPost(ctx, &ident, &p6, "app.bsky.feed.post/3kpnillluo666", cid.Undef)) 119 + p7 := appbsky.FeedPost{Text: "ハリー・ポッター", CreatedAt: "2024-01-02T03:04:05.006Z"} 120 + assert.NoError(srv.indexPost(ctx, &ident, &p7, "app.bsky.feed.post/3kpnillluo777", cid.Undef)) 121 + p8 := appbsky.FeedPost{Text: "ハリ", CreatedAt: "2024-01-02T03:04:05.006Z"} 122 + assert.NoError(srv.indexPost(ctx, &ident, &p8, "app.bsky.feed.post/3kpnillluo223", cid.Undef)) 123 + p9 := appbsky.FeedPost{Text: "multilingual 多言語", CreatedAt: "2024-01-02T03:04:05.006Z"} 124 + assert.NoError(srv.indexPost(ctx, &ident, &p9, "app.bsky.feed.post/3kpnillluo224", cid.Undef)) 125 + 126 + _, err = srv.escli.Indices.Refresh() 127 + assert.NoError(err) 128 + 129 + // expect all to be indexed 130 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "*", 0, 20) 131 + if err != nil { 132 + t.Fatal(err) 133 + } 134 + assert.Equal(9, len(res.Hits.Hits)) 135 + 136 + // check that english matches (single post) 137 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20) 138 + if err != nil { 139 + t.Fatal(err) 140 + } 141 + assert.Equal(1, len(res.Hits.Hits)) 142 + 143 + // "thermodynamics"; should return only one match 144 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "熱力学", 0, 20) 145 + if err != nil { 146 + t.Fatal(err) 147 + } 148 + assert.Equal(1, len(res.Hits.Hits)) 149 + 150 + // "Kyoto"; should return only one match 151 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "京都", 0, 20) 152 + if err != nil { 153 + t.Fatal(err) 154 + } 155 + assert.Equal(1, len(res.Hits.Hits)) 156 + 157 + // "Paris"; should return only one match 158 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "パリ", 0, 20) 159 + if err != nil { 160 + t.Fatal(err) 161 + } 162 + assert.Equal(1, len(res.Hits.Hits)) 163 + 164 + // should return only one match 165 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "ハリー", 0, 20) 166 + if err != nil { 167 + t.Fatal(err) 168 + } 169 + assert.Equal(1, len(res.Hits.Hits)) 170 + 171 + // part of a word; should match none 172 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "ハ", 0, 20) 173 + if err != nil { 174 + t.Fatal(err) 175 + } 176 + assert.Equal(0, len(res.Hits.Hits)) 177 + 178 + // should match both ways, and together 179 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "multilingual", 0, 20) 180 + if err != nil { 181 + t.Fatal(err) 182 + } 183 + assert.Equal(1, len(res.Hits.Hits)) 184 + 185 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "多言語", 0, 20) 186 + if err != nil { 187 + t.Fatal(err) 188 + } 189 + assert.Equal(1, len(res.Hits.Hits)) 190 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "multilingual 多言語", 0, 20) 191 + if err != nil { 192 + t.Fatal(err) 193 + } 194 + assert.Equal(1, len(res.Hits.Hits)) 195 + res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "\"multilingual 多言語\"", 0, 20) 196 + if err != nil { 197 + t.Fatal(err) 198 + } 199 + assert.Equal(1, len(res.Hits.Hits)) 200 + }
+57
search/testdata/transform-post-fixtures.json
··· 186 186 ], 187 187 "embed_img_count": 2 188 188 } 189 + }, 190 + { 191 + "did": "did:plc:u5cwb2mwiv2bfq53cjufe6yn", 192 + "handle": "handle.example.com", 193 + "rkey": "3k4duaz5vfs2b", 194 + "cid": "bafyreibjifzpqj6o6wcq3hejh7y4z4z2vmiklkvykc57tw3pcbx3kxifpm", 195 + "PostRecord": { 196 + "$type": "app.bsky.feed.post", 197 + "text": "学校から帰って熱いお風呂に入ったら力一杯がんばる", 198 + "createdAt": "2023-08-07T05:46:14.423045Z", 199 + "embed": { 200 + "$type": "app.bsky.embed.images", 201 + "images": [ 202 + { 203 + "alt": "brief alt text description of the first image ハリー・ポッター", 204 + "image": { 205 + "$type": "blob", 206 + "ref": { 207 + "$link": "bafkreibabalobzn6cd366ukcsjycp4yymjymgfxcv6xczmlgpemzkz3cfa" 208 + }, 209 + "mimeType": "image/webp", 210 + "size": 760898 211 + } 212 + }, 213 + { 214 + "alt": "brief alt text description of the second image", 215 + "image": { 216 + "$type": "blob", 217 + "ref": { 218 + "$link": "bafkreif3fouono2i3fmm5moqypwskh3yjtp7snd5hfq5pr453oggygyrte" 219 + }, 220 + "mimeType": "image/png", 221 + "size": 13208 222 + } 223 + } 224 + ] 225 + } 226 + }, 227 + "doc_id": "did:plc:u5cwb2mwiv2bfq53cjufe6yn_3k4duaz5vfs2b", 228 + "PostDoc": { 229 + "doc_index_ts": "2006-01-02T15:04:05.000Z", 230 + "did": "did:plc:u5cwb2mwiv2bfq53cjufe6yn", 231 + "handle": "handle.example.com", 232 + "record_rkey": "3k4duaz5vfs2b", 233 + "record_cid": "bafyreibjifzpqj6o6wcq3hejh7y4z4z2vmiklkvykc57tw3pcbx3kxifpm", 234 + "created_at": "2023-08-07T05:46:14.423045Z", 235 + "text": "学校から帰って熱いお風呂に入ったら力一杯がんばる", 236 + "text_ja": "学校から帰って熱いお風呂に入ったら力一杯がんばる", 237 + "embed_img_alt_text": [ 238 + "brief alt text description of the first image ハリー・ポッター", 239 + "brief alt text description of the second image" 240 + ], 241 + "embed_img_alt_text_ja": [ 242 + "brief alt text description of the first image ハリー・ポッター" 243 + ], 244 + "embed_img_count": 2 245 + } 189 246 } 190 247 ]
+46 -35
search/transform.go
··· 28 28 } 29 29 30 30 type PostDoc struct { 31 - DocIndexTs string `json:"doc_index_ts"` 32 - DID string `json:"did"` 33 - RecordRkey string `json:"record_rkey"` 34 - RecordCID string `json:"record_cid"` 35 - CreatedAt *string `json:"created_at,omitempty"` 36 - Text string `json:"text"` 37 - LangCode []string `json:"lang_code,omitempty"` 38 - LangCodeIso2 []string `json:"lang_code_iso2,omitempty"` 39 - MentionDID []string `json:"mention_did,omitempty"` 40 - LinkURL []string `json:"link_url,omitempty"` 41 - EmbedURL *string `json:"embed_url,omitempty"` 42 - EmbedATURI *string `json:"embed_aturi,omitempty"` 43 - ReplyRootATURI *string `json:"reply_root_aturi,omitempty"` 44 - EmbedImgCount int `json:"embed_img_count"` 45 - EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"` 46 - SelfLabel []string `json:"self_label,omitempty"` 47 - Tag []string `json:"tag,omitempty"` 48 - Emoji []string `json:"emoji,omitempty"` 31 + DocIndexTs string `json:"doc_index_ts"` 32 + DID string `json:"did"` 33 + RecordRkey string `json:"record_rkey"` 34 + RecordCID string `json:"record_cid"` 35 + CreatedAt *string `json:"created_at,omitempty"` 36 + Text string `json:"text"` 37 + TextJA string `json:"text_ja,omitempty"` 38 + LangCode []string `json:"lang_code,omitempty"` 39 + LangCodeIso2 []string `json:"lang_code_iso2,omitempty"` 40 + MentionDID []string `json:"mention_did,omitempty"` 41 + LinkURL []string `json:"link_url,omitempty"` 42 + EmbedURL *string `json:"embed_url,omitempty"` 43 + EmbedATURI *string `json:"embed_aturi,omitempty"` 44 + ReplyRootATURI *string `json:"reply_root_aturi,omitempty"` 45 + EmbedImgCount int `json:"embed_img_count"` 46 + EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"` 47 + EmbedImgAltTextJA []string `json:"embed_img_alt_text_ja,omitempty"` 48 + SelfLabel []string `json:"self_label,omitempty"` 49 + Tag []string `json:"tag,omitempty"` 50 + Emoji []string `json:"emoji,omitempty"` 49 51 } 50 52 51 53 // Returns the search index document ID (`_id`) for this document. ··· 143 145 } 144 146 var embedImgCount int = 0 145 147 var embedImgAltText []string 148 + var embedImgAltTextJA []string 146 149 if post.Embed != nil && post.Embed.EmbedImages != nil { 147 150 embedImgCount = len(post.Embed.EmbedImages.Images) 148 151 for _, img := range post.Embed.EmbedImages.Images { 149 152 if img.Alt != "" { 150 153 embedImgAltText = append(embedImgAltText, img.Alt) 154 + if containsJapanese(img.Alt) { 155 + embedImgAltTextJA = append(embedImgAltTextJA, img.Alt) 156 + } 151 157 } 152 158 } 153 159 } ··· 159 165 } 160 166 161 167 doc := PostDoc{ 162 - DocIndexTs: syntax.DatetimeNow().String(), 163 - DID: ident.DID.String(), 164 - RecordRkey: rkey, 165 - RecordCID: cid, 166 - Text: post.Text, 167 - LangCode: post.Langs, 168 - LangCodeIso2: langCodeIso2, 169 - MentionDID: mentionDIDs, 170 - LinkURL: linkURLs, 171 - EmbedURL: embedURL, 172 - EmbedATURI: embedATURI, 173 - ReplyRootATURI: replyRootATURI, 174 - EmbedImgCount: embedImgCount, 175 - EmbedImgAltText: embedImgAltText, 176 - SelfLabel: selfLabels, 177 - Tag: parsePostTags(post), 178 - Emoji: parseEmojis(post.Text), 168 + DocIndexTs: syntax.DatetimeNow().String(), 169 + DID: ident.DID.String(), 170 + RecordRkey: rkey, 171 + RecordCID: cid, 172 + Text: post.Text, 173 + LangCode: post.Langs, 174 + LangCodeIso2: langCodeIso2, 175 + MentionDID: mentionDIDs, 176 + LinkURL: linkURLs, 177 + EmbedURL: embedURL, 178 + EmbedATURI: embedATURI, 179 + ReplyRootATURI: replyRootATURI, 180 + EmbedImgCount: embedImgCount, 181 + EmbedImgAltText: embedImgAltText, 182 + EmbedImgAltTextJA: embedImgAltTextJA, 183 + SelfLabel: selfLabels, 184 + Tag: parsePostTags(post), 185 + Emoji: parseEmojis(post.Text), 186 + } 187 + 188 + if containsJapanese(post.Text) { 189 + doc.TextJA = post.Text 179 190 } 180 191 181 192 if post.CreatedAt != "" {