+14
search/japanese.go
+14
search/japanese.go
···
1
+
package search
2
+
3
+
import (
4
+
"regexp"
5
+
)
6
+
7
+
// U+3040 - U+30FF: hiragana and katakana (Japanese only)
8
+
// U+FF66 - U+FF9F: half-width katakana (Japanese only)
9
+
var japaneseRegex = regexp.MustCompile(`[\x{3040}-\x{30ff}\x{ff66}-\x{ff9f}]`)
10
+
11
+
// helper to check if an input string contains any Japanese-specific characters (hiragana or katakana). will not trigger on CJK characters which are not specific to Japanese
12
+
func containsJapanese(text string) bool {
13
+
return japaneseRegex.MatchString(text)
14
+
}
+23
search/japanese_test.go
+23
search/japanese_test.go
···
1
+
package search
2
+
3
+
import (
4
+
"testing"
5
+
6
+
"github.com/stretchr/testify/assert"
7
+
)
8
+
9
+
func TestJapaneseDetection(t *testing.T) {
10
+
assert := assert.New(t)
11
+
12
+
assert.False(containsJapanese(""))
13
+
assert.False(containsJapanese("basic english"))
14
+
assert.False(containsJapanese("basic english"))
15
+
16
+
assert.True(containsJapanese("学校から帰って熱いお風呂に入ったら力一杯がんばる"))
17
+
assert.True(containsJapanese("パリ"))
18
+
assert.True(containsJapanese("ハリー・ポッター"))
19
+
assert.True(containsJapanese("some japanese パリ and some english"))
20
+
21
+
// CJK, but not japanese-specific
22
+
assert.False(containsJapanese("熱力学"))
23
+
}
+5
-1
search/query.go
+5
-1
search/query.go
···
64
64
return nil, err
65
65
}
66
66
queryStr, filters := ParseQuery(ctx, dir, q)
67
+
idx := "everything"
68
+
if containsJapanese(queryStr) {
69
+
idx = "everything_ja"
70
+
}
67
71
basic := map[string]interface{}{
68
72
"simple_query_string": map[string]interface{}{
69
73
"query": queryStr,
70
-
"fields": []string{"everything"},
74
+
"fields": []string{idx},
71
75
"flags": "AND|NOT|OR|PHRASE|PRECEDENCE|WHITESPACE",
72
76
"default_operator": "and",
73
77
"lenient": true,
+200
search/query_test.go
+200
search/query_test.go
···
1
+
package search
2
+
3
+
import (
4
+
"context"
5
+
"crypto/tls"
6
+
"io"
7
+
"log/slog"
8
+
"net/http"
9
+
"testing"
10
+
11
+
appbsky "github.com/bluesky-social/indigo/api/bsky"
12
+
"github.com/bluesky-social/indigo/atproto/identity"
13
+
"github.com/bluesky-social/indigo/atproto/syntax"
14
+
15
+
"github.com/ipfs/go-cid"
16
+
es "github.com/opensearch-project/opensearch-go/v2"
17
+
"github.com/stretchr/testify/assert"
18
+
"gorm.io/driver/sqlite"
19
+
"gorm.io/gorm"
20
+
)
21
+
22
+
var (
23
+
testPostIndex = "palomar_test_post"
24
+
testProfileIndex = "palomar_test_profile"
25
+
)
26
+
27
+
func testEsClient(t *testing.T) *es.Client {
28
+
cfg := es.Config{
29
+
Addresses: []string{"http://localhost:9200"},
30
+
Username: "admin",
31
+
Password: "0penSearch-Pal0mar",
32
+
CACert: nil,
33
+
Transport: &http.Transport{
34
+
MaxIdleConnsPerHost: 5,
35
+
TLSClientConfig: &tls.Config{
36
+
InsecureSkipVerify: true,
37
+
},
38
+
},
39
+
}
40
+
escli, err := es.NewClient(cfg)
41
+
if err != nil {
42
+
t.Fatal(err)
43
+
}
44
+
info, err := escli.Info()
45
+
if err != nil {
46
+
t.Fatal(err)
47
+
}
48
+
info.Body.Close()
49
+
return escli
50
+
51
+
}
52
+
53
+
func testServer(ctx context.Context, t *testing.T, escli *es.Client, dir identity.Directory) *Server {
54
+
db, err := gorm.Open(sqlite.Open("file::memory:?cache=shared"), &gorm.Config{})
55
+
if err != nil {
56
+
t.Fatal(err)
57
+
}
58
+
59
+
srv, err := NewServer(
60
+
db,
61
+
escli,
62
+
dir,
63
+
Config{
64
+
RelayHost: "wss://relay.invalid",
65
+
PostIndex: testPostIndex,
66
+
ProfileIndex: testProfileIndex,
67
+
Logger: slog.Default(),
68
+
RelaySyncRateLimit: 1,
69
+
IndexMaxConcurrency: 1,
70
+
},
71
+
)
72
+
if err != nil {
73
+
t.Fatal(err)
74
+
}
75
+
76
+
// NOTE: skipping errors
77
+
resp, _ := srv.escli.Indices.Delete([]string{testPostIndex, testProfileIndex})
78
+
defer resp.Body.Close()
79
+
io.ReadAll(resp.Body)
80
+
81
+
if err := srv.EnsureIndices(ctx); err != nil {
82
+
t.Fatal(err)
83
+
}
84
+
85
+
return srv
86
+
}
87
+
88
+
func TestJapaneseRegressions(t *testing.T) {
89
+
assert := assert.New(t)
90
+
ctx := context.Background()
91
+
escli := testEsClient(t)
92
+
dir := identity.NewMockDirectory()
93
+
srv := testServer(ctx, t, escli, &dir)
94
+
ident := identity.Identity{
95
+
DID: syntax.DID("did:plc:abc111"),
96
+
Handle: syntax.Handle("handle.example.com"),
97
+
}
98
+
99
+
res, err := DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20)
100
+
if err != nil {
101
+
t.Fatal(err)
102
+
}
103
+
assert.Equal(0, len(res.Hits.Hits))
104
+
105
+
p1 := appbsky.FeedPost{Text: "basic english post", CreatedAt: "2024-01-02T03:04:05.006Z"}
106
+
assert.NoError(srv.indexPost(ctx, &ident, &p1, "app.bsky.feed.post/3kpnillluoh2y", cid.Undef))
107
+
108
+
// https://github.com/bluesky-social/indigo/issues/302
109
+
p2 := appbsky.FeedPost{Text: "学校から帰って熱いお風呂に入ったら力一杯がんばる", CreatedAt: "2024-01-02T03:04:05.006Z"}
110
+
assert.NoError(srv.indexPost(ctx, &ident, &p2, "app.bsky.feed.post/3kpnillluo222", cid.Undef))
111
+
p3 := appbsky.FeedPost{Text: "熱力学", CreatedAt: "2024-01-02T03:04:05.006Z"}
112
+
assert.NoError(srv.indexPost(ctx, &ident, &p3, "app.bsky.feed.post/3kpnillluo333", cid.Undef))
113
+
p4 := appbsky.FeedPost{Text: "東京都", CreatedAt: "2024-01-02T03:04:05.006Z"}
114
+
assert.NoError(srv.indexPost(ctx, &ident, &p4, "app.bsky.feed.post/3kpnillluo444", cid.Undef))
115
+
p5 := appbsky.FeedPost{Text: "京都", CreatedAt: "2024-01-02T03:04:05.006Z"}
116
+
assert.NoError(srv.indexPost(ctx, &ident, &p5, "app.bsky.feed.post/3kpnillluo555", cid.Undef))
117
+
p6 := appbsky.FeedPost{Text: "パリ", CreatedAt: "2024-01-02T03:04:05.006Z"}
118
+
assert.NoError(srv.indexPost(ctx, &ident, &p6, "app.bsky.feed.post/3kpnillluo666", cid.Undef))
119
+
p7 := appbsky.FeedPost{Text: "ハリー・ポッター", CreatedAt: "2024-01-02T03:04:05.006Z"}
120
+
assert.NoError(srv.indexPost(ctx, &ident, &p7, "app.bsky.feed.post/3kpnillluo777", cid.Undef))
121
+
p8 := appbsky.FeedPost{Text: "ハリ", CreatedAt: "2024-01-02T03:04:05.006Z"}
122
+
assert.NoError(srv.indexPost(ctx, &ident, &p8, "app.bsky.feed.post/3kpnillluo223", cid.Undef))
123
+
p9 := appbsky.FeedPost{Text: "multilingual 多言語", CreatedAt: "2024-01-02T03:04:05.006Z"}
124
+
assert.NoError(srv.indexPost(ctx, &ident, &p9, "app.bsky.feed.post/3kpnillluo224", cid.Undef))
125
+
126
+
_, err = srv.escli.Indices.Refresh()
127
+
assert.NoError(err)
128
+
129
+
// expect all to be indexed
130
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "*", 0, 20)
131
+
if err != nil {
132
+
t.Fatal(err)
133
+
}
134
+
assert.Equal(9, len(res.Hits.Hits))
135
+
136
+
// check that english matches (single post)
137
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20)
138
+
if err != nil {
139
+
t.Fatal(err)
140
+
}
141
+
assert.Equal(1, len(res.Hits.Hits))
142
+
143
+
// "thermodynamics"; should return only one match
144
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "熱力学", 0, 20)
145
+
if err != nil {
146
+
t.Fatal(err)
147
+
}
148
+
assert.Equal(1, len(res.Hits.Hits))
149
+
150
+
// "Kyoto"; should return only one match
151
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "京都", 0, 20)
152
+
if err != nil {
153
+
t.Fatal(err)
154
+
}
155
+
assert.Equal(1, len(res.Hits.Hits))
156
+
157
+
// "Paris"; should return only one match
158
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "パリ", 0, 20)
159
+
if err != nil {
160
+
t.Fatal(err)
161
+
}
162
+
assert.Equal(1, len(res.Hits.Hits))
163
+
164
+
// should return only one match
165
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "ハリー", 0, 20)
166
+
if err != nil {
167
+
t.Fatal(err)
168
+
}
169
+
assert.Equal(1, len(res.Hits.Hits))
170
+
171
+
// part of a word; should match none
172
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "ハ", 0, 20)
173
+
if err != nil {
174
+
t.Fatal(err)
175
+
}
176
+
assert.Equal(0, len(res.Hits.Hits))
177
+
178
+
// should match both ways, and together
179
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "multilingual", 0, 20)
180
+
if err != nil {
181
+
t.Fatal(err)
182
+
}
183
+
assert.Equal(1, len(res.Hits.Hits))
184
+
185
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "多言語", 0, 20)
186
+
if err != nil {
187
+
t.Fatal(err)
188
+
}
189
+
assert.Equal(1, len(res.Hits.Hits))
190
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "multilingual 多言語", 0, 20)
191
+
if err != nil {
192
+
t.Fatal(err)
193
+
}
194
+
assert.Equal(1, len(res.Hits.Hits))
195
+
res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "\"multilingual 多言語\"", 0, 20)
196
+
if err != nil {
197
+
t.Fatal(err)
198
+
}
199
+
assert.Equal(1, len(res.Hits.Hits))
200
+
}
+57
search/testdata/transform-post-fixtures.json
+57
search/testdata/transform-post-fixtures.json
···
186
186
],
187
187
"embed_img_count": 2
188
188
}
189
+
},
190
+
{
191
+
"did": "did:plc:u5cwb2mwiv2bfq53cjufe6yn",
192
+
"handle": "handle.example.com",
193
+
"rkey": "3k4duaz5vfs2b",
194
+
"cid": "bafyreibjifzpqj6o6wcq3hejh7y4z4z2vmiklkvykc57tw3pcbx3kxifpm",
195
+
"PostRecord": {
196
+
"$type": "app.bsky.feed.post",
197
+
"text": "学校から帰って熱いお風呂に入ったら力一杯がんばる",
198
+
"createdAt": "2023-08-07T05:46:14.423045Z",
199
+
"embed": {
200
+
"$type": "app.bsky.embed.images",
201
+
"images": [
202
+
{
203
+
"alt": "brief alt text description of the first image ハリー・ポッター",
204
+
"image": {
205
+
"$type": "blob",
206
+
"ref": {
207
+
"$link": "bafkreibabalobzn6cd366ukcsjycp4yymjymgfxcv6xczmlgpemzkz3cfa"
208
+
},
209
+
"mimeType": "image/webp",
210
+
"size": 760898
211
+
}
212
+
},
213
+
{
214
+
"alt": "brief alt text description of the second image",
215
+
"image": {
216
+
"$type": "blob",
217
+
"ref": {
218
+
"$link": "bafkreif3fouono2i3fmm5moqypwskh3yjtp7snd5hfq5pr453oggygyrte"
219
+
},
220
+
"mimeType": "image/png",
221
+
"size": 13208
222
+
}
223
+
}
224
+
]
225
+
}
226
+
},
227
+
"doc_id": "did:plc:u5cwb2mwiv2bfq53cjufe6yn_3k4duaz5vfs2b",
228
+
"PostDoc": {
229
+
"doc_index_ts": "2006-01-02T15:04:05.000Z",
230
+
"did": "did:plc:u5cwb2mwiv2bfq53cjufe6yn",
231
+
"handle": "handle.example.com",
232
+
"record_rkey": "3k4duaz5vfs2b",
233
+
"record_cid": "bafyreibjifzpqj6o6wcq3hejh7y4z4z2vmiklkvykc57tw3pcbx3kxifpm",
234
+
"created_at": "2023-08-07T05:46:14.423045Z",
235
+
"text": "学校から帰って熱いお風呂に入ったら力一杯がんばる",
236
+
"text_ja": "学校から帰って熱いお風呂に入ったら力一杯がんばる",
237
+
"embed_img_alt_text": [
238
+
"brief alt text description of the first image ハリー・ポッター",
239
+
"brief alt text description of the second image"
240
+
],
241
+
"embed_img_alt_text_ja": [
242
+
"brief alt text description of the first image ハリー・ポッター"
243
+
],
244
+
"embed_img_count": 2
245
+
}
189
246
}
190
247
]
+46
-35
search/transform.go
+46
-35
search/transform.go
···
28
28
}
29
29
30
30
type PostDoc struct {
31
-
DocIndexTs string `json:"doc_index_ts"`
32
-
DID string `json:"did"`
33
-
RecordRkey string `json:"record_rkey"`
34
-
RecordCID string `json:"record_cid"`
35
-
CreatedAt *string `json:"created_at,omitempty"`
36
-
Text string `json:"text"`
37
-
LangCode []string `json:"lang_code,omitempty"`
38
-
LangCodeIso2 []string `json:"lang_code_iso2,omitempty"`
39
-
MentionDID []string `json:"mention_did,omitempty"`
40
-
LinkURL []string `json:"link_url,omitempty"`
41
-
EmbedURL *string `json:"embed_url,omitempty"`
42
-
EmbedATURI *string `json:"embed_aturi,omitempty"`
43
-
ReplyRootATURI *string `json:"reply_root_aturi,omitempty"`
44
-
EmbedImgCount int `json:"embed_img_count"`
45
-
EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"`
46
-
SelfLabel []string `json:"self_label,omitempty"`
47
-
Tag []string `json:"tag,omitempty"`
48
-
Emoji []string `json:"emoji,omitempty"`
31
+
DocIndexTs string `json:"doc_index_ts"`
32
+
DID string `json:"did"`
33
+
RecordRkey string `json:"record_rkey"`
34
+
RecordCID string `json:"record_cid"`
35
+
CreatedAt *string `json:"created_at,omitempty"`
36
+
Text string `json:"text"`
37
+
TextJA string `json:"text_ja,omitempty"`
38
+
LangCode []string `json:"lang_code,omitempty"`
39
+
LangCodeIso2 []string `json:"lang_code_iso2,omitempty"`
40
+
MentionDID []string `json:"mention_did,omitempty"`
41
+
LinkURL []string `json:"link_url,omitempty"`
42
+
EmbedURL *string `json:"embed_url,omitempty"`
43
+
EmbedATURI *string `json:"embed_aturi,omitempty"`
44
+
ReplyRootATURI *string `json:"reply_root_aturi,omitempty"`
45
+
EmbedImgCount int `json:"embed_img_count"`
46
+
EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"`
47
+
EmbedImgAltTextJA []string `json:"embed_img_alt_text_ja,omitempty"`
48
+
SelfLabel []string `json:"self_label,omitempty"`
49
+
Tag []string `json:"tag,omitempty"`
50
+
Emoji []string `json:"emoji,omitempty"`
49
51
}
50
52
51
53
// Returns the search index document ID (`_id`) for this document.
···
143
145
}
144
146
var embedImgCount int = 0
145
147
var embedImgAltText []string
148
+
var embedImgAltTextJA []string
146
149
if post.Embed != nil && post.Embed.EmbedImages != nil {
147
150
embedImgCount = len(post.Embed.EmbedImages.Images)
148
151
for _, img := range post.Embed.EmbedImages.Images {
149
152
if img.Alt != "" {
150
153
embedImgAltText = append(embedImgAltText, img.Alt)
154
+
if containsJapanese(img.Alt) {
155
+
embedImgAltTextJA = append(embedImgAltTextJA, img.Alt)
156
+
}
151
157
}
152
158
}
153
159
}
···
159
165
}
160
166
161
167
doc := PostDoc{
162
-
DocIndexTs: syntax.DatetimeNow().String(),
163
-
DID: ident.DID.String(),
164
-
RecordRkey: rkey,
165
-
RecordCID: cid,
166
-
Text: post.Text,
167
-
LangCode: post.Langs,
168
-
LangCodeIso2: langCodeIso2,
169
-
MentionDID: mentionDIDs,
170
-
LinkURL: linkURLs,
171
-
EmbedURL: embedURL,
172
-
EmbedATURI: embedATURI,
173
-
ReplyRootATURI: replyRootATURI,
174
-
EmbedImgCount: embedImgCount,
175
-
EmbedImgAltText: embedImgAltText,
176
-
SelfLabel: selfLabels,
177
-
Tag: parsePostTags(post),
178
-
Emoji: parseEmojis(post.Text),
168
+
DocIndexTs: syntax.DatetimeNow().String(),
169
+
DID: ident.DID.String(),
170
+
RecordRkey: rkey,
171
+
RecordCID: cid,
172
+
Text: post.Text,
173
+
LangCode: post.Langs,
174
+
LangCodeIso2: langCodeIso2,
175
+
MentionDID: mentionDIDs,
176
+
LinkURL: linkURLs,
177
+
EmbedURL: embedURL,
178
+
EmbedATURI: embedATURI,
179
+
ReplyRootATURI: replyRootATURI,
180
+
EmbedImgCount: embedImgCount,
181
+
EmbedImgAltText: embedImgAltText,
182
+
EmbedImgAltTextJA: embedImgAltTextJA,
183
+
SelfLabel: selfLabels,
184
+
Tag: parsePostTags(post),
185
+
Emoji: parseEmojis(post.Text),
186
+
}
187
+
188
+
if containsJapanese(post.Text) {
189
+
doc.TextJA = post.Text
179
190
}
180
191
181
192
if post.CreatedAt != "" {