cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package documents
2
3import (
4 "testing"
5 "time"
6)
7
8func TestTokenizer(t *testing.T) {
9 tokenizer := NewTokenizer()
10
11 t.Run("Basic tokenization", func(t *testing.T) {
12 t.Run("tokenizes simple text", func(t *testing.T) {
13 tokens := tokenizer.Tokenize("Hello World")
14 if len(tokens) != 2 {
15 t.Fatalf("expected 2 tokens, got %d", len(tokens))
16 }
17 if tokens[0] != "hello" || tokens[1] != "world" {
18 t.Errorf("expected [hello world], got %v", tokens)
19 }
20 })
21
22 t.Run("lowercases all tokens", func(t *testing.T) {
23 tokens := tokenizer.Tokenize("UPPERCASE MiXeD lowercase")
24 if len(tokens) != 3 {
25 t.Fatalf("expected 3 tokens, got %d", len(tokens))
26 }
27 for _, token := range tokens {
28 if token != "uppercase" && token != "mixed" && token != "lowercase" {
29 t.Errorf("unexpected token: %s", token)
30 }
31 }
32 })
33
34 t.Run("handles punctuation", func(t *testing.T) {
35 tokens := tokenizer.Tokenize("Hello, world! How are you?")
36 expected := []string{"hello", "world", "how", "are", "you"}
37 if len(tokens) != len(expected) {
38 t.Fatalf("expected %d tokens, got %d", len(expected), len(tokens))
39 }
40 for i, token := range tokens {
41 if token != expected[i] {
42 t.Errorf("token %d: expected %s, got %s", i, expected[i], token)
43 }
44 }
45 })
46 })
47
48 t.Run("Unicode support", func(t *testing.T) {
49 t.Run("tokenizes unicode characters", func(t *testing.T) {
50 tokens := tokenizer.Tokenize("caf茅 r茅sum茅 na茂ve")
51 if len(tokens) != 3 {
52 t.Fatalf("expected 3 tokens, got %d", len(tokens))
53 }
54 })
55
56 t.Run("handles emoji and special characters", func(t *testing.T) {
57 tokens := tokenizer.Tokenize("hello 馃榾 world")
58 if len(tokens) != 2 {
59 t.Fatalf("expected 2 tokens (emoji excluded), got %d", len(tokens))
60 }
61 if tokens[0] != "hello" || tokens[1] != "world" {
62 t.Errorf("expected [hello world], got %v", tokens)
63 }
64 })
65
66 t.Run("tokenizes CJK characters", func(t *testing.T) {
67 tokens := tokenizer.Tokenize("浣犲ソ 涓栫晫")
68 if len(tokens) != 2 {
69 t.Fatalf("expected 2 tokens, got %d", len(tokens))
70 }
71 })
72 })
73
74 t.Run("Numbers", func(t *testing.T) {
75 t.Run("tokenizes numbers", func(t *testing.T) {
76 tokens := tokenizer.Tokenize("test 123 456")
77 if len(tokens) != 3 {
78 t.Fatalf("expected 3 tokens, got %d", len(tokens))
79 }
80 if tokens[1] != "123" || tokens[2] != "456" {
81 t.Errorf("expected numbers to be tokenized, got %v", tokens)
82 }
83 })
84
85 t.Run("handles mixed alphanumeric", func(t *testing.T) {
86 tokens := tokenizer.Tokenize("version 2 released")
87 if len(tokens) != 3 {
88 t.Fatalf("expected 3 tokens, got %d", len(tokens))
89 }
90 })
91 })
92
93 t.Run("Edge cases", func(t *testing.T) {
94 t.Run("handles empty string", func(t *testing.T) {
95 tokens := tokenizer.Tokenize("")
96 if len(tokens) != 0 {
97 t.Errorf("expected 0 tokens for empty string, got %d", len(tokens))
98 }
99 })
100
101 t.Run("handles whitespace only", func(t *testing.T) {
102 tokens := tokenizer.Tokenize(" \t\n ")
103 if len(tokens) != 0 {
104 t.Errorf("expected 0 tokens for whitespace, got %d", len(tokens))
105 }
106 })
107
108 t.Run("handles punctuation only", func(t *testing.T) {
109 tokens := tokenizer.Tokenize("!@#$%^&*()")
110 if len(tokens) != 0 {
111 t.Errorf("expected 0 tokens for punctuation only, got %d", len(tokens))
112 }
113 })
114 })
115}
116
117func TestTokenFrequency(t *testing.T) {
118 t.Run("counts term frequencies", func(t *testing.T) {
119 tokens := []string{"hello", "world", "hello", "test"}
120 freq := TokenFrequency(tokens)
121
122 if freq["hello"] != 2 {
123 t.Errorf("expected hello frequency 2, got %d", freq["hello"])
124 }
125 if freq["world"] != 1 {
126 t.Errorf("expected world frequency 1, got %d", freq["world"])
127 }
128 if freq["test"] != 1 {
129 t.Errorf("expected test frequency 1, got %d", freq["test"])
130 }
131 })
132
133 t.Run("handles empty token list", func(t *testing.T) {
134 freq := TokenFrequency([]string{})
135 if len(freq) != 0 {
136 t.Errorf("expected empty frequency map, got %d entries", len(freq))
137 }
138 })
139
140 t.Run("handles single token", func(t *testing.T) {
141 freq := TokenFrequency([]string{"single"})
142 if freq["single"] != 1 {
143 t.Errorf("expected frequency 1, got %d", freq["single"])
144 }
145 })
146}
147
148func TestBuildIndex(t *testing.T) {
149 now := time.Now()
150
151 t.Run("builds index from documents", func(t *testing.T) {
152 docs := []Document{
153 {ID: 1, Title: "Go Programming", Body: "Go is a great language", CreatedAt: now, DocKind: int64(NoteDoc)},
154 {ID: 2, Title: "Python Guide", Body: "Python is versatile", CreatedAt: now, DocKind: int64(ArticleDoc)},
155 }
156
157 idx := BuildIndex(docs)
158
159 if idx.NumDocs != 2 {
160 t.Errorf("expected NumDocs 2, got %d", idx.NumDocs)
161 }
162
163 if len(idx.DocLengths) != 2 {
164 t.Errorf("expected 2 document lengths, got %d", len(idx.DocLengths))
165 }
166
167 if idx.DocLengths[1] <= 0 || idx.DocLengths[2] <= 0 {
168 t.Error("document lengths should be positive")
169 }
170
171 if _, exists := idx.Postings["go"]; !exists {
172 t.Error("expected 'go' to be in postings")
173 }
174 if _, exists := idx.Postings["python"]; !exists {
175 t.Error("expected 'python' to be in postings")
176 }
177 })
178
179 t.Run("handles empty document list", func(t *testing.T) {
180 idx := BuildIndex([]Document{})
181 if idx.NumDocs != 0 {
182 t.Errorf("expected NumDocs 0, got %d", idx.NumDocs)
183 }
184 if len(idx.Postings) != 0 {
185 t.Errorf("expected empty postings, got %d entries", len(idx.Postings))
186 }
187 })
188
189 t.Run("calculates term frequencies correctly", func(t *testing.T) {
190 docs := []Document{
191 {ID: 1, Title: "test", Body: "test test test", CreatedAt: now, DocKind: int64(NoteDoc)},
192 }
193
194 idx := BuildIndex(docs)
195
196 postings := idx.Postings["test"]
197 if len(postings) != 1 {
198 t.Fatalf("expected 1 posting for 'test', got %d", len(postings))
199 }
200
201 if postings[0].TF != 4 {
202 t.Errorf("expected TF 4 (title + 3 in body), got %d", postings[0].TF)
203 }
204 })
205
206 t.Run("builds postings for multiple documents with same term", func(t *testing.T) {
207 docs := []Document{
208 {ID: 1, Title: "Go", Body: "Go is great", CreatedAt: now, DocKind: int64(NoteDoc)},
209 {ID: 2, Title: "Go Tutorial", Body: "Learn Go", CreatedAt: now, DocKind: int64(NoteDoc)},
210 }
211
212 idx := BuildIndex(docs)
213
214 postings := idx.Postings["go"]
215 if len(postings) != 2 {
216 t.Fatalf("expected 2 postings for 'go', got %d", len(postings))
217 }
218 })
219}
220
221func TestIndexSearch(t *testing.T) {
222 now := time.Now()
223
224 t.Run("Search functionality", func(t *testing.T) {
225 t.Run("returns empty results for empty query", func(t *testing.T) {
226 docs := []Document{
227 {ID: 1, Title: "Test", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)},
228 }
229 idx := BuildIndex(docs)
230
231 results, err := idx.Search("", 10)
232 if err != nil {
233 t.Fatalf("unexpected error: %v", err)
234 }
235 if len(results) != 0 {
236 t.Errorf("expected 0 results for empty query, got %d", len(results))
237 }
238 })
239
240 t.Run("finds matching documents", func(t *testing.T) {
241 docs := []Document{
242 {ID: 1, Title: "Go Programming", Body: "Learn Go language", CreatedAt: now, DocKind: int64(NoteDoc)},
243 {ID: 2, Title: "Python Guide", Body: "Python is versatile", CreatedAt: now, DocKind: int64(ArticleDoc)},
244 }
245 idx := BuildIndex(docs)
246
247 results, err := idx.Search("go", 10)
248 if err != nil {
249 t.Fatalf("unexpected error: %v", err)
250 }
251
252 if len(results) != 1 {
253 t.Fatalf("expected 1 result, got %d", len(results))
254 }
255
256 if results[0].DocID != 1 {
257 t.Errorf("expected DocID 1, got %d", results[0].DocID)
258 }
259
260 if results[0].Score <= 0 {
261 t.Error("expected positive score")
262 }
263 })
264
265 t.Run("ranks documents by relevance", func(t *testing.T) {
266 docs := []Document{
267 {ID: 1, Title: "Go", Body: "tutorial python rust", CreatedAt: now, DocKind: int64(NoteDoc)},
268 {ID: 2, Title: "Go Programming", Body: "advanced go tutorial", CreatedAt: now, DocKind: int64(NoteDoc)},
269 {ID: 3, Title: "Python", Body: "different language", CreatedAt: now, DocKind: int64(NoteDoc)},
270 }
271 idx := BuildIndex(docs)
272
273 results, err := idx.Search("go", 10)
274 if err != nil {
275 t.Fatalf("unexpected error: %v", err)
276 }
277
278 if len(results) != 2 {
279 t.Fatalf("expected 2 results, got %d", len(results))
280 }
281
282 if results[0].DocID != 2 {
283 t.Errorf("expected document 2 to rank higher (has more 'go' terms)")
284 }
285
286 if results[0].Score <= results[1].Score {
287 t.Errorf("expected first result to have higher score, got %f <= %f", results[0].Score, results[1].Score)
288 }
289 })
290
291 t.Run("respects limit parameter", func(t *testing.T) {
292 docs := []Document{
293 {ID: 1, Title: "test one", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)},
294 {ID: 2, Title: "test two", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)},
295 {ID: 3, Title: "test three", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)},
296 }
297 idx := BuildIndex(docs)
298
299 results, err := idx.Search("test", 2)
300 if err != nil {
301 t.Fatalf("unexpected error: %v", err)
302 }
303
304 if len(results) != 2 {
305 t.Errorf("expected 2 results with limit=2, got %d", len(results))
306 }
307 })
308
309 t.Run("handles multi-term queries", func(t *testing.T) {
310 docs := []Document{
311 {ID: 1, Title: "Go Programming", Body: "advanced tutorial", CreatedAt: now, DocKind: int64(NoteDoc)},
312 {ID: 2, Title: "Go Basics", Body: "beginner tutorial", CreatedAt: now, DocKind: int64(NoteDoc)},
313 {ID: 3, Title: "Python", Body: "different language", CreatedAt: now, DocKind: int64(NoteDoc)},
314 }
315 idx := BuildIndex(docs)
316
317 results, err := idx.Search("go tutorial", 10)
318 if err != nil {
319 t.Fatalf("unexpected error: %v", err)
320 }
321
322 if len(results) != 2 {
323 t.Errorf("expected 2 results, got %d", len(results))
324 }
325 })
326
327 t.Run("returns no results for non-matching query", func(t *testing.T) {
328 docs := []Document{
329 {ID: 1, Title: "Go", Body: "programming", CreatedAt: now, DocKind: int64(NoteDoc)},
330 }
331 idx := BuildIndex(docs)
332
333 results, err := idx.Search("rust", 10)
334 if err != nil {
335 t.Fatalf("unexpected error: %v", err)
336 }
337
338 if len(results) != 0 {
339 t.Errorf("expected 0 results for non-matching query, got %d", len(results))
340 }
341 })
342
343 t.Run("handles zero limit", func(t *testing.T) {
344 docs := []Document{
345 {ID: 1, Title: "test", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)},
346 }
347 idx := BuildIndex(docs)
348
349 results, err := idx.Search("test", 0)
350 if err != nil {
351 t.Fatalf("unexpected error: %v", err)
352 }
353
354 if len(results) != 1 {
355 t.Errorf("expected all results with limit=0, got %d", len(results))
356 }
357 })
358
359 t.Run("tie-breaking uses DocID", func(t *testing.T) {
360 docs := []Document{
361 {ID: 1, Title: "test", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)},
362 {ID: 2, Title: "test", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)},
363 }
364 idx := BuildIndex(docs)
365
366 results, err := idx.Search("test", 10)
367 if err != nil {
368 t.Fatalf("unexpected error: %v", err)
369 }
370
371 if len(results) != 2 {
372 t.Fatalf("expected 2 results, got %d", len(results))
373 }
374
375 if results[0].DocID <= results[1].DocID {
376 t.Error("expected higher DocID first when scores are equal")
377 }
378 })
379 })
380}