cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
at main 380 lines 11 kB view raw
1package documents 2 3import ( 4 "testing" 5 "time" 6) 7 8func TestTokenizer(t *testing.T) { 9 tokenizer := NewTokenizer() 10 11 t.Run("Basic tokenization", func(t *testing.T) { 12 t.Run("tokenizes simple text", func(t *testing.T) { 13 tokens := tokenizer.Tokenize("Hello World") 14 if len(tokens) != 2 { 15 t.Fatalf("expected 2 tokens, got %d", len(tokens)) 16 } 17 if tokens[0] != "hello" || tokens[1] != "world" { 18 t.Errorf("expected [hello world], got %v", tokens) 19 } 20 }) 21 22 t.Run("lowercases all tokens", func(t *testing.T) { 23 tokens := tokenizer.Tokenize("UPPERCASE MiXeD lowercase") 24 if len(tokens) != 3 { 25 t.Fatalf("expected 3 tokens, got %d", len(tokens)) 26 } 27 for _, token := range tokens { 28 if token != "uppercase" && token != "mixed" && token != "lowercase" { 29 t.Errorf("unexpected token: %s", token) 30 } 31 } 32 }) 33 34 t.Run("handles punctuation", func(t *testing.T) { 35 tokens := tokenizer.Tokenize("Hello, world! How are you?") 36 expected := []string{"hello", "world", "how", "are", "you"} 37 if len(tokens) != len(expected) { 38 t.Fatalf("expected %d tokens, got %d", len(expected), len(tokens)) 39 } 40 for i, token := range tokens { 41 if token != expected[i] { 42 t.Errorf("token %d: expected %s, got %s", i, expected[i], token) 43 } 44 } 45 }) 46 }) 47 48 t.Run("Unicode support", func(t *testing.T) { 49 t.Run("tokenizes unicode characters", func(t *testing.T) { 50 tokens := tokenizer.Tokenize("caf茅 r茅sum茅 na茂ve") 51 if len(tokens) != 3 { 52 t.Fatalf("expected 3 tokens, got %d", len(tokens)) 53 } 54 }) 55 56 t.Run("handles emoji and special characters", func(t *testing.T) { 57 tokens := tokenizer.Tokenize("hello 馃榾 world") 58 if len(tokens) != 2 { 59 t.Fatalf("expected 2 tokens (emoji excluded), got %d", len(tokens)) 60 } 61 if tokens[0] != "hello" || tokens[1] != "world" { 62 t.Errorf("expected [hello world], got %v", tokens) 63 } 64 }) 65 66 t.Run("tokenizes CJK characters", func(t *testing.T) { 67 tokens := tokenizer.Tokenize("浣犲ソ 涓栫晫") 68 if len(tokens) != 2 { 69 t.Fatalf("expected 2 tokens, got %d", len(tokens)) 70 } 71 }) 72 }) 73 74 t.Run("Numbers", func(t *testing.T) { 75 t.Run("tokenizes numbers", func(t *testing.T) { 76 tokens := tokenizer.Tokenize("test 123 456") 77 if len(tokens) != 3 { 78 t.Fatalf("expected 3 tokens, got %d", len(tokens)) 79 } 80 if tokens[1] != "123" || tokens[2] != "456" { 81 t.Errorf("expected numbers to be tokenized, got %v", tokens) 82 } 83 }) 84 85 t.Run("handles mixed alphanumeric", func(t *testing.T) { 86 tokens := tokenizer.Tokenize("version 2 released") 87 if len(tokens) != 3 { 88 t.Fatalf("expected 3 tokens, got %d", len(tokens)) 89 } 90 }) 91 }) 92 93 t.Run("Edge cases", func(t *testing.T) { 94 t.Run("handles empty string", func(t *testing.T) { 95 tokens := tokenizer.Tokenize("") 96 if len(tokens) != 0 { 97 t.Errorf("expected 0 tokens for empty string, got %d", len(tokens)) 98 } 99 }) 100 101 t.Run("handles whitespace only", func(t *testing.T) { 102 tokens := tokenizer.Tokenize(" \t\n ") 103 if len(tokens) != 0 { 104 t.Errorf("expected 0 tokens for whitespace, got %d", len(tokens)) 105 } 106 }) 107 108 t.Run("handles punctuation only", func(t *testing.T) { 109 tokens := tokenizer.Tokenize("!@#$%^&*()") 110 if len(tokens) != 0 { 111 t.Errorf("expected 0 tokens for punctuation only, got %d", len(tokens)) 112 } 113 }) 114 }) 115} 116 117func TestTokenFrequency(t *testing.T) { 118 t.Run("counts term frequencies", func(t *testing.T) { 119 tokens := []string{"hello", "world", "hello", "test"} 120 freq := TokenFrequency(tokens) 121 122 if freq["hello"] != 2 { 123 t.Errorf("expected hello frequency 2, got %d", freq["hello"]) 124 } 125 if freq["world"] != 1 { 126 t.Errorf("expected world frequency 1, got %d", freq["world"]) 127 } 128 if freq["test"] != 1 { 129 t.Errorf("expected test frequency 1, got %d", freq["test"]) 130 } 131 }) 132 133 t.Run("handles empty token list", func(t *testing.T) { 134 freq := TokenFrequency([]string{}) 135 if len(freq) != 0 { 136 t.Errorf("expected empty frequency map, got %d entries", len(freq)) 137 } 138 }) 139 140 t.Run("handles single token", func(t *testing.T) { 141 freq := TokenFrequency([]string{"single"}) 142 if freq["single"] != 1 { 143 t.Errorf("expected frequency 1, got %d", freq["single"]) 144 } 145 }) 146} 147 148func TestBuildIndex(t *testing.T) { 149 now := time.Now() 150 151 t.Run("builds index from documents", func(t *testing.T) { 152 docs := []Document{ 153 {ID: 1, Title: "Go Programming", Body: "Go is a great language", CreatedAt: now, DocKind: int64(NoteDoc)}, 154 {ID: 2, Title: "Python Guide", Body: "Python is versatile", CreatedAt: now, DocKind: int64(ArticleDoc)}, 155 } 156 157 idx := BuildIndex(docs) 158 159 if idx.NumDocs != 2 { 160 t.Errorf("expected NumDocs 2, got %d", idx.NumDocs) 161 } 162 163 if len(idx.DocLengths) != 2 { 164 t.Errorf("expected 2 document lengths, got %d", len(idx.DocLengths)) 165 } 166 167 if idx.DocLengths[1] <= 0 || idx.DocLengths[2] <= 0 { 168 t.Error("document lengths should be positive") 169 } 170 171 if _, exists := idx.Postings["go"]; !exists { 172 t.Error("expected 'go' to be in postings") 173 } 174 if _, exists := idx.Postings["python"]; !exists { 175 t.Error("expected 'python' to be in postings") 176 } 177 }) 178 179 t.Run("handles empty document list", func(t *testing.T) { 180 idx := BuildIndex([]Document{}) 181 if idx.NumDocs != 0 { 182 t.Errorf("expected NumDocs 0, got %d", idx.NumDocs) 183 } 184 if len(idx.Postings) != 0 { 185 t.Errorf("expected empty postings, got %d entries", len(idx.Postings)) 186 } 187 }) 188 189 t.Run("calculates term frequencies correctly", func(t *testing.T) { 190 docs := []Document{ 191 {ID: 1, Title: "test", Body: "test test test", CreatedAt: now, DocKind: int64(NoteDoc)}, 192 } 193 194 idx := BuildIndex(docs) 195 196 postings := idx.Postings["test"] 197 if len(postings) != 1 { 198 t.Fatalf("expected 1 posting for 'test', got %d", len(postings)) 199 } 200 201 if postings[0].TF != 4 { 202 t.Errorf("expected TF 4 (title + 3 in body), got %d", postings[0].TF) 203 } 204 }) 205 206 t.Run("builds postings for multiple documents with same term", func(t *testing.T) { 207 docs := []Document{ 208 {ID: 1, Title: "Go", Body: "Go is great", CreatedAt: now, DocKind: int64(NoteDoc)}, 209 {ID: 2, Title: "Go Tutorial", Body: "Learn Go", CreatedAt: now, DocKind: int64(NoteDoc)}, 210 } 211 212 idx := BuildIndex(docs) 213 214 postings := idx.Postings["go"] 215 if len(postings) != 2 { 216 t.Fatalf("expected 2 postings for 'go', got %d", len(postings)) 217 } 218 }) 219} 220 221func TestIndexSearch(t *testing.T) { 222 now := time.Now() 223 224 t.Run("Search functionality", func(t *testing.T) { 225 t.Run("returns empty results for empty query", func(t *testing.T) { 226 docs := []Document{ 227 {ID: 1, Title: "Test", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)}, 228 } 229 idx := BuildIndex(docs) 230 231 results, err := idx.Search("", 10) 232 if err != nil { 233 t.Fatalf("unexpected error: %v", err) 234 } 235 if len(results) != 0 { 236 t.Errorf("expected 0 results for empty query, got %d", len(results)) 237 } 238 }) 239 240 t.Run("finds matching documents", func(t *testing.T) { 241 docs := []Document{ 242 {ID: 1, Title: "Go Programming", Body: "Learn Go language", CreatedAt: now, DocKind: int64(NoteDoc)}, 243 {ID: 2, Title: "Python Guide", Body: "Python is versatile", CreatedAt: now, DocKind: int64(ArticleDoc)}, 244 } 245 idx := BuildIndex(docs) 246 247 results, err := idx.Search("go", 10) 248 if err != nil { 249 t.Fatalf("unexpected error: %v", err) 250 } 251 252 if len(results) != 1 { 253 t.Fatalf("expected 1 result, got %d", len(results)) 254 } 255 256 if results[0].DocID != 1 { 257 t.Errorf("expected DocID 1, got %d", results[0].DocID) 258 } 259 260 if results[0].Score <= 0 { 261 t.Error("expected positive score") 262 } 263 }) 264 265 t.Run("ranks documents by relevance", func(t *testing.T) { 266 docs := []Document{ 267 {ID: 1, Title: "Go", Body: "tutorial python rust", CreatedAt: now, DocKind: int64(NoteDoc)}, 268 {ID: 2, Title: "Go Programming", Body: "advanced go tutorial", CreatedAt: now, DocKind: int64(NoteDoc)}, 269 {ID: 3, Title: "Python", Body: "different language", CreatedAt: now, DocKind: int64(NoteDoc)}, 270 } 271 idx := BuildIndex(docs) 272 273 results, err := idx.Search("go", 10) 274 if err != nil { 275 t.Fatalf("unexpected error: %v", err) 276 } 277 278 if len(results) != 2 { 279 t.Fatalf("expected 2 results, got %d", len(results)) 280 } 281 282 if results[0].DocID != 2 { 283 t.Errorf("expected document 2 to rank higher (has more 'go' terms)") 284 } 285 286 if results[0].Score <= results[1].Score { 287 t.Errorf("expected first result to have higher score, got %f <= %f", results[0].Score, results[1].Score) 288 } 289 }) 290 291 t.Run("respects limit parameter", func(t *testing.T) { 292 docs := []Document{ 293 {ID: 1, Title: "test one", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)}, 294 {ID: 2, Title: "test two", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)}, 295 {ID: 3, Title: "test three", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)}, 296 } 297 idx := BuildIndex(docs) 298 299 results, err := idx.Search("test", 2) 300 if err != nil { 301 t.Fatalf("unexpected error: %v", err) 302 } 303 304 if len(results) != 2 { 305 t.Errorf("expected 2 results with limit=2, got %d", len(results)) 306 } 307 }) 308 309 t.Run("handles multi-term queries", func(t *testing.T) { 310 docs := []Document{ 311 {ID: 1, Title: "Go Programming", Body: "advanced tutorial", CreatedAt: now, DocKind: int64(NoteDoc)}, 312 {ID: 2, Title: "Go Basics", Body: "beginner tutorial", CreatedAt: now, DocKind: int64(NoteDoc)}, 313 {ID: 3, Title: "Python", Body: "different language", CreatedAt: now, DocKind: int64(NoteDoc)}, 314 } 315 idx := BuildIndex(docs) 316 317 results, err := idx.Search("go tutorial", 10) 318 if err != nil { 319 t.Fatalf("unexpected error: %v", err) 320 } 321 322 if len(results) != 2 { 323 t.Errorf("expected 2 results, got %d", len(results)) 324 } 325 }) 326 327 t.Run("returns no results for non-matching query", func(t *testing.T) { 328 docs := []Document{ 329 {ID: 1, Title: "Go", Body: "programming", CreatedAt: now, DocKind: int64(NoteDoc)}, 330 } 331 idx := BuildIndex(docs) 332 333 results, err := idx.Search("rust", 10) 334 if err != nil { 335 t.Fatalf("unexpected error: %v", err) 336 } 337 338 if len(results) != 0 { 339 t.Errorf("expected 0 results for non-matching query, got %d", len(results)) 340 } 341 }) 342 343 t.Run("handles zero limit", func(t *testing.T) { 344 docs := []Document{ 345 {ID: 1, Title: "test", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)}, 346 } 347 idx := BuildIndex(docs) 348 349 results, err := idx.Search("test", 0) 350 if err != nil { 351 t.Fatalf("unexpected error: %v", err) 352 } 353 354 if len(results) != 1 { 355 t.Errorf("expected all results with limit=0, got %d", len(results)) 356 } 357 }) 358 359 t.Run("tie-breaking uses DocID", func(t *testing.T) { 360 docs := []Document{ 361 {ID: 1, Title: "test", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)}, 362 {ID: 2, Title: "test", Body: "content", CreatedAt: now, DocKind: int64(NoteDoc)}, 363 } 364 idx := BuildIndex(docs) 365 366 results, err := idx.Search("test", 10) 367 if err != nil { 368 t.Fatalf("unexpected error: %v", err) 369 } 370 371 if len(results) != 2 { 372 t.Fatalf("expected 2 results, got %d", len(results)) 373 } 374 375 if results[0].DocID <= results[1].DocID { 376 t.Error("expected higher DocID first when scores are equal") 377 } 378 }) 379 }) 380}