cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1// Term Frequency-Inverse Document Frequency search model for notes
2package documents
3
4import (
5 "math"
6 "regexp"
7 "sort"
8 "strings"
9 "time"
10)
11
12type DocKind int64
13
14const (
15 NoteDoc DocKind = iota
16 ArticleDoc
17 MovieDoc
18 BookDoc
19 TVDoc
20)
21
22type Document struct {
23 ID int64
24 Title string
25 Body string
26 CreatedAt time.Time
27 DocKind int64
28}
29
30type Posting struct {
31 DocID int64
32 TF int
33}
34
35type Index struct {
36 Postings map[string][]Posting
37 DocLengths map[int64]int
38 NumDocs int
39}
40
41type Result struct {
42 DocID int64
43 Score float64
44}
45
46type Searchable interface {
47 Search(query string, limit int) ([]Result, error)
48}
49
50// Tokenizer handles text tokenization and normalization
51type Tokenizer struct {
52 pattern *regexp.Regexp
53}
54
55// NewTokenizer creates a new tokenizer with Unicode-aware word/number matching
56func NewTokenizer() *Tokenizer {
57 return &Tokenizer{
58 pattern: regexp.MustCompile(`\p{L}+\p{M}*|\p{N}+`),
59 }
60}
61
62// Tokenize splits text into normalized tokens (lowercase words and numbers)
63func (t *Tokenizer) Tokenize(text string) []string {
64 lowered := strings.ToLower(text)
65 return t.pattern.FindAllString(lowered, -1)
66}
67
68// TokenFrequency computes term frequency map for tokens
69func TokenFrequency(tokens []string) map[string]int {
70 freq := make(map[string]int)
71 for _, token := range tokens {
72 freq[token]++
73 }
74 return freq
75}
76
77// BuildIndex constructs a TF-IDF index from a collection of documents
78func BuildIndex(docs []Document) *Index {
79 idx := &Index{
80 Postings: make(map[string][]Posting),
81 DocLengths: make(map[int64]int),
82 NumDocs: 0,
83 }
84
85 tokenizer := NewTokenizer()
86
87 for _, doc := range docs {
88 text := doc.Title + " " + doc.Body
89 tokens := tokenizer.Tokenize(text)
90
91 idx.NumDocs++
92 idx.DocLengths[doc.ID] = len(tokens)
93
94 freq := TokenFrequency(tokens)
95
96 for term, tf := range freq {
97 idx.Postings[term] = append(idx.Postings[term], Posting{
98 DocID: doc.ID,
99 TF: tf,
100 })
101 }
102 }
103
104 return idx
105}
106
107// Search performs TF-IDF ranked search on the index
108func (idx *Index) Search(query string, limit int) ([]Result, error) {
109 tokenizer := NewTokenizer()
110 queryTokens := tokenizer.Tokenize(query)
111
112 if len(queryTokens) == 0 {
113 return []Result{}, nil
114 }
115
116 scores := make(map[int64]float64)
117
118 for _, term := range queryTokens {
119 postings, exists := idx.Postings[term]
120 if !exists {
121 continue
122 }
123
124 df := len(postings)
125 idf := math.Log(float64(idx.NumDocs) / float64(df))
126
127 for _, posting := range postings {
128 tf := float64(posting.TF)
129 scores[posting.DocID] += tf * idf
130 }
131 }
132
133 results := make([]Result, 0, len(scores))
134 for docID, score := range scores {
135 results = append(results, Result{
136 DocID: docID,
137 Score: score,
138 })
139 }
140
141 sort.Slice(results, func(i, j int) bool {
142 if results[i].Score != results[j].Score {
143 return results[i].Score > results[j].Score
144 }
145 return results[i].DocID > results[j].DocID
146 })
147
148 if limit > 0 && limit < len(results) {
149 results = results[:limit]
150 }
151
152 return results, nil
153}