cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
at main 153 lines 3.0 kB view raw
1// Term Frequency-Inverse Document Frequency search model for notes 2package documents 3 4import ( 5 "math" 6 "regexp" 7 "sort" 8 "strings" 9 "time" 10) 11 12type DocKind int64 13 14const ( 15 NoteDoc DocKind = iota 16 ArticleDoc 17 MovieDoc 18 BookDoc 19 TVDoc 20) 21 22type Document struct { 23 ID int64 24 Title string 25 Body string 26 CreatedAt time.Time 27 DocKind int64 28} 29 30type Posting struct { 31 DocID int64 32 TF int 33} 34 35type Index struct { 36 Postings map[string][]Posting 37 DocLengths map[int64]int 38 NumDocs int 39} 40 41type Result struct { 42 DocID int64 43 Score float64 44} 45 46type Searchable interface { 47 Search(query string, limit int) ([]Result, error) 48} 49 50// Tokenizer handles text tokenization and normalization 51type Tokenizer struct { 52 pattern *regexp.Regexp 53} 54 55// NewTokenizer creates a new tokenizer with Unicode-aware word/number matching 56func NewTokenizer() *Tokenizer { 57 return &Tokenizer{ 58 pattern: regexp.MustCompile(`\p{L}+\p{M}*|\p{N}+`), 59 } 60} 61 62// Tokenize splits text into normalized tokens (lowercase words and numbers) 63func (t *Tokenizer) Tokenize(text string) []string { 64 lowered := strings.ToLower(text) 65 return t.pattern.FindAllString(lowered, -1) 66} 67 68// TokenFrequency computes term frequency map for tokens 69func TokenFrequency(tokens []string) map[string]int { 70 freq := make(map[string]int) 71 for _, token := range tokens { 72 freq[token]++ 73 } 74 return freq 75} 76 77// BuildIndex constructs a TF-IDF index from a collection of documents 78func BuildIndex(docs []Document) *Index { 79 idx := &Index{ 80 Postings: make(map[string][]Posting), 81 DocLengths: make(map[int64]int), 82 NumDocs: 0, 83 } 84 85 tokenizer := NewTokenizer() 86 87 for _, doc := range docs { 88 text := doc.Title + " " + doc.Body 89 tokens := tokenizer.Tokenize(text) 90 91 idx.NumDocs++ 92 idx.DocLengths[doc.ID] = len(tokens) 93 94 freq := TokenFrequency(tokens) 95 96 for term, tf := range freq { 97 idx.Postings[term] = append(idx.Postings[term], Posting{ 98 DocID: doc.ID, 99 TF: tf, 100 }) 101 } 102 } 103 104 return idx 105} 106 107// Search performs TF-IDF ranked search on the index 108func (idx *Index) Search(query string, limit int) ([]Result, error) { 109 tokenizer := NewTokenizer() 110 queryTokens := tokenizer.Tokenize(query) 111 112 if len(queryTokens) == 0 { 113 return []Result{}, nil 114 } 115 116 scores := make(map[int64]float64) 117 118 for _, term := range queryTokens { 119 postings, exists := idx.Postings[term] 120 if !exists { 121 continue 122 } 123 124 df := len(postings) 125 idf := math.Log(float64(idx.NumDocs) / float64(df)) 126 127 for _, posting := range postings { 128 tf := float64(posting.TF) 129 scores[posting.DocID] += tf * idf 130 } 131 } 132 133 results := make([]Result, 0, len(scores)) 134 for docID, score := range scores { 135 results = append(results, Result{ 136 DocID: docID, 137 Score: score, 138 }) 139 } 140 141 sort.Slice(results, func(i, j int) bool { 142 if results[i].Score != results[j].Score { 143 return results[i].Score > results[j].Score 144 } 145 return results[i].DocID > results[j].DocID 146 }) 147 148 if limit > 0 && limit < len(results) { 149 results = results[:limit] 150 } 151 152 return results, nil 153}