internal/documents/documents.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / documents / documents.go
at main 153 lines 3.0 kB view raw
  1// Term Frequency-Inverse Document Frequency search model for notes
  2package documents
  3
  4import (
  5	"math"
  6	"regexp"
  7	"sort"
  8	"strings"
  9	"time"
 10)
 11
 12type DocKind int64
 13
 14const (
 15	NoteDoc DocKind = iota
 16	ArticleDoc
 17	MovieDoc
 18	BookDoc
 19	TVDoc
 20)
 21
 22type Document struct {
 23	ID        int64
 24	Title     string
 25	Body      string
 26	CreatedAt time.Time
 27	DocKind   int64
 28}
 29
 30type Posting struct {
 31	DocID int64
 32	TF    int
 33}
 34
 35type Index struct {
 36	Postings   map[string][]Posting
 37	DocLengths map[int64]int
 38	NumDocs    int
 39}
 40
 41type Result struct {
 42	DocID int64
 43	Score float64
 44}
 45
 46type Searchable interface {
 47	Search(query string, limit int) ([]Result, error)
 48}
 49
 50// Tokenizer handles text tokenization and normalization
 51type Tokenizer struct {
 52	pattern *regexp.Regexp
 53}
 54
 55// NewTokenizer creates a new tokenizer with Unicode-aware word/number matching
 56func NewTokenizer() *Tokenizer {
 57	return &Tokenizer{
 58		pattern: regexp.MustCompile(`\p{L}+\p{M}*|\p{N}+`),
 59	}
 60}
 61
 62// Tokenize splits text into normalized tokens (lowercase words and numbers)
 63func (t *Tokenizer) Tokenize(text string) []string {
 64	lowered := strings.ToLower(text)
 65	return t.pattern.FindAllString(lowered, -1)
 66}
 67
 68// TokenFrequency computes term frequency map for tokens
 69func TokenFrequency(tokens []string) map[string]int {
 70	freq := make(map[string]int)
 71	for _, token := range tokens {
 72		freq[token]++
 73	}
 74	return freq
 75}
 76
 77// BuildIndex constructs a TF-IDF index from a collection of documents
 78func BuildIndex(docs []Document) *Index {
 79	idx := &Index{
 80		Postings:   make(map[string][]Posting),
 81		DocLengths: make(map[int64]int),
 82		NumDocs:    0,
 83	}
 84
 85	tokenizer := NewTokenizer()
 86
 87	for _, doc := range docs {
 88		text := doc.Title + " " + doc.Body
 89		tokens := tokenizer.Tokenize(text)
 90
 91		idx.NumDocs++
 92		idx.DocLengths[doc.ID] = len(tokens)
 93
 94		freq := TokenFrequency(tokens)
 95
 96		for term, tf := range freq {
 97			idx.Postings[term] = append(idx.Postings[term], Posting{
 98				DocID: doc.ID,
 99				TF:    tf,
100			})
101		}
102	}
103
104	return idx
105}
106
107// Search performs TF-IDF ranked search on the index
108func (idx *Index) Search(query string, limit int) ([]Result, error) {
109	tokenizer := NewTokenizer()
110	queryTokens := tokenizer.Tokenize(query)
111
112	if len(queryTokens) == 0 {
113		return []Result{}, nil
114	}
115
116	scores := make(map[int64]float64)
117
118	for _, term := range queryTokens {
119		postings, exists := idx.Postings[term]
120		if !exists {
121			continue
122		}
123
124		df := len(postings)
125		idf := math.Log(float64(idx.NumDocs) / float64(df))
126
127		for _, posting := range postings {
128			tf := float64(posting.TF)
129			scores[posting.DocID] += tf * idf
130		}
131	}
132
133	results := make([]Result, 0, len(scores))
134	for docID, score := range scores {
135		results = append(results, Result{
136			DocID: docID,
137			Score: score,
138		})
139	}
140
141	sort.Slice(results, func(i, j int) bool {
142		if results[i].Score != results[j].Score {
143			return results[i].Score > results[j].Score
144		}
145		return results[i].DocID > results[j].DocID
146	})
147
148	if limit > 0 && limit < len(results) {
149		results = results[:limit]
150	}
151
152	return results, nil
153}