// Term Frequency-Inverse Document Frequency search model for notes package documents import ( "math" "regexp" "sort" "strings" "time" ) type DocKind int64 const ( NoteDoc DocKind = iota ArticleDoc MovieDoc BookDoc TVDoc ) type Document struct { ID int64 Title string Body string CreatedAt time.Time DocKind int64 } type Posting struct { DocID int64 TF int } type Index struct { Postings map[string][]Posting DocLengths map[int64]int NumDocs int } type Result struct { DocID int64 Score float64 } type Searchable interface { Search(query string, limit int) ([]Result, error) } // Tokenizer handles text tokenization and normalization type Tokenizer struct { pattern *regexp.Regexp } // NewTokenizer creates a new tokenizer with Unicode-aware word/number matching func NewTokenizer() *Tokenizer { return &Tokenizer{ pattern: regexp.MustCompile(`\p{L}+\p{M}*|\p{N}+`), } } // Tokenize splits text into normalized tokens (lowercase words and numbers) func (t *Tokenizer) Tokenize(text string) []string { lowered := strings.ToLower(text) return t.pattern.FindAllString(lowered, -1) } // TokenFrequency computes term frequency map for tokens func TokenFrequency(tokens []string) map[string]int { freq := make(map[string]int) for _, token := range tokens { freq[token]++ } return freq } // BuildIndex constructs a TF-IDF index from a collection of documents func BuildIndex(docs []Document) *Index { idx := &Index{ Postings: make(map[string][]Posting), DocLengths: make(map[int64]int), NumDocs: 0, } tokenizer := NewTokenizer() for _, doc := range docs { text := doc.Title + " " + doc.Body tokens := tokenizer.Tokenize(text) idx.NumDocs++ idx.DocLengths[doc.ID] = len(tokens) freq := TokenFrequency(tokens) for term, tf := range freq { idx.Postings[term] = append(idx.Postings[term], Posting{ DocID: doc.ID, TF: tf, }) } } return idx } // Search performs TF-IDF ranked search on the index func (idx *Index) Search(query string, limit int) ([]Result, error) { tokenizer := NewTokenizer() queryTokens := tokenizer.Tokenize(query) if len(queryTokens) == 0 { return []Result{}, nil } scores := make(map[int64]float64) for _, term := range queryTokens { postings, exists := idx.Postings[term] if !exists { continue } df := len(postings) idf := math.Log(float64(idx.NumDocs) / float64(df)) for _, posting := range postings { tf := float64(posting.TF) scores[posting.DocID] += tf * idf } } results := make([]Result, 0, len(scores)) for docID, score := range scores { results = append(results, Result{ DocID: docID, Score: score, }) } sort.Slice(results, func(i, j int) bool { if results[i].Score != results[j].Score { return results[i].Score > results[j].Score } return results[i].DocID > results[j].DocID }) if limit > 0 && limit < len(results) { results = results[:limit] } return results, nil }