1package keyword
2
3import (
4 "log/slog"
5 "regexp"
6 "strings"
7 "unicode"
8
9 "golang.org/x/text/runes"
10 "golang.org/x/text/transform"
11 "golang.org/x/text/unicode/norm"
12)
13
14var (
15 puncChars = regexp.MustCompile(`[[:punct:]]+`)
16 nonTokenChars = regexp.MustCompile(`[^\pL\pN\s]+`)
17 nonTokenCharsSkipCensorChars = regexp.MustCompile(`[^\pL\pN\s#*_-]`)
18)
19
20// Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding.
21//
22// The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc.
23func TokenizeTextWithRegex(text string, nonTokenCharsRegex *regexp.Regexp) []string {
24 // this function needs to be re-defined in every function call to prevent a race condition
25 normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
26 split := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(text, " "))
27 bare := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(split, ""))
28 norm, _, err := transform.String(normFunc, bare)
29 if err != nil {
30 slog.Warn("unicode normalization error", "err", err)
31 norm = bare
32 }
33 return strings.Fields(norm)
34}
35
36func TokenizeText(text string) []string {
37 return TokenizeTextWithRegex(text, nonTokenChars)
38}
39
40func TokenizeTextSkippingCensorChars(text string) []string {
41 return TokenizeTextWithRegex(text, nonTokenCharsSkipCensorChars)
42}
43
44func splitIdentRune(c rune) bool {
45 return !unicode.IsLetter(c) && !unicode.IsNumber(c)
46}
47
48// Splits an identifier in to tokens. Removes any single-character tokens.
49//
50// For example, the-handle.bsky.social would be split in to ["the", "handle", "bsky", "social"]
51func TokenizeIdentifier(orig string) []string {
52 fields := strings.FieldsFunc(orig, splitIdentRune)
53 out := make([]string, 0, len(fields))
54 for _, v := range fields {
55 tok := Slugify(v)
56 if len(tok) > 1 {
57 out = append(out, tok)
58 }
59 }
60 return out
61}