automod/keyword/tokenize.go at main · oppi.li/indigo-ci

oppi.li / indigo-ci
porting all github actions from bluesky-social/indigo to tangled CI
indigo-ci / automod / keyword / tokenize.go
at main 2.1 kB view raw
 1package keyword
 2
 3import (
 4	"log/slog"
 5	"regexp"
 6	"strings"
 7	"unicode"
 8
 9	"golang.org/x/text/runes"
10	"golang.org/x/text/transform"
11	"golang.org/x/text/unicode/norm"
12)
13
14var (
15	puncChars                    = regexp.MustCompile(`[[:punct:]]+`)
16	nonTokenChars                = regexp.MustCompile(`[^\pL\pN\s]+`)
17	nonTokenCharsSkipCensorChars = regexp.MustCompile(`[^\pL\pN\s#*_-]`)
18)
19
20// Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding.
21//
22// The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc.
23func TokenizeTextWithRegex(text string, nonTokenCharsRegex *regexp.Regexp) []string {
24	// this function needs to be re-defined in every function call to prevent a race condition
25	normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
26	split := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(text, " "))
27	bare := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(split, ""))
28	norm, _, err := transform.String(normFunc, bare)
29	if err != nil {
30		slog.Warn("unicode normalization error", "err", err)
31		norm = bare
32	}
33	return strings.Fields(norm)
34}
35
36func TokenizeText(text string) []string {
37	return TokenizeTextWithRegex(text, nonTokenChars)
38}
39
40func TokenizeTextSkippingCensorChars(text string) []string {
41	return TokenizeTextWithRegex(text, nonTokenCharsSkipCensorChars)
42}
43
44func splitIdentRune(c rune) bool {
45	return !unicode.IsLetter(c) && !unicode.IsNumber(c)
46}
47
48// Splits an identifier in to tokens. Removes any single-character tokens.
49//
50// For example, the-handle.bsky.social would be split in to ["the", "handle", "bsky", "social"]
51func TokenizeIdentifier(orig string) []string {
52	fields := strings.FieldsFunc(orig, splitIdentRune)
53	out := make([]string, 0, len(fields))
54	for _, v := range fields {
55		tok := Slugify(v)
56		if len(tok) > 1 {
57			out = append(out, tok)
58		}
59	}
60	return out
61}