+3
-7
automod/keyword/tokenize.go
+3
-7
automod/keyword/tokenize.go
···
20
20
// Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding.
21
21
//
22
22
// The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc.
23
-
func tokenizeText(text string, nonTokenCharsRegex *regexp.Regexp) []string {
23
+
func TokenizeTextWithRegex(text string, nonTokenCharsRegex *regexp.Regexp) []string {
24
24
// this function needs to be re-defined in every function call to prevent a race condition
25
25
normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
26
26
split := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(text, " "))
···
34
34
}
35
35
36
36
func TokenizeText(text string) []string {
37
-
return tokenizeText(text, nonTokenChars)
37
+
return TokenizeTextWithRegex(text, nonTokenChars)
38
38
}
39
39
40
40
func TokenizeTextSkippingCensorChars(text string) []string {
41
-
return tokenizeText(text, nonTokenCharsSkipCensorChars)
42
-
}
43
-
44
-
func TokenizeTextWithRegex(text string, nonTokenCharsRegex *regexp.Regexp) []string {
45
-
return tokenizeText(text, nonTokenCharsRegex)
41
+
return TokenizeTextWithRegex(text, nonTokenCharsSkipCensorChars)
46
42
}
47
43
48
44
func splitIdentRune(c rune) bool {