commit c8238e1043309c94117e520c4b256be84588256f · oppi.li/indigo-ci

+3 -7

automod/keyword/tokenize.go

··· 20 20 // Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding. 21 21 // 22 22 // The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc. 23 - func tokenizeText(text string, nonTokenCharsRegex *regexp.Regexp) []string { 23 + func TokenizeTextWithRegex(text string, nonTokenCharsRegex *regexp.Regexp) []string { 24 24 // this function needs to be re-defined in every function call to prevent a race condition 25 25 normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) 26 26 split := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(text, " ")) ··· 34 34 } 35 35 36 36 func TokenizeText(text string) []string { 37 - return tokenizeText(text, nonTokenChars) 37 + return TokenizeTextWithRegex(text, nonTokenChars) 38 38 } 39 39 40 40 func TokenizeTextSkippingCensorChars(text string) []string { 41 - return tokenizeText(text, nonTokenCharsSkipCensorChars) 42 - } 43 - 44 - func TokenizeTextWithRegex(text string, nonTokenCharsRegex *regexp.Regexp) []string { 45 - return tokenizeText(text, nonTokenCharsRegex) 41 + return TokenizeTextWithRegex(text, nonTokenCharsSkipCensorChars) 46 42 } 47 43 48 44 func splitIdentRune(c rune) bool {