porting all github actions from bluesky-social/indigo to tangled CI
at main 2.1 kB view raw
1package keyword 2 3import ( 4 "log/slog" 5 "regexp" 6 "strings" 7 "unicode" 8 9 "golang.org/x/text/runes" 10 "golang.org/x/text/transform" 11 "golang.org/x/text/unicode/norm" 12) 13 14var ( 15 puncChars = regexp.MustCompile(`[[:punct:]]+`) 16 nonTokenChars = regexp.MustCompile(`[^\pL\pN\s]+`) 17 nonTokenCharsSkipCensorChars = regexp.MustCompile(`[^\pL\pN\s#*_-]`) 18) 19 20// Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding. 21// 22// The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc. 23func TokenizeTextWithRegex(text string, nonTokenCharsRegex *regexp.Regexp) []string { 24 // this function needs to be re-defined in every function call to prevent a race condition 25 normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) 26 split := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(text, " ")) 27 bare := strings.ToLower(nonTokenCharsRegex.ReplaceAllString(split, "")) 28 norm, _, err := transform.String(normFunc, bare) 29 if err != nil { 30 slog.Warn("unicode normalization error", "err", err) 31 norm = bare 32 } 33 return strings.Fields(norm) 34} 35 36func TokenizeText(text string) []string { 37 return TokenizeTextWithRegex(text, nonTokenChars) 38} 39 40func TokenizeTextSkippingCensorChars(text string) []string { 41 return TokenizeTextWithRegex(text, nonTokenCharsSkipCensorChars) 42} 43 44func splitIdentRune(c rune) bool { 45 return !unicode.IsLetter(c) && !unicode.IsNumber(c) 46} 47 48// Splits an identifier in to tokens. Removes any single-character tokens. 49// 50// For example, the-handle.bsky.social would be split in to ["the", "handle", "bsky", "social"] 51func TokenizeIdentifier(orig string) []string { 52 fields := strings.FieldsFunc(orig, splitIdentRune) 53 out := make([]string, 0, len(fields)) 54 for _, v := range fields { 55 tok := Slugify(v) 56 if len(tok) > 1 { 57 out = append(out, tok) 58 } 59 } 60 return out 61}