fork of indigo with slightly nicer lexgen

syntax: basic language code (BCP-47) validation

+41
atproto/syntax/language.go
··· 1 + package syntax 2 + 3 + import ( 4 + "fmt" 5 + "regexp" 6 + ) 7 + 8 + // Represents a Language specifier in string format, as would pass Lexicon syntax validation. 9 + // 10 + // Always use [ParseLanguage] instead of wrapping strings directly, especially when working with network input. 11 + // 12 + // The syntax is BCP-47. This is a partial/naive parsing implementation, designed for fast validation and exact-string passthrough with no normaliztion. For actually working with BCP-47 language specifiers in atproto code bases, we recommend the golang.org/x/text/language package. 13 + type Language string 14 + 15 + func ParseLanguage(raw string) (Language, error) { 16 + if len(raw) > 128 { 17 + return "", fmt.Errorf("Language is too long (128 chars max)") 18 + } 19 + var langRegex = regexp.MustCompile(`^(i|[a-z]{2,3})(-[a-zA-Z0-9]+)*$`) 20 + if !langRegex.MatchString(raw) { 21 + return "", fmt.Errorf("Language syntax didn't validate via regex") 22 + } 23 + return Language(raw), nil 24 + } 25 + 26 + func (l Language) String() string { 27 + return string(l) 28 + } 29 + 30 + func (l Language) MarshalText() ([]byte, error) { 31 + return []byte(l.String()), nil 32 + } 33 + 34 + func (l *Language) UnmarshalText(text []byte) error { 35 + lang, err := ParseLanguage(string(text)) 36 + if err != nil { 37 + return err 38 + } 39 + *l = lang 40 + return nil 41 + }
+50
atproto/syntax/language_test.go
··· 1 + package syntax 2 + 3 + import ( 4 + "bufio" 5 + "fmt" 6 + "os" 7 + "testing" 8 + 9 + "github.com/stretchr/testify/assert" 10 + ) 11 + 12 + func TestInteropLanguagesValid(t *testing.T) { 13 + assert := assert.New(t) 14 + file, err := os.Open("testdata/language_syntax_valid.txt") 15 + assert.NoError(err) 16 + defer file.Close() 17 + scanner := bufio.NewScanner(file) 18 + for scanner.Scan() { 19 + line := scanner.Text() 20 + if len(line) == 0 || line[0] == '#' { 21 + continue 22 + } 23 + _, err := ParseLanguage(line) 24 + if err != nil { 25 + fmt.Println("GOOD: " + line) 26 + } 27 + assert.NoError(err) 28 + } 29 + assert.NoError(scanner.Err()) 30 + } 31 + 32 + func TestInteropLanguagesInvalid(t *testing.T) { 33 + assert := assert.New(t) 34 + file, err := os.Open("testdata/language_syntax_invalid.txt") 35 + assert.NoError(err) 36 + defer file.Close() 37 + scanner := bufio.NewScanner(file) 38 + for scanner.Scan() { 39 + line := scanner.Text() 40 + if len(line) == 0 || line[0] == '#' { 41 + continue 42 + } 43 + _, err := ParseLanguage(line) 44 + if err == nil { 45 + fmt.Println("BAD: " + line) 46 + } 47 + assert.Error(err) 48 + } 49 + assert.NoError(scanner.Err()) 50 + }
+10
atproto/syntax/testdata/language_syntax_invalid.txt
··· 1 + jaja 2 + . 3 + 123 4 + JA 5 + j 6 + ja- 7 + a-DE 8 + 9 + # technically not valid, but allowing in naive parser 10 + #de-419-DE
+18
atproto/syntax/testdata/language_syntax_valid.txt
··· 1 + ja 2 + ban 3 + pt-BR 4 + hy-Latn-IT-arevela 5 + en-GB 6 + zh-Hant 7 + sgn-BE-NL 8 + es-419 9 + en-GB-boont-r-extended-sequence-x-private 10 + 11 + # grandfathered 12 + zh-hakka 13 + i-default 14 + i-navajo 15 + 16 + # https://github.com/sebinsua/ietf-language-tag-regex/blob/master/test.js 17 + de-CH-1901 18 + qaa-Qaaa-QM-x-southern