Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

unicode: Add utf8-data module

utf8data.h contains a large database table which is an auto-generated
decodification trie for the unicode normalization functions.

Allow building it into a separate module.

Based on a patch from Shreeya Patel <shreeya.patel@collabora.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>

authored by

Christoph Hellwig and committed by
Gabriel Krisman Bertazi
2b3d0478 6ca99ce7

+126 -91
+11 -2
fs/unicode/Kconfig
··· 8 8 Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding 9 9 support. 10 10 11 + config UNICODE_UTF8_DATA 12 + tristate "UTF-8 normalization and casefolding tables" 13 + depends on UNICODE 14 + default UNICODE 15 + help 16 + This contains a large table of case foldings, which can be loaded as 17 + a separate module if you say M here. To be on the safe side stick 18 + to the default of Y. Saying N here makes no sense, if you do not want 19 + utf8 casefolding support, disable CONFIG_UNICODE instead. 20 + 11 21 config UNICODE_NORMALIZATION_SELFTEST 12 22 tristate "Test UTF-8 normalization support" 13 - depends on UNICODE 14 - default n 23 + depends on UNICODE_UTF8_DATA
+7 -6
fs/unicode/Makefile
··· 2 2 3 3 obj-$(CONFIG_UNICODE) += unicode.o 4 4 obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o 5 + obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o 5 6 6 7 unicode-y := utf8-norm.o utf8-core.o 7 8 8 - $(obj)/utf8-norm.o: $(obj)/utf8data.h 9 + $(obj)/utf8-data.o: $(obj)/utf8data.c 9 10 10 - # In the normal build, the checked-in utf8data.h is just shipped. 11 + # In the normal build, the checked-in utf8data.c is just shipped. 11 12 # 12 - # To generate utf8data.h from UCD, put *.txt files in this directory 13 + # To generate utf8data.c from UCD, put *.txt files in this directory 13 14 # and pass REGENERATE_UTF8DATA=1 from the command line. 14 15 ifdef REGENERATE_UTF8DATA 15 16 ··· 25 24 -t $(srctree)/$(src)/NormalizationTest.txt \ 26 25 -o $@ 27 26 28 - $(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE 27 + $(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE 29 28 $(call if_changed,utf8data) 30 29 31 30 else 32 31 33 - $(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE 32 + $(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE 34 33 $(call if_changed,shipped) 35 34 36 35 endif 37 36 38 - targets += utf8data.h 37 + targets += utf8data.c 39 38 hostprogs += mkutf8data
+19 -5
fs/unicode/mkutf8data.c
··· 3287 3287 open_fail(utf8_name, errno); 3288 3288 3289 3289 fprintf(file, "/* This file is generated code, do not edit. */\n"); 3290 - fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); 3291 - fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); 3292 - fprintf(file, "#endif\n"); 3293 3290 fprintf(file, "\n"); 3294 - fprintf(file, "static const unsigned int utf8vers = %#x;\n", 3295 - unicode_maxage); 3291 + fprintf(file, "#include <linux/module.h>\n"); 3292 + fprintf(file, "#include <linux/kernel.h>\n"); 3293 + fprintf(file, "#include \"utf8n.h\"\n"); 3296 3294 fprintf(file, "\n"); 3297 3295 fprintf(file, "static const unsigned int utf8agetab[] = {\n"); 3298 3296 for (i = 0; i != ages_count; i++) ··· 3337 3339 fprintf(file, "\n"); 3338 3340 } 3339 3341 fprintf(file, "};\n"); 3342 + fprintf(file, "\n"); 3343 + fprintf(file, "struct utf8data_table utf8_data_table = {\n"); 3344 + fprintf(file, "\t.utf8agetab = utf8agetab,\n"); 3345 + fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n"); 3346 + fprintf(file, "\n"); 3347 + fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n"); 3348 + fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n"); 3349 + fprintf(file, "\n"); 3350 + fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n"); 3351 + fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n"); 3352 + fprintf(file, "\n"); 3353 + fprintf(file, "\t.utf8data = utf8data,\n"); 3354 + fprintf(file, "};\n"); 3355 + fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); 3356 + fprintf(file, "\n"); 3357 + fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); 3340 3358 fclose(file); 3341 3359 } 3342 3360
+31 -8
fs/unicode/utf8-core.c
··· 160 160 } 161 161 EXPORT_SYMBOL(utf8_normalize); 162 162 163 + static const struct utf8data *find_table_version(const struct utf8data *table, 164 + size_t nr_entries, unsigned int version) 165 + { 166 + size_t i = nr_entries - 1; 167 + 168 + while (version < table[i].maxage) 169 + i--; 170 + if (version > table[i].maxage) 171 + return NULL; 172 + return &table[i]; 173 + } 174 + 163 175 struct unicode_map *utf8_load(unsigned int version) 164 176 { 165 177 struct unicode_map *um; 166 - 167 - if (!utf8version_is_supported(version)) 168 - return ERR_PTR(-EINVAL); 169 178 170 179 um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); 171 180 if (!um) 172 181 return ERR_PTR(-ENOMEM); 173 182 um->version = version; 174 - um->ntab[UTF8_NFDI] = utf8nfdi(version); 183 + 184 + um->tables = symbol_request(utf8_data_table); 185 + if (!um->tables) 186 + goto out_free_um; 187 + 188 + if (!utf8version_is_supported(um, version)) 189 + goto out_symbol_put; 190 + um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata, 191 + um->tables->utf8nfdidata_size, um->version); 175 192 if (!um->ntab[UTF8_NFDI]) 176 - goto out_free_um; 177 - um->ntab[UTF8_NFDICF] = utf8nfdicf(version); 193 + goto out_symbol_put; 194 + um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata, 195 + um->tables->utf8nfdicfdata_size, um->version); 178 196 if (!um->ntab[UTF8_NFDICF]) 179 - goto out_free_um; 197 + goto out_symbol_put; 180 198 return um; 181 199 200 + out_symbol_put: 201 + symbol_put(um->tables); 182 202 out_free_um: 183 203 kfree(um); 184 204 return ERR_PTR(-EINVAL); ··· 207 187 208 188 void utf8_unload(struct unicode_map *um) 209 189 { 210 - kfree(um); 190 + if (um) { 191 + symbol_put(utf8_data_table); 192 + kfree(um); 193 + } 211 194 } 212 195 EXPORT_SYMBOL(utf8_unload); 213 196
+9 -39
fs/unicode/utf8-norm.c
··· 6 6 7 7 #include "utf8n.h" 8 8 9 - struct utf8data { 10 - unsigned int maxage; 11 - unsigned int offset; 12 - }; 13 - 14 - #define __INCLUDED_FROM_UTF8NORM_C__ 15 - #include "utf8data.h" 16 - #undef __INCLUDED_FROM_UTF8NORM_C__ 17 - 18 - int utf8version_is_supported(unsigned int version) 9 + int utf8version_is_supported(const struct unicode_map *um, unsigned int version) 19 10 { 20 - int i = ARRAY_SIZE(utf8agetab) - 1; 11 + int i = um->tables->utf8agetab_size - 1; 21 12 22 - while (i >= 0 && utf8agetab[i] != 0) { 23 - if (version == utf8agetab[i]) 13 + while (i >= 0 && um->tables->utf8agetab[i] != 0) { 14 + if (version == um->tables->utf8agetab[i]) 24 15 return 1; 25 16 i--; 26 17 } ··· 152 161 * underlying datatype: unsigned char. 153 162 * 154 163 * leaf[0]: The unicode version, stored as a generation number that is 155 - * an index into utf8agetab[]. With this we can filter code 164 + * an index into ->utf8agetab[]. With this we can filter code 156 165 * points based on the unicode version in which they were 157 166 * defined. The CCC of a non-defined code point is 0. 158 167 * leaf[1]: Canonical Combining Class. During normalization, we need ··· 304 313 enum utf8_normalization n, unsigned char *hangul, const char *s, 305 314 size_t len) 306 315 { 307 - utf8trie_t *trie = utf8data + um->ntab[n]->offset; 316 + utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset; 308 317 int offlen; 309 318 int offset; 310 319 int mask; ··· 395 404 leaf = utf8nlookup(um, n, hangul, s, len); 396 405 if (!leaf) 397 406 return -1; 398 - if (utf8agetab[LEAF_GEN(leaf)] > um->ntab[n]->maxage) 407 + if (um->tables->utf8agetab[LEAF_GEN(leaf)] > 408 + um->ntab[n]->maxage) 399 409 ret += utf8clen(s); 400 410 else if (LEAF_CCC(leaf) == DECOMPOSE) 401 411 ret += strlen(LEAF_STR(leaf)); ··· 512 520 513 521 ccc = LEAF_CCC(leaf); 514 522 /* Characters that are too new have CCC 0. */ 515 - if (utf8agetab[LEAF_GEN(leaf)] > 523 + if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] > 516 524 u8c->um->ntab[u8c->n]->maxage) { 517 525 ccc = STOPPER; 518 526 } else if (ccc == DECOMPOSE) { ··· 589 597 } 590 598 } 591 599 EXPORT_SYMBOL(utf8byte); 592 - 593 - const struct utf8data *utf8nfdi(unsigned int maxage) 594 - { 595 - int i = ARRAY_SIZE(utf8nfdidata) - 1; 596 - 597 - while (maxage < utf8nfdidata[i].maxage) 598 - i--; 599 - if (maxage > utf8nfdidata[i].maxage) 600 - return NULL; 601 - return &utf8nfdidata[i]; 602 - } 603 - 604 - const struct utf8data *utf8nfdicf(unsigned int maxage) 605 - { 606 - int i = ARRAY_SIZE(utf8nfdicfdata) - 1; 607 - 608 - while (maxage < utf8nfdicfdata[i].maxage) 609 - i--; 610 - if (maxage > utf8nfdicfdata[i].maxage) 611 - return NULL; 612 - return &utf8nfdicfdata[i]; 613 - }
+8 -8
fs/unicode/utf8-selftest.c
··· 255 255 } 256 256 } 257 257 258 - static void check_supported_versions(void) 258 + static void check_supported_versions(struct unicode_map *um) 259 259 { 260 260 /* Unicode 7.0.0 should be supported. */ 261 - test(utf8version_is_supported(UNICODE_AGE(7, 0, 0))); 261 + test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); 262 262 263 263 /* Unicode 9.0.0 should be supported. */ 264 - test(utf8version_is_supported(UNICODE_AGE(9, 0, 0))); 264 + test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); 265 265 266 266 /* Unicode 1x.0.0 (the latest version) should be supported. */ 267 - test(utf8version_is_supported(UTF8_LATEST)); 267 + test(utf8version_is_supported(um, UTF8_LATEST)); 268 268 269 269 /* Next versions don't exist. */ 270 - test(!utf8version_is_supported(UNICODE_AGE(13, 0, 0))); 271 - test(!utf8version_is_supported(UNICODE_AGE(0, 0, 0))); 272 - test(!utf8version_is_supported(UNICODE_AGE(-1, -1, -1))); 270 + test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); 271 + test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); 272 + test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); 273 273 } 274 274 275 275 static int __init init_test_ucd(void) ··· 285 285 return PTR_ERR(um); 286 286 } 287 287 288 - check_supported_versions(); 288 + check_supported_versions(um); 289 289 check_utf8_nfdi(um); 290 290 check_utf8_nfdicf(um); 291 291 check_utf8_comparisons(um);
+18 -4
fs/unicode/utf8data.h_shipped fs/unicode/utf8data.c_shipped
··· 1 1 /* This file is generated code, do not edit. */ 2 - #ifndef __INCLUDED_FROM_UTF8NORM_C__ 3 - #error Only nls_utf8-norm.c should include this file. 4 - #endif 5 2 6 - static const unsigned int utf8vers = 0xc0100; 3 + #include <linux/module.h> 4 + #include <linux/kernel.h> 5 + #include "utf8n.h" 7 6 8 7 static const unsigned int utf8agetab[] = { 9 8 0, ··· 4106 4107 0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xcf,0x06,0x02,0x00, 4107 4108 0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00 4108 4109 }; 4110 + 4111 + struct utf8data_table utf8_data_table = { 4112 + .utf8agetab = utf8agetab, 4113 + .utf8agetab_size = ARRAY_SIZE(utf8agetab), 4114 + 4115 + .utf8nfdicfdata = utf8nfdicfdata, 4116 + .utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata), 4117 + 4118 + .utf8nfdidata = utf8nfdidata, 4119 + .utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata), 4120 + 4121 + .utf8data = utf8data, 4122 + }; 4123 + EXPORT_SYMBOL_GPL(utf8_data_table); 4124 + MODULE_LICENSE("GPL v2");
+21 -19
fs/unicode/utf8n.h
··· 13 13 #include <linux/module.h> 14 14 #include <linux/unicode.h> 15 15 16 - int utf8version_is_supported(unsigned int version); 17 - 18 - /* 19 - * Look for the correct const struct utf8data for a unicode version. 20 - * Returns NULL if the version requested is too new. 21 - * 22 - * Two normalization forms are supported: nfdi and nfdicf. 23 - * 24 - * nfdi: 25 - * - Apply unicode normalization form NFD. 26 - * - Remove any Default_Ignorable_Code_Point. 27 - * 28 - * nfdicf: 29 - * - Apply unicode normalization form NFD. 30 - * - Remove any Default_Ignorable_Code_Point. 31 - * - Apply a full casefold (C + F). 32 - */ 33 - extern const struct utf8data *utf8nfdi(unsigned int maxage); 34 - extern const struct utf8data *utf8nfdicf(unsigned int maxage); 16 + int utf8version_is_supported(const struct unicode_map *um, unsigned int version); 35 17 36 18 /* 37 19 * Determine the length of the normalized from of the string, ··· 59 77 * Returns -1 if the string being normalized is not valid UTF-8. 60 78 */ 61 79 extern int utf8byte(struct utf8cursor *u8c); 80 + 81 + struct utf8data { 82 + unsigned int maxage; 83 + unsigned int offset; 84 + }; 85 + 86 + struct utf8data_table { 87 + const unsigned int *utf8agetab; 88 + int utf8agetab_size; 89 + 90 + const struct utf8data *utf8nfdicfdata; 91 + int utf8nfdicfdata_size; 92 + 93 + const struct utf8data *utf8nfdidata; 94 + int utf8nfdidata_size; 95 + 96 + const unsigned char *utf8data; 97 + }; 98 + 99 + extern struct utf8data_table utf8_data_table; 62 100 63 101 #endif /* UTF8NORM_H */
+2
include/linux/unicode.h
··· 6 6 #include <linux/dcache.h> 7 7 8 8 struct utf8data; 9 + struct utf8data_table; 9 10 10 11 #define UNICODE_MAJ_SHIFT 16 11 12 #define UNICODE_MIN_SHIFT 8 ··· 50 49 struct unicode_map { 51 50 unsigned int version; 52 51 const struct utf8data *ntab[UTF8_NMAX]; 52 + const struct utf8data_table *tables; 53 53 }; 54 54 55 55 int utf8_validate(const struct unicode_map *um, const struct qstr *str);