[LIB]: Naive finite state machine based textsearch

A finite state machine consists of n states (struct ts_fsm_token)
representing the pattern as a finite automation. The data is read
sequentially on a octet basis. Every state token specifies the number
of recurrences and the type of value accepted which can be either a
specific character or ctype based set of characters. The available
type of recurrences include 1, (0|1), [0 n], and [1 n].

The algorithm differs between strict/non-strict mode specyfing
whether the pattern has to start at the first octect. Strict mode
is enabled by default and can be disabled by inserting
TS_FSM_HEAD_IGNORE as the first token in the chain.

The runtime performance of the algorithm should be around O(n),
however while in strict mode the average runtime can be better.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by Thomas Graf and committed by David S. Miller 6408f79c df3fb93a

+398
+48
include/linux/textsearch_fsm.h
··· 1 + #ifndef __LINUX_TEXTSEARCH_FSM_H 2 + #define __LINUX_TEXTSEARCH_FSM_H 3 + 4 + #include <linux/types.h> 5 + 6 + enum { 7 + TS_FSM_SPECIFIC, /* specific character */ 8 + TS_FSM_WILDCARD, /* any character */ 9 + TS_FSM_DIGIT, /* isdigit() */ 10 + TS_FSM_XDIGIT, /* isxdigit() */ 11 + TS_FSM_PRINT, /* isprint() */ 12 + TS_FSM_ALPHA, /* isalpha() */ 13 + TS_FSM_ALNUM, /* isalnum() */ 14 + TS_FSM_ASCII, /* isascii() */ 15 + TS_FSM_CNTRL, /* iscntrl() */ 16 + TS_FSM_GRAPH, /* isgraph() */ 17 + TS_FSM_LOWER, /* islower() */ 18 + TS_FSM_UPPER, /* isupper() */ 19 + TS_FSM_PUNCT, /* ispunct() */ 20 + TS_FSM_SPACE, /* isspace() */ 21 + __TS_FSM_TYPE_MAX, 22 + }; 23 + #define TS_FSM_TYPE_MAX (__TS_FSM_TYPE_MAX - 1) 24 + 25 + enum { 26 + TS_FSM_SINGLE, /* 1 occurrence */ 27 + TS_FSM_PERHAPS, /* 1 or 0 occurrence */ 28 + TS_FSM_ANY, /* 0..n occurrences */ 29 + TS_FSM_MULTI, /* 1..n occurrences */ 30 + TS_FSM_HEAD_IGNORE, /* 0..n ignored occurrences at head */ 31 + __TS_FSM_RECUR_MAX, 32 + }; 33 + #define TS_FSM_RECUR_MAX (__TS_FSM_RECUR_MAX - 1) 34 + 35 + /** 36 + * struct ts_fsm_token - state machine token (state) 37 + * @type: type of token 38 + * @recur: number of recurrences 39 + * @value: character value for TS_FSM_SPECIFIC 40 + */ 41 + struct ts_fsm_token 42 + { 43 + __u16 type; 44 + __u8 recur; 45 + __u8 value; 46 + }; 47 + 48 + #endif
+11
lib/Kconfig
··· 80 80 To compile this code as a module, choose M here: the 81 81 module will be called ts_kmp. 82 82 83 + config TEXTSEARCH_FSM 84 + depends on TEXTSEARCH 85 + tristate "Finite state machine" 86 + help 87 + Say Y here if you want to be able to search text using a 88 + naive finite state machine approach implementing a subset 89 + of regular expressions. 90 + 91 + To compile this code as a module, choose M here: the 92 + module will be called ts_fsm. 93 + 83 94 endmenu
+1
lib/Makefile
··· 38 38 39 39 lib-$(CONFIG_TEXTSEARCH) += textsearch.o 40 40 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o 41 + obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o 41 42 42 43 hostprogs-y := gen_crc32table 43 44 clean-files := crc32table.h
+338
lib/ts_fsm.c
··· 1 + /* 2 + * lib/ts_fsm.c A naive finite state machine text search approach 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + * 9 + * Authors: Thomas Graf <tgraf@suug.ch> 10 + * 11 + * ========================================================================== 12 + * 13 + * A finite state machine consists of n states (struct ts_fsm_token) 14 + * representing the pattern as a finite automation. The data is read 15 + * sequentially on a octet basis. Every state token specifies the number 16 + * of recurrences and the type of value accepted which can be either a 17 + * specific character or ctype based set of characters. The available 18 + * type of recurrences include 1, (0|1), [0 n], and [1 n]. 19 + * 20 + * The algorithm differs between strict/non-strict mode specyfing 21 + * whether the pattern has to start at the first octect. Strict mode 22 + * is enabled by default and can be disabled by inserting 23 + * TS_FSM_HEAD_IGNORE as the first token in the chain. 24 + * 25 + * The runtime performance of the algorithm should be around O(n), 26 + * however while in strict mode the average runtime can be better. 27 + */ 28 + 29 + #include <linux/config.h> 30 + #include <linux/module.h> 31 + #include <linux/types.h> 32 + #include <linux/string.h> 33 + #include <linux/ctype.h> 34 + #include <linux/textsearch.h> 35 + #include <linux/textsearch_fsm.h> 36 + 37 + struct ts_fsm 38 + { 39 + unsigned int ntokens; 40 + struct ts_fsm_token tokens[0]; 41 + }; 42 + 43 + /* other values derived from ctype.h */ 44 + #define _A 0x100 /* ascii */ 45 + #define _W 0x200 /* wildcard */ 46 + 47 + /* Map to _ctype flags and some magic numbers */ 48 + static u16 token_map[TS_FSM_TYPE_MAX+1] = { 49 + [TS_FSM_SPECIFIC] = 0, 50 + [TS_FSM_WILDCARD] = _W, 51 + [TS_FSM_CNTRL] = _C, 52 + [TS_FSM_LOWER] = _L, 53 + [TS_FSM_UPPER] = _U, 54 + [TS_FSM_PUNCT] = _P, 55 + [TS_FSM_SPACE] = _S, 56 + [TS_FSM_DIGIT] = _D, 57 + [TS_FSM_XDIGIT] = _D | _X, 58 + [TS_FSM_ALPHA] = _U | _L, 59 + [TS_FSM_ALNUM] = _U | _L | _D, 60 + [TS_FSM_PRINT] = _P | _U | _L | _D | _SP, 61 + [TS_FSM_GRAPH] = _P | _U | _L | _D, 62 + [TS_FSM_ASCII] = _A, 63 + }; 64 + 65 + static u16 token_lookup_tbl[256] = { 66 + _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */ 67 + _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */ 68 + _W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */ 69 + _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */ 70 + _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */ 71 + _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */ 72 + _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */ 73 + _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */ 74 + _W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */ 75 + _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */ 76 + _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */ 77 + _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */ 78 + _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */ 79 + _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */ 80 + _W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */ 81 + _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */ 82 + _W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */ 83 + _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */ 84 + _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */ 85 + _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */ 86 + _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */ 87 + _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */ 88 + _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */ 89 + _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */ 90 + _W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */ 91 + _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */ 92 + _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */ 93 + _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */ 94 + _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */ 95 + _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */ 96 + _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */ 97 + _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */ 98 + _W, _W, _W, _W, /* 128-131 */ 99 + _W, _W, _W, _W, /* 132-135 */ 100 + _W, _W, _W, _W, /* 136-139 */ 101 + _W, _W, _W, _W, /* 140-143 */ 102 + _W, _W, _W, _W, /* 144-147 */ 103 + _W, _W, _W, _W, /* 148-151 */ 104 + _W, _W, _W, _W, /* 152-155 */ 105 + _W, _W, _W, _W, /* 156-159 */ 106 + _W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */ 107 + _W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */ 108 + _W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */ 109 + _W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */ 110 + _W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */ 111 + _W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */ 112 + _W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */ 113 + _W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */ 114 + _W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */ 115 + _W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */ 116 + _W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */ 117 + _W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */ 118 + _W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */ 119 + _W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */ 120 + _W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */ 121 + _W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */ 122 + _W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */ 123 + _W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */ 124 + _W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */ 125 + _W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */ 126 + _W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */ 127 + _W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */ 128 + _W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */ 129 + _W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */ 130 + 131 + static inline int match_token(struct ts_fsm_token *t, u8 d) 132 + { 133 + if (t->type) 134 + return (token_lookup_tbl[d] & t->type) != 0; 135 + else 136 + return t->value == d; 137 + } 138 + 139 + static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) 140 + { 141 + struct ts_fsm *fsm = ts_config_priv(conf); 142 + struct ts_fsm_token *cur = NULL, *next; 143 + unsigned int match_start, block_idx = 0, tok_idx; 144 + unsigned block_len = 0, strict, consumed = state->offset; 145 + const u8 *data; 146 + 147 + #define GET_NEXT_BLOCK() \ 148 + ({ consumed += block_idx; \ 149 + block_idx = 0; \ 150 + block_len = conf->get_next_block(consumed, &data, conf, state); }) 151 + 152 + #define TOKEN_MISMATCH() \ 153 + do { \ 154 + if (strict) \ 155 + goto no_match; \ 156 + block_idx++; \ 157 + goto startover; \ 158 + } while(0) 159 + 160 + #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) 161 + 162 + if (end_of_data()) 163 + goto no_match; 164 + 165 + strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; 166 + 167 + startover: 168 + match_start = consumed + block_idx; 169 + 170 + for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { 171 + cur = &fsm->tokens[tok_idx]; 172 + 173 + if (likely(tok_idx < (fsm->ntokens - 1))) 174 + next = &fsm->tokens[tok_idx + 1]; 175 + else 176 + next = NULL; 177 + 178 + switch (cur->recur) { 179 + case TS_FSM_SINGLE: 180 + if (end_of_data()) 181 + goto no_match; 182 + 183 + if (!match_token(cur, data[block_idx])) 184 + TOKEN_MISMATCH(); 185 + break; 186 + 187 + case TS_FSM_PERHAPS: 188 + if (end_of_data() || 189 + !match_token(cur, data[block_idx])) 190 + continue; 191 + break; 192 + 193 + case TS_FSM_MULTI: 194 + if (end_of_data()) 195 + goto no_match; 196 + 197 + if (!match_token(cur, data[block_idx])) 198 + TOKEN_MISMATCH(); 199 + 200 + block_idx++; 201 + /* fall through */ 202 + 203 + case TS_FSM_ANY: 204 + if (next == NULL) 205 + goto found_match; 206 + 207 + if (end_of_data()) 208 + continue; 209 + 210 + while (!match_token(next, data[block_idx])) { 211 + if (!match_token(cur, data[block_idx])) 212 + TOKEN_MISMATCH(); 213 + block_idx++; 214 + if (end_of_data()) 215 + goto no_match; 216 + } 217 + continue; 218 + 219 + /* 220 + * Optimization: Prefer small local loop over jumping 221 + * back and forth until garbage at head is munched. 222 + */ 223 + case TS_FSM_HEAD_IGNORE: 224 + if (end_of_data()) 225 + continue; 226 + 227 + while (!match_token(next, data[block_idx])) { 228 + /* 229 + * Special case, don't start over upon 230 + * a mismatch, give the user the 231 + * chance to specify the type of data 232 + * allowed to be ignored. 233 + */ 234 + if (!match_token(cur, data[block_idx])) 235 + goto no_match; 236 + 237 + block_idx++; 238 + if (end_of_data()) 239 + goto no_match; 240 + } 241 + 242 + match_start = consumed + block_idx; 243 + continue; 244 + } 245 + 246 + block_idx++; 247 + } 248 + 249 + if (end_of_data()) 250 + goto found_match; 251 + 252 + no_match: 253 + return UINT_MAX; 254 + 255 + found_match: 256 + state->offset = consumed + block_idx; 257 + return match_start; 258 + } 259 + 260 + static struct ts_config *fsm_init(const void *pattern, unsigned int len, 261 + int gfp_mask) 262 + { 263 + int i, err = -EINVAL; 264 + struct ts_config *conf; 265 + struct ts_fsm *fsm; 266 + struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; 267 + unsigned int ntokens = len / sizeof(*tokens); 268 + size_t priv_size = sizeof(*fsm) + len; 269 + 270 + if (len % sizeof(struct ts_fsm_token) || ntokens < 1) 271 + goto errout; 272 + 273 + for (i = 0; i < ntokens; i++) { 274 + struct ts_fsm_token *t = &tokens[i]; 275 + 276 + if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) 277 + goto errout; 278 + 279 + if (t->recur == TS_FSM_HEAD_IGNORE && 280 + (i != 0 || i == (ntokens - 1))) 281 + goto errout; 282 + } 283 + 284 + conf = alloc_ts_config(priv_size, gfp_mask); 285 + if (IS_ERR(conf)) 286 + return conf; 287 + 288 + fsm = ts_config_priv(conf); 289 + fsm->ntokens = ntokens; 290 + memcpy(fsm->tokens, pattern, len); 291 + 292 + for (i = 0; i < fsm->ntokens; i++) { 293 + struct ts_fsm_token *t = &fsm->tokens[i]; 294 + t->type = token_map[t->type]; 295 + } 296 + 297 + return conf; 298 + 299 + errout: 300 + return ERR_PTR(err); 301 + } 302 + 303 + static void *fsm_get_pattern(struct ts_config *conf) 304 + { 305 + struct ts_fsm *fsm = ts_config_priv(conf); 306 + return fsm->tokens; 307 + } 308 + 309 + static unsigned int fsm_get_pattern_len(struct ts_config *conf) 310 + { 311 + struct ts_fsm *fsm = ts_config_priv(conf); 312 + return fsm->ntokens * sizeof(struct ts_fsm_token); 313 + } 314 + 315 + static struct ts_ops fsm_ops = { 316 + .name = "fsm", 317 + .find = fsm_find, 318 + .init = fsm_init, 319 + .get_pattern = fsm_get_pattern, 320 + .get_pattern_len = fsm_get_pattern_len, 321 + .owner = THIS_MODULE, 322 + .list = LIST_HEAD_INIT(fsm_ops.list) 323 + }; 324 + 325 + static int __init init_fsm(void) 326 + { 327 + return textsearch_register(&fsm_ops); 328 + } 329 + 330 + static void __exit exit_fsm(void) 331 + { 332 + textsearch_unregister(&fsm_ops); 333 + } 334 + 335 + MODULE_LICENSE("GPL"); 336 + 337 + module_init(init_fsm); 338 + module_exit(exit_fsm);