Template repo for tiny cross-platform apps that can be modified on phone, tablet or computer.
1local utf8 = require 'utf8'
2
3local my_utf8 = {}
4
5function my_utf8.offset(s, pos)
6 if pos == 1 then return 1 end
7 local result = utf8.offset(s, pos)
8 if result == nil then
9 assert(false, ('my_utf8.offset(%d) called on a string of length %d (byte size %d); this is likely a failure to handle utf8\n\n^%s$\n'):format(pos, utf8.len(s), #s, s))
10 end
11 return result
12end
13
14function my_utf8.codepoint(s, pos)
15 return my_utf8.sub(s, pos, pos+1)
16end
17
18-- end_pos is exclusive
19function my_utf8.sub(s, start_pos, end_pos)
20 local start_offset = my_utf8.offset(s, start_pos)
21 local end_offset = my_utf8.offset(s, end_pos)
22 return s:sub(start_offset, end_offset-1)
23end
24
25function my_utf8.match_at(s, pos, pat)
26 return my_utf8.codepoint(s, pos):match(pat)
27end
28
29-- create a new iterator for s which provides the index and UTF-8 bytes corresponding to each codepoint
30function my_utf8.chars(s, startpos)
31 local next_pos = startpos or 1 -- in code points
32 local next_offset = utf8.offset(s, next_pos) -- in bytes
33 return function()
34 assert(next_offset) -- never call the iterator after it returns nil
35 local curr_pos = next_pos
36 next_pos = next_pos+1
37 local curr_offset = next_offset
38 if curr_offset > #s then return end
39 local codepoint = utf8.codepoint(s, curr_offset)
40 next_offset = utf8.offset(s, 2, next_offset)
41 assert(next_offset)
42 local curr_char = s:sub(curr_offset, next_offset-1)
43 return curr_pos, codepoint, curr_char
44 end
45end
46
47return my_utf8