local utf8 = require 'utf8' local my_utf8 = {} function my_utf8.offset(s, pos) if pos == 1 then return 1 end local result = utf8.offset(s, pos) if result == nil then assert(false, ('my_utf8.offset(%d) called on a string of length %d (byte size %d); this is likely a failure to handle utf8\n\n^%s$\n'):format(pos, utf8.len(s), #s, s)) end return result end function my_utf8.codepoint(s, pos) return my_utf8.sub(s, pos, pos+1) end -- end_pos is exclusive function my_utf8.sub(s, start_pos, end_pos) local start_offset = my_utf8.offset(s, start_pos) local end_offset = my_utf8.offset(s, end_pos) return s:sub(start_offset, end_offset-1) end function my_utf8.match_at(s, pos, pat) return my_utf8.codepoint(s, pos):match(pat) end -- create a new iterator for s which provides the index and UTF-8 bytes corresponding to each codepoint function my_utf8.chars(s, startpos) local next_pos = startpos or 1 -- in code points local next_offset = utf8.offset(s, next_pos) -- in bytes return function() assert(next_offset) -- never call the iterator after it returns nil local curr_pos = next_pos next_pos = next_pos+1 local curr_offset = next_offset if curr_offset > #s then return end local codepoint = utf8.codepoint(s, curr_offset) next_offset = utf8.offset(s, 2, next_offset) assert(next_offset) local curr_char = s:sub(curr_offset, next_offset-1) return curr_pos, codepoint, curr_char end end return my_utf8