string.subutf8(string, start[,end]) substrings, UTF-8 aware
pos, char = string.nextutf8(string, orig_pos) returns the char at orig_pos and the next char's position in pos.
for i, char in str:nextutf8(orig_pos) iterates through the string, starting at orig_pos.
pos = string.seekutf8(string, orig_pos, n) returns the position orig_pos, N characters forward (or backwards, if N negative).
char = string.utf8char(code) returns the char the code of which is code.
code = string.utf8code(char) returns the code of char (UTF-8 character).
len = string.lenutf8(string) returns the length of string in UTF-8 characters.
UTF-8 BOM has by convention a code of 0. Valid code ranges are: 0-0xD7FF, 0xE000-0x10FFFF.
Unicode is an universal character set, widely used in XML documents.
The point of this is that Unicode codepoints are 0-21 bits in length. With UTF-8, ASCII characters are stored as one byte, others use from 2 to 4 bytes. See [RFC 3629].
The previous paragraph formerly linked to [RFC 2279], which has been obsoleted by RFC-3629 to bring it into alignment with the Unicode Standard [1]. A reasonably fast standards-compliant pure Lua library can be found at [2]. (link broken)
StephaneArnold 2007年11月13日 - I delete the posted code that was not compliant to the latest UTF-8 standard. I have converted some functions of the the 'pure Lua library' to C functions :
lua_utf8.c
/*==================================================================*/
/* C program by sarnold@free.fr 2007, MIT license
based on the work of Rici Lake rici@ricilake.net */
/*==================================================================*/
#include <memory.h>
#include "lua.h"
#include "lauxlib.h"
#include "lualib.h"
#define INVALID_UTF8 "invalid utf-8 string"
#define POINTS_ASCII(p) (*((unsigned char*)p) < 128)
#define RANGE(x, min, max) ((x)>=min && (x)<=max)
#define RANGE_SND(x) RANGE(x,128,191)
#define UTF8_BOM(p) (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF)
int sarn_utf8_next(const unsigned char* str)
{
if (*str < 128)
return 1;
if (UTF8_BOM(str))
return 3;
if (*str < 194)
return 0;
if (*str > 244)
return 0;
if (*str < 224 && RANGE_SND(str[1]))
return 2;
if (RANGE(*str, 225, 239) && *str != 237
&& RANGE_SND(str[1]) && RANGE_SND(str[2]))
return 3;
if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2]))
return 3;
if (*str == 237 && RANGE(str[1],128,159) && RANGE_SND(str[2]))
return 3;
if (RANGE(*str, 241, 243) && RANGE_SND(str[1])
&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
return 4;
if (*str == 240 && RANGE(str[1],144,191)
&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
return 4;
if (*str == 244 && RANGE(str[1],128,143)
&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
return 4;
return 0;
}
#define BACK(str, remain) if (--remain == 0) return 0; else str--
int sarn_utf8_prev(unsigned char* str, int remain)
{
BACK(str,remain);
if (*str < 128)
return 1;
BACK(str,remain);
if (RANGE(*str,195,224) && RANGE_SND(str[1]))
return 2;
BACK(str,remain);
if (UTF8_BOM(str))
return 3;
if (RANGE(*str, 225, 239) && *str != 237
&& RANGE_SND(str[1]) && RANGE_SND(str[2]))
return 3;
if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2]))
return 3;
if (*str == 237 && RANGE(str[1],160,191) && RANGE_SND(str[2]))
return 3;
BACK(str,remain);
if (RANGE(*str, 241, 243) && RANGE_SND(str[1])
&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
return 4;
if (*str == 240 && RANGE(str[1],144,191)
&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
return 4;
if (*str == 244 && RANGE(str[1],128,143)
&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
return 4;
/* fail back */
return 0;
}
/** Realign index on an UTF-8 char boundary in str.
Returns the offset (0 to 3) to be seeked backwards, or -1 if it fails.
*/
int sarn_utf8_realign(unsigned char* str, size_t index)
{
size_t size, i;
for (i = 0; i<4 && index>=i;i++) {
if (sarn_utf8_next(str-i)!=0)
return i;
}
return -1;
}
int sarn_utf8_next_func(lua_State* L)
{
const char *str;
size_t pos, clen;
char utf8[5];
str = luaL_checkstring(L, 1);
pos = luaL_checklong(L, 2);
if (strlen(str)<pos) {
lua_pushnil(L);
return 1;
}
memset(utf8, '0円', sizeof(utf8));
if (pos == 0)
return luaL_error(L, "bad index value : 0");
clen = sarn_utf8_next((unsigned char *)str+pos-1);
if (!clen)
return luaL_error(L, INVALID_UTF8);
lua_pushnumber(L, pos+clen);
strncpy(utf8, str+pos-1, clen);
lua_pushstring(L, utf8);
return 2;
}
int sarn_utf8_len_func(lua_State *L)
{
unsigned char *str;
int l;
size_t len = 0;
str = (unsigned char*) luaL_checkstring(L, 1);
while (*str) {
if (POINTS_ASCII(str)) {
str++;
len++;
continue;
}
l = sarn_utf8_next(str);
if (!l)
return luaL_error(L, INVALID_UTF8);
len++;
str+=l;
}
lua_pushnumber(L, len);
return 1;
}
int sarn_utf8_seek_func(lua_State *L)
{
unsigned char* str;
int pos, shift;
int clen, len;
str = (unsigned char*)luaL_checkstring(L, 1);
pos = luaL_checklong(L, 2);
shift = luaL_checklong(L, 3);
len = strlen(str);
if (shift == 0) {
lua_pushinteger(L, pos);
return 1;
}
if (pos > len || pos < 1)
return luaL_error(L, "invalid index (arg #2)");
/* then, pos is 0-based */
pos--;
if (abs(shift) > len) {
/* out of range */
lua_pushnil(L);
return 1;
}
if (shift < 0) {
while ((shift++) != 0) {
clen = sarn_utf8_prev(str+pos, pos+1);
if (clen == 0 || pos+1 < clen) {
lua_pushnil(L);
return 1;
}
pos -= clen;
}
} else {
while ((shift--) != 0) {
if (POINTS_ASCII(str+pos)) {
pos ++;
continue;
}
clen = sarn_utf8_next(str+pos);
if (clen == 0 || pos+clen >= len) {
lua_pushnil(L);
return 1;
}
pos += clen;
}
}
lua_pushinteger(L, pos+1);
return 1;
}
int sarn_utf8_char_func(lua_State *L)
{
unsigned char str[2];
long int i;
unsigned long int code;
unsigned char result[5];
i = luaL_checklong(L, 1);
memset(result, '0円', sizeof(result));
code = i;
if (i >= 0xD800 && i <= 0xDFFF)
return luaL_error(L, "invalid utf-8 code");
if (i >= 0 && i < 0x110000UL) {
if (code == 0) {
/* UTF8 BOM */
lua_pushstring(L, "\xEF\xBB\xBF");
return 1;
}
if (code < 128) {
result[0] = code;
lua_pushstring(L, (char*)result);
return 1;
}
str[0] = 0x80 + (code & 63);
code = code >> 6;
if (code < 32) {
result[0] = 0xC0+code;
result[1] = str[0];
lua_pushstring(L, (char*)result);
return 1;
}
str[1] = code & 0x3f;
code = code >> 6;
if (code < 16 && (code != 13 || str[1] < 32)) {
result[0] = 0xE0 + code;
result[1] = str[1] + 0x80;
result[2] = str[0];
lua_pushstring(L, (char*)result);
return 1;
} else if (code >= 16 && code < 0x110) {
result[1] = 0x80 + (code & 0x3f);
result[0] = 0xF0 + (code >> 6);
result[2] = str[1] + 0x80;
result[3] = str[0];
lua_pushstring(L, (char*) result);
return 1;
}
}
return luaL_error(L, "invalid utf-8 code");
}
int sarn_utf8_code_func(lua_State *L)
{
unsigned char* str;
size_t len, i;
unsigned long int code;
unsigned long int offset[] = {0, 0x3000,
0xE0000UL,
0x3C00000UL};
str = (unsigned char*)luaL_checklstring(L, 1, &len);
if (len != sarn_utf8_next(str))
return luaL_error(L, INVALID_UTF8);
if (UTF8_BOM(str)) {
lua_pushinteger(L, 0);
return 1;
}
code = str[0];
for (i = 1; i < len; i++) {
code = (code << 6) + (str[i] & 63);
}
lua_pushinteger(L, code - offset[len-1]);
return 1;
}
int luaopen_libluautf8 (lua_State *L)
{
lua_getglobal(L, "string");
lua_pushcfunction(L, sarn_utf8_next_func);
lua_setfield(L, -2, "nextutf8");
lua_pushcfunction(L, sarn_utf8_len_func);
lua_setfield(L, -2, "utf8len");
lua_pushcfunction(L, sarn_utf8_seek_func);
lua_setfield(L, -2, "seekutf8");
lua_pushcfunction(L, sarn_utf8_code_func);
lua_setfield(L, -2, "utf8code");
lua_pushcfunction(L, sarn_utf8_char_func);
lua_setfield(L, -2, "utf8char");
return 0;
}
Makefile
all: compile LUA_CFLAGS=-O2 -fpic LUA_LDFLAGS=-O -shared -fpic compile: lua_utf8 lua_utf8: lua_utf8.c $(CC) $(CFLAGS) $(LUA_CFLAGS) -c lua_utf8.c $(CC) $(CFLAGS) $(LUA_LDFLAGS) -o libluautf8.so lua_utf8.o
module(...,package.seeall) require'libluautf8' local mt = {} local unistr = {} function unistr:new(str) return setmetatable({value = str or ''},mt) end -- redirects methods to unistr mt.__index = function(t,key) if key == 'length' then return string.utf8len(t.value) end if key == 'value' then return t.value end return unistr[key] end -- substrings, utf8 ready -- it might be very expensive -- isn't every encoding function expensive compared to raw access -- to bytes function unistr:sub (first, last) local fn fn = function (str,idx) if idx == 1 or idx == 0 then return idx end if idx<0 then -- negative indices are counted backwards return str:seekutf8(#str, idx) or 1 else return str:seekutf8(1, idx-1) or #str+1 end end local i = fn(self.value, first) if last == nil then return self.value:sub(i) end if last < 0 then if first > 0 or (first<0 and last-first > -last) then -- we must anyway walk through the encoded string -- when walking from the end of the string backwards -- has costs less than walking from the first index -- we choose the least cost -- we get the last index from fn return self.value:sub(i, fn(self.value, last)) end end if first == 0 then return self.value:sub(i, fn(self.value, last)) end return self.value:sub(i, self.value:seekutf8(i, last-first)) end local u2s=function (str) if type(str) == 'string' then return str else return str.value end end -- unicode strings concat function mt.__concat(a,b) return u(u2s(a)..u2s(b)) end -- encoded string length with a metatable is not possible -- so let's stick with a len() method function unistr:len() return self.value:utf8len() end -- iterator function unistr:each(pos) return string.nextutf8, self.value, pos or 1 end -- creates a global "u" function to be used like that: -- str = u"Hello" (it feels Python-like but is really a Lua function) -- then, thanks to the metatable mechanism, concatenation and other funcs -- can be invoked as if it was a simple scalar of type string _G.u = function(str) return unistr:new(str) end function unicodize(f) return function(str) return f(u2s(str)) end end _G.print = unicodize(print) -- return this function return _G.u
Test code
require 'utf8' a=u'hello' b="hello" function assertEqual(name,a,b) if a~=b then print(name.."["..a..'|'..b..']') else --print(name.."...OK") end end for i = 0,10 do assertEqual("sub1."..i,a:sub(i),b:sub(i)) end for i = 0,5 do for j = i,10 do assertEqual("sub2."..i.."-"..j,a:sub(i,j),b:sub(i,j)) end end lentest = {{"h",1},{"",0},{"hel",3},{"hi St駱hane",11}} for _,val in ipairs(lentest) do str = u(val[1]) assertEqual("len1.".._, str.length, val[2]) end firstName=u"St駱hane" lastName = u"Arnold" print("hello "..firstName.." "..lastName)