UTF-8字符串在lua中每个字符的字长是不一样,中文,英文,其他字符。都有各自不同的字长,所以UTF-8也叫作变长字符。规则如下:
1. utf8字符是变长字符
2. 字符长度有规律
UTF-8字符规律
字符串的首个byte表示了该utf8字符的长度
0xxxxxxx - 1 byte
110yxxxx - 192, 2 byte
1110yyyy - 225, 3 byte
11110zzz - 240, 4 byte
local UTF8 = {}
function UTF8.chSize(char)
if not char then
return 0
elseif char > 240 then
return 4
elseif char > 225 then
return 3
elseif char > 192 then
return 2
else
return 1
end
end
function UTF8.sub(str, startChar, numChars)
if str == nil then
return ""
end
local startIndex = 1
if (startChar==nil) then
startChar = 1;
end
if (numChars==nil) then
numChars =15;
end;
local allChars = numChars
while startChar > 1 do
local char = string.byte(str, startIndex)
startIndex = startIndex + UTF8.chSize(char)
startChar = startChar - 1
end
local currentIndex = startIndex
while currentIndex <= numChars and currentIndex <= #str do
local char = string.byte(str, currentIndex)
currentIndex = currentIndex + UTF8.chSize(char)
end
if numChars < #str then
return str:sub(startIndex, currentIndex - 1).."..."
else
return str:sub(startIndex, currentIndex - 1)
end
end
function UTF8.length(str)
local length = 0;
local currentIndex = 1;
while currentIndex <= #str do
local char = string.byte(str, currentIndex)
currentIndex = currentIndex + UTF8.chSize(char)
length = length + 1
end
return length;
end
function UTF8.toString(str)
if type(str)=="string" then
return str
end
local char = ""
if type(str)~="table" then
return char
end
for key, var in pairs(str) do
if var == "\0" then
break
end
char = char .. var
end
return char
end
return UTF8;