在VB中用FileSystemObject去读的话肯定不行,数据被破坏了,你只能得到乱码。
只能用open+get用二进制读到byte数组里,现在开始就讨厌了。
用strconv函数没有办法把这个字节数组正确地转成字符串。strconv(str/byte_array, format, localeId)。format是指望不上了,想想LocaleId是不是能用一下呀——也TNND的不行。其实确省的localeID的结果无非就是用936简体中文代码页进行转换,那微软为什么就不能给个参数用65001(utf-8)代码页转换就完事了?据说VB.NET可以,但是不是为了干这个俺还得全新地把.NET学一番?
谁知道有现成的代码可以用的?
根据VB糟糕的处理byte数组的能力,我能想到的只能是FOR NEXT循环去处理那个数组里的每一个字节了,那效率可真的快哦。。。——跟蜗牛比的话!
而且VB里好象还不支持位运算符,感谢M$,我折腾了一天,最后就想骂人了。。。
6 个解决方案
#1
up
#2
给你段老外的代码,用二进制读的时候,要注意前两个字节是格式标志
Option Explicit
Public m_bIsNt As Boolean
Public Declare Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByVal lpDefaultChar As String, ByVal lpUsedDefaultChar As Long) As Long
Public Declare Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long) As Long
Public Declare Sub CopyMemory Lib "kernel32" Alias "RtlMoveMemory" (lpvDest As Any, lpvSource As Any, ByVal cbCopy As Long)
Public Const CP_UTF8 = 65001
'Purpose:Convert Utf8 to Unicode
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
Dim lngUtf8Size As Long
Dim strBuffer As String
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
Dim n As Long
If LenB(sUTF8) = 0 Then Exit Function
If m_bIsNt Then
On Error GoTo EndFunction
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
lngUtf8Size = UBound(bytUtf8) + 1
On Error GoTo 0
'Set buffer for longest possible string i.e. each byte is
'ANSI, thus 1 unicode(2 bytes)for every utf-8 character.
lngBufferSize = lngUtf8Size * 2
strBuffer = String$(lngBufferSize, vbNullChar)
'Translate using code page 65001(UTF-8)
lngResult = MultiByteToWideChar(CP_UTF8, 0, bytUtf8(0), _
lngUtf8Size, StrPtr(strBuffer), lngBufferSize)
'Trim result to actual length
If lngResult Then
UTF8_Decode = Left$(strBuffer, lngResult)
End If
Else
Dim i As Long
Dim TopIndex As Long
Dim TwoBytes(1) As Byte
Dim ThreeBytes(2) As Byte
Dim AByte As Byte
Dim TStr As String
Dim BArray() As Byte
'Resume on error in case someone inputs text with accents
'that should have been encoded as UTF-8
On Error Resume Next
TopIndex = Len(sUTF8) ' Number of bytes equal TopIndex+1
If TopIndex = 0 Then Exit Function ' get out if there's nothing to convert
BArray = StrConv(sUTF8, vbFromUnicode)
i = 0 ' Initialise pointer
TopIndex = TopIndex - 1
' Iterate through the Byte Array
Do While i <= TopIndex
AByte = BArray(i)
If AByte < &H80 Then
' Normal ANSI character - use it as is
TStr = TStr & Chr$(AByte): i = i + 1 ' Increment byte array index
ElseIf AByte >= &HE0 Then 'was = &HE1 Then
' Start of 3 byte UTF-8 group for a character
' Copy 3 byte to ThreeBytes
ThreeBytes(0) = BArray(i): i = i + 1
ThreeBytes(1) = BArray(i): i = i + 1
ThreeBytes(2) = BArray(i): i = i + 1
' Convert Byte array to UTF-16 then Unicode
TStr = TStr & ChrW$((ThreeBytes(0) And &HF) * &H1000 + (ThreeBytes(1) And &H3F) * &H40 + (ThreeBytes(2) And &H3F))
ElseIf (AByte >= &HC2) And (AByte <= &HDB) Then
' Start of 2 byte UTF-8 group for a character
TwoBytes(0) = BArray(i): i = i + 1
TwoBytes(1) = BArray(i): i = i + 1
' Convert Byte array to UTF-16 then Unicode
TStr = TStr & ChrW$((TwoBytes(0) And &H1F) * &H40 + (TwoBytes(1) And &H3F))
Else
' Normal ANSI character - use it as is
TStr = TStr & Chr$(AByte): i = i + 1 ' Increment byte array index
End If
Loop
UTF8_Decode = TStr ' Return the resultant string
Erase BArray
End If
EndFunction:
End Function
'Purpose:Convert Unicode string to UTF-8.
Public Function UTF8_Encode(ByVal strUnicode As String, Optional ByVal bHTML As Boolean = True) As String
Dim i As Long
Dim TLen As Long
Dim lPtr As Long
Dim UTF16 As Long
Dim UTF8_EncodeLong As String
TLen = Len(strUnicode)
If TLen = 0 Then Exit Function
If m_bIsNt Then
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
'Set buffer for longest possible string.
lngBufferSize = TLen * 3 + 1
ReDim bytUtf8(lngBufferSize - 1)
'Translate using code page 65001(UTF-8).
lngResult = WideCharToMultiByte(CP_UTF8, 0, StrPtr(strUnicode), _
TLen, bytUtf8(0), lngBufferSize, vbNullString, 0)
'Trim result to actual length.
If lngResult Then
lngResult = lngResult - 1
ReDim Preserve bytUtf8(lngResult)
'CopyMemory StrPtr(UTF8_Encode), bytUtf8(0&), lngResult
UTF8_Encode = StrConv(bytUtf8, vbUnicode)
' For i = 0 To lngResult
' UTF8_Encode = UTF8_Encode & Chr$(bytUtf8(i))
' Next
End If
Else
For i = 1 To TLen
' Get UTF-16 value of Unicode character
lPtr = StrPtr(strUnicode) + ((i - 1) * 2)
CopyMemory UTF16, ByVal lPtr, 2
'Convert to UTF-8
If UTF16 < &H80 Then ' 1 UTF-8 byte
UTF8_EncodeLong = Chr$(UTF16)
ElseIf UTF16 < &H800 Then ' 2 UTF-8 bytes
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) ' Least Significant 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&HC0 + (UTF16 And &H1F)) & UTF8_EncodeLong ' Use 5 remaining bits
Else ' 3 UTF-8 bytes
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) ' Least Significant 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) & UTF8_EncodeLong ' Use next 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&HE0 + (UTF16 And &HF)) & UTF8_EncodeLong ' Use 4 remaining bits
End If
UTF8_Encode = UTF8_Encode & UTF8_EncodeLong
Next
End If
'Substitute vbCrLf with HTML line breaks if requested.
If bHTML Then
UTF8_Encode = Replace$(UTF8_Encode, vbCrLf, "<br/>")
End If
End Function
Option Explicit
Public m_bIsNt As Boolean
Public Declare Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByVal lpDefaultChar As String, ByVal lpUsedDefaultChar As Long) As Long
Public Declare Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long) As Long
Public Declare Sub CopyMemory Lib "kernel32" Alias "RtlMoveMemory" (lpvDest As Any, lpvSource As Any, ByVal cbCopy As Long)
Public Const CP_UTF8 = 65001
'Purpose:Convert Utf8 to Unicode
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
Dim lngUtf8Size As Long
Dim strBuffer As String
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
Dim n As Long
If LenB(sUTF8) = 0 Then Exit Function
If m_bIsNt Then
On Error GoTo EndFunction
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
lngUtf8Size = UBound(bytUtf8) + 1
On Error GoTo 0
'Set buffer for longest possible string i.e. each byte is
'ANSI, thus 1 unicode(2 bytes)for every utf-8 character.
lngBufferSize = lngUtf8Size * 2
strBuffer = String$(lngBufferSize, vbNullChar)
'Translate using code page 65001(UTF-8)
lngResult = MultiByteToWideChar(CP_UTF8, 0, bytUtf8(0), _
lngUtf8Size, StrPtr(strBuffer), lngBufferSize)
'Trim result to actual length
If lngResult Then
UTF8_Decode = Left$(strBuffer, lngResult)
End If
Else
Dim i As Long
Dim TopIndex As Long
Dim TwoBytes(1) As Byte
Dim ThreeBytes(2) As Byte
Dim AByte As Byte
Dim TStr As String
Dim BArray() As Byte
'Resume on error in case someone inputs text with accents
'that should have been encoded as UTF-8
On Error Resume Next
TopIndex = Len(sUTF8) ' Number of bytes equal TopIndex+1
If TopIndex = 0 Then Exit Function ' get out if there's nothing to convert
BArray = StrConv(sUTF8, vbFromUnicode)
i = 0 ' Initialise pointer
TopIndex = TopIndex - 1
' Iterate through the Byte Array
Do While i <= TopIndex
AByte = BArray(i)
If AByte < &H80 Then
' Normal ANSI character - use it as is
TStr = TStr & Chr$(AByte): i = i + 1 ' Increment byte array index
ElseIf AByte >= &HE0 Then 'was = &HE1 Then
' Start of 3 byte UTF-8 group for a character
' Copy 3 byte to ThreeBytes
ThreeBytes(0) = BArray(i): i = i + 1
ThreeBytes(1) = BArray(i): i = i + 1
ThreeBytes(2) = BArray(i): i = i + 1
' Convert Byte array to UTF-16 then Unicode
TStr = TStr & ChrW$((ThreeBytes(0) And &HF) * &H1000 + (ThreeBytes(1) And &H3F) * &H40 + (ThreeBytes(2) And &H3F))
ElseIf (AByte >= &HC2) And (AByte <= &HDB) Then
' Start of 2 byte UTF-8 group for a character
TwoBytes(0) = BArray(i): i = i + 1
TwoBytes(1) = BArray(i): i = i + 1
' Convert Byte array to UTF-16 then Unicode
TStr = TStr & ChrW$((TwoBytes(0) And &H1F) * &H40 + (TwoBytes(1) And &H3F))
Else
' Normal ANSI character - use it as is
TStr = TStr & Chr$(AByte): i = i + 1 ' Increment byte array index
End If
Loop
UTF8_Decode = TStr ' Return the resultant string
Erase BArray
End If
EndFunction:
End Function
'Purpose:Convert Unicode string to UTF-8.
Public Function UTF8_Encode(ByVal strUnicode As String, Optional ByVal bHTML As Boolean = True) As String
Dim i As Long
Dim TLen As Long
Dim lPtr As Long
Dim UTF16 As Long
Dim UTF8_EncodeLong As String
TLen = Len(strUnicode)
If TLen = 0 Then Exit Function
If m_bIsNt Then
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
'Set buffer for longest possible string.
lngBufferSize = TLen * 3 + 1
ReDim bytUtf8(lngBufferSize - 1)
'Translate using code page 65001(UTF-8).
lngResult = WideCharToMultiByte(CP_UTF8, 0, StrPtr(strUnicode), _
TLen, bytUtf8(0), lngBufferSize, vbNullString, 0)
'Trim result to actual length.
If lngResult Then
lngResult = lngResult - 1
ReDim Preserve bytUtf8(lngResult)
'CopyMemory StrPtr(UTF8_Encode), bytUtf8(0&), lngResult
UTF8_Encode = StrConv(bytUtf8, vbUnicode)
' For i = 0 To lngResult
' UTF8_Encode = UTF8_Encode & Chr$(bytUtf8(i))
' Next
End If
Else
For i = 1 To TLen
' Get UTF-16 value of Unicode character
lPtr = StrPtr(strUnicode) + ((i - 1) * 2)
CopyMemory UTF16, ByVal lPtr, 2
'Convert to UTF-8
If UTF16 < &H80 Then ' 1 UTF-8 byte
UTF8_EncodeLong = Chr$(UTF16)
ElseIf UTF16 < &H800 Then ' 2 UTF-8 bytes
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) ' Least Significant 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&HC0 + (UTF16 And &H1F)) & UTF8_EncodeLong ' Use 5 remaining bits
Else ' 3 UTF-8 bytes
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) ' Least Significant 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) & UTF8_EncodeLong ' Use next 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&HE0 + (UTF16 And &HF)) & UTF8_EncodeLong ' Use 4 remaining bits
End If
UTF8_Encode = UTF8_Encode & UTF8_EncodeLong
Next
End If
'Substitute vbCrLf with HTML line breaks if requested.
If bHTML Then
UTF8_Encode = Replace$(UTF8_Encode, vbCrLf, "<br/>")
End If
End Function
#3
把这一段放在一个模块里,
用UTF8_Decode函数来解码,用UTF8_Encode函数把Unicode转换为UTF-8
如果是win9x系统,m_bIsNt设为False,否则设为True
用UTF8_Decode函数来解码,用UTF8_Encode函数把Unicode转换为UTF-8
如果是win9x系统,m_bIsNt设为False,否则设为True
#4
Dim str_UTF8 As String
Dim str_Unicode As String
Open "C:\tmp_utf8.txt" For Input As 1
str_UTF8 = StrConv(InputB$(LOF(1), 1), vbUnicode)'读到字符串
str_UTF8 = Right(str_UTF8, Len(str_UTF8) - 1) '第一个字符是格式标志,无实际意义,去掉
Close 1
Module_UTF8.m_bIsNt = True
str_Unicode = UTF8_Decode(str_UTF8)'转换为Unicode
msgbox str_Unicode
Dim str_Unicode As String
Open "C:\tmp_utf8.txt" For Input As 1
str_UTF8 = StrConv(InputB$(LOF(1), 1), vbUnicode)'读到字符串
str_UTF8 = Right(str_UTF8, Len(str_UTF8) - 1) '第一个字符是格式标志,无实际意义,去掉
Close 1
Module_UTF8.m_bIsNt = True
str_Unicode = UTF8_Decode(str_UTF8)'转换为Unicode
msgbox str_Unicode
#5
楼上厉害!100cents r urs!
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
这句是不是可以拿掉哦。俺直接就传进一个Variant (VBByteArray)。
他原先是传进字符串,再转成字节数组的。
=============================================================
如果是win9x系统,m_bIsNt设为False,否则设为True
NT/XP/2000是true吧? 有点奇怪,这应该是说平台,好象应该是win16, win32的区别,怎么是isNT?
=============================================================
看了看,后面半截(m_bIsNt==false时),他的程序很有问题。例如他就在那里弄循环,而且循环的次数设定基于一个错误的长度(不过用这个数字来循环不一定会出错就是了)。
若是能用这样的pattern多好?(当然可以写得简单些,我这么写只不过看得清楚点,反正比循环快 ^_^ )
/[\xC0-\xDF][\x80-\xFF]|[\xE0-\xEF][\x80-\xFF]{2}|[\xF0-\xF7][\x80-\xFF]{3}|[\xF8-\xFB][\x80-\xFF]{4}|[\xFC-\xFD][\x80-\xFF]{5}/gi
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
这句是不是可以拿掉哦。俺直接就传进一个Variant (VBByteArray)。
他原先是传进字符串,再转成字节数组的。
=============================================================
如果是win9x系统,m_bIsNt设为False,否则设为True
NT/XP/2000是true吧? 有点奇怪,这应该是说平台,好象应该是win16, win32的区别,怎么是isNT?
=============================================================
看了看,后面半截(m_bIsNt==false时),他的程序很有问题。例如他就在那里弄循环,而且循环的次数设定基于一个错误的长度(不过用这个数字来循环不一定会出错就是了)。
若是能用这样的pattern多好?(当然可以写得简单些,我这么写只不过看得清楚点,反正比循环快 ^_^ )
/[\xC0-\xDF][\x80-\xFF]|[\xE0-\xEF][\x80-\xFF]{2}|[\xF0-\xF7][\x80-\xFF]{3}|[\xF8-\xFB][\x80-\xFF]{4}|[\xFC-\xFD][\x80-\xFF]{5}/gi
#6
plus, 俺还没有试过你给的程序,今天没时间,明天再说。估计俺中意这一段,单独拿出来用:
'Purpose:Convert Utf8 to Unicode
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
Dim lngUtf8Size As Long
Dim strBuffer As String
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
Dim n As Long
If LenB(sUTF8) = 0 Then Exit Function
If m_bIsNt Then
On Error GoTo EndFunction
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
lngUtf8Size = UBound(bytUtf8) + 1
On Error GoTo 0
'Set buffer for longest possible string i.e. each byte is
'ANSI, thus 1 unicode(2 bytes)for every utf-8 character.
lngBufferSize = lngUtf8Size * 2
strBuffer = String$(lngBufferSize, vbNullChar)
'Translate using code page 65001(UTF-8)
lngResult = MultiByteToWideChar(CP_UTF8, 0, bytUtf8(0), _
lngUtf8Size, StrPtr(strBuffer), lngBufferSize)
'Trim result to actual length
If lngResult Then
UTF8_Decode = Left$(strBuffer, lngResult)
End If
End If
End sub
'Purpose:Convert Utf8 to Unicode
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
Dim lngUtf8Size As Long
Dim strBuffer As String
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
Dim n As Long
If LenB(sUTF8) = 0 Then Exit Function
If m_bIsNt Then
On Error GoTo EndFunction
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
lngUtf8Size = UBound(bytUtf8) + 1
On Error GoTo 0
'Set buffer for longest possible string i.e. each byte is
'ANSI, thus 1 unicode(2 bytes)for every utf-8 character.
lngBufferSize = lngUtf8Size * 2
strBuffer = String$(lngBufferSize, vbNullChar)
'Translate using code page 65001(UTF-8)
lngResult = MultiByteToWideChar(CP_UTF8, 0, bytUtf8(0), _
lngUtf8Size, StrPtr(strBuffer), lngBufferSize)
'Trim result to actual length
If lngResult Then
UTF8_Decode = Left$(strBuffer, lngResult)
End If
End If
End sub
#1
up
#2
给你段老外的代码,用二进制读的时候,要注意前两个字节是格式标志
Option Explicit
Public m_bIsNt As Boolean
Public Declare Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByVal lpDefaultChar As String, ByVal lpUsedDefaultChar As Long) As Long
Public Declare Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long) As Long
Public Declare Sub CopyMemory Lib "kernel32" Alias "RtlMoveMemory" (lpvDest As Any, lpvSource As Any, ByVal cbCopy As Long)
Public Const CP_UTF8 = 65001
'Purpose:Convert Utf8 to Unicode
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
Dim lngUtf8Size As Long
Dim strBuffer As String
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
Dim n As Long
If LenB(sUTF8) = 0 Then Exit Function
If m_bIsNt Then
On Error GoTo EndFunction
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
lngUtf8Size = UBound(bytUtf8) + 1
On Error GoTo 0
'Set buffer for longest possible string i.e. each byte is
'ANSI, thus 1 unicode(2 bytes)for every utf-8 character.
lngBufferSize = lngUtf8Size * 2
strBuffer = String$(lngBufferSize, vbNullChar)
'Translate using code page 65001(UTF-8)
lngResult = MultiByteToWideChar(CP_UTF8, 0, bytUtf8(0), _
lngUtf8Size, StrPtr(strBuffer), lngBufferSize)
'Trim result to actual length
If lngResult Then
UTF8_Decode = Left$(strBuffer, lngResult)
End If
Else
Dim i As Long
Dim TopIndex As Long
Dim TwoBytes(1) As Byte
Dim ThreeBytes(2) As Byte
Dim AByte As Byte
Dim TStr As String
Dim BArray() As Byte
'Resume on error in case someone inputs text with accents
'that should have been encoded as UTF-8
On Error Resume Next
TopIndex = Len(sUTF8) ' Number of bytes equal TopIndex+1
If TopIndex = 0 Then Exit Function ' get out if there's nothing to convert
BArray = StrConv(sUTF8, vbFromUnicode)
i = 0 ' Initialise pointer
TopIndex = TopIndex - 1
' Iterate through the Byte Array
Do While i <= TopIndex
AByte = BArray(i)
If AByte < &H80 Then
' Normal ANSI character - use it as is
TStr = TStr & Chr$(AByte): i = i + 1 ' Increment byte array index
ElseIf AByte >= &HE0 Then 'was = &HE1 Then
' Start of 3 byte UTF-8 group for a character
' Copy 3 byte to ThreeBytes
ThreeBytes(0) = BArray(i): i = i + 1
ThreeBytes(1) = BArray(i): i = i + 1
ThreeBytes(2) = BArray(i): i = i + 1
' Convert Byte array to UTF-16 then Unicode
TStr = TStr & ChrW$((ThreeBytes(0) And &HF) * &H1000 + (ThreeBytes(1) And &H3F) * &H40 + (ThreeBytes(2) And &H3F))
ElseIf (AByte >= &HC2) And (AByte <= &HDB) Then
' Start of 2 byte UTF-8 group for a character
TwoBytes(0) = BArray(i): i = i + 1
TwoBytes(1) = BArray(i): i = i + 1
' Convert Byte array to UTF-16 then Unicode
TStr = TStr & ChrW$((TwoBytes(0) And &H1F) * &H40 + (TwoBytes(1) And &H3F))
Else
' Normal ANSI character - use it as is
TStr = TStr & Chr$(AByte): i = i + 1 ' Increment byte array index
End If
Loop
UTF8_Decode = TStr ' Return the resultant string
Erase BArray
End If
EndFunction:
End Function
'Purpose:Convert Unicode string to UTF-8.
Public Function UTF8_Encode(ByVal strUnicode As String, Optional ByVal bHTML As Boolean = True) As String
Dim i As Long
Dim TLen As Long
Dim lPtr As Long
Dim UTF16 As Long
Dim UTF8_EncodeLong As String
TLen = Len(strUnicode)
If TLen = 0 Then Exit Function
If m_bIsNt Then
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
'Set buffer for longest possible string.
lngBufferSize = TLen * 3 + 1
ReDim bytUtf8(lngBufferSize - 1)
'Translate using code page 65001(UTF-8).
lngResult = WideCharToMultiByte(CP_UTF8, 0, StrPtr(strUnicode), _
TLen, bytUtf8(0), lngBufferSize, vbNullString, 0)
'Trim result to actual length.
If lngResult Then
lngResult = lngResult - 1
ReDim Preserve bytUtf8(lngResult)
'CopyMemory StrPtr(UTF8_Encode), bytUtf8(0&), lngResult
UTF8_Encode = StrConv(bytUtf8, vbUnicode)
' For i = 0 To lngResult
' UTF8_Encode = UTF8_Encode & Chr$(bytUtf8(i))
' Next
End If
Else
For i = 1 To TLen
' Get UTF-16 value of Unicode character
lPtr = StrPtr(strUnicode) + ((i - 1) * 2)
CopyMemory UTF16, ByVal lPtr, 2
'Convert to UTF-8
If UTF16 < &H80 Then ' 1 UTF-8 byte
UTF8_EncodeLong = Chr$(UTF16)
ElseIf UTF16 < &H800 Then ' 2 UTF-8 bytes
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) ' Least Significant 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&HC0 + (UTF16 And &H1F)) & UTF8_EncodeLong ' Use 5 remaining bits
Else ' 3 UTF-8 bytes
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) ' Least Significant 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) & UTF8_EncodeLong ' Use next 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&HE0 + (UTF16 And &HF)) & UTF8_EncodeLong ' Use 4 remaining bits
End If
UTF8_Encode = UTF8_Encode & UTF8_EncodeLong
Next
End If
'Substitute vbCrLf with HTML line breaks if requested.
If bHTML Then
UTF8_Encode = Replace$(UTF8_Encode, vbCrLf, "<br/>")
End If
End Function
Option Explicit
Public m_bIsNt As Boolean
Public Declare Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByVal lpDefaultChar As String, ByVal lpUsedDefaultChar As Long) As Long
Public Declare Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long) As Long
Public Declare Sub CopyMemory Lib "kernel32" Alias "RtlMoveMemory" (lpvDest As Any, lpvSource As Any, ByVal cbCopy As Long)
Public Const CP_UTF8 = 65001
'Purpose:Convert Utf8 to Unicode
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
Dim lngUtf8Size As Long
Dim strBuffer As String
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
Dim n As Long
If LenB(sUTF8) = 0 Then Exit Function
If m_bIsNt Then
On Error GoTo EndFunction
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
lngUtf8Size = UBound(bytUtf8) + 1
On Error GoTo 0
'Set buffer for longest possible string i.e. each byte is
'ANSI, thus 1 unicode(2 bytes)for every utf-8 character.
lngBufferSize = lngUtf8Size * 2
strBuffer = String$(lngBufferSize, vbNullChar)
'Translate using code page 65001(UTF-8)
lngResult = MultiByteToWideChar(CP_UTF8, 0, bytUtf8(0), _
lngUtf8Size, StrPtr(strBuffer), lngBufferSize)
'Trim result to actual length
If lngResult Then
UTF8_Decode = Left$(strBuffer, lngResult)
End If
Else
Dim i As Long
Dim TopIndex As Long
Dim TwoBytes(1) As Byte
Dim ThreeBytes(2) As Byte
Dim AByte As Byte
Dim TStr As String
Dim BArray() As Byte
'Resume on error in case someone inputs text with accents
'that should have been encoded as UTF-8
On Error Resume Next
TopIndex = Len(sUTF8) ' Number of bytes equal TopIndex+1
If TopIndex = 0 Then Exit Function ' get out if there's nothing to convert
BArray = StrConv(sUTF8, vbFromUnicode)
i = 0 ' Initialise pointer
TopIndex = TopIndex - 1
' Iterate through the Byte Array
Do While i <= TopIndex
AByte = BArray(i)
If AByte < &H80 Then
' Normal ANSI character - use it as is
TStr = TStr & Chr$(AByte): i = i + 1 ' Increment byte array index
ElseIf AByte >= &HE0 Then 'was = &HE1 Then
' Start of 3 byte UTF-8 group for a character
' Copy 3 byte to ThreeBytes
ThreeBytes(0) = BArray(i): i = i + 1
ThreeBytes(1) = BArray(i): i = i + 1
ThreeBytes(2) = BArray(i): i = i + 1
' Convert Byte array to UTF-16 then Unicode
TStr = TStr & ChrW$((ThreeBytes(0) And &HF) * &H1000 + (ThreeBytes(1) And &H3F) * &H40 + (ThreeBytes(2) And &H3F))
ElseIf (AByte >= &HC2) And (AByte <= &HDB) Then
' Start of 2 byte UTF-8 group for a character
TwoBytes(0) = BArray(i): i = i + 1
TwoBytes(1) = BArray(i): i = i + 1
' Convert Byte array to UTF-16 then Unicode
TStr = TStr & ChrW$((TwoBytes(0) And &H1F) * &H40 + (TwoBytes(1) And &H3F))
Else
' Normal ANSI character - use it as is
TStr = TStr & Chr$(AByte): i = i + 1 ' Increment byte array index
End If
Loop
UTF8_Decode = TStr ' Return the resultant string
Erase BArray
End If
EndFunction:
End Function
'Purpose:Convert Unicode string to UTF-8.
Public Function UTF8_Encode(ByVal strUnicode As String, Optional ByVal bHTML As Boolean = True) As String
Dim i As Long
Dim TLen As Long
Dim lPtr As Long
Dim UTF16 As Long
Dim UTF8_EncodeLong As String
TLen = Len(strUnicode)
If TLen = 0 Then Exit Function
If m_bIsNt Then
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
'Set buffer for longest possible string.
lngBufferSize = TLen * 3 + 1
ReDim bytUtf8(lngBufferSize - 1)
'Translate using code page 65001(UTF-8).
lngResult = WideCharToMultiByte(CP_UTF8, 0, StrPtr(strUnicode), _
TLen, bytUtf8(0), lngBufferSize, vbNullString, 0)
'Trim result to actual length.
If lngResult Then
lngResult = lngResult - 1
ReDim Preserve bytUtf8(lngResult)
'CopyMemory StrPtr(UTF8_Encode), bytUtf8(0&), lngResult
UTF8_Encode = StrConv(bytUtf8, vbUnicode)
' For i = 0 To lngResult
' UTF8_Encode = UTF8_Encode & Chr$(bytUtf8(i))
' Next
End If
Else
For i = 1 To TLen
' Get UTF-16 value of Unicode character
lPtr = StrPtr(strUnicode) + ((i - 1) * 2)
CopyMemory UTF16, ByVal lPtr, 2
'Convert to UTF-8
If UTF16 < &H80 Then ' 1 UTF-8 byte
UTF8_EncodeLong = Chr$(UTF16)
ElseIf UTF16 < &H800 Then ' 2 UTF-8 bytes
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) ' Least Significant 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&HC0 + (UTF16 And &H1F)) & UTF8_EncodeLong ' Use 5 remaining bits
Else ' 3 UTF-8 bytes
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) ' Least Significant 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&H80 + (UTF16 And &H3F)) & UTF8_EncodeLong ' Use next 6 bits
UTF16 = UTF16 \ &H40 ' Shift right 6 bits
UTF8_EncodeLong = Chr$(&HE0 + (UTF16 And &HF)) & UTF8_EncodeLong ' Use 4 remaining bits
End If
UTF8_Encode = UTF8_Encode & UTF8_EncodeLong
Next
End If
'Substitute vbCrLf with HTML line breaks if requested.
If bHTML Then
UTF8_Encode = Replace$(UTF8_Encode, vbCrLf, "<br/>")
End If
End Function
#3
把这一段放在一个模块里,
用UTF8_Decode函数来解码,用UTF8_Encode函数把Unicode转换为UTF-8
如果是win9x系统,m_bIsNt设为False,否则设为True
用UTF8_Decode函数来解码,用UTF8_Encode函数把Unicode转换为UTF-8
如果是win9x系统,m_bIsNt设为False,否则设为True
#4
Dim str_UTF8 As String
Dim str_Unicode As String
Open "C:\tmp_utf8.txt" For Input As 1
str_UTF8 = StrConv(InputB$(LOF(1), 1), vbUnicode)'读到字符串
str_UTF8 = Right(str_UTF8, Len(str_UTF8) - 1) '第一个字符是格式标志,无实际意义,去掉
Close 1
Module_UTF8.m_bIsNt = True
str_Unicode = UTF8_Decode(str_UTF8)'转换为Unicode
msgbox str_Unicode
Dim str_Unicode As String
Open "C:\tmp_utf8.txt" For Input As 1
str_UTF8 = StrConv(InputB$(LOF(1), 1), vbUnicode)'读到字符串
str_UTF8 = Right(str_UTF8, Len(str_UTF8) - 1) '第一个字符是格式标志,无实际意义,去掉
Close 1
Module_UTF8.m_bIsNt = True
str_Unicode = UTF8_Decode(str_UTF8)'转换为Unicode
msgbox str_Unicode
#5
楼上厉害!100cents r urs!
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
这句是不是可以拿掉哦。俺直接就传进一个Variant (VBByteArray)。
他原先是传进字符串,再转成字节数组的。
=============================================================
如果是win9x系统,m_bIsNt设为False,否则设为True
NT/XP/2000是true吧? 有点奇怪,这应该是说平台,好象应该是win16, win32的区别,怎么是isNT?
=============================================================
看了看,后面半截(m_bIsNt==false时),他的程序很有问题。例如他就在那里弄循环,而且循环的次数设定基于一个错误的长度(不过用这个数字来循环不一定会出错就是了)。
若是能用这样的pattern多好?(当然可以写得简单些,我这么写只不过看得清楚点,反正比循环快 ^_^ )
/[\xC0-\xDF][\x80-\xFF]|[\xE0-\xEF][\x80-\xFF]{2}|[\xF0-\xF7][\x80-\xFF]{3}|[\xF8-\xFB][\x80-\xFF]{4}|[\xFC-\xFD][\x80-\xFF]{5}/gi
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
这句是不是可以拿掉哦。俺直接就传进一个Variant (VBByteArray)。
他原先是传进字符串,再转成字节数组的。
=============================================================
如果是win9x系统,m_bIsNt设为False,否则设为True
NT/XP/2000是true吧? 有点奇怪,这应该是说平台,好象应该是win16, win32的区别,怎么是isNT?
=============================================================
看了看,后面半截(m_bIsNt==false时),他的程序很有问题。例如他就在那里弄循环,而且循环的次数设定基于一个错误的长度(不过用这个数字来循环不一定会出错就是了)。
若是能用这样的pattern多好?(当然可以写得简单些,我这么写只不过看得清楚点,反正比循环快 ^_^ )
/[\xC0-\xDF][\x80-\xFF]|[\xE0-\xEF][\x80-\xFF]{2}|[\xF0-\xF7][\x80-\xFF]{3}|[\xF8-\xFB][\x80-\xFF]{4}|[\xFC-\xFD][\x80-\xFF]{5}/gi
#6
plus, 俺还没有试过你给的程序,今天没时间,明天再说。估计俺中意这一段,单独拿出来用:
'Purpose:Convert Utf8 to Unicode
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
Dim lngUtf8Size As Long
Dim strBuffer As String
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
Dim n As Long
If LenB(sUTF8) = 0 Then Exit Function
If m_bIsNt Then
On Error GoTo EndFunction
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
lngUtf8Size = UBound(bytUtf8) + 1
On Error GoTo 0
'Set buffer for longest possible string i.e. each byte is
'ANSI, thus 1 unicode(2 bytes)for every utf-8 character.
lngBufferSize = lngUtf8Size * 2
strBuffer = String$(lngBufferSize, vbNullChar)
'Translate using code page 65001(UTF-8)
lngResult = MultiByteToWideChar(CP_UTF8, 0, bytUtf8(0), _
lngUtf8Size, StrPtr(strBuffer), lngBufferSize)
'Trim result to actual length
If lngResult Then
UTF8_Decode = Left$(strBuffer, lngResult)
End If
End If
End sub
'Purpose:Convert Utf8 to Unicode
Public Function UTF8_Decode(ByVal sUTF8 As String) As String
Dim lngUtf8Size As Long
Dim strBuffer As String
Dim lngBufferSize As Long
Dim lngResult As Long
Dim bytUtf8() As Byte
Dim n As Long
If LenB(sUTF8) = 0 Then Exit Function
If m_bIsNt Then
On Error GoTo EndFunction
bytUtf8 = StrConv(sUTF8, vbFromUnicode)
lngUtf8Size = UBound(bytUtf8) + 1
On Error GoTo 0
'Set buffer for longest possible string i.e. each byte is
'ANSI, thus 1 unicode(2 bytes)for every utf-8 character.
lngBufferSize = lngUtf8Size * 2
strBuffer = String$(lngBufferSize, vbNullChar)
'Translate using code page 65001(UTF-8)
lngResult = MultiByteToWideChar(CP_UTF8, 0, bytUtf8(0), _
lngUtf8Size, StrPtr(strBuffer), lngBufferSize)
'Trim result to actual length
If lngResult Then
UTF8_Decode = Left$(strBuffer, lngResult)
End If
End If
End sub