UTF-8、UTF-16、UTF-32编码的相互转换

Published by orzz.org(). (https://orzz.org/utf_transform/)

最近在考虑写一个可以跨平台的通用字符串类，首先需要搞定的就是编码转换问题。

vs默认保存代码文件，使用的是本地code（中文即GBK，日文即Shift-JIS），也可以使用带BOM的UTF-8。
gcc则是UTF-8，有无BOM均可（源代码的字符集可以由参数-finput-charset指定）。
那么源代码可以采用带BOM的UTF-8来保存。而windows下的unicode是UTF-16编码；linux则使用UTF-8或UTF-32。因此不论在哪种系统里，程序在处理字符串时都需要考虑UTF编码之间的相互转换。

下面直接贴出算法代码。算法上我借鉴了秦建辉（http://blog.csdn.net/jhqin）的UnicodeConverter，只是在外面增加了一些泛型处理，让使用相对简单。

核心算法（来自UnicodeConverter）：

namespace transform
{
    /*
        UTF-32 to UTF-8
    */

    inline static size_t utf(uint32 src, uint8* des)
    {
        if (src == 0) return 0;

        static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
        static const uint32 CODE_UP[] =
        {
            0x80,           // U+00000000 - U+0000007F
            0x800,          // U+00000080 - U+000007FF
            0x10000,        // U+00000800 - U+0000FFFF
            0x200000,       // U+00010000 - U+001FFFFF
            0x4000000,      // U+00200000 - U+03FFFFFF
            0x80000000      // U+04000000 - U+7FFFFFFF
        };

        size_t i, len = sizeof(CODE_UP) / sizeof(uint32);
        for(i = 0; i < len; ++i)
            if (src < CODE_UP[i]) break;

        if (i == len) return 0; // the src is invalid

        len = i + 1;
        if (des)
        {
            for(; i > 0; --i)
            {
                des[i] = static_cast<uint8>((src & 0x3F) | 0x80);
                src >>= 6;
            }
            des[0] = static_cast<uint8>(src | PREFIX[len - 1]);
        }
        return len;
    }

    /*
        UTF-8 to UTF-32
    */

    inline static size_t utf(const uint8* src, uint32& des)
    {
        if (!src || (*src) == 0) return 0;

        uint8 b = *(src++);

        if (b < 0x80)
        {
            des = b;
            return 1;
        }

        if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid

        size_t len;

        if (b < 0xE0)
        {
            des = b & 0x1F;
            len = 2;
        }
        else
        if (b < 0xF0)
        {
            des = b & 0x0F;
            len = 3;
        }
        else
        if (b < 0xF8)
        {
            des = b & 0x07;
            len = 4;
        }
        else
        if (b < 0xFC)
        {
            des = b & 0x03;
            len = 5;
        }
        else
        {
            des = b & 0x01;
            len = 6;
        }

        size_t i = 1;
        for (; i < len; ++i)
        {
            b = *(src++);
            if (b < 0x80 || b > 0xBF) return 0; // the src is invalid
            des = (des << 6) + (b & 0x3F);
        }
        return len;
    }

    /*
        UTF-32 to UTF-16
    */

    inline static size_t utf(uint32 src, uint16* des)
    {
        if (src == 0) return 0;

        if (src <= 0xFFFF)
        {
            if (des) (*des) = static_cast<uint16>(src);
            return 1;
        }
        else
        if (src <= 0xEFFFF)
        {
            if (des)
            {
                des[0] = static_cast<uint16>(0xD800 + (src >> 10) - 0x40);  // high
                des[1] = static_cast<uint16>(0xDC00 + (src & 0x03FF));      // low
            }
            return 2;
        }
        return 0;
    }

    /*
        UTF-16 to UTF-32
    */

    inline static size_t utf(const uint16* src, uint32& des)
    {
        if (!src || (*src) == 0) return 0;

        uint16 w1 = src[0];
        if (w1 >= 0xD800 && w1 <= 0xDFFF)
        {
            if (w1 < 0xDC00)
            {
                uint16 w2 = src[1];
                if (w2 >= 0xDC00 && w2 <= 0xDFFF)
                {
                    des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);
                    return 2;
                }
            }
            return 0; // the src is invalid
        }
        else
        {
            des = w1;
            return 1;
        }
    }
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

namespace transform

{

UTF-32 to UTF-8

inline static size_t utf(uint32 src, uint8* des)

{

if (src == 0) return 0;

static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

static const uint32 CODE_UP[] =

{

0x80, // U+00000000 - U+0000007F

0x800, // U+00000080 - U+000007FF

0x10000, // U+00000800 - U+0000FFFF

0x200000, // U+00010000 - U+001FFFFF

0x4000000, // U+00200000 - U+03FFFFFF

0x80000000 // U+04000000 - U+7FFFFFFF

};

size_t i, len = sizeof(CODE_UP) / sizeof(uint32);

for(i = 0; i < len; ++i)

if (src < CODE_UP[i]) break;

if (i == len) return 0; // the src is invalid

len = i + 1;

if (des)

{

for(; i > 0; --i)

{

des[i] = static_cast<uint8>((src & 0x3F) | 0x80);

src >>= 6;

}

des[0] = static_cast<uint8>(src | PREFIX[len - 1]);

}

return len;

}

UTF-8 to UTF-32

inline static size_t utf(const uint8* src, uint32& des)

{

if (!src || (*src) == 0) return 0;

uint8 b = *(src++);

if (b < 0x80)

{

des = b;

return 1;

}

if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid

size_t len;

if (b < 0xE0)

{

des = b & 0x1F;

len = 2;

}

else

if (b < 0xF0)

{

des = b & 0x0F;

len = 3;

}

else

if (b < 0xF8)

{

des = b & 0x07;

len = 4;

}

else

if (b < 0xFC)

{

des = b & 0x03;

len = 5;

}

else

{

des = b & 0x01;

len = 6;

}

size_t i = 1;

for (; i < len; ++i)

{

b = *(src++);

if (b < 0x80 || b > 0xBF) return 0; // the src is invalid

des = (des << 6) + (b & 0x3F);

}

return len;

}

UTF-32 to UTF-16

inline static size_t utf(uint32 src, uint16* des)

{

if (src == 0) return 0;

if (src <= 0xFFFF)

{

if (des) (*des) = static_cast<uint16>(src);

return 1;

}

else

if (src <= 0xEFFFF)

{

if (des)

{

des[0] = static_cast<uint16>(0xD800 + (src >> 10) - 0x40); // high

des[1] = static_cast<uint16>(0xDC00 + (src & 0x03FF)); // low

}

return 2;

}

return 0;

}

UTF-16 to UTF-32

inline static size_t utf(const uint16* src, uint32& des)

{

if (!src || (*src) == 0) return 0;

uint16 w1 = src[0];

if (w1 >= 0xD800 && w1 <= 0xDFFF)

{

if (w1 < 0xDC00)

{

uint16 w2 = src[1];

if (w2 >= 0xDC00 && w2 <= 0xDFFF)

{

des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);

return 2;

}

return 0; // the src is invalid

}

else

{

des = w1;

return 1;

}

上面这些算法都是针对单个字符的，并且是UTF-32和UTF-16/8之间的互转。
通过上面的算法，可以得到UTF-16和UTF-8之间的单字符转换算法：

namespace transform
{
    /*
        UTF-16 to UTF-8
    */

    inline static size_t utf(uint16 src, uint8* des)
    {
        // make utf-16 to utf-32
        uint32 tmp;
        if (utf(&src, tmp) != 1) return 0;
        // make utf-32 to utf-8
        return utf(tmp, des);
    }

    /*
        UTF-8 to UTF-16
    */

    inline static size_t utf(const uint8* src, uint16& des)
    {
        // make utf-8 to utf-32
        uint32 tmp;
        size_t len = utf(src, tmp);
        if (len == 0) return 0;
        // make utf-32 to utf-16
        if (utf(tmp, &des) != 1) return 0;
        return len;
    }
}

namespace transform

{

UTF-16 to UTF-8

inline static size_t utf(uint16 src, uint8* des)

{

// make utf-16 to utf-32

uint32 tmp;

if (utf(&src, tmp) != 1) return 0;

// make utf-32 to utf-8

return utf(tmp, des);

}

UTF-8 to UTF-16

inline static size_t utf(const uint8* src, uint16& des)

{

// make utf-8 to utf-32

uint32 tmp;

size_t len = utf(src, tmp);

if (len == 0) return 0;

// make utf-32 to utf-16

if (utf(tmp, &des) != 1) return 0;

return len;

}

同样，通过上面的单字符转换算法，可以得到整个字符串的转换算法：

namespace transform
{
    /*
        UTF-X: string to string
    */

    template <typename T>
    size_t utf(const uint32* src, T* des)   // UTF-32 to UTF-X(8/16)
    {
        if (!src || (*src) == 0) return 0;

        size_t num = 0;
        for(; *src; ++src)
        {
            size_t len = utf(*src, des);
            if (len == 0) break;
            if (des) des += len;
            num += len;
        }
        if (des) (*des) = 0;
        return num;
    }

    template <typename T>
    size_t utf(const T* src, uint32* des)   // UTF-X(8/16) to UTF-32
    {
        if (!src || (*src) == 0) return 0;

        size_t num = 0;
        while(*src)
        {
            uint32 tmp;
            size_t len = utf(src, tmp);
            if (len == 0) break;
            if (des)
            {
                (*des) = tmp;
                ++des;
            }
            src += len;
            num += 1;
        }
        if (des) (*des) = 0;
        return num;
    }

    template <typename T, typename U>
    size_t utf(const T* src, U* des)    // UTF-X(8/16) to UTF-Y(16/8)
    {
        if (!src || (*src) == 0) return 0;

        size_t num = 0;
        while(*src)
        {
            // make utf-x to ucs4
            uint32 tmp;
            size_t len = utf(src, tmp);
            if (len == 0) break;
            src += len;
            // make ucs4 to utf-y
            len = utf(tmp, des);
            if (len == 0) break;
            if (des) des += len;
            num += len;
        }
        if (des) (*des) = 0;
        return num;
    }
}

namespace transform

{

UTF-X: string to string

template <typename T>

size_t utf(const uint32* src, T* des) // UTF-32 to UTF-X(8/16)

{

if (!src || (*src) == 0) return 0;

size_t num = 0;

for(; *src; ++src)

{

size_t len = utf(*src, des);

if (len == 0) break;

if (des) des += len;

num += len;

}

if (des) (*des) = 0;

return num;

}

template <typename T>

size_t utf(const T* src, uint32* des) // UTF-X(8/16) to UTF-32

{

if (!src || (*src) == 0) return 0;

size_t num = 0;

while(*src)

{

uint32 tmp;

size_t len = utf(src, tmp);

if (len == 0) break;

if (des)

{

(*des) = tmp;

++des;

}

src += len;

num += 1;

}

if (des) (*des) = 0;

return num;

}

template <typename T, typename U>

size_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-Y(16/8)

{

if (!src || (*src) == 0) return 0;

size_t num = 0;

while(*src)

{

// make utf-x to ucs4

uint32 tmp;

size_t len = utf(src, tmp);

if (len == 0) break;

src += len;

// make ucs4 to utf-y

len = utf(tmp, des);

if (len == 0) break;

if (des) des += len;

num += len;

}

if (des) (*des) = 0;

return num;

}

有了这些之后，我们已经可以完整的做UTF-8/16/32之间的相互转换了，但是这些函数的使用仍然不是很方便。
比如我现在想把一个UTF-8字符串转换成一个wchar_t*字符串，我得这样写：

const uint8* c = (uint8*)"こんにちわ、世界";
size_t n = (sizeof(wchar_t) == 2) ?
    transform::utf(c, (uint16*)0) :
    transform::utf(c, (uint32*)0);
wchar_t* s = new wchar_t[n];
if (sizeof(wchar_t) == 2)
    transform::utf(c, (uint16*)s);
else
    transform::utf(c, (uint32*)s);

const uint8* c = (uint8*)"こんにちわ、世界";

size_t n = (sizeof(wchar_t) == 2) ?

transform::utf(c, (uint16*)0) :

transform::utf(c, (uint32*)0);

wchar_t* s = new wchar_t[n];

if (sizeof(wchar_t) == 2)

transform::utf(c, (uint16*)s);

else

transform::utf(c, (uint32*)s);

这显然是一件很抽搐的事情，因为wchar_t在不同的操作系统（windows/linux）里有不同的sizeof长度。
上面的类型强制转换只是为了去适配合适的函数重载，当然我们也可以通过函数名来区分这些函数：比如分别叫utf8_to_utf32之类的。但是这改变不了写if-else来适配长度的问题。

显然这里可以通过泛型来让算法更好用。
首先，需要被抽离出来的就是参数的类型大小和类型本身的依赖关系：

template <size_t X> struct utf_type;
template <>         struct utf_type<1> { typedef uint8  type_t; };
template <>         struct utf_type<2> { typedef uint16 type_t; };
template <>         struct utf_type<4> { typedef uint32 type_t; };

template <size_t X> struct utf_type;

template <> struct utf_type<1> { typedef uint8 type_t; };

template <> struct utf_type<2> { typedef uint16 type_t; };

template <> struct utf_type<4> { typedef uint32 type_t; };

然后，实现一个简单的check算法，这样后面就可以利用SFINAE的技巧筛选出合适的算法函数：

template <size_t X, typename T>
struct check
{
    static const bool value =
        ((sizeof(T) == sizeof(typename utf_type<X>::type_t)) && !is_pointer<T>::value);
};

template <size_t X, typename T>

struct check

{

static const bool value =

((sizeof(T) == sizeof(typename utf_type<X>::type_t)) && !is_pointer<T>::value);

};

下面我们需要一个detail，即泛型适配的细节。从上面的算法函数参数中，我们可以很容易的观察出一些规律：
只要是由大向小转换（比如32->16，或16->8）的，其对外接口可以抽象成这两种形式：

type_t utf(T src, U* des)
type_t utf(const T* src, U* des)

1 2	type_t utf(T src, U* des) type_t utf(const T* src, U* des)

而由小向大的转换，则是下面这两种形式：

type_t utf(const T* src, U& des)
type_t utf(const T* src, U* des)

1 2	type_t utf(const T* src, U& des) type_t utf(const T* src, U* des)

再加上第二个指针参数是可以给一个默认值（空指针）的，因此适配的泛型类就可以写成这样：

template <size_t X, size_t Y, bool = (X > Y), bool = (X != Y)>
struct detail;

/*
    UTF-X(32/16) to UTF-Y(16/8)
*/

template <size_t X, size_t Y>
struct detail<X, Y, true, true>
{
    typedef typename utf_type<X>::type_t src_t;
    typedef typename utf_type<Y>::type_t des_t;

    template <typename T, typename U>
    static typename enable_if<check<X, T>::value && check<Y, U>::value,
    size_t>::type_t utf(T src, U* des)
    {
        return transform::utf((src_t)(src), (des_t*)(des));
    }

    template <typename T>
    static typename enable_if<check<X, T>::value,
    size_t>::type_t utf(T src)
    {
        return transform::utf((src_t)(src), (des_t*)(0));
    }

    template <typename T, typename U>
    static typename enable_if<check<X, T>::value && check<Y, U>::value,
    size_t>::type_t utf(const T* src, U* des)
    {
        return transform::utf((const src_t*)(src), (des_t*)(des));
    }

    template <typename T>
    static typename enable_if<check<X, T>::value,
    size_t>::type_t utf(const T* src)
    {
        return transform::utf((src_t)(src), (des_t*)(0));
    }
};

/*
    UTF-X(16/8) to UTF-Y(32/16)
*/

template <size_t X, size_t Y>
struct detail<X, Y, false, true>
{
    typedef typename utf_type<X>::type_t src_t;
    typedef typename utf_type<Y>::type_t des_t;

    template <typename T, typename U>
    static typename enable_if<check<X, T>::value && check<Y, U>::value,
    size_t>::type_t utf(const T* src, U& des)
    {
        des_t tmp; // for disable the warning strict-aliasing from gcc 4.4
        size_t ret = transform::utf((const src_t*)(src), tmp);
        des = tmp;
        return ret;
    }

    template <typename T, typename U>
    static typename enable_if<check<X, T>::value && check<Y, U>::value,
    size_t>::type_t utf(const T* src, U* des)
    {
        return transform::utf((const src_t*)(src), (des_t*)(des));
    }

    template <typename T>
    static typename enable_if<check<X, T>::value,
    size_t>::type_t utf(const T* src)
    {
        return transform::utf((const src_t*)(src), (des_t*)(0));
    }
};

template <size_t X, size_t Y, bool = (X > Y), bool = (X != Y)>

struct detail;

UTF-X(32/16) to UTF-Y(16/8)

template <size_t X, size_t Y>

struct detail<X, Y, true, true>

{

typedef typename utf_type<X>::type_t src_t;

typedef typename utf_type<Y>::type_t des_t;

template <typename T, typename U>

static typename enable_if<check<X, T>::value && check<Y, U>::value,

size_t>::type_t utf(T src, U* des)

{

return transform::utf((src_t)(src), (des_t*)(des));

}

template <typename T>

static typename enable_if<check<X, T>::value,

size_t>::type_t utf(T src)

{

return transform::utf((src_t)(src), (des_t*)(0));

}

template <typename T, typename U>

static typename enable_if<check<X, T>::value && check<Y, U>::value,

size_t>::type_t utf(const T* src, U* des)

{

return transform::utf((const src_t*)(src), (des_t*)(des));

}

template <typename T>

static typename enable_if<check<X, T>::value,

size_t>::type_t utf(const T* src)

{

return transform::utf((src_t)(src), (des_t*)(0));

}

};

UTF-X(16/8) to UTF-Y(32/16)

template <size_t X, size_t Y>

struct detail<X, Y, false, true>

{

typedef typename utf_type<X>::type_t src_t;

typedef typename utf_type<Y>::type_t des_t;

template <typename T, typename U>

static typename enable_if<check<X, T>::value && check<Y, U>::value,

size_t>::type_t utf(const T* src, U& des)

{

des_t tmp; // for disable the warning strict-aliasing from gcc 4.4

size_t ret = transform::utf((const src_t*)(src), tmp);

des = tmp;

return ret;

}

template <typename T, typename U>

static typename enable_if<check<X, T>::value && check<Y, U>::value,

size_t>::type_t utf(const T* src, U* des)

{

return transform::utf((const src_t*)(src), (des_t*)(des));

}

template <typename T>

static typename enable_if<check<X, T>::value,

size_t>::type_t utf(const T* src)

{

return transform::utf((const src_t*)(src), (des_t*)(0));

}

};

最后的外敷类收尾就可以相当的简单：

template <typename T, typename U>
struct converter
    : detail<sizeof(T), sizeof(U)>
{};

template <typename T, typename U>

struct converter

: detail<sizeof(T), sizeof(U)>

{};

通过上面的detail，我们也可以很轻松的写出一个通过指定8、16这些数字，来控制选择哪些转换算法的外敷模板。
有了converter，同类型的需求（指UTF-8转wchar_t）就可以变得轻松愉快很多：

const char* c = "こんにちわ、世界";
wstring s;
size_t n; wchar_t w;
while (!!(n = converter<char, wchar_t>::utf(c, w))) // 这里的!!是为了屏蔽gcc的警告
{
    s.push_back(w);
    c += n;
}
FILE* fp = fopen("test_converter.txt", "wb");
fwrite(s.c_str(), sizeof(wchar_t), s.length(), fp);
fclose(fp);

const char* c = "こんにちわ、世界";

wstring s;

size_t n; wchar_t w;

while (!!(n = converter<char, wchar_t>::utf(c, w))) // 这里的!!是为了屏蔽gcc的警告

{

s.push_back(w);

c += n;

}

FILE* fp = fopen("test_converter.txt", "wb");

fwrite(s.c_str(), sizeof(wchar_t), s.length(), fp);

fclose(fp);

上面这一小段代码是将一段UTF-8的文字逐字符转换为wchar_t，并一个个push_back到wstring里，最后把转换完毕的字符串输出到test_converter.txt里。

其实上面的泛型还是显得累赘了。为什么不直接在transform::utf上使用泛型参数呢？
一开始只想到上面那个方法，自然是由于惯性的想要手动指定如何转换编码的缘故，比如最开始的想法，是想做成类似这样的模板：utf<8, 32>(s1, s2)，指定两个数字，来决定输入和输出的格式。

后来发现，直接指定字符串/字符的类型或许更加直接些。
现在回头再看看，其实转换所需要的字长（8、16、32）已经在参数的类型中指定了：8bits的char或byte类型肯定不会是用来存放UTF-32的嘛。。
所以只需要把上面核心算法的参数泛型化就可以了。这时代码就会写成下面这个样子：

namespace transform
{
    namespace private_
    {
        template <size_t X> struct utf_type;
        template <>         struct utf_type<1> { typedef uint8  type_t; };
        template <>         struct utf_type<2> { typedef uint16 type_t; };
        template <>         struct utf_type<4> { typedef uint32 type_t; };

        template <typename T, size_t X>
        struct check
        {
            static const bool value =
                ((sizeof(T) == sizeof(typename utf_type<X>::type_t)) && !is_pointer<T>::value);
        }
    }

    using namespace transform::private_;

    /*
        UTF-32 to UTF-8
    */

    template <typename T, typename U>
    typename enable_if<check<T, 4>::value && check<U, 1>::value,
    size_t>::type_t utf(T src, U* des)
    {
        if (src == 0) return 0;

        static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
        static const uint32 CODE_UP[] =
        {
            0x80,           // U+00000000 - U+0000007F
            0x800,          // U+00000080 - U+000007FF
            0x10000,        // U+00000800 - U+0000FFFF
            0x200000,       // U+00010000 - U+001FFFFF
            0x4000000,      // U+00200000 - U+03FFFFFF
            0x80000000      // U+04000000 - U+7FFFFFFF
        };

        size_t i, len = sizeof(CODE_UP) / sizeof(uint32);
        for(i = 0; i < len; ++i)
            if (src < CODE_UP[i]) break;

        if (i == len) return 0; // the src is invalid

        len = i + 1;
        if (des)
        {
            for(; i > 0; --i)
            {
                des[i] = static_cast<U>((src & 0x3F) | 0x80);
                src >>= 6;
            }
            des[0] = static_cast<U>(src | PREFIX[len - 1]);
        }
        return len;
    }

    /*
        UTF-8 to UTF-32
    */

    template <typename T, typename U>
    typename enable_if<check<T, 1>::value && check<U, 4>::value,
    size_t>::type_t utf(const T* src, U& des)
    {
        if (!src || (*src) == 0) return 0;

        uint8 b = *(src++);

        if (b < 0x80)
        {
            des = b;
            return 1;
        }

        if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid

        size_t len;

        if (b < 0xE0)
        {
            des = b & 0x1F;
            len = 2;
        }
        else
        if (b < 0xF0)
        {
            des = b & 0x0F;
            len = 3;
        }
        else
        if (b < 0xF8)
        {
            des = b & 0x07;
            len = 4;
        }
        else
        if (b < 0xFC)
        {
            des = b & 0x03;
            len = 5;
        }
        else
        {
            des = b & 0x01;
            len = 6;
        }

        size_t i = 1;
        for (; i < len; ++i)
        {
            b = *(src++);
            if (b < 0x80 || b > 0xBF) return 0; // the src is invalid
            des = (des << 6) + (b & 0x3F);
        }
        return len;
    }

    /*
        UTF-32 to UTF-16
    */

    template <typename T, typename U>
    typename enable_if<check<T, 4>::value && check<U, 2>::value,
    size_t>::type_t utf(T src, U* des)
    {
        if (src == 0) return 0;

        if (src <= 0xFFFF)
        {
            if (des) (*des) = static_cast<U>(src);
            return 1;
        }
        else
        if (src <= 0xEFFFF)
        {
            if (des)
            {
                des[0] = static_cast<U>(0xD800 + (src >> 10) - 0x40);  // high
                des[1] = static_cast<U>(0xDC00 + (src & 0x03FF));      // low
            }
            return 2;
        }
        return 0;
    }

    /*
        UTF-16 to UTF-32
    */

    template <typename T, typename U>
    typename enable_if<check<T, 2>::value && check<U, 4>::value,
    size_t>::type_t utf(const T* src, U& des)
    {
        if (!src || (*src) == 0) return 0;

        uint16 w1 = src[0];
        if (w1 >= 0xD800 && w1 <= 0xDFFF)
        {
            if (w1 < 0xDC00)
            {
                uint16 w2 = src[1];
                if (w2 >= 0xDC00 && w2 <= 0xDFFF)
                {
                    des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);
                    return 2;
                }
            }
            return 0; // the src is invalid
        }
        else
        {
            des = w1;
            return 1;
        }
    }

    /*
        UTF-16 to UTF-8
    */

    template <typename T, typename U>
    typename enable_if<check<T, 2>::value && check<U, 1>::value,
    size_t>::type_t utf(T src, U* des)
    {
        // make utf-16 to utf-32
        uint32 tmp;
        if (utf(&src, tmp) != 1) return 0;
        // make utf-32 to utf-8
        return utf(tmp, des);
    }

    /*
        UTF-8 to UTF-16
    */

    template <typename T, typename U>
    typename enable_if<check<T, 1>::value && check<U, 2>::value,
    size_t>::type_t utf(const T* src, U& des)
    {
        // make utf-8 to utf-32
        uint32 tmp;
        size_t len = utf(src, tmp);
        if (len == 0) return 0;
        // make utf-32 to utf-16
        if (utf(tmp, &des) != 1) return 0;
        return len;
    }

    /*
        UTF-X: string to string
    */

    template <typename T, typename U>
    typename enable_if<check<T, 4>::value && (check<U, 1>::value || check<U, 2>::value),
    size_t>::type_t utf(const T* src, U* des)   // UTF-32 to UTF-X(8/16)
    {
        if (!src || (*src) == 0) return 0;

        size_t num = 0;
        for(; *src; ++src)
        {
            size_t len = utf(*src, des);
            if (len == 0) break;
            if (des) des += len;
            num += len;
        }
        if (des) (*des) = 0;
        return num;
    }

    template <typename T, typename U>
    typename enable_if<(check<T, 1>::value || check<T, 2>::value) && check<U, 4>::value,
    size_t>::type_t utf(const T* src, U* des)   // UTF-X(8/16) to UTF-32
    {
        if (!src || (*src) == 0) return 0;

        size_t num = 0;
        while(*src)
        {
            uint32 tmp;
            size_t len = utf(src, tmp);
            if (len == 0) break;
            if (des)
            {
                (*des) = tmp;
                ++des;
            }
            src += len;
            num += 1;
        }
        if (des) (*des) = 0;
        return num;
    }

    template <typename T, typename U>
    typename enable_if<(check<T, 1>::value && check<U, 2>::value) ||
                       (check<T, 2>::value && check<U, 1>::value),
    size_t>::type_t utf(const T* src, U* des)    // UTF-X(8/16) to UTF-Y(16/8)
    {
        if (!src || (*src) == 0) return 0;

        size_t num = 0;
        while(*src)
        {
            // make utf-x to utf-32
            uint32 tmp;
            size_t len = utf(src, tmp);
            if (len == 0) break;
            src += len;
            // make utf-32 to utf-y
            len = utf(tmp, des);
            if (len == 0) break;
            if (des) des += len;
            num += len;
        }
        if (des) (*des) = 0;
        return num;
    }
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

namespace transform

{

namespace private_

{

template <size_t X> struct utf_type;

template <> struct utf_type<1> { typedef uint8 type_t; };

template <> struct utf_type<2> { typedef uint16 type_t; };

template <> struct utf_type<4> { typedef uint32 type_t; };

template <typename T, size_t X>

struct check

{

static const bool value =

((sizeof(T) == sizeof(typename utf_type<X>::type_t)) && !is_pointer<T>::value);

}

using namespace transform::private_;

UTF-32 to UTF-8

template <typename T, typename U>

typename enable_if<check<T, 4>::value && check<U, 1>::value,

size_t>::type_t utf(T src, U* des)

{

if (src == 0) return 0;

static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

static const uint32 CODE_UP[] =

{

0x80, // U+00000000 - U+0000007F

0x800, // U+00000080 - U+000007FF

0x10000, // U+00000800 - U+0000FFFF

0x200000, // U+00010000 - U+001FFFFF

0x4000000, // U+00200000 - U+03FFFFFF

0x80000000 // U+04000000 - U+7FFFFFFF

};

size_t i, len = sizeof(CODE_UP) / sizeof(uint32);

for(i = 0; i < len; ++i)

if (src < CODE_UP[i]) break;

if (i == len) return 0; // the src is invalid

len = i + 1;

if (des)

{

for(; i > 0; --i)

{

des[i] = static_cast((src & 0x3F) | 0x80);

src >>= 6;

}

des[0] = static_cast(src | PREFIX[len - 1]);

}

return len;

}

UTF-8 to UTF-32

template <typename T, typename U>

typename enable_if<check<T, 1>::value && check<U, 4>::value,

size_t>::type_t utf(const T* src, U& des)

{

if (!src || (*src) == 0) return 0;

uint8 b = *(src++);

if (b < 0x80)

{

des = b;

return 1;

}

if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid

size_t len;

if (b < 0xE0)

{

des = b & 0x1F;

len = 2;

}

else

if (b < 0xF0)

{

des = b & 0x0F;

len = 3;

}

else

if (b < 0xF8)

{

des = b & 0x07;

len = 4;

}

else

if (b < 0xFC)

{

des = b & 0x03;

len = 5;

}

else

{

des = b & 0x01;

len = 6;

}

size_t i = 1;

for (; i < len; ++i)

{

b = *(src++);

if (b < 0x80 || b > 0xBF) return 0; // the src is invalid

des = (des << 6) + (b & 0x3F);

}

return len;

}

UTF-32 to UTF-16

template <typename T, typename U>

typename enable_if<check<T, 4>::value && check<U, 2>::value,

size_t>::type_t utf(T src, U* des)

{

if (src == 0) return 0;

if (src <= 0xFFFF)

{

if (des) (*des) = static_cast(src);

return 1;

}

else

if (src <= 0xEFFFF)

{

if (des)

{

des[0] = static_cast(0xD800 + (src >> 10) - 0x40); // high

des[1] = static_cast(0xDC00 + (src & 0x03FF)); // low

}

return 2;

}

return 0;

}

UTF-16 to UTF-32

template <typename T, typename U>

typename enable_if<check<T, 2>::value && check<U, 4>::value,

size_t>::type_t utf(const T* src, U& des)

{

if (!src || (*src) == 0) return 0;

uint16 w1 = src[0];

if (w1 >= 0xD800 && w1 <= 0xDFFF)

{

if (w1 < 0xDC00)

{

uint16 w2 = src[1];

if (w2 >= 0xDC00 && w2 <= 0xDFFF)

{

des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);

return 2;

}

return 0; // the src is invalid

}

else

{

des = w1;

return 1;

}

UTF-16 to UTF-8

template <typename T, typename U>

typename enable_if<check<T, 2>::value && check<U, 1>::value,

size_t>::type_t utf(T src, U* des)

{

// make utf-16 to utf-32

uint32 tmp;

if (utf(&src, tmp) != 1) return 0;

// make utf-32 to utf-8

return utf(tmp, des);

}

UTF-8 to UTF-16

template <typename T, typename U>

typename enable_if<check<T, 1>::value && check<U, 2>::value,

size_t>::type_t utf(const T* src, U& des)

{

// make utf-8 to utf-32

uint32 tmp;

size_t len = utf(src, tmp);

if (len == 0) return 0;

// make utf-32 to utf-16

if (utf(tmp, &des) != 1) return 0;

return len;

}

UTF-X: string to string

template <typename T, typename U>

typename enable_if<check<T, 4>::value && (check<U, 1>::value || check<U, 2>::value),

size_t>::type_t utf(const T* src, U* des) // UTF-32 to UTF-X(8/16)

{

if (!src || (*src) == 0) return 0;

size_t num = 0;

for(; *src; ++src)

{

size_t len = utf(*src, des);

if (len == 0) break;

if (des) des += len;

num += len;

}

if (des) (*des) = 0;

return num;

}

template <typename T, typename U>

typename enable_if<(check<T, 1>::value || check<T, 2>::value) && check<U, 4>::value,

size_t>::type_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-32

{

if (!src || (*src) == 0) return 0;

size_t num = 0;

while(*src)

{

uint32 tmp;

size_t len = utf(src, tmp);

if (len == 0) break;

if (des)

{

(*des) = tmp;

++des;

}

src += len;

num += 1;

}

if (des) (*des) = 0;

return num;

}

template <typename T, typename U>

typename enable_if<(check<T, 1>::value && check<U, 2>::value) ||

(check<T, 2>::value && check<U, 1>::value),

size_t>::type_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-Y(16/8)

{

if (!src || (*src) == 0) return 0;

size_t num = 0;

while(*src)

{

// make utf-x to utf-32

uint32 tmp;

size_t len = utf(src, tmp);

if (len == 0) break;

src += len;

// make utf-32 to utf-y

len = utf(tmp, des);

if (len == 0) break;

if (des) des += len;

num += len;

}

if (des) (*des) = 0;

return num;

}

这样用起来就更加简单了：

const char* c = "你好世界";
size_t n = nx::transform::utf(c, (wchar_t*)0);

1 2	const char* c = "你好世界"; size_t n = nx::transform::utf(c, (wchar_t*)0);

完整代码请参考：
https://code.google.com/p/nixy/source/browse/trunk/nixycore/string/transform.h

Published by orzz.org(). (https://orzz.org/utf_transform/)

发表回复 取消回复

发表回复取消回复