11

I'm looking for a collection of functions for performing UTF character conversion in C++11. It should include conversion to and from any of utf8, utf16, and utf32. A function for recognizing byte order marks would be helpful, too.

Lightness Races in Orbit
386k77 gold badges668 silver badges1.1k bronze badges
asked Jul 31, 2016 at 21:10

3 Answers 3

14

Update: The functions listed here are maintained in a GitHub repo, .hpp, .cpp and tests. Some UTF-16 functions have been disable because they do not work correctly. The "banana" tests in the utf.test.cpp file demonstrate the problem.

Also included a "read_with_bom" function for recognizing byte order marks.

#if _MSC_VER == 1900 //work around for bug in MS Visual C++ 2015 https://social.msdn.microsoft.com/Forums/en-US/8f40dcd8-c67f-4eba-9134-a19b9178e481/vs-2015-rc-linker-stdcodecvt-error?forum=vcgeneral
std::string to_utf8(const std::u16string &s)
{
 std::wstring_convert<std::codecvt_utf8<int16_t>, int16_t> convert;
 auto p = reinterpret_cast<const int16_t *>(s.data());
 return convert.to_bytes(p, p + s.size());
}
std::string to_utf8(const std::u32string &s)
{
 std::wstring_convert<std::codecvt_utf8<int32_t>, int32_t> convert;
 auto p = reinterpret_cast<const int32_t *>(s.data());
 return convert.to_bytes(p, p + s.size());
}
std::u16string to_utf16(const std::string &s)
{
 std::wstring_convert<std::codecvt_utf8<int16_t>, int16_t> convert;
 auto asInt = convert.from_bytes(s);
 return std::u16string(reinterpret_cast<char16_t const *>(asInt.data()), asInt.length());
}
std::u32string to_utf32(const std::string &s)
{
 std::wstring_convert<std::codecvt_utf8<int32_t>, int32_t> convert;
 auto asInt = convert.from_bytes(s);
 return std::u32string(reinterpret_cast<char32_t const *>(asInt.data()), asInt.length());
}
#else
std::string to_utf8(const std::u16string &s)
{
 std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> conv;
 return conv.to_bytes(s);
}
std::string to_utf8(const std::u32string &s)
{
 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
 return conv.to_bytes(s);
}
std::u16string to_utf16(const std::string &s)
{
 std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> convert;
 return convert.from_bytes(s);
}
std::u32string to_utf32(const std::string &s)
{
 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
 return conv.from_bytes(s);
}
#endif
std::u16string to_utf16(const std::u32string &s)
{
 return to_utf16(to_utf8(s));
}
std::u32string to_utf32(const std::u16string &s) {
 return to_utf32(to_utf8(s));
}
std::u32string read_with_bom(std::istream & src)
{
 enum encoding {
 encoding_utf32be = 0,
 encoding_utf32le,
 encoding_utf16be,
 encoding_utf16le,
 encoding_utf8,
 encoding_ascii,
 };
 std::vector<std::string> boms = {
 std::string("\x00\x00\xFE\xFF", 4),
 std::string("\xFF\xFE\x00\x00", 4),
 std::string("\xFE\xFF", 2),
 std::string("\xFF\xFE", 2),
 std::string("\xEF\xBB\xBF", 3)
 };
 std::string buffer((std::istreambuf_iterator<char>(src)), std::istreambuf_iterator<char>());
 encoding enc = encoding_ascii;
 for (unsigned int i = 0; i < boms.size(); ++i) {
 std::string testBom = boms[i];
 if (buffer.compare(0, testBom.length(), testBom) == 0) {
 enc = encoding(i);
 buffer = buffer.substr(testBom.length());
 break;
 }
 }
 switch (enc) {
 case encoding_utf32be:
 {
 if (buffer.length() % 4 != 0) {
 throw std::logic_error("size in bytes must be a multiple of 4");
 }
 int count = buffer.length() / 4;
 std::u32string temp = std::u32string(count, 0);
 for (int i = 0; i < count; ++i) {
 temp[i] = static_cast<char32_t>(buffer[i * 4 + 3] << 0 | buffer[i * 4 + 2] << 8 | buffer[i * 4 + 1] << 16 | buffer[i * 4 + 0] << 24);
 }
 return temp;
 }
 case encoding_utf32le:
 {
 if (buffer.length() % 4 != 0) {
 throw std::logic_error("size in bytes must be a multiple of 4");
 }
 int count = buffer.length() / 4;
 std::u32string temp = std::u32string(count, 0);
 for (int i = 0; i < count; ++i) {
 temp[i] = static_cast<char32_t>(buffer[i * 4 + 0] << 0 | buffer[i * 4 + 1] << 8 | buffer[i * 4 + 2] << 16 | buffer[i * 4 + 3] << 24);
 }
 return temp;
 }
 case encoding_utf16be:
 {
 if (buffer.length() % 2 != 0) {
 throw std::logic_error("size in bytes must be a multiple of 2");
 }
 int count = buffer.length() / 2;
 std::u16string temp = std::u16string(count, 0);
 for (int i = 0; i < count; ++i) {
 temp[i] = static_cast<char16_t>(buffer[i * 2 + 1] << 0 | buffer[i * 2 + 0] << 8);
 }
 return to_utf32(temp);
 }
 case encoding_utf16le:
 {
 if (buffer.length() % 2 != 0) {
 throw std::logic_error("size in bytes must be a multiple of 2");
 }
 int count = buffer.length() / 2;
 std::u16string temp = std::u16string(count, 0);
 for (int i = 0; i < count; ++i) {
 temp[i] = static_cast<char16_t>(buffer[i * 2 + 0] << 0 | buffer[i * 2 + 1] << 8);
 }
 return to_utf32(temp);
 }
 default:
 return to_utf32(buffer);
 }
}
answered Jul 31, 2016 at 21:10
8
  • 2
    Consider posting this to codereview.stackexchange.com for code review if everything works properly. Commented Jul 31, 2016 at 21:19
  • I came to stack overflow to ask this question, and found there was no answer. I think I've used the site correctly. Commented Jul 31, 2016 at 21:20
  • 2
    I'm not saying you've used stack overflow incorrectly; I think this is a good question to ask on code review. Commented Jul 31, 2016 at 21:21
  • Why does your UTF-16->UTF-8 use uint16_t instead of char16_t? Same goes for char32_t and UTF-32. Is that required somewhere? Commented Aug 1, 2016 at 1:52
  • 1
    error C4996: 'std::codecvt_utf8<char32_t,1114111,0>': warning STL4017: std::wbuffer_convert, std::wstring_convert, and the <codecvt> header (containing std::codecvt_mode, std::codecvt_utf8, std::codecvt_utf16, and std::codecvt_utf8_utf16) are deprecated in C++17. (The std::codecvt class template is NOT deprecated.) The C++ Standard doesn't provide equivalent non-deprecated functionality; consider using MultiByteToWideChar() and WideCharToMultiByte() from <Windows.h> instead. ... C++17 in 2019 might require an update to this answer. Commented Aug 10, 2019 at 0:31
3

I've written a little utf_ranges library for doing just this. It uses Range-V3 and C++14.

It has both views and actions (if you're familiar with Range-V3 terminology) for converting between any of the three main UTF encodings, can consume and generate byte order marks, and perform endian conversion based on a bom. For example, reading a file from unknown-endian UTF-16 into a UTF-8 std::string, converting any of the seven unicode line endings to \n, looks like this:

std::ifstream source{path, std::ios::binary};
std::string str = utf::istreambuf<char16_t>(source)
 | utf::view::consume_bom
 | utf::view::utf8
 | utf::view::line_end_convert;
answered Aug 6, 2016 at 1:40
1
  • Pretty nice! Any possibility for conversion to a one-header & one-source version? Commented Sep 25, 2016 at 20:23
0

Here's my UTF-8 code from Baby X (https://github.com/MalcolmMcLean/babyx)

static const unsigned int offsetsFromUTF8[6] = 
{
 0x00000000UL, 0x00003080UL, 0x000E2080UL,
 0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
static const unsigned char trailingBytesForUTF8[256] = {
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
int bbx_isutf8z(const char *str)
{
 int len = 0;
 int pos = 0;
 int nb;
 int i;
 int ch;
 while(str[len])
 len++;
 while(pos < len && *str)
 {
 nb = bbx_utf8_skip(str);
 if(nb < 1 || nb > 4)
 return 0;
 if(pos + nb > len)
 return 0;
 for(i=1;i<nb;i++)
 if( (str[i] & 0xC0) != 0x80 )
 return 0;
 ch = bbx_utf8_getch(str);
 if(ch < 0x80)
 {
 if(nb != 1)
 return 0;
 }
 else if(ch < 0x8000)
 {
 if(nb != 2)
 return 0;
 }
 else if(ch < 0x10000)
 {
 if(nb != 3)
 return 0;
 }
 else if(ch < 0x110000)
 {
 if(nb != 4)
 return 0;
 }
 pos += nb;
 str += nb; 
 }
 return 1;
}
int bbx_utf8_skip(const char *utf8)
{
 return trailingBytesForUTF8[(unsigned char) *utf8] + 1;
}
int bbx_utf8_getch(const char *utf8)
{
 int ch;
 int nb;
 nb = trailingBytesForUTF8[(unsigned char)*utf8];
 ch = 0;
 switch (nb) 
 {
 /* these fall through deliberately */
 case 3: ch += (unsigned char)*utf8++; ch <<= 6;
 case 2: ch += (unsigned char)*utf8++; ch <<= 6;
 case 1: ch += (unsigned char)*utf8++; ch <<= 6;
 case 0: ch += (unsigned char)*utf8++;
 }
 ch -= offsetsFromUTF8[nb];
 return ch;
}
int bbx_utf8_putch(char *out, int ch)
{
 char *dest = out;
 if (ch < 0x80) 
 {
 *dest++ = (char)ch;
 }
 else if (ch < 0x800) 
 {
 *dest++ = (ch>>6) | 0xC0;
 *dest++ = (ch & 0x3F) | 0x80;
 }
 else if (ch < 0x10000) 
 {
 *dest++ = (ch>>12) | 0xE0;
 *dest++ = ((ch>>6) & 0x3F) | 0x80;
 *dest++ = (ch & 0x3F) | 0x80;
 }
 else if (ch < 0x110000) 
 {
 *dest++ = (ch>>18) | 0xF0;
 *dest++ = ((ch>>12) & 0x3F) | 0x80;
 *dest++ = ((ch>>6) & 0x3F) | 0x80;
 *dest++ = (ch & 0x3F) | 0x80;
 }
 else
 return 0;
 return dest - out;
}
int bbx_utf8_charwidth(int ch)
{
 if (ch < 0x80)
 {
 return 1;
 }
 else if (ch < 0x800)
 {
 return 2;
 }
 else if (ch < 0x10000)
 {
 return 3;
 }
 else if (ch < 0x110000)
 {
 return 4;
 }
 else
 return 0;
}
int bbx_utf8_Nchars(const char *utf8)
{
 int answer = 0;
 while(*utf8)
 {
 utf8 += bbx_utf8_skip(utf8);
 answer++;
 }
 return answer;
}
answered Feb 4, 2018 at 19:39

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.