You are viewing an old version of the ionflux.org website! Please visit the new version
here.
UTF-8 Encoding/Decoding (C++)
UTF-8 is a character encoding which supports an amazing number of characters
(the full unicode set) and allows text to be written and transmitted in almost
any script of the world, while still maintaining efficiency. It is quickly
becoming the standard character encoding on computers as well as in the
internet. See here [externer
Link] for a detailed explanation of what UTF-8 is and how it works. The code
snippets in this section provide encoding/decoding of UTF-8 text strings as well
as various utility functions.
Each character in the Unicode character set is identified by a numerical value.
UTF-8 strings are strings of bytes, where one or more bytes may be used for
encoding a single character, depending on its numerical value. For the purpose
of this code, decoded Unicode characters are represented as unsigned integers,
while encoded UTF-8 characters are just standard C++ strings.
Single Character Encoder
The following code encodes a single unicode character into its corresponding
UTF-8 string. Ugly, maybe, but necessary. ;-)
#include <iostream>
#include <string>
std::string uint_to_utf8(unsigned int uniChar)
{
std::string result;
if (uniChar < 128U)
result.append(1, static_cast<unsigned char>(uniChar));
else
if (uniChar < 2048U)
{
result.append(1, static_cast<unsigned char>((uniChar >> 6) | 0xc0));
result.append(1, static_cast<unsigned char>((uniChar & 0x3f) | 0x80));
} else
if (uniChar < 65536U)
{
result.append(1, static_cast<unsigned char>((uniChar >> 12) | 0xe0));
result.append(1, static_cast<unsigned char>(((uniChar >> 6) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>((uniChar & 0x3f) | 0x80));
} else
if (uniChar < 2097152U)
{
result.append(1, static_cast<unsigned char>((uniChar >> 18) | 0xf0));
result.append(1, static_cast<unsigned char>(((uniChar >> 12) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>(((uniChar >> 6) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>((uniChar & 0x3f) | 0x80));
} else
if (uniChar < 67108864U)
{
result.append(1, static_cast<unsigned char>((uniChar >> 24) | 0xf8));
result.append(1, static_cast<unsigned char>(((uniChar >> 18) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>(((uniChar >> 12) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>(((uniChar >> 6) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>((uniChar & 0x3f)
| 0x80));
} else
if (uniChar < 2147483648U)
{
result.append(1, static_cast<unsigned char>((uniChar >> 30) | 0xfc));
result.append(1, static_cast<unsigned char>(((uniChar >> 24) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>(((uniChar >> 18) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>(((uniChar >> 12) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>(((uniChar >> 6) & 0x3f)
| 0x80));
result.append(1, static_cast<unsigned char>((uniChar & 0x3f) | 0x80));
} else
std::cerr << "[uint_to_utf8] WARNING: Character not representable by UTF-8."
<< std::endl;
return result;
}
Single Character Decoder
The following code decodes the first unicode character from an UTF-8 string.
#include <iostream>
#include <string>
bool utf8_to_uint(const std::string& bytes, unsigned int& target)
{
unsigned int size = bytes.size();
target = 0;
bool result = true;
if (size == 1)
{
if ((bytes[0] >> 7) != 0)
{
// ----- WARNING ----- //
std::cerr << "[utf8_to_uint] WARNING: Invalid single-byte character."
<< std::endl;
// ----- WARNING ----- */
result = false;
} else
target = bytes[0];
} else
{
unsigned char byte = bytes[0];
target = ((byte & (0xff >> (size + 1))) << (6 * (size - 1)));
unsigned int i = 1;
while (result && (i < size))
{
byte = bytes[i];
if ((byte >> 6) != 2)
{
// ----- WARNING ----- //
std::cerr << "[utf8_to_uint] WARNING: Invalid byte ("
<< static_cast<unsigned int>(byte)
<< ") in UTF-8 sequence at position " << i << "."
<< std::endl;
// ----- WARNING ----- */
result = false;
} else
target |= ((byte & 0x3f) << (6 * (size - 1 - i)));
i++;
}
}
return result;
}
String Encoder
This snippet can be used to encode a whole string of Unicode characters,
specified as a vector of code values, to a UTF-8 string. There is really nothing
special to it - it just calls the single-character version of uint_to_utf8() a
couple of times.
#include <string>
#include <vector>
void uint_to_utf8(const std::vector<unsigned int>& uniChars,
std::string& target)
{
target = "";
for (unsigned int i = 0; i < uniChars.size(); i++)
target.append(uint_to_utf8(uniChars[i]));
}
String Decoder
Now, here is the UTF-8 string decoder, which decodes a standard string from
UTF-8 encoding to a vector of numerical Unicode values. This function requires
additional code which calculates the length of an UTF-8 character. This is
necessary since a character may require more than one byte to be represented in
UTF-8 encoding. The size is calculated by the utf8_get_size() function.
#include <string>
#include <vector>
unsigned int utf8_get_size(unsigned char byte)
{
if (byte < 128)
return 1;
else
if ((byte & 0xe0) == 0xc0)
return 2;
else
if ((byte & 0xf0) == 0xe0)
return 3;
else
if ((byte & 0xf8) == 0xf0)
return 4;
else
if ((byte & 0xfc) == 0xf8)
return 5;
else
if ((byte & 0xfe) == 0xfc)
return 6;
// ----- WARNING ----- //
std::cerr << "[utf8_get_size] WARNING: Invalid character size."
<< std::endl;
// ----- WARNING ----- */
return 0;
}
bool utf8_to_uint(const std::string& bytes, std::vector<unsigned int>& target)
{
unsigned int size = bytes.size();
unsigned int i = 0;
unsigned int charSize = 0;
unsigned int currentChar = 0;
bool result = true;
target.clear();
while (result && (i < size))
{
charSize = utf8_get_size(bytes[i]);
if ((charSize > 0)
&& ((i + charSize) <= size)
&& utf8_to_uint(bytes.substr(i, charSize), currentChar))
i += charSize;
else
{
// ----- WARNING ----- //
std::cerr << "[utf8_to_uint] WARNING: Could not convert UTF-8 character "
"(size = " << charSize << ", position = " << i << ")."
<< std::endl;
// ----- WARNING ----- */
result = false;
}
if (result)
target.push_back(currentChar);
}
return result;
}
Length of an UTF-8 string
There may be a case where it is necessary to know the length of an UTF-8 string
in characters (not bytes). Unfortunately, this is not possible without reading
the entire string - unless the length is calculated when the string is created
and then stored somewhere along with the encoded data - but that's cheating. So
here is a function that calculates the length of an UTF-8 string by looking at
each character without actually decoding it.
#include <string>
unsigned int utf8_get_size(const std::string& bytes)
{
unsigned int size = bytes.size();
unsigned int i = 0;
unsigned int charLen = 0;
unsigned int result = 0;
bool error = false;
while (!error && (i < size))
{
charLen = utf8_get_size(bytes[i]);
if (charLen > 0)
{
result++;
i += charLen;
} else
error = true;
}
if (error)
return 0;
return result;
}