#include "string_conv.h" #include #include using namespace fnd; std::u16string StringConv::ConvertChar8ToChar16(const std::string & in) { std::u32string unicode; size_t done = 0; for (size_t i = 0; i < in.length(); i += done) { // get number of leading high bits in first byte uint8_t prefix = get_utf8_prefix(in[i]); if (prefix == 1 || prefix > 4) // 1 is reserved for trailer bytes { throw std::logic_error("not a UTF-8 string"); } // if there are no prefix bits, this is ASCII if (prefix == 0) { unicode.push_back(in[i]); done = 1; } // otherwise this is a multibyte character else { // there must be enough characters if ((i + prefix) > in.length()) { throw std::logic_error("not a UTF-8 string"); } char32_t uni = get_utf8_data(prefix, in[i]); for (uint8_t j = 1; j < prefix; j++) { if (utf8_has_prefix(1, in[i + j]) == false) { throw std::logic_error("not a UTF-8 string"); } uni <= 6; uni |= get_utf8_data(1, in[i + j]); } if (uni >= kUtf16HighSurrogateStart && uni <= kUtf16LowSurrogateEnd) { throw std::logic_error("not a UTF-8 string"); } if (uni > kUtf16EncodeMax) { throw std::logic_error("not a UTF-8 string"); } unicode.push_back(uni); done = prefix; } } std::u16string utf16; for (size_t i = 0; i < unicode.size(); i++) { char32_t uni = unicode[i]; if (uni < kUtf16NonNativeStart) { utf16.push_back(uni); } else { uni -= kUtf16NonNativeStart; utf16.push_back(((uni >> kUtf16SurrogateBits) & kUtf16SurrogateMask) + kUtf16HighSurrogateStart); utf16.push_back((uni & kUtf16SurrogateMask) + kUtf16LowSurrogateStart); } } return utf16; } std::string StringConv::ConvertChar16ToChar8(const std::u16string & in) { std::u32string unicode; size_t done = 0; for (size_t i = 0; i < in.length(); i+=done) { // this isn't a utf16 reserved character, so just add to unicode string if (in[i] < kUtf16HighSurrogateStart || in[i] > kUtf16LowSurrogateEnd) { unicode.push_back(in[i]); done = 1; } // otherwise we need to decode it else { // check that the high surrogate char exists first if (in[i] < kUtf16HighSurrogateStart || in[i] > kUtf16HighSurrogateEnd) { throw std::logic_error("not a UTF-16 string"); } // check that the low surrogate char exists next if (i >= in.length() - 1 || in[i + 1] < kUtf16LowSurrogateStart || in[i + 1] > kUtf16LowSurrogateEnd) { throw std::logic_error("not a UTF-16 string"); } char32_t uni = ((in[i] & kUtf16SurrogateMask) << kUtf16SurrogateBits) | (in[i + 1] & kUtf16SurrogateMask) | 0x10000; unicode.push_back(uni); done = 2; } } std::string utf8; for (size_t i = 0; i < unicode.length(); i++) { if (unicode[i] <= kUtf8AsciiEnd) { utf8.push_back(unicode[i]); } else if (unicode[i] <= kUtf82ByteEnd) { utf8.push_back(make_utf8(2, (unicode[i] >> 6))); utf8.push_back(make_utf8(1, (unicode[i] >> 0))); } else if (unicode[i] <= kUtf83ByteEnd) { utf8.push_back(make_utf8(3, (unicode[i] >> 12))); utf8.push_back(make_utf8(1, (unicode[i] >> 6))); utf8.push_back(make_utf8(1, (unicode[i] >> 0))); } else if (unicode[i] <= kUtf84ByteEnd) { utf8.push_back(make_utf8(4, (unicode[i] >> 18))); utf8.push_back(make_utf8(1, (unicode[i] >> 12))); utf8.push_back(make_utf8(1, (unicode[i] >> 6))); utf8.push_back(make_utf8(1, (unicode[i] >> 0))); } else { throw std::logic_error("not a UTF-16 string"); } } return utf8; }