1 | // Copyright (c) Microsoft Corporation. All rights reserved.
|
2 | // Licensed under the MIT License.
|
3 |
|
4 |
|
5 |
|
6 |
|
7 |
|
8 |
|
9 |
|
10 |
|
11 |
|
12 |
|
13 |
|
14 |
|
15 |
|
16 | namespace Microsoft::Common::Unicode {
|
17 |
|
18 | // The implementations of the following functions heavily reference the MSDN
|
19 | // article at https://msdn.microsoft.com/en-us/magazine/mt763237.aspx.
|
20 |
|
21 | std::wstring Utf8ToUtf16(const char* utf8, size_t utf8Len) {
|
22 | std::wstring utf16{};
|
23 |
|
24 | // A small optimization.
|
25 | if (utf8Len == 0) {
|
26 | return utf16;
|
27 | }
|
28 |
|
29 | // Extra parentheses needed here to prevent expanding max as a
|
30 | // Windows-specific preprocessor macro.
|
31 | if (utf8Len > static_cast<size_t>((std::numeric_limits<int>::max)())) {
|
32 | throw std::overflow_error("Length of input string to Utf8ToUtf16() must fit into an int.");
|
33 | }
|
34 |
|
35 | const int utf8Length = static_cast<int>(utf8Len);
|
36 |
|
37 | // We do not specify MB_ERR_INVALID_CHARS here, which means that invalid UTF-8
|
38 | // characters are replaced with U+FFFD.
|
39 | constexpr DWORD flags = 0;
|
40 |
|
41 | const int utf16Length = ::MultiByteToWideChar(
|
42 | CP_UTF8, // Source string is in UTF-8.
|
43 | flags, // Conversion flags.
|
44 | utf8, // Source UTF-8 string pointer.
|
45 | utf8Length, // Length of the source UTF-8 string, in chars.
|
46 | nullptr, // Do not convert during this step, instead, request the size
|
47 | 0 // of the destination buffer, in wchar_ts, excluding the
|
48 | // null termination character.
|
49 | );
|
50 |
|
51 | if (utf16Length == 0) {
|
52 | throw UnicodeConversionException(
|
53 | "Cannot get result string length when converting from UTF-8 to UTF-16 "
|
54 | "(MultiByteToWideChar failed).",
|
55 | GetLastError());
|
56 | }
|
57 |
|
58 | // Note that because the length of the input UTF-8 string was explicitly
|
59 | // passed to MultiByteToWideChar (instead of just passing -1 and asking
|
60 | // MultiByteToWideChar to scan the whole input string until a null terminator
|
61 | // is found), MultiByteToWideChar won't add an additional null terminator to
|
62 | // the result string. Therefore, there's no need to invoke
|
63 | // std::wstring::resize with a "utf16Length + 1" value.
|
64 | utf16.resize(utf16Length);
|
65 |
|
66 | // Convert from UTF-8 to UTF-16
|
67 | // Note that MultiByteToWideChar converts the UTF-8 BOM into the UTF-16BE BOM.
|
68 | // So we do not have to do anything extra here to ensure correct BOM behavior.
|
69 | int result = ::MultiByteToWideChar(
|
70 | CP_UTF8, // Source string is in UTF-8.
|
71 | flags, // Conversion flags.
|
72 | utf8, // Source UTF-8 string pointer.
|
73 | utf8Length, // Length of source UTF-8 string, in chars.
|
74 | &utf16[0], // Pointer to destination buffer. This is fine because the
|
75 | // the C++11 standard specifies that the elements of a
|
76 | // std::basic_string are stored continuously.
|
77 | utf16Length // Size of destination buffer, in wchar_ts.
|
78 | );
|
79 |
|
80 | if (result == 0) {
|
81 | throw UnicodeConversionException(
|
82 | "Cannot convert from UTF-8 to UTF-16 (MultiByteToWideChar failed).", GetLastError());
|
83 | }
|
84 |
|
85 | return utf16;
|
86 | }
|
87 |
|
88 | std::wstring Utf8ToUtf16(const char* utf8) {
|
89 | return Utf8ToUtf16(utf8, strlen(utf8));
|
90 | }
|
91 |
|
92 | std::wstring Utf8ToUtf16(const std::string& utf8) {
|
93 | return Utf8ToUtf16(utf8.c_str(), utf8.length());
|
94 | }
|
95 |
|
96 |
|
97 | std::wstring Utf8ToUtf16(const std::string_view& utf8) {
|
98 | return Utf8ToUtf16(utf8.data(), utf8.length());
|
99 | }
|
100 |
|
101 |
|
102 | std::string Utf16ToUtf8(const wchar_t* utf16, size_t utf16Len) {
|
103 | std::string utf8{};
|
104 |
|
105 | // A small optimization.
|
106 | if (utf16Len == 0) {
|
107 | return utf8;
|
108 | }
|
109 |
|
110 | // Extra parentheses needed here to prevent expanding max as a
|
111 | // Windows-specific preprocessor macro.
|
112 | if (utf16Len > static_cast<size_t>((std::numeric_limits<int>::max)())) {
|
113 | throw std::overflow_error("Length of input string to Utf16ToUtf8() must fit into an int.");
|
114 | }
|
115 |
|
116 | const int utf16Length = static_cast<int>(utf16Len);
|
117 |
|
118 | // We do not specify WC_ERR_INVALID_CHARS here, which means that invalid
|
119 | // UTF-16 characters are replaced with U+FFFD.
|
120 | constexpr DWORD flags = 0;
|
121 |
|
122 | const int utf8Length = ::WideCharToMultiByte(
|
123 | CP_UTF8, // Destination string is in UTF-8.
|
124 | flags, // Conversion flags.
|
125 | utf16, // Source UTF-16 string pointer.
|
126 | utf16Length, // Length of the source UTF-16 string, in wchar_ts.
|
127 | nullptr, // Do not convert during this step, instead, request the size
|
128 | 0, // of the destination buffer, in chars, excluding the
|
129 | // null termination character.
|
130 | nullptr, // WideCharToMultiByte requires the last two parameters to be
|
131 | nullptr // nullptrs when converting to UTF-8.
|
132 | );
|
133 |
|
134 | if (utf8Length == 0) {
|
135 | throw UnicodeConversionException(
|
136 | "Cannot get result string length when converting from UTF-16 to UTF-8 "
|
137 | "(WideCharToMultiByte failed).",
|
138 | GetLastError());
|
139 | }
|
140 |
|
141 | // Note that because the length of the input UTF-16 string was explicitly
|
142 | // passed to WideCharToMultiByte (instead of just passing -1 and asking
|
143 | // WideCharToMultiByte to scan the whole input string until a null terminator
|
144 | // is found), WideCharToMultiByte won't add an additional null terminator to
|
145 | // the result string. Therefore, there's no need to invoke
|
146 | // std::string::resize with a "utf8Length + 1" value.
|
147 | utf8.resize(utf8Length);
|
148 |
|
149 | // Convert from UTF-8 to UTF-16
|
150 | // Note that MultiByteToWideChar converts the UTF-8 BOM into the UTF-16BE BOM.
|
151 | // So we do not have to do anything extra here to ensure correct BOM behavior.
|
152 | int result = ::WideCharToMultiByte(
|
153 | CP_UTF8, // Destination string is in UTF-8.
|
154 | flags, // Conversion flags.
|
155 | utf16, // Source UTF-16 string pointer.
|
156 | utf16Length, // Length of the source UTF-16 string, in wchar_ts.
|
157 | &utf8[0], // Pointer to destination buffer. This is fine because the
|
158 | // the C++11 standard specifies that the elements of a
|
159 | // std::basic_string are stored continuously.
|
160 | utf8Length, // Size of destination buffer, in chars.
|
161 | nullptr, // WideCharToMultiByte requires the last two parameters to be
|
162 | nullptr // nullptrs when converting to UTF-8.
|
163 | );
|
164 |
|
165 | if (result == 0) {
|
166 | throw UnicodeConversionException(
|
167 | "Cannot convert from UTF-16 to UTF-8 (WideCharToMultiByte failed).", GetLastError());
|
168 | }
|
169 |
|
170 | return utf8;
|
171 | }
|
172 |
|
173 | std::string Utf16ToUtf8(const char16_t* utf16, size_t utf16Len) {
|
174 | return Utf16ToUtf8(Utilities::CheckedReinterpretCast<const wchar_t*>(utf16), utf16Len);
|
175 | }
|
176 |
|
177 | std::string Utf16ToUtf8(const wchar_t* utf16) {
|
178 | return Utf16ToUtf8(utf16, wcslen(utf16));
|
179 | }
|
180 |
|
181 | std::string Utf16ToUtf8(const char16_t* utf16) {
|
182 | return Utf16ToUtf8(utf16, std::char_traits<char16_t>::length(utf16));
|
183 | }
|
184 |
|
185 | std::string Utf16ToUtf8(const std::wstring& utf16) {
|
186 | return Utf16ToUtf8(utf16.c_str(), utf16.length());
|
187 | }
|
188 |
|
189 | std::string Utf16ToUtf8(const std::u16string& utf16) {
|
190 | return Utf16ToUtf8(Utilities::CheckedReinterpretCast<const wchar_t*>(utf16.c_str()), utf16.length());
|
191 | }
|
192 |
|
193 |
|
194 | std::string Utf16ToUtf8(const std::wstring_view& utf16) {
|
195 | return Utf16ToUtf8(utf16.data(), utf16.length());
|
196 | }
|
197 |
|
198 | std::string Utf16ToUtf8(const std::u16string_view& utf16) {
|
199 | return Utf16ToUtf8(Utilities::CheckedReinterpretCast<const wchar_t*>(utf16.data()), utf16.length());
|
200 | }
|
201 |
|
202 |
|
203 | } // namespace Microsoft::Common::Unicode
|