UNPKG

7.7 kBtext/x-cView Raw
1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3#include "pch.h"
4#include "Unicode.h"
5#include "Utilities.h"
6
7#include "windows.h"
8
9#include "stringapiset.h"
10
11#include <cassert>
12#include <cstring>
13#include <exception>
14#include <string>
15
16namespace Microsoft::Common::Unicode {
17
18 // The implementations of the following functions heavily reference the MSDN
19 // article at https://msdn.microsoft.com/en-us/magazine/mt763237.aspx.
20
21 std::wstring Utf8ToUtf16(const char* utf8, size_t utf8Len) {
22 std::wstring utf16{};
23
24 // A small optimization.
25 if (utf8Len == 0) {
26 return utf16;
27 }
28
29 // Extra parentheses needed here to prevent expanding max as a
30 // Windows-specific preprocessor macro.
31 if (utf8Len > static_cast<size_t>((std::numeric_limits<int>::max)())) {
32 throw std::overflow_error("Length of input string to Utf8ToUtf16() must fit into an int.");
33 }
34
35 const int utf8Length = static_cast<int>(utf8Len);
36
37 // We do not specify MB_ERR_INVALID_CHARS here, which means that invalid UTF-8
38 // characters are replaced with U+FFFD.
39 constexpr DWORD flags = 0;
40
41 const int utf16Length = ::MultiByteToWideChar(
42 CP_UTF8, // Source string is in UTF-8.
43 flags, // Conversion flags.
44 utf8, // Source UTF-8 string pointer.
45 utf8Length, // Length of the source UTF-8 string, in chars.
46 nullptr, // Do not convert during this step, instead, request the size
47 0 // of the destination buffer, in wchar_ts, excluding the
48 // null termination character.
49 );
50
51 if (utf16Length == 0) {
52 throw UnicodeConversionException(
53 "Cannot get result string length when converting from UTF-8 to UTF-16 "
54 "(MultiByteToWideChar failed).",
55 GetLastError());
56 }
57
58 // Note that because the length of the input UTF-8 string was explicitly
59 // passed to MultiByteToWideChar (instead of just passing -1 and asking
60 // MultiByteToWideChar to scan the whole input string until a null terminator
61 // is found), MultiByteToWideChar won't add an additional null terminator to
62 // the result string. Therefore, there's no need to invoke
63 // std::wstring::resize with a "utf16Length + 1" value.
64 utf16.resize(utf16Length);
65
66 // Convert from UTF-8 to UTF-16
67 // Note that MultiByteToWideChar converts the UTF-8 BOM into the UTF-16BE BOM.
68 // So we do not have to do anything extra here to ensure correct BOM behavior.
69 int result = ::MultiByteToWideChar(
70 CP_UTF8, // Source string is in UTF-8.
71 flags, // Conversion flags.
72 utf8, // Source UTF-8 string pointer.
73 utf8Length, // Length of source UTF-8 string, in chars.
74 &utf16[0], // Pointer to destination buffer. This is fine because the
75 // the C++11 standard specifies that the elements of a
76 // std::basic_string are stored continuously.
77 utf16Length // Size of destination buffer, in wchar_ts.
78 );
79
80 if (result == 0) {
81 throw UnicodeConversionException(
82 "Cannot convert from UTF-8 to UTF-16 (MultiByteToWideChar failed).", GetLastError());
83 }
84
85 return utf16;
86 }
87
88 std::wstring Utf8ToUtf16(const char* utf8) {
89 return Utf8ToUtf16(utf8, strlen(utf8));
90 }
91
92 std::wstring Utf8ToUtf16(const std::string& utf8) {
93 return Utf8ToUtf16(utf8.c_str(), utf8.length());
94 }
95
96#if _HAS_CXX17
97 std::wstring Utf8ToUtf16(const std::string_view& utf8) {
98 return Utf8ToUtf16(utf8.data(), utf8.length());
99 }
100#endif
101
102 std::string Utf16ToUtf8(const wchar_t* utf16, size_t utf16Len) {
103 std::string utf8{};
104
105 // A small optimization.
106 if (utf16Len == 0) {
107 return utf8;
108 }
109
110 // Extra parentheses needed here to prevent expanding max as a
111 // Windows-specific preprocessor macro.
112 if (utf16Len > static_cast<size_t>((std::numeric_limits<int>::max)())) {
113 throw std::overflow_error("Length of input string to Utf16ToUtf8() must fit into an int.");
114 }
115
116 const int utf16Length = static_cast<int>(utf16Len);
117
118 // We do not specify WC_ERR_INVALID_CHARS here, which means that invalid
119 // UTF-16 characters are replaced with U+FFFD.
120 constexpr DWORD flags = 0;
121
122 const int utf8Length = ::WideCharToMultiByte(
123 CP_UTF8, // Destination string is in UTF-8.
124 flags, // Conversion flags.
125 utf16, // Source UTF-16 string pointer.
126 utf16Length, // Length of the source UTF-16 string, in wchar_ts.
127 nullptr, // Do not convert during this step, instead, request the size
128 0, // of the destination buffer, in chars, excluding the
129 // null termination character.
130 nullptr, // WideCharToMultiByte requires the last two parameters to be
131 nullptr // nullptrs when converting to UTF-8.
132 );
133
134 if (utf8Length == 0) {
135 throw UnicodeConversionException(
136 "Cannot get result string length when converting from UTF-16 to UTF-8 "
137 "(WideCharToMultiByte failed).",
138 GetLastError());
139 }
140
141 // Note that because the length of the input UTF-16 string was explicitly
142 // passed to WideCharToMultiByte (instead of just passing -1 and asking
143 // WideCharToMultiByte to scan the whole input string until a null terminator
144 // is found), WideCharToMultiByte won't add an additional null terminator to
145 // the result string. Therefore, there's no need to invoke
146 // std::string::resize with a "utf8Length + 1" value.
147 utf8.resize(utf8Length);
148
149 // Convert from UTF-8 to UTF-16
150 // Note that MultiByteToWideChar converts the UTF-8 BOM into the UTF-16BE BOM.
151 // So we do not have to do anything extra here to ensure correct BOM behavior.
152 int result = ::WideCharToMultiByte(
153 CP_UTF8, // Destination string is in UTF-8.
154 flags, // Conversion flags.
155 utf16, // Source UTF-16 string pointer.
156 utf16Length, // Length of the source UTF-16 string, in wchar_ts.
157 &utf8[0], // Pointer to destination buffer. This is fine because the
158 // the C++11 standard specifies that the elements of a
159 // std::basic_string are stored continuously.
160 utf8Length, // Size of destination buffer, in chars.
161 nullptr, // WideCharToMultiByte requires the last two parameters to be
162 nullptr // nullptrs when converting to UTF-8.
163 );
164
165 if (result == 0) {
166 throw UnicodeConversionException(
167 "Cannot convert from UTF-16 to UTF-8 (WideCharToMultiByte failed).", GetLastError());
168 }
169
170 return utf8;
171 }
172
173 std::string Utf16ToUtf8(const char16_t* utf16, size_t utf16Len) {
174 return Utf16ToUtf8(Utilities::CheckedReinterpretCast<const wchar_t*>(utf16), utf16Len);
175 }
176
177 std::string Utf16ToUtf8(const wchar_t* utf16) {
178 return Utf16ToUtf8(utf16, wcslen(utf16));
179 }
180
181 std::string Utf16ToUtf8(const char16_t* utf16) {
182 return Utf16ToUtf8(utf16, std::char_traits<char16_t>::length(utf16));
183 }
184
185 std::string Utf16ToUtf8(const std::wstring& utf16) {
186 return Utf16ToUtf8(utf16.c_str(), utf16.length());
187 }
188
189 std::string Utf16ToUtf8(const std::u16string& utf16) {
190 return Utf16ToUtf8(Utilities::CheckedReinterpretCast<const wchar_t*>(utf16.c_str()), utf16.length());
191 }
192
193#if _HAS_CXX17
194 std::string Utf16ToUtf8(const std::wstring_view& utf16) {
195 return Utf16ToUtf8(utf16.data(), utf16.length());
196 }
197
198 std::string Utf16ToUtf8(const std::u16string_view& utf16) {
199 return Utf16ToUtf8(Utilities::CheckedReinterpretCast<const wchar_t*>(utf16.data()), utf16.length());
200 }
201#endif
202
203} // namespace Microsoft::Common::Unicode