1 | // Copyright 2017-2022 @polkadot/util authors & contributors
|
2 | // SPDX-License-Identifier: Apache-2.0
|
3 | // Adapted from https://github.com/JulienPalard/is_utf8/blob/master/is_utf8.c
|
4 | import { u8aToU8a } from "../u8a/toU8a.js";
|
5 | import { isString } from "./string.js";
|
6 | /**
|
7 | * @name isUtf8
|
8 | * @summary Tests if the input is valid Utf8
|
9 | * @description
|
10 | * Checks to see if the input string or Uint8Array is valid Utf8
|
11 | */
|
12 |
|
13 | export function isUtf8(value) {
|
14 | if (!value) {
|
15 | return isString(value);
|
16 | }
|
17 |
|
18 | const u8a = u8aToU8a(value);
|
19 | const len = u8a.length;
|
20 | let i = 0;
|
21 |
|
22 | while (i < len) {
|
23 | if (u8a[i] <= 0x7F)
|
24 | /* 00..7F */
|
25 | {
|
26 | i += 1;
|
27 | } else if (u8a[i] >= 0xC2 && u8a[i] <= 0xDF)
|
28 | /* C2..DF 80..BF */
|
29 | {
|
30 | if (i + 1 < len)
|
31 | /* Expect a 2nd byte */
|
32 | {
|
33 | if (u8a[i + 1] < 0x80 || u8a[i + 1] > 0xBF) {
|
34 | // *message = "After a first byte between C2 and DF, expecting a 2nd byte between 80 and BF";
|
35 | // *faulty_bytes = 2;
|
36 | return false;
|
37 | }
|
38 | } else {
|
39 | // *message = "After a first byte between C2 and DF, expecting a 2nd byte.";
|
40 | // *faulty_bytes = 1;
|
41 | return false;
|
42 | }
|
43 |
|
44 | i += 2;
|
45 | } else if (u8a[i] === 0xE0)
|
46 | /* E0 A0..BF 80..BF */
|
47 | {
|
48 | if (i + 2 < len)
|
49 | /* Expect a 2nd and 3rd byte */
|
50 | {
|
51 | if (u8a[i + 1] < 0xA0 || u8a[i + 1] > 0xBF) {
|
52 | // *message = "After a first byte of E0, expecting a 2nd byte between A0 and BF.";
|
53 | // *faulty_bytes = 2;
|
54 | return false;
|
55 | }
|
56 |
|
57 | if (u8a[i + 2] < 0x80 || u8a[i + 2] > 0xBF) {
|
58 | // *message = "After a first byte of E0, expecting a 3nd byte between 80 and BF.";
|
59 | // *faulty_bytes = 3;
|
60 | return false;
|
61 | }
|
62 | } else {
|
63 | // *message = "After a first byte of E0, expecting two following bytes.";
|
64 | // *faulty_bytes = 1;
|
65 | return false;
|
66 | }
|
67 |
|
68 | i += 3;
|
69 | } else if (u8a[i] >= 0xE1 && u8a[i] <= 0xEC)
|
70 | /* E1..EC 80..BF 80..BF */
|
71 | {
|
72 | if (i + 2 < len)
|
73 | /* Expect a 2nd and 3rd byte */
|
74 | {
|
75 | if (u8a[i + 1] < 0x80 || u8a[i + 1] > 0xBF) {
|
76 | // *message = "After a first byte between E1 and EC, expecting the 2nd byte between 80 and BF.";
|
77 | // *faulty_bytes = 2;
|
78 | return false;
|
79 | }
|
80 |
|
81 | if (u8a[i + 2] < 0x80 || u8a[i + 2] > 0xBF) {
|
82 | // *message = "After a first byte between E1 and EC, expecting the 3rd byte between 80 and BF.";
|
83 | // *faulty_bytes = 3;
|
84 | return false;
|
85 | }
|
86 | } else {
|
87 | // *message = "After a first byte between E1 and EC, expecting two following bytes.";
|
88 | // *faulty_bytes = 1;
|
89 | return false;
|
90 | }
|
91 |
|
92 | i += 3;
|
93 | } else if (u8a[i] === 0xED)
|
94 | /* ED 80..9F 80..BF */
|
95 | {
|
96 | if (i + 2 < len)
|
97 | /* Expect a 2nd and 3rd byte */
|
98 | {
|
99 | if (u8a[i + 1] < 0x80 || u8a[i + 1] > 0x9F) {
|
100 | // *message = "After a first byte of ED, expecting 2nd byte between 80 and 9F.";
|
101 | // *faulty_bytes = 2;
|
102 | return false;
|
103 | }
|
104 |
|
105 | if (u8a[i + 2] < 0x80 || u8a[i + 2] > 0xBF) {
|
106 | // *message = "After a first byte of ED, expecting 3rd byte between 80 and BF.";
|
107 | // *faulty_bytes = 3;
|
108 | return false;
|
109 | }
|
110 | } else {
|
111 | // *message = "After a first byte of ED, expecting two following bytes.";
|
112 | // *faulty_bytes = 1;
|
113 | return false;
|
114 | }
|
115 |
|
116 | i += 3;
|
117 | } else if (u8a[i] >= 0xEE && u8a[i] <= 0xEF)
|
118 | /* EE..EF 80..BF 80..BF */
|
119 | {
|
120 | if (i + 2 < len)
|
121 | /* Expect a 2nd and 3rd byte */
|
122 | {
|
123 | if (u8a[i + 1] < 0x80 || u8a[i + 1] > 0xBF) {
|
124 | // *message = "After a first byte between EE and EF, expecting 2nd byte between 80 and BF.";
|
125 | // *faulty_bytes = 2;
|
126 | return false;
|
127 | }
|
128 |
|
129 | if (u8a[i + 2] < 0x80 || u8a[i + 2] > 0xBF) {
|
130 | // *message = "After a first byte between EE and EF, expecting 3rd byte between 80 and BF.";
|
131 | // *faulty_bytes = 3;
|
132 | return false;
|
133 | }
|
134 | } else {
|
135 | // *message = "After a first byte between EE and EF, two following bytes.";
|
136 | // *faulty_bytes = 1;
|
137 | return false;
|
138 | }
|
139 |
|
140 | i += 3;
|
141 | } else if (u8a[i] === 0xF0)
|
142 | /* F0 90..BF 80..BF 80..BF */
|
143 | {
|
144 | if (i + 3 < len)
|
145 | /* Expect a 2nd, 3rd 3th byte */
|
146 | {
|
147 | if (u8a[i + 1] < 0x90 || u8a[i + 1] > 0xBF) {
|
148 | // *message = "After a first byte of F0, expecting 2nd byte between 90 and BF.";
|
149 | // *faulty_bytes = 2;
|
150 | return false;
|
151 | }
|
152 |
|
153 | if (u8a[i + 2] < 0x80 || u8a[i + 2] > 0xBF) {
|
154 | // *message = "After a first byte of F0, expecting 3rd byte between 80 and BF.";
|
155 | // *faulty_bytes = 3;
|
156 | return false;
|
157 | }
|
158 |
|
159 | if (u8a[i + 3] < 0x80 || u8a[i + 3] > 0xBF) {
|
160 | // *message = "After a first byte of F0, expecting 4th byte between 80 and BF.";
|
161 | // *faulty_bytes = 4;
|
162 | return false;
|
163 | }
|
164 | } else {
|
165 | // *message = "After a first byte of F0, expecting three following bytes.";
|
166 | // *faulty_bytes = 1;
|
167 | return false;
|
168 | }
|
169 |
|
170 | i += 4;
|
171 | } else if (u8a[i] >= 0xF1 && u8a[i] <= 0xF3)
|
172 | /* F1..F3 80..BF 80..BF 80..BF */
|
173 | {
|
174 | if (i + 3 < len)
|
175 | /* Expect a 2nd, 3rd 3th byte */
|
176 | {
|
177 | if (u8a[i + 1] < 0x80 || u8a[i + 1] > 0xBF) {
|
178 | // *message = "After a first byte of F1, F2, or F3, expecting a 2nd byte between 80 and BF.";
|
179 | // *faulty_bytes = 2;
|
180 | return false;
|
181 | }
|
182 |
|
183 | if (u8a[i + 2] < 0x80 || u8a[i + 2] > 0xBF) {
|
184 | // *message = "After a first byte of F1, F2, or F3, expecting a 3rd byte between 80 and BF.";
|
185 | // *faulty_bytes = 3;
|
186 | return false;
|
187 | }
|
188 |
|
189 | if (u8a[i + 3] < 0x80 || u8a[i + 3] > 0xBF) {
|
190 | // *message = "After a first byte of F1, F2, or F3, expecting a 4th byte between 80 and BF.";
|
191 | // *faulty_bytes = 4;
|
192 | return false;
|
193 | }
|
194 | } else {
|
195 | // *message = "After a first byte of F1, F2, or F3, expecting three following bytes.";
|
196 | // *faulty_bytes = 1;
|
197 | return false;
|
198 | }
|
199 |
|
200 | i += 4;
|
201 | } else if (u8a[i] === 0xF4)
|
202 | /* F4 80..8F 80..BF 80..BF */
|
203 | {
|
204 | if (i + 3 < len)
|
205 | /* Expect a 2nd, 3rd 3th byte */
|
206 | {
|
207 | if (u8a[i + 1] < 0x80 || u8a[i + 1] > 0x8F) {
|
208 | // *message = "After a first byte of F4, expecting 2nd byte between 80 and 8F.";
|
209 | // *faulty_bytes = 2;
|
210 | return false;
|
211 | }
|
212 |
|
213 | if (u8a[i + 2] < 0x80 || u8a[i + 2] > 0xBF) {
|
214 | // *message = "After a first byte of F4, expecting 3rd byte between 80 and BF.";
|
215 | // *faulty_bytes = 3;
|
216 | return false;
|
217 | }
|
218 |
|
219 | if (u8a[i + 3] < 0x80 || u8a[i + 3] > 0xBF) {
|
220 | // *message = "After a first byte of F4, expecting 4th byte between 80 and BF.";
|
221 | // *faulty_bytes = 4;
|
222 | return false;
|
223 | }
|
224 | } else {
|
225 | // *message = "After a first byte of F4, expecting three following bytes.";
|
226 | // *faulty_bytes = 1;
|
227 | return false;
|
228 | }
|
229 |
|
230 | i += 4;
|
231 | } else {
|
232 | // *message = "Expecting bytes in the following ranges: 00..7F C2..F4.";
|
233 | // *faulty_bytes = 1;
|
234 | return false;
|
235 | }
|
236 | }
|
237 |
|
238 | return true;
|
239 | } |
\ | No newline at end of file |