VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 98103

Last change on this file since 98103 was 98103, checked in by vboxsync, 23 months ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 70.8 KB
Line 
1/* $Id: utf-8.cpp 98103 2023-01-17 14:15:46Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#include <iprt/string.h>
42#include <iprt/latin1.h>
43#include "internal/iprt.h"
44
45#include <iprt/uni.h>
46#include <iprt/asm.h>
47#include <iprt/alloc.h>
48#include <iprt/assert.h>
49#include <iprt/err.h>
50#include "internal/string.h"
51
52
53
54/**
55 * Get get length in code points of a UTF-8 encoded string.
56 * The string is validated while doing this.
57 *
58 * @returns IPRT status code.
59 * @param psz Pointer to the UTF-8 string.
60 * @param cch The max length of the string. (btw cch = cb)
61 * Use RTSTR_MAX if all of the string is to be examined.
62 * @param pcuc Where to store the length in unicode code points.
63 * @param pcchActual Where to store the actual size of the UTF-8 string
64 * on success (cch = cb again). Optional.
65 */
66DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
67{
68 const unsigned char *puch = (const unsigned char *)psz;
69 size_t cCodePoints = 0;
70 while (cch > 0)
71 {
72 const unsigned char uch = *puch;
73 if (!uch)
74 break;
75 if (uch & RT_BIT(7))
76 {
77 /* figure sequence length and validate the first byte */
78/** @todo RT_USE_RTC_3629 */
79 unsigned cb;
80 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
81 cb = 2;
82 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
83 cb = 3;
84 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
85 cb = 4;
86 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
87 cb = 5;
88 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
89 cb = 6;
90 else
91 {
92 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
93 return VERR_INVALID_UTF8_ENCODING;
94 }
95
96 /* check length */
97 if (cb > cch)
98 {
99 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
100 return VERR_INVALID_UTF8_ENCODING;
101 }
102
103 /* validate the rest */
104 switch (cb)
105 {
106 case 6:
107 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
108 RT_FALL_THRU();
109 case 5:
110 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
111 RT_FALL_THRU();
112 case 4:
113 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
114 RT_FALL_THRU();
115 case 3:
116 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
117 RT_FALL_THRU();
118 case 2:
119 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120 break;
121 }
122
123 /* validate the code point. */
124 RTUNICP uc;
125 switch (cb)
126 {
127 case 6:
128 uc = (puch[5] & 0x3f)
129 | ((RTUNICP)(puch[4] & 0x3f) << 6)
130 | ((RTUNICP)(puch[3] & 0x3f) << 12)
131 | ((RTUNICP)(puch[2] & 0x3f) << 18)
132 | ((RTUNICP)(puch[1] & 0x3f) << 24)
133 | ((RTUNICP)(uch & 0x01) << 30);
134 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
135 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136 break;
137 case 5:
138 uc = (puch[4] & 0x3f)
139 | ((RTUNICP)(puch[3] & 0x3f) << 6)
140 | ((RTUNICP)(puch[2] & 0x3f) << 12)
141 | ((RTUNICP)(puch[1] & 0x3f) << 18)
142 | ((RTUNICP)(uch & 0x03) << 24);
143 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
144 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
145 break;
146 case 4:
147 uc = (puch[3] & 0x3f)
148 | ((RTUNICP)(puch[2] & 0x3f) << 6)
149 | ((RTUNICP)(puch[1] & 0x3f) << 12)
150 | ((RTUNICP)(uch & 0x07) << 18);
151 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
152 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153 break;
154 case 3:
155 uc = (puch[2] & 0x3f)
156 | ((RTUNICP)(puch[1] & 0x3f) << 6)
157 | ((RTUNICP)(uch & 0x0f) << 12);
158 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
159 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
160 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
161 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
162 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
163 break;
164 case 2:
165 uc = (puch[1] & 0x3f)
166 | ((RTUNICP)(uch & 0x1f) << 6);
167 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
168 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
169 break;
170 }
171
172 /* advance */
173 cch -= cb;
174 puch += cb;
175 }
176 else
177 {
178 /* one ASCII byte */
179 puch++;
180 cch--;
181 }
182 cCodePoints++;
183 }
184
185 /* done */
186 *pcuc = cCodePoints;
187 if (pcchActual)
188 *pcchActual = puch - (unsigned char const *)psz;
189 return VINF_SUCCESS;
190}
191
192
193/**
194 * Decodes and UTF-8 string into an array of unicode code point.
195 *
196 * Since we know the input is valid, we do *not* perform encoding or length checks.
197 *
198 * @returns iprt status code.
199 * @param psz The UTF-8 string to recode. This is a valid encoding.
200 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
201 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
202 * @param paCps Where to store the code points array.
203 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
204 */
205static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
206{
207 int rc = VINF_SUCCESS;
208 const unsigned char *puch = (const unsigned char *)psz;
209 PRTUNICP pCp = paCps;
210 while (cch > 0)
211 {
212 /* read the next char and check for terminator. */
213 const unsigned char uch = *puch;
214 if (uch)
215 { /* we only break once, so consider this the likely branch. */ }
216 else
217 break;
218
219 /* check for output overflow */
220 if (RT_LIKELY(cCps >= 1))
221 { /* likely */ }
222 else
223 {
224 rc = VERR_BUFFER_OVERFLOW;
225 break;
226 }
227 cCps--;
228
229 /* decode and recode the code point */
230 if (!(uch & RT_BIT(7)))
231 {
232 *pCp++ = uch;
233 puch++;
234 cch--;
235 }
236#ifdef RT_STRICT
237 else if (!(uch & RT_BIT(6)))
238 AssertMsgFailed(("Internal error!\n"));
239#endif
240 else if (!(uch & RT_BIT(5)))
241 {
242 *pCp++ = (puch[1] & 0x3f)
243 | ((uint16_t)(uch & 0x1f) << 6);
244 puch += 2;
245 cch -= 2;
246 }
247 else if (!(uch & RT_BIT(4)))
248 {
249 *pCp++ = (puch[2] & 0x3f)
250 | ((uint16_t)(puch[1] & 0x3f) << 6)
251 | ((uint16_t)(uch & 0x0f) << 12);
252 puch += 3;
253 cch -= 3;
254 }
255 else if (!(uch & RT_BIT(3)))
256 {
257 *pCp++ = (puch[3] & 0x3f)
258 | ((RTUNICP)(puch[2] & 0x3f) << 6)
259 | ((RTUNICP)(puch[1] & 0x3f) << 12)
260 | ((RTUNICP)(uch & 0x07) << 18);
261 puch += 4;
262 cch -= 4;
263 }
264 else if (!(uch & RT_BIT(2)))
265 {
266 *pCp++ = (puch[4] & 0x3f)
267 | ((RTUNICP)(puch[3] & 0x3f) << 6)
268 | ((RTUNICP)(puch[2] & 0x3f) << 12)
269 | ((RTUNICP)(puch[1] & 0x3f) << 18)
270 | ((RTUNICP)(uch & 0x03) << 24);
271 puch += 5;
272 cch -= 6;
273 }
274 else
275 {
276 Assert(!(uch & RT_BIT(1)));
277 *pCp++ = (puch[5] & 0x3f)
278 | ((RTUNICP)(puch[4] & 0x3f) << 6)
279 | ((RTUNICP)(puch[3] & 0x3f) << 12)
280 | ((RTUNICP)(puch[2] & 0x3f) << 18)
281 | ((RTUNICP)(puch[1] & 0x3f) << 24)
282 | ((RTUNICP)(uch & 0x01) << 30);
283 puch += 6;
284 cch -= 6;
285 }
286 }
287
288 /* done */
289 *pCp = 0;
290 return rc;
291}
292
293
294RTDECL(size_t) RTStrUniLen(const char *psz)
295{
296 size_t cCodePoints;
297 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
298 return RT_SUCCESS(rc) ? cCodePoints : 0;
299}
300RT_EXPORT_SYMBOL(RTStrUniLen);
301
302
303RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
304{
305 size_t cCodePoints;
306 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
307 if (pcCps)
308 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
309 return rc;
310}
311RT_EXPORT_SYMBOL(RTStrUniLenEx);
312
313
314RTDECL(int) RTStrValidateEncoding(const char *psz)
315{
316 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
317}
318RT_EXPORT_SYMBOL(RTStrValidateEncoding);
319
320
321RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
322{
323 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
324 VERR_INVALID_PARAMETER);
325 AssertPtr(psz);
326
327 /*
328 * Use rtUtf8Length for the job.
329 */
330 size_t cchActual;
331 size_t cCpsIgnored;
332 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
333 if (RT_SUCCESS(rc))
334 {
335 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
336 {
337 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
338 cchActual++;
339 if (cchActual == cch)
340 rc = VINF_SUCCESS;
341 else if (cchActual < cch)
342 rc = VERR_BUFFER_UNDERFLOW;
343 else
344 rc = VERR_BUFFER_OVERFLOW;
345 }
346 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
347 && cchActual >= cch)
348 rc = VERR_BUFFER_OVERFLOW;
349 }
350 return rc;
351}
352RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
353
354
355RTDECL(bool) RTStrIsValidEncoding(const char *psz)
356{
357 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
358 return RT_SUCCESS(rc);
359}
360RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
361
362
363RTDECL(size_t) RTStrPurgeEncoding(char *psz)
364{
365 size_t cErrors = 0;
366 for (;;)
367 {
368 RTUNICP Cp;
369 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
370 if (RT_SUCCESS(rc))
371 {
372 if (!Cp)
373 break;
374 }
375 else
376 {
377 psz[-1] = '?';
378 cErrors++;
379 }
380 }
381 return cErrors;
382}
383RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
384
385
386/**
387 * Helper for RTStrPurgeComplementSet.
388 *
389 * @returns true if @a Cp is valid, false if not.
390 * @param Cp The code point to validate.
391 * @param puszValidPairs Pair of valid code point sets.
392 * @param cValidPairs Number of pairs.
393 */
394DECLINLINE(bool) rtStrPurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
395{
396 while (cValidPairs-- > 0)
397 {
398 if ( Cp >= puszValidPairs[0]
399 && Cp <= puszValidPairs[1])
400 return true;
401 puszValidPairs += 2;
402 }
403 return false;
404}
405
406
407RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidPairs, char chReplacement)
408{
409 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
410
411 /*
412 * Calc valid pairs and check that we've got an even number.
413 */
414 uint32_t cValidPairs = 0;
415 while (puszValidPairs[cValidPairs * 2])
416 {
417 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
418 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
419 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
420 cValidPairs++;
421 }
422
423 /*
424 * Do the replacing.
425 */
426 ssize_t cReplacements = 0;
427 for (;;)
428 {
429 char *pszCur = psz;
430 RTUNICP Cp;
431 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
432 if (RT_SUCCESS(rc))
433 {
434 if (Cp)
435 {
436 if (!rtStrPurgeIsInSet(Cp, puszValidPairs, cValidPairs))
437 {
438 for (; pszCur != psz; ++pszCur)
439 *pszCur = chReplacement;
440 ++cReplacements;
441 }
442 }
443 else
444 break;
445 }
446 else
447 return -1;
448 }
449 return cReplacements;
450}
451RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
452
453
454RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
455{
456 /*
457 * Validate input.
458 */
459 AssertPtr(pszString);
460 AssertPtr(ppaCps);
461 *ppaCps = NULL;
462
463 /*
464 * Validate the UTF-8 input and count its code points.
465 */
466 size_t cCps;
467 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
468 if (RT_SUCCESS(rc))
469 {
470 /*
471 * Allocate buffer.
472 */
473 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
474 if (paCps)
475 {
476 /*
477 * Decode the string.
478 */
479 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
480 if (RT_SUCCESS(rc))
481 {
482 *ppaCps = paCps;
483 return rc;
484 }
485 RTMemFree(paCps);
486 }
487 else
488 rc = VERR_NO_CODE_POINT_MEMORY;
489 }
490 return rc;
491}
492RT_EXPORT_SYMBOL(RTStrToUni);
493
494
495RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
496{
497 /*
498 * Validate input.
499 */
500 AssertPtr(pszString);
501 AssertPtr(ppaCps);
502 AssertPtrNull(pcCps);
503
504 /*
505 * Validate the UTF-8 input and count the code points.
506 */
507 size_t cCpsResult;
508 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
509 if (RT_SUCCESS(rc))
510 {
511 if (pcCps)
512 *pcCps = cCpsResult;
513
514 /*
515 * Check buffer size / Allocate buffer.
516 */
517 bool fShouldFree;
518 PRTUNICP paCpsResult;
519 if (cCps > 0 && *ppaCps)
520 {
521 fShouldFree = false;
522 if (cCps <= cCpsResult)
523 return VERR_BUFFER_OVERFLOW;
524 paCpsResult = *ppaCps;
525 }
526 else
527 {
528 *ppaCps = NULL;
529 fShouldFree = true;
530 cCps = RT_MAX(cCpsResult + 1, cCps);
531 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
532 }
533 if (paCpsResult)
534 {
535 /*
536 * Encode the UTF-16 string.
537 */
538 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
539 if (RT_SUCCESS(rc))
540 {
541 *ppaCps = paCpsResult;
542 return rc;
543 }
544 if (fShouldFree)
545 RTMemFree(paCpsResult);
546 }
547 else
548 rc = VERR_NO_CODE_POINT_MEMORY;
549 }
550 return rc;
551}
552RT_EXPORT_SYMBOL(RTStrToUniEx);
553
554
555/**
556 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
557 *
558 * @returns IPRT status code.
559 * @param psz Pointer to the UTF-8 string.
560 * @param cch The max length of the string. (btw cch = cb)
561 * @param pcwc Where to store the length of the UTF-16 string as a number
562 * of RTUTF16 characters.
563 * @sa rtUtf8CalcUtf16Length
564 */
565static int rtUtf8CalcUtf16LengthN(const char *psz, size_t cch, size_t *pcwc)
566{
567 const unsigned char *puch = (const unsigned char *)psz;
568 size_t cwc = 0;
569 while (cch > 0)
570 {
571 const unsigned char uch = *puch;
572 if (!(uch & RT_BIT(7)))
573 {
574 /* one ASCII byte */
575 if (uch)
576 {
577 cwc++;
578 puch++;
579 cch--;
580 }
581 else
582 break;
583 }
584 else
585 {
586 /*
587 * Multibyte sequence is more complicated when we have length
588 * restrictions on the input.
589 */
590 /* figure sequence length and validate the first byte */
591 unsigned cb;
592 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
593 cb = 2;
594 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
595 cb = 3;
596 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
597 cb = 4;
598 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
599 cb = 5;
600 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
601 cb = 6;
602 else
603 {
604 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
605 return VERR_INVALID_UTF8_ENCODING;
606 }
607
608 /* check length */
609 if (cb > cch)
610 {
611 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
612 return VERR_INVALID_UTF8_ENCODING;
613 }
614
615 /* validate the rest */
616 switch (cb)
617 {
618 case 6:
619 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
620 RT_FALL_THRU();
621 case 5:
622 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
623 RT_FALL_THRU();
624 case 4:
625 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
626 RT_FALL_THRU();
627 case 3:
628 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
629 RT_FALL_THRU();
630 case 2:
631 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
632 break;
633 }
634
635 /* validate the code point. */
636 RTUNICP uc;
637 switch (cb)
638 {
639 case 6:
640 uc = (puch[5] & 0x3f)
641 | ((RTUNICP)(puch[4] & 0x3f) << 6)
642 | ((RTUNICP)(puch[3] & 0x3f) << 12)
643 | ((RTUNICP)(puch[2] & 0x3f) << 18)
644 | ((RTUNICP)(puch[1] & 0x3f) << 24)
645 | ((RTUNICP)(uch & 0x01) << 30);
646 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
647 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
648 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
649 return VERR_CANT_RECODE_AS_UTF16;
650 case 5:
651 uc = (puch[4] & 0x3f)
652 | ((RTUNICP)(puch[3] & 0x3f) << 6)
653 | ((RTUNICP)(puch[2] & 0x3f) << 12)
654 | ((RTUNICP)(puch[1] & 0x3f) << 18)
655 | ((RTUNICP)(uch & 0x03) << 24);
656 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
657 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
658 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
659 return VERR_CANT_RECODE_AS_UTF16;
660 case 4:
661 uc = (puch[3] & 0x3f)
662 | ((RTUNICP)(puch[2] & 0x3f) << 6)
663 | ((RTUNICP)(puch[1] & 0x3f) << 12)
664 | ((RTUNICP)(uch & 0x07) << 18);
665 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
666 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
667 RTStrAssertMsgReturn(uc <= 0x0010ffff,
668 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
669 cwc++;
670 break;
671 case 3:
672 uc = (puch[2] & 0x3f)
673 | ((RTUNICP)(puch[1] & 0x3f) << 6)
674 | ((RTUNICP)(uch & 0x0f) << 12);
675 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
676 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
677 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
678 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
679 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
680 break;
681 case 2:
682 uc = (puch[1] & 0x3f)
683 | ((RTUNICP)(uch & 0x1f) << 6);
684 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
685 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
686 break;
687 }
688
689 /* advance */
690 cch -= cb;
691 puch += cb;
692 cwc++;
693 }
694 }
695
696 /* done */
697 *pcwc = cwc;
698 return VINF_SUCCESS;
699}
700
701
702/**
703 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
704 *
705 * @returns IPRT status code.
706 * @param psz Pointer to the UTF-8 string.
707 * @param pcwc Where to store the length of the UTF-16 string as a number
708 * of RTUTF16 characters.
709 * @sa rtUtf8CalcUtf16LengthN
710 */
711static int rtUtf8CalcUtf16Length(const char *psz, size_t *pcwc)
712{
713 const unsigned char *puch = (const unsigned char *)psz;
714 size_t cwc = 0;
715 for (;;)
716 {
717 const unsigned char uch = *puch;
718 if (!(uch & RT_BIT(7)))
719 {
720 /* one ASCII byte */
721 if (uch)
722 {
723 cwc++;
724 puch++;
725 }
726 else
727 break;
728 }
729 else
730 {
731 /*
732 * Figure sequence length, implicitly validate the first byte.
733 * Then validate the additional bytes.
734 * Finally validate the code point.
735 */
736 unsigned cb;
737 RTUNICP uc;
738 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
739 {
740 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
741 uc = (puch[1] & 0x3f)
742 | ((RTUNICP)(uch & 0x1f) << 6);
743 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
744 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
745 cb = 2;
746 }
747 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
748 {
749 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
750 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
751 uc = (puch[2] & 0x3f)
752 | ((RTUNICP)(puch[1] & 0x3f) << 6)
753 | ((RTUNICP)(uch & 0x0f) << 12);
754 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
755 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
756 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
757 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
758 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
759 cb = 3;
760 }
761 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
762 {
763 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
764 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
765 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
766 uc = (puch[3] & 0x3f)
767 | ((RTUNICP)(puch[2] & 0x3f) << 6)
768 | ((RTUNICP)(puch[1] & 0x3f) << 12)
769 | ((RTUNICP)(uch & 0x07) << 18);
770 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
771 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
772 RTStrAssertMsgReturn(uc <= 0x0010ffff,
773 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
774 cwc++;
775 cb = 4;
776 }
777 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
778 {
779 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
780 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
781 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
782 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
783 uc = (puch[4] & 0x3f)
784 | ((RTUNICP)(puch[3] & 0x3f) << 6)
785 | ((RTUNICP)(puch[2] & 0x3f) << 12)
786 | ((RTUNICP)(puch[1] & 0x3f) << 18)
787 | ((RTUNICP)(uch & 0x03) << 24);
788 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
789 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
790 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
791 return VERR_CANT_RECODE_AS_UTF16;
792 //cb = 5;
793 }
794 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
795 {
796 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
797 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
798 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
799 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
800 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
801 uc = (puch[5] & 0x3f)
802 | ((RTUNICP)(puch[4] & 0x3f) << 6)
803 | ((RTUNICP)(puch[3] & 0x3f) << 12)
804 | ((RTUNICP)(puch[2] & 0x3f) << 18)
805 | ((RTUNICP)(puch[1] & 0x3f) << 24)
806 | ((RTUNICP)(uch & 0x01) << 30);
807 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
808 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
809 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
810 return VERR_CANT_RECODE_AS_UTF16;
811 //cb = 6;
812 }
813 else
814 {
815 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
816 return VERR_INVALID_UTF8_ENCODING;
817 }
818
819 /* advance */
820 puch += cb;
821 cwc++;
822 }
823 }
824
825 /* done */
826 *pcwc = cwc;
827 return VINF_SUCCESS;
828}
829
830
831
832/**
833 * Recodes a valid UTF-8 string as UTF-16.
834 *
835 * Since we know the input is valid, we do *not* perform encoding or length checks.
836 *
837 * @returns iprt status code.
838 * @param psz The UTF-8 string to recode. This is a valid encoding.
839 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
840 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
841 * @param pwsz Where to store the UTF-16 string.
842 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
843 *
844 * @note rtUtf8RecodeAsUtf16Big is a duplicate with RT_H2BE_U16 applied.
845 */
846static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
847{
848 int rc = VINF_SUCCESS;
849 const unsigned char *puch = (const unsigned char *)psz;
850 PRTUTF16 pwc = pwsz;
851 while (cch > 0)
852 {
853 /* read the next char and check for terminator. */
854 const unsigned char uch = *puch;
855 if (uch)
856 { /* we only break once, so consider this the likely branch. */ }
857 else
858 break;
859
860 /* check for output overflow */
861 if (RT_LIKELY(cwc >= 1))
862 { /* likely */ }
863 else
864 {
865 rc = VERR_BUFFER_OVERFLOW;
866 break;
867 }
868 cwc--;
869
870 /* decode and recode the code point */
871 if (!(uch & RT_BIT(7)))
872 {
873 *pwc++ = uch;
874 puch++;
875 cch--;
876 }
877 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
878 {
879 uint16_t uc = (puch[1] & 0x3f)
880 | ((uint16_t)(uch & 0x1f) << 6);
881 *pwc++ = uc;
882 puch += 2;
883 cch -= 2;
884 }
885 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
886 {
887 uint16_t uc = (puch[2] & 0x3f)
888 | ((uint16_t)(puch[1] & 0x3f) << 6)
889 | ((uint16_t)(uch & 0x0f) << 12);
890 *pwc++ = uc;
891 puch += 3;
892 cch -= 3;
893 }
894 else
895 {
896 /* generate surrogate pair */
897 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
898 RTUNICP uc = (puch[3] & 0x3f)
899 | ((RTUNICP)(puch[2] & 0x3f) << 6)
900 | ((RTUNICP)(puch[1] & 0x3f) << 12)
901 | ((RTUNICP)(uch & 0x07) << 18);
902 if (RT_UNLIKELY(cwc < 1))
903 {
904 rc = VERR_BUFFER_OVERFLOW;
905 break;
906 }
907 cwc--;
908
909 uc -= 0x10000;
910 *pwc++ = 0xd800 | (uc >> 10);
911 *pwc++ = 0xdc00 | (uc & 0x3ff);
912 puch += 4;
913 cch -= 4;
914 }
915 }
916
917 /* done */
918 *pwc = '\0';
919 return rc;
920}
921
922
923/**
924 * Recodes a valid UTF-8 string as UTF-16BE.
925 *
926 * Since we know the input is valid, we do *not* perform encoding or length checks.
927 *
928 * @returns iprt status code.
929 * @param psz The UTF-8 string to recode. This is a valid encoding.
930 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
931 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
932 * @param pwsz Where to store the UTF-16BE string.
933 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
934 *
935 * @note This is a copy of rtUtf8RecodeAsUtf16 with RT_H2BE_U16 applied.
936 */
937static int rtUtf8RecodeAsUtf16Big(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
938{
939 int rc = VINF_SUCCESS;
940 const unsigned char *puch = (const unsigned char *)psz;
941 PRTUTF16 pwc = pwsz;
942 while (cch > 0)
943 {
944 /* read the next char and check for terminator. */
945 const unsigned char uch = *puch;
946 if (uch)
947 { /* we only break once, so consider this the likely branch. */ }
948 else
949 break;
950
951 /* check for output overflow */
952 if (RT_LIKELY(cwc >= 1))
953 { /* likely */ }
954 else
955 {
956 rc = VERR_BUFFER_OVERFLOW;
957 break;
958 }
959 cwc--;
960
961 /* decode and recode the code point */
962 if (!(uch & RT_BIT(7)))
963 {
964 *pwc++ = RT_H2BE_U16((RTUTF16)uch);
965 puch++;
966 cch--;
967 }
968 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
969 {
970 uint16_t uc = (puch[1] & 0x3f)
971 | ((uint16_t)(uch & 0x1f) << 6);
972 *pwc++ = RT_H2BE_U16(uc);
973 puch += 2;
974 cch -= 2;
975 }
976 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
977 {
978 uint16_t uc = (puch[2] & 0x3f)
979 | ((uint16_t)(puch[1] & 0x3f) << 6)
980 | ((uint16_t)(uch & 0x0f) << 12);
981 *pwc++ = RT_H2BE_U16(uc);
982 puch += 3;
983 cch -= 3;
984 }
985 else
986 {
987 /* generate surrogate pair */
988 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
989 RTUNICP uc = (puch[3] & 0x3f)
990 | ((RTUNICP)(puch[2] & 0x3f) << 6)
991 | ((RTUNICP)(puch[1] & 0x3f) << 12)
992 | ((RTUNICP)(uch & 0x07) << 18);
993 if (RT_UNLIKELY(cwc < 1))
994 {
995 rc = VERR_BUFFER_OVERFLOW;
996 break;
997 }
998 cwc--;
999
1000 uc -= 0x10000;
1001 *pwc++ = RT_H2BE_U16(0xd800 | (uc >> 10));
1002 *pwc++ = RT_H2BE_U16(0xdc00 | (uc & 0x3ff));
1003 puch += 4;
1004 cch -= 4;
1005 }
1006 }
1007
1008 /* done */
1009 *pwc = '\0';
1010 return rc;
1011}
1012
1013
1014RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
1015{
1016 /*
1017 * Validate input.
1018 */
1019 AssertPtr(ppwszString);
1020 AssertPtr(pszString);
1021 *ppwszString = NULL;
1022
1023 /*
1024 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1025 */
1026 size_t cwc;
1027 int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1028 if (RT_SUCCESS(rc))
1029 {
1030 /*
1031 * Allocate buffer.
1032 */
1033 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1034 if (pwsz)
1035 {
1036 /*
1037 * Encode the UTF-16 string.
1038 */
1039 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1040 if (RT_SUCCESS(rc))
1041 {
1042 *ppwszString = pwsz;
1043 return rc;
1044 }
1045 RTMemFree(pwsz);
1046 }
1047 else
1048 rc = VERR_NO_UTF16_MEMORY;
1049 }
1050 return rc;
1051}
1052RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
1053
1054
1055RTDECL(int) RTStrToUtf16BigTag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
1056{
1057 /*
1058 * Validate input.
1059 */
1060 AssertPtr(ppwszString);
1061 AssertPtr(pszString);
1062 *ppwszString = NULL;
1063
1064 /*
1065 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1066 */
1067 size_t cwc;
1068 int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1069 if (RT_SUCCESS(rc))
1070 {
1071 /*
1072 * Allocate buffer.
1073 */
1074 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1075 if (pwsz)
1076 {
1077 /*
1078 * Encode the UTF-16 string.
1079 */
1080 rc = rtUtf8RecodeAsUtf16Big(pszString, RTSTR_MAX, pwsz, cwc);
1081 if (RT_SUCCESS(rc))
1082 {
1083 *ppwszString = pwsz;
1084 return rc;
1085 }
1086 RTMemFree(pwsz);
1087 }
1088 else
1089 rc = VERR_NO_UTF16_MEMORY;
1090 }
1091 return rc;
1092}
1093RT_EXPORT_SYMBOL(RTStrToUtf16BigTag);
1094
1095
1096RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
1097 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1098{
1099 /*
1100 * Validate input.
1101 */
1102 AssertPtr(pszString);
1103 AssertPtr(ppwsz);
1104 AssertPtrNull(pcwc);
1105
1106 /*
1107 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1108 */
1109 size_t cwcResult;
1110 int rc;
1111 if (cchString != RTSTR_MAX)
1112 rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1113 else
1114 rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1115 if (RT_SUCCESS(rc))
1116 {
1117 if (pcwc)
1118 *pcwc = cwcResult;
1119
1120 /*
1121 * Check buffer size / Allocate buffer.
1122 */
1123 bool fShouldFree;
1124 PRTUTF16 pwszResult;
1125 if (cwc > 0 && *ppwsz)
1126 {
1127 fShouldFree = false;
1128 if (cwc <= cwcResult)
1129 return VERR_BUFFER_OVERFLOW;
1130 pwszResult = *ppwsz;
1131 }
1132 else
1133 {
1134 *ppwsz = NULL;
1135 fShouldFree = true;
1136 cwc = RT_MAX(cwcResult + 1, cwc);
1137 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1138 }
1139 if (pwszResult)
1140 {
1141 /*
1142 * Encode the UTF-16 string.
1143 */
1144 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1145 if (RT_SUCCESS(rc))
1146 {
1147 *ppwsz = pwszResult;
1148 return rc;
1149 }
1150 if (fShouldFree)
1151 RTMemFree(pwszResult);
1152 }
1153 else
1154 rc = VERR_NO_UTF16_MEMORY;
1155 }
1156 return rc;
1157}
1158RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
1159
1160
1161RTDECL(int) RTStrToUtf16BigExTag(const char *pszString, size_t cchString,
1162 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1163{
1164 /*
1165 * Validate input.
1166 */
1167 AssertPtr(pszString);
1168 AssertPtr(ppwsz);
1169 AssertPtrNull(pcwc);
1170
1171 /*
1172 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1173 */
1174 size_t cwcResult;
1175 int rc;
1176 if (cchString != RTSTR_MAX)
1177 rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1178 else
1179 rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1180 if (RT_SUCCESS(rc))
1181 {
1182 if (pcwc)
1183 *pcwc = cwcResult;
1184
1185 /*
1186 * Check buffer size / Allocate buffer.
1187 */
1188 bool fShouldFree;
1189 PRTUTF16 pwszResult;
1190 if (cwc > 0 && *ppwsz)
1191 {
1192 fShouldFree = false;
1193 if (cwc <= cwcResult)
1194 return VERR_BUFFER_OVERFLOW;
1195 pwszResult = *ppwsz;
1196 }
1197 else
1198 {
1199 *ppwsz = NULL;
1200 fShouldFree = true;
1201 cwc = RT_MAX(cwcResult + 1, cwc);
1202 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1203 }
1204 if (pwszResult)
1205 {
1206 /*
1207 * Encode the UTF-16BE string.
1208 */
1209 rc = rtUtf8RecodeAsUtf16Big(pszString, cchString, pwszResult, cwc - 1);
1210 if (RT_SUCCESS(rc))
1211 {
1212 *ppwsz = pwszResult;
1213 return rc;
1214 }
1215 if (fShouldFree)
1216 RTMemFree(pwszResult);
1217 }
1218 else
1219 rc = VERR_NO_UTF16_MEMORY;
1220 }
1221 return rc;
1222}
1223RT_EXPORT_SYMBOL(RTStrToUtf16BigExTag);
1224
1225
1226RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
1227{
1228 size_t cwc;
1229 int rc = rtUtf8CalcUtf16Length(psz, &cwc);
1230 return RT_SUCCESS(rc) ? cwc : 0;
1231}
1232RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
1233
1234
1235RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1236{
1237 size_t cwc;
1238 int rc;
1239 if (cch != RTSTR_MAX)
1240 rc = rtUtf8CalcUtf16LengthN(psz, cch, &cwc);
1241 else
1242 rc = rtUtf8CalcUtf16Length(psz, &cwc);
1243 if (pcwc)
1244 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1245 return rc;
1246}
1247RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
1248
1249
1250/**
1251 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
1252 *
1253 * @returns iprt status code.
1254 * @param psz The Latin-1 string.
1255 * @param cchIn The max length of the Latin-1 string to consider.
1256 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
1257 */
1258static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
1259{
1260 size_t cch = 0;
1261 for (;;)
1262 {
1263 RTUNICP Cp;
1264 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
1265 if (Cp == 0 || rc == VERR_END_OF_STRING)
1266 break;
1267 if (RT_FAILURE(rc))
1268 return rc;
1269 cch += RTStrCpSize(Cp); /* cannot fail */
1270 }
1271
1272 /* done */
1273 *pcch = cch;
1274 return VINF_SUCCESS;
1275}
1276
1277
1278/**
1279 * Recodes a Latin-1 string as UTF-8.
1280 *
1281 * @returns iprt status code.
1282 * @param pszIn The Latin-1 string.
1283 * @param cchIn The number of characters to process from psz. The recoding
1284 * will stop when cch or '\\0' is reached.
1285 * @param psz Where to store the UTF-8 string.
1286 * @param cch The size of the UTF-8 buffer, excluding the terminator.
1287 */
1288static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1289{
1290 int rc;
1291 for (;;)
1292 {
1293 RTUNICP Cp;
1294 size_t cchCp;
1295 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
1296 if (Cp == 0 || RT_FAILURE(rc))
1297 break;
1298 cchCp = RTStrCpSize(Cp);
1299 if (RT_UNLIKELY(cch < cchCp))
1300 {
1301 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1302 rc = VERR_BUFFER_OVERFLOW;
1303 break;
1304 }
1305 cch -= cchCp;
1306 psz = RTStrPutCp(psz, Cp);
1307 }
1308
1309 /* done */
1310 if (rc == VERR_END_OF_STRING)
1311 rc = VINF_SUCCESS;
1312 *psz = '\0';
1313 return rc;
1314}
1315
1316
1317
1318RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
1319{
1320 /*
1321 * Validate input.
1322 */
1323 AssertPtr(ppszString);
1324 AssertPtr(pszString);
1325 *ppszString = NULL;
1326
1327 /*
1328 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
1329 */
1330 size_t cch;
1331 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
1332 if (RT_SUCCESS(rc))
1333 {
1334 /*
1335 * Allocate buffer and recode it.
1336 */
1337 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
1338 if (pszResult)
1339 {
1340 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
1341 if (RT_SUCCESS(rc))
1342 {
1343 *ppszString = pszResult;
1344 return rc;
1345 }
1346
1347 RTMemFree(pszResult);
1348 }
1349 else
1350 rc = VERR_NO_STR_MEMORY;
1351 }
1352 return rc;
1353}
1354RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
1355
1356
1357RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1358{
1359 /*
1360 * Validate input.
1361 */
1362 AssertPtr(pszString);
1363 AssertPtr(ppsz);
1364 AssertPtrNull(pcch);
1365
1366 /*
1367 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
1368 */
1369 size_t cchResult;
1370 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
1371 if (RT_SUCCESS(rc))
1372 {
1373 if (pcch)
1374 *pcch = cchResult;
1375
1376 /*
1377 * Check buffer size / Allocate buffer and recode it.
1378 */
1379 bool fShouldFree;
1380 char *pszResult;
1381 if (cch > 0 && *ppsz)
1382 {
1383 fShouldFree = false;
1384 if (RT_UNLIKELY(cch <= cchResult))
1385 return VERR_BUFFER_OVERFLOW;
1386 pszResult = *ppsz;
1387 }
1388 else
1389 {
1390 *ppsz = NULL;
1391 fShouldFree = true;
1392 cch = RT_MAX(cch, cchResult + 1);
1393 pszResult = (char *)RTStrAllocTag(cch, pszTag);
1394 }
1395 if (pszResult)
1396 {
1397 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
1398 if (RT_SUCCESS(rc))
1399 {
1400 *ppsz = pszResult;
1401 return rc;
1402 }
1403
1404 if (fShouldFree)
1405 RTStrFree(pszResult);
1406 }
1407 else
1408 rc = VERR_NO_STR_MEMORY;
1409 }
1410 return rc;
1411}
1412RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1413
1414
1415RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1416{
1417 size_t cch;
1418 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1419 return RT_SUCCESS(rc) ? cch : 0;
1420}
1421RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1422
1423
1424RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
1425{
1426 size_t cch;
1427 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1428 if (pcch)
1429 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1430 return rc;
1431}
1432RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1433
1434
1435/**
1436 * Calculates the Latin-1 length of a string, validating the encoding while
1437 * doing so.
1438 *
1439 * @returns IPRT status code.
1440 * @param psz Pointer to the UTF-8 string.
1441 * @param cchIn The max length of the string. (btw cch = cb)
1442 * Use RTSTR_MAX if all of the string is to be examined.
1443 * @param pcch Where to store the length of the Latin-1 string in bytes.
1444 */
1445static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
1446{
1447 size_t cch = 0;
1448 for (;;)
1449 {
1450 RTUNICP Cp;
1451 size_t cchCp;
1452 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1453 if (Cp == 0 || rc == VERR_END_OF_STRING)
1454 break;
1455 if (RT_FAILURE(rc))
1456 return rc;
1457 cchCp = RTLatin1CpSize(Cp);
1458 if (cchCp == 0)
1459 return VERR_NO_TRANSLATION;
1460 cch += cchCp;
1461 }
1462
1463 /* done */
1464 *pcch = cch;
1465 return VINF_SUCCESS;
1466}
1467
1468
1469/**
1470 * Recodes a valid UTF-8 string as Latin-1.
1471 *
1472 * Since we know the input is valid, we do *not* perform encoding or length checks.
1473 *
1474 * @returns iprt status code.
1475 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1476 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1477 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1478 * @param psz Where to store the Latin-1 string.
1479 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1480 */
1481static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1482{
1483 int rc;
1484 for (;;)
1485 {
1486 RTUNICP Cp;
1487 size_t cchCp;
1488 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1489 if (Cp == 0 || RT_FAILURE(rc))
1490 break;
1491 cchCp = RTLatin1CpSize(Cp);
1492 if (RT_UNLIKELY(cch < cchCp))
1493 {
1494 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1495 rc = VERR_BUFFER_OVERFLOW;
1496 break;
1497 }
1498 cch -= cchCp;
1499 psz = RTLatin1PutCp(psz, Cp);
1500 }
1501
1502 /* done */
1503 if (rc == VERR_END_OF_STRING)
1504 rc = VINF_SUCCESS;
1505 *psz = '\0';
1506 return rc;
1507}
1508
1509
1510
1511RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1512{
1513 /*
1514 * Validate input.
1515 */
1516 AssertPtr(ppszString);
1517 AssertPtr(pszString);
1518 *ppszString = NULL;
1519
1520 /*
1521 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1522 */
1523 size_t cch;
1524 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1525 if (RT_SUCCESS(rc))
1526 {
1527 /*
1528 * Allocate buffer.
1529 */
1530 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1531 if (psz)
1532 {
1533 /*
1534 * Encode the UTF-16 string.
1535 */
1536 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1537 if (RT_SUCCESS(rc))
1538 {
1539 *ppszString = psz;
1540 return rc;
1541 }
1542 RTMemFree(psz);
1543 }
1544 else
1545 rc = VERR_NO_STR_MEMORY;
1546 }
1547 return rc;
1548}
1549RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1550
1551
1552RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1553 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1554{
1555 /*
1556 * Validate input.
1557 */
1558 AssertPtr(pszString);
1559 AssertPtr(ppsz);
1560 AssertPtrNull(pcch);
1561
1562 /*
1563 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1564 */
1565 size_t cchResult;
1566 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1567 if (RT_SUCCESS(rc))
1568 {
1569 if (pcch)
1570 *pcch = cchResult;
1571
1572 /*
1573 * Check buffer size / Allocate buffer.
1574 */
1575 bool fShouldFree;
1576 char *pszResult;
1577 if (cch > 0 && *ppsz)
1578 {
1579 fShouldFree = false;
1580 if (cch <= cchResult)
1581 return VERR_BUFFER_OVERFLOW;
1582 pszResult = *ppsz;
1583 }
1584 else
1585 {
1586 *ppsz = NULL;
1587 fShouldFree = true;
1588 cch = RT_MAX(cchResult + 1, cch);
1589 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1590 }
1591 if (pszResult)
1592 {
1593 /*
1594 * Encode the Latin-1 string.
1595 */
1596 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1597 if (RT_SUCCESS(rc))
1598 {
1599 *ppsz = pszResult;
1600 return rc;
1601 }
1602 if (fShouldFree)
1603 RTMemFree(pszResult);
1604 }
1605 else
1606 rc = VERR_NO_STR_MEMORY;
1607 }
1608 return rc;
1609}
1610RT_EXPORT_SYMBOL(RTStrToLatin1ExTag);
1611
1612
1613RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1614{
1615 size_t cch;
1616 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1617 return RT_SUCCESS(rc) ? cch : 0;
1618}
1619RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1620
1621
1622RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1623{
1624 size_t cch;
1625 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1626 if (pcch)
1627 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1628 return rc;
1629}
1630RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1631
1632
1633/**
1634 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1635 * @returns rc
1636 * @param ppsz The pointer to the string position point.
1637 * @param pCp Where to store RTUNICP_INVALID.
1638 * @param rc The iprt error code.
1639 */
1640static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1641{
1642 /*
1643 * Try find a valid encoding.
1644 */
1645 (*ppsz)++; /** @todo code this! */
1646 *pCp = RTUNICP_INVALID;
1647 return rc;
1648}
1649
1650
1651RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1652{
1653 RTUNICP Cp;
1654 RTStrGetCpExInternal(&psz, &Cp);
1655 return Cp;
1656}
1657RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1658
1659
1660RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1661{
1662 const unsigned char *puch = (const unsigned char *)*ppsz;
1663 const unsigned char uch = *puch;
1664 RTUNICP uc;
1665
1666 /* ASCII ? */
1667 if (!(uch & RT_BIT(7)))
1668 {
1669 uc = uch;
1670 puch++;
1671 }
1672 else if (uch & RT_BIT(6))
1673 {
1674 /* figure the length and validate the first octet. */
1675/** @todo RT_USE_RTC_3629 */
1676 unsigned cb;
1677 if (!(uch & RT_BIT(5)))
1678 cb = 2;
1679 else if (!(uch & RT_BIT(4)))
1680 cb = 3;
1681 else if (!(uch & RT_BIT(3)))
1682 cb = 4;
1683 else if (!(uch & RT_BIT(2)))
1684 cb = 5;
1685 else if (!(uch & RT_BIT(1)))
1686 cb = 6;
1687 else
1688 {
1689 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1690 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1691 }
1692
1693 /* validate the rest */
1694 switch (cb)
1695 {
1696 case 6:
1697 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1698 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1699 RT_FALL_THRU();
1700 case 5:
1701 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1702 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1703 RT_FALL_THRU();
1704 case 4:
1705 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1706 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1707 RT_FALL_THRU();
1708 case 3:
1709 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1710 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1711 RT_FALL_THRU();
1712 case 2:
1713 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1714 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1715 break;
1716 }
1717
1718 /* get and validate the code point. */
1719 switch (cb)
1720 {
1721 case 6:
1722 uc = (puch[5] & 0x3f)
1723 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1724 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1725 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1726 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1727 | ((RTUNICP)(uch & 0x01) << 30);
1728 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1729 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1730 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1731 break;
1732 case 5:
1733 uc = (puch[4] & 0x3f)
1734 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1735 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1736 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1737 | ((RTUNICP)(uch & 0x03) << 24);
1738 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1739 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1740 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1741 break;
1742 case 4:
1743 uc = (puch[3] & 0x3f)
1744 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1745 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1746 | ((RTUNICP)(uch & 0x07) << 18);
1747 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1748 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1749 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1750 break;
1751 case 3:
1752 uc = (puch[2] & 0x3f)
1753 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1754 | ((RTUNICP)(uch & 0x0f) << 12);
1755 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1756 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1757 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1758 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1759 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1760 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1761 break;
1762 case 2:
1763 uc = (puch[1] & 0x3f)
1764 | ((RTUNICP)(uch & 0x1f) << 6);
1765 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1766 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1767 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1768 break;
1769 default: /* impossible, but GCC is bitching. */
1770 uc = RTUNICP_INVALID;
1771 break;
1772 }
1773 puch += cb;
1774 }
1775 else
1776 {
1777 /* 6th bit is always set. */
1778 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1779 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1780 }
1781 *pCp = uc;
1782 *ppsz = (const char *)puch;
1783 return VINF_SUCCESS;
1784}
1785RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1786
1787
1788/**
1789 * Handle invalid encodings passed to RTStrGetCpNEx().
1790 * @returns rc
1791 * @param ppsz The pointer to the string position point.
1792 * @param pcch Pointer to the string length.
1793 * @param pCp Where to store RTUNICP_INVALID.
1794 * @param rc The iprt error code.
1795 */
1796static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1797{
1798 /*
1799 * Try find a valid encoding.
1800 */
1801 (*ppsz)++; /** @todo code this! */
1802 (*pcch)--;
1803 *pCp = RTUNICP_INVALID;
1804 return rc;
1805}
1806
1807
1808RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1809{
1810 const unsigned char *puch = (const unsigned char *)*ppsz;
1811 const unsigned char uch = *puch;
1812 size_t cch = *pcch;
1813 RTUNICP uc;
1814
1815 if (cch == 0)
1816 {
1817 *pCp = RTUNICP_INVALID;
1818 return VERR_END_OF_STRING;
1819 }
1820
1821 /* ASCII ? */
1822 if (!(uch & RT_BIT(7)))
1823 {
1824 uc = uch;
1825 puch++;
1826 cch--;
1827 }
1828 else if (uch & RT_BIT(6))
1829 {
1830 /* figure the length and validate the first octet. */
1831/** @todo RT_USE_RTC_3629 */
1832 unsigned cb;
1833 if (!(uch & RT_BIT(5)))
1834 cb = 2;
1835 else if (!(uch & RT_BIT(4)))
1836 cb = 3;
1837 else if (!(uch & RT_BIT(3)))
1838 cb = 4;
1839 else if (!(uch & RT_BIT(2)))
1840 cb = 5;
1841 else if (!(uch & RT_BIT(1)))
1842 cb = 6;
1843 else
1844 {
1845 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1846 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1847 }
1848
1849 if (cb > cch)
1850 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1851
1852 /* validate the rest */
1853 switch (cb)
1854 {
1855 case 6:
1856 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1857 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1858 RT_FALL_THRU();
1859 case 5:
1860 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1861 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1862 RT_FALL_THRU();
1863 case 4:
1864 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1865 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1866 RT_FALL_THRU();
1867 case 3:
1868 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1869 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1870 RT_FALL_THRU();
1871 case 2:
1872 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1873 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1874 break;
1875 }
1876
1877 /* get and validate the code point. */
1878 switch (cb)
1879 {
1880 case 6:
1881 uc = (puch[5] & 0x3f)
1882 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1883 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1884 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1885 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1886 | ((RTUNICP)(uch & 0x01) << 30);
1887 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1888 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1889 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1890 break;
1891 case 5:
1892 uc = (puch[4] & 0x3f)
1893 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1894 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1895 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1896 | ((RTUNICP)(uch & 0x03) << 24);
1897 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1898 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1899 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1900 break;
1901 case 4:
1902 uc = (puch[3] & 0x3f)
1903 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1904 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1905 | ((RTUNICP)(uch & 0x07) << 18);
1906 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1907 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1908 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1909 break;
1910 case 3:
1911 uc = (puch[2] & 0x3f)
1912 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1913 | ((RTUNICP)(uch & 0x0f) << 12);
1914 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1915 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1916 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1917 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1918 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1919 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1920 break;
1921 case 2:
1922 uc = (puch[1] & 0x3f)
1923 | ((RTUNICP)(uch & 0x1f) << 6);
1924 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1925 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1926 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1927 break;
1928 default: /* impossible, but GCC is bitching. */
1929 uc = RTUNICP_INVALID;
1930 break;
1931 }
1932 puch += cb;
1933 cch -= cb;
1934 }
1935 else
1936 {
1937 /* 6th bit is always set. */
1938 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1939 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1940 }
1941 *pCp = uc;
1942 *ppsz = (const char *)puch;
1943 (*pcch) = cch;
1944 return VINF_SUCCESS;
1945}
1946RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1947
1948
1949RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1950{
1951 unsigned char *puch = (unsigned char *)psz;
1952 if (uc < 0x80)
1953 *puch++ = (unsigned char )uc;
1954 else if (uc < 0x00000800)
1955 {
1956 *puch++ = 0xc0 | (uc >> 6);
1957 *puch++ = 0x80 | (uc & 0x3f);
1958 }
1959 else if (uc < 0x00010000)
1960 {
1961/** @todo RT_USE_RTC_3629 */
1962 if ( uc < 0x0000d8000
1963 || ( uc > 0x0000dfff
1964 && uc < 0x0000fffe))
1965 {
1966 *puch++ = 0xe0 | (uc >> 12);
1967 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1968 *puch++ = 0x80 | (uc & 0x3f);
1969 }
1970 else
1971 {
1972 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1973 *puch++ = 0x7f;
1974 }
1975 }
1976/** @todo RT_USE_RTC_3629 */
1977 else if (uc < 0x00200000)
1978 {
1979 *puch++ = 0xf0 | (uc >> 18);
1980 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1981 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1982 *puch++ = 0x80 | (uc & 0x3f);
1983 }
1984 else if (uc < 0x04000000)
1985 {
1986 *puch++ = 0xf8 | (uc >> 24);
1987 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1988 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1989 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1990 *puch++ = 0x80 | (uc & 0x3f);
1991 }
1992 else if (uc <= 0x7fffffff)
1993 {
1994 *puch++ = 0xfc | (uc >> 30);
1995 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1996 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1997 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1998 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1999 *puch++ = 0x80 | (uc & 0x3f);
2000 }
2001 else
2002 {
2003 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
2004 *puch++ = 0x7f;
2005 }
2006
2007 return (char *)puch;
2008}
2009RT_EXPORT_SYMBOL(RTStrPutCpInternal);
2010
2011
2012RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
2013{
2014 if (pszStart < psz)
2015 {
2016 /* simple char? */
2017 const unsigned char *puch = (const unsigned char *)psz;
2018 unsigned uch = *--puch;
2019 if (!(uch & RT_BIT(7)))
2020 return (char *)puch;
2021 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
2022
2023 /* two or more. */
2024 uint32_t uMask = 0xffffffc0;
2025 while ( (const unsigned char *)pszStart < puch
2026 && !(uMask & 1))
2027 {
2028 uch = *--puch;
2029 if ((uch & 0xc0) != 0x80)
2030 {
2031 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
2032 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
2033 (char *)pszStart);
2034 return (char *)puch;
2035 }
2036 uMask >>= 1;
2037 }
2038 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
2039 }
2040 return (char *)pszStart;
2041}
2042RT_EXPORT_SYMBOL(RTStrPrevCp);
2043
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette