VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 68316

Last change on this file since 68316 was 68316, checked in by vboxsync, 7 years ago

iprt/utf16.h: Added some more RTUtf16Big functions, introducing a handful of RTUtf16Little functions to go along with them.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 34.4 KB
Line 
1/* $Id: utf-16.cpp 68316 2017-08-07 14:19:34Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2016 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/asm.h>
36#include <iprt/mem.h>
37#include <iprt/assert.h>
38#include <iprt/err.h>
39#include "internal/string.h"
40
41
42/**
43 * Get get length in code points of an UTF-16 encoded string, validating the
44 * string while doing so.
45 *
46 * @returns IPRT status code.
47 * @param pwsz Pointer to the UTF-16 string.
48 * @param cwc The max length of the string in UTF-16 units. Use
49 * RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcwcActual Where to store the actual size of the UTF-16 string
52 * on success. Optional.
53 */
54static int rtUtf16Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcuc, size_t *pcwcActual)
55{
56 PCRTUTF16 pwszStart = pwsz;
57 size_t cCodePoints = 0;
58 while (cwc > 0)
59 {
60 RTUTF16 wc = *pwsz;
61 if (!wc)
62 break;
63 if (wc < 0xd800 || wc > 0xdfff)
64 {
65 cCodePoints++;
66 pwsz++;
67 cwc--;
68 }
69 /* Surrogate pair: */
70 else if (wc >= 0xdc00)
71 {
72 RTStrAssertMsgFailed(("Lone UTF-16 trail surrogate: %#x (%.*Rhxs)\n", wc, RT_MIN(cwc * 2, 10), pwsz));
73 return VERR_INVALID_UTF16_ENCODING;
74 }
75 else if (cwc < 2)
76 {
77 RTStrAssertMsgFailed(("Lone UTF-16 lead surrogate: %#x\n", wc));
78 return VERR_INVALID_UTF16_ENCODING;
79 }
80 else
81 {
82 RTUTF16 wcTrail = pwsz[1];
83 if (wcTrail < 0xdc00 || wcTrail > 0xdfff)
84 {
85 RTStrAssertMsgFailed(("Invalid UTF-16 trail surrogate: %#x (lead %#x)\n", wcTrail, wc));
86 return VERR_INVALID_UTF16_ENCODING;
87 }
88
89 cCodePoints++;
90 pwsz += 2;
91 cwc -= 2;
92 }
93 }
94
95 /* done */
96 *pcuc = cCodePoints;
97 if (pcwcActual)
98 *pcwcActual = pwsz - pwszStart;
99 return VINF_SUCCESS;
100}
101
102
103RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag)
104{
105 if (cb > sizeof(RTUTF16))
106 cb = RT_ALIGN_Z(cb, sizeof(RTUTF16));
107 else
108 cb = sizeof(RTUTF16);
109 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
110 if (pwsz)
111 *pwsz = '\0';
112 return pwsz;
113}
114RT_EXPORT_SYMBOL(RTUtf16AllocTag);
115
116
117RTDECL(int) RTUtf16ReallocTag(PRTUTF16 *ppwsz, size_t cbNew, const char *pszTag)
118{
119 PRTUTF16 pwszOld = *ppwsz;
120 cbNew = RT_ALIGN_Z(cbNew, sizeof(RTUTF16));
121 if (!cbNew)
122 {
123 RTMemFree(pwszOld);
124 *ppwsz = NULL;
125 }
126 else if (pwszOld)
127 {
128 PRTUTF16 pwszNew = (PRTUTF16)RTMemReallocTag(pwszOld, cbNew, pszTag);
129 if (!pwszNew)
130 return VERR_NO_STR_MEMORY;
131 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
132 *ppwsz = pwszNew;
133 }
134 else
135 {
136 PRTUTF16 pwszNew = (PRTUTF16)RTMemAllocTag(cbNew, pszTag);
137 if (!pwszNew)
138 return VERR_NO_UTF16_MEMORY;
139 pwszNew[0] = '\0';
140 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
141 *ppwsz = pwszNew;
142 }
143 return VINF_SUCCESS;
144}
145RT_EXPORT_SYMBOL(RTUtf16ReallocTag);
146
147
148RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
149{
150 if (pwszString)
151 RTMemTmpFree(pwszString);
152}
153RT_EXPORT_SYMBOL(RTUtf16Free);
154
155
156RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
157{
158 Assert(pwszString);
159 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
160 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
161 if (pwsz)
162 memcpy(pwsz, pwszString, cb);
163 return pwsz;
164}
165RT_EXPORT_SYMBOL(RTUtf16DupTag);
166
167
168RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
169{
170 Assert(pwszString);
171 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
172 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
173 if (pwsz)
174 {
175 memcpy(pwsz, pwszString, cb);
176 *ppwszString = pwsz;
177 return VINF_SUCCESS;
178 }
179 return VERR_NO_MEMORY;
180}
181RT_EXPORT_SYMBOL(RTUtf16DupExTag);
182
183
184RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
185{
186 if (!pwszString)
187 return 0;
188
189 PCRTUTF16 pwsz = pwszString;
190 while (*pwsz)
191 pwsz++;
192 return pwsz - pwszString;
193}
194RT_EXPORT_SYMBOL(RTUtf16Len);
195
196
197RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
198{
199 if (pwsz1 == pwsz2)
200 return 0;
201 if (!pwsz1)
202 return -1;
203 if (!pwsz2)
204 return 1;
205
206 for (;;)
207 {
208 register RTUTF16 wcs = *pwsz1;
209 register int iDiff = wcs - *pwsz2;
210 if (iDiff || !wcs)
211 return iDiff;
212 pwsz1++;
213 pwsz2++;
214 }
215}
216RT_EXPORT_SYMBOL(RTUtf16Cmp);
217
218
219RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2)
220{
221 /*
222 * NULL and empty strings are all the same.
223 */
224 if (!pwsz1)
225 return !psz2 || !*psz2 ? 0 : -1;
226 if (!psz2)
227 return !*pwsz1 ? 0 : 1;
228
229 /*
230 * Compare with a UTF-8 string by enumerating them char by char.
231 */
232 for (;;)
233 {
234 RTUNICP uc1;
235 int rc = RTUtf16GetCpEx(&pwsz1, &uc1);
236 AssertRCReturn(rc, 1);
237
238 RTUNICP uc2;
239 rc = RTStrGetCpEx(&psz2, &uc2);
240 AssertRCReturn(rc, -1);
241 if (uc1 == uc2)
242 {
243 if (uc1)
244 continue;
245 return 0;
246 }
247 return uc1 < uc2 ? -1 : 1;
248 }
249}
250RT_EXPORT_SYMBOL(RTUtf16CmpUtf8);
251
252
253RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz)
254{
255 return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
256}
257RT_EXPORT_SYMBOL(RTUtf16ValidateEncoding);
258
259
260RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags)
261{
262 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
263 VERR_INVALID_PARAMETER);
264 AssertPtr(pwsz);
265
266 /*
267 * Use rtUtf16Length for the job.
268 */
269 size_t cwcActual = 0; /* Shut up cc1plus. */
270 size_t cCpsIgnored;
271 int rc = rtUtf16Length(pwsz, cwc, &cCpsIgnored, &cwcActual);
272 if (RT_SUCCESS(rc))
273 {
274 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
275 {
276 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
277 cwcActual++;
278 if (cwcActual == cwc)
279 rc = VINF_SUCCESS;
280 else if (cwcActual < cwc)
281 rc = VERR_BUFFER_UNDERFLOW;
282 else
283 rc = VERR_BUFFER_OVERFLOW;
284 }
285 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
286 && cwcActual >= cwc)
287 rc = VERR_BUFFER_OVERFLOW;
288 }
289 return rc;
290}
291RT_EXPORT_SYMBOL(RTUtf16ValidateEncodingEx);
292
293
294RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz)
295{
296 int rc = RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
297 return RT_SUCCESS(rc);
298}
299RT_EXPORT_SYMBOL(RTUtf16IsValidEncoding);
300
301
302/**
303 * Helper for RTUtf16PurgeComplementSet.
304 *
305 * @returns true if @a Cp is valid, false if not.
306 * @param Cp The code point to validate.
307 * @param puszValidPairs Pair of valid code point sets.
308 * @param cValidPairs Number of pairs.
309 */
310DECLINLINE(bool) rtUtf16PurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
311{
312 while (cValidPairs-- > 0)
313 {
314 if ( Cp >= puszValidPairs[0]
315 && Cp <= puszValidPairs[1])
316 return true;
317 puszValidPairs += 2;
318 }
319 return false;
320}
321
322
323RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement)
324{
325 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
326
327 /*
328 * Calc valid pairs and check that we've got an even number.
329 */
330 uint32_t cValidPairs = 0;
331 while (puszValidPairs[cValidPairs * 2])
332 {
333 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
334 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
335 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
336 cValidPairs++;
337 }
338
339 /*
340 * Do the replacing.
341 */
342 ssize_t cReplacements = 0;
343 for (;;)
344 {
345 PRTUTF16 pwszCur = pwsz;
346 RTUNICP Cp;
347 int rc = RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp);
348 if (RT_SUCCESS(rc))
349 {
350 if (Cp)
351 {
352 if (!rtUtf16PurgeIsInSet(Cp, puszValidPairs, cValidPairs))
353 {
354 for (; pwszCur != pwsz; ++pwszCur)
355 *pwszCur = chReplacement;
356 ++cReplacements;
357 }
358 }
359 else
360 break;
361 }
362 else
363 return -1;
364 }
365 return cReplacements;
366}
367RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
368
369
370/**
371 * Validate the UTF-16BE encoding and calculates the length of an UTF-8
372 * encoding.
373 *
374 * @returns iprt status code.
375 * @param pwsz The UTF-16BE string.
376 * @param cwc The max length of the UTF-16BE string to consider.
377 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
378 *
379 * @note rtUtf16LittleCalcUtf8Length | s/RT_LE2H_U16/RT_BE2H_U16/g
380 */
381static int rtUtf16BigCalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
382{
383 int rc = VINF_SUCCESS;
384 size_t cch = 0;
385 while (cwc > 0)
386 {
387 RTUTF16 wc = *pwsz++; cwc--;
388 if (!wc)
389 break;
390 wc = RT_BE2H_U16(wc);
391 if (wc < 0xd800 || wc > 0xdfff)
392 {
393 if (wc < 0x80)
394 cch++;
395 else if (wc < 0x800)
396 cch += 2;
397 else if (wc < 0xfffe)
398 cch += 3;
399 else
400 {
401 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
402 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
403 break;
404 }
405 }
406 else
407 {
408 if (wc >= 0xdc00)
409 {
410 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
411 rc = VERR_INVALID_UTF16_ENCODING;
412 break;
413 }
414 if (cwc <= 0)
415 {
416 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
417 rc = VERR_INVALID_UTF16_ENCODING;
418 break;
419 }
420 wc = *pwsz++; cwc--;
421 wc = RT_BE2H_U16(wc);
422 if (wc < 0xdc00 || wc > 0xdfff)
423 {
424 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
425 rc = VERR_INVALID_UTF16_ENCODING;
426 break;
427 }
428 cch += 4;
429 }
430 }
431
432
433 /* done */
434 *pcch = cch;
435 return rc;
436}
437
438
439/**
440 * Validate the UTF-16LE encoding and calculates the length of an UTF-8
441 * encoding.
442 *
443 * @returns iprt status code.
444 * @param pwsz The UTF-16LE string.
445 * @param cwc The max length of the UTF-16LE string to consider.
446 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
447 *
448 * @note rtUtf16BigCalcUtf8Length | s/RT_BE2H_U16/RT_LE2H_U16/g
449 */
450static int rtUtf16LittleCalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
451{
452 int rc = VINF_SUCCESS;
453 size_t cch = 0;
454 while (cwc > 0)
455 {
456 RTUTF16 wc = *pwsz++; cwc--;
457 if (!wc)
458 break;
459 wc = RT_LE2H_U16(wc);
460 if (wc < 0xd800 || wc > 0xdfff)
461 {
462 if (wc < 0x80)
463 cch++;
464 else if (wc < 0x800)
465 cch += 2;
466 else if (wc < 0xfffe)
467 cch += 3;
468 else
469 {
470 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
471 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
472 break;
473 }
474 }
475 else
476 {
477 if (wc >= 0xdc00)
478 {
479 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
480 rc = VERR_INVALID_UTF16_ENCODING;
481 break;
482 }
483 if (cwc <= 0)
484 {
485 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
486 rc = VERR_INVALID_UTF16_ENCODING;
487 break;
488 }
489 wc = *pwsz++; cwc--;
490 wc = RT_LE2H_U16(wc);
491 if (wc < 0xdc00 || wc > 0xdfff)
492 {
493 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
494 rc = VERR_INVALID_UTF16_ENCODING;
495 break;
496 }
497 cch += 4;
498 }
499 }
500
501
502 /* done */
503 *pcch = cch;
504 return rc;
505}
506
507
508/**
509 * Recodes an valid UTF-16BE string as UTF-8.
510 *
511 * @returns iprt status code.
512 * @param pwsz The UTF-16BE string.
513 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
514 * will stop when cwc or '\\0' is reached.
515 * @param psz Where to store the UTF-8 string.
516 * @param cch The size of the UTF-8 buffer, excluding the terminator.
517 * @param pcch Where to store the number of octets actually encoded.
518 *
519 * @note rtUtf16LittleRecodeAsUtf8 == s/RT_BE2H_U16/RT_LE2H_U16/g
520 */
521static int rtUtf16BigRecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
522{
523 unsigned char *pwch = (unsigned char *)psz;
524 int rc = VINF_SUCCESS;
525 while (cwc > 0)
526 {
527 RTUTF16 wc = *pwsz++; cwc--;
528 if (!wc)
529 break;
530 wc = RT_BE2H_U16(wc);
531 if (wc < 0xd800 || wc > 0xdfff)
532 {
533 if (wc < 0x80)
534 {
535 if (RT_UNLIKELY(cch < 1))
536 {
537 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
538 rc = VERR_BUFFER_OVERFLOW;
539 break;
540 }
541 cch--;
542 *pwch++ = (unsigned char)wc;
543 }
544 else if (wc < 0x800)
545 {
546 if (RT_UNLIKELY(cch < 2))
547 {
548 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
549 rc = VERR_BUFFER_OVERFLOW;
550 break;
551 }
552 cch -= 2;
553 *pwch++ = 0xc0 | (wc >> 6);
554 *pwch++ = 0x80 | (wc & 0x3f);
555 }
556 else if (wc < 0xfffe)
557 {
558 if (RT_UNLIKELY(cch < 3))
559 {
560 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
561 rc = VERR_BUFFER_OVERFLOW;
562 break;
563 }
564 cch -= 3;
565 *pwch++ = 0xe0 | (wc >> 12);
566 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
567 *pwch++ = 0x80 | (wc & 0x3f);
568 }
569 else
570 {
571 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
572 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
573 break;
574 }
575 }
576 else
577 {
578 if (wc >= 0xdc00)
579 {
580 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
581 rc = VERR_INVALID_UTF16_ENCODING;
582 break;
583 }
584 if (cwc <= 0)
585 {
586 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
587 rc = VERR_INVALID_UTF16_ENCODING;
588 break;
589 }
590 RTUTF16 wc2 = *pwsz++; cwc--;
591 wc2 = RT_BE2H_U16(wc2);
592 if (wc2 < 0xdc00 || wc2 > 0xdfff)
593 {
594 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
595 rc = VERR_INVALID_UTF16_ENCODING;
596 break;
597 }
598 uint32_t CodePoint = 0x10000
599 + ( ((wc & 0x3ff) << 10)
600 | (wc2 & 0x3ff));
601 if (RT_UNLIKELY(cch < 4))
602 {
603 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
604 rc = VERR_BUFFER_OVERFLOW;
605 break;
606 }
607 cch -= 4;
608 *pwch++ = 0xf0 | (CodePoint >> 18);
609 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
610 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
611 *pwch++ = 0x80 | (CodePoint & 0x3f);
612 }
613 }
614
615 /* done */
616 *pwch = '\0';
617 *pcch = (char *)pwch - psz;
618 return rc;
619}
620
621
622/**
623 * Recodes an valid UTF-16LE string as UTF-8.
624 *
625 * @returns iprt status code.
626 * @param pwsz The UTF-16LE string.
627 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
628 * will stop when cwc or '\\0' is reached.
629 * @param psz Where to store the UTF-8 string.
630 * @param cch The size of the UTF-8 buffer, excluding the terminator.
631 * @param pcch Where to store the number of octets actually encoded.
632 *
633 * @note rtUtf16LittleRecodeAsUtf8 == s/RT_LE2H_U16/RT_GE2H_U16/g
634 */
635static int rtUtf16LittleRecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
636{
637 unsigned char *pwch = (unsigned char *)psz;
638 int rc = VINF_SUCCESS;
639 while (cwc > 0)
640 {
641 RTUTF16 wc = *pwsz++; cwc--;
642 if (!wc)
643 break;
644 wc = RT_LE2H_U16(wc);
645 if (wc < 0xd800 || wc > 0xdfff)
646 {
647 if (wc < 0x80)
648 {
649 if (RT_UNLIKELY(cch < 1))
650 {
651 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
652 rc = VERR_BUFFER_OVERFLOW;
653 break;
654 }
655 cch--;
656 *pwch++ = (unsigned char)wc;
657 }
658 else if (wc < 0x800)
659 {
660 if (RT_UNLIKELY(cch < 2))
661 {
662 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
663 rc = VERR_BUFFER_OVERFLOW;
664 break;
665 }
666 cch -= 2;
667 *pwch++ = 0xc0 | (wc >> 6);
668 *pwch++ = 0x80 | (wc & 0x3f);
669 }
670 else if (wc < 0xfffe)
671 {
672 if (RT_UNLIKELY(cch < 3))
673 {
674 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
675 rc = VERR_BUFFER_OVERFLOW;
676 break;
677 }
678 cch -= 3;
679 *pwch++ = 0xe0 | (wc >> 12);
680 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
681 *pwch++ = 0x80 | (wc & 0x3f);
682 }
683 else
684 {
685 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
686 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
687 break;
688 }
689 }
690 else
691 {
692 if (wc >= 0xdc00)
693 {
694 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
695 rc = VERR_INVALID_UTF16_ENCODING;
696 break;
697 }
698 if (cwc <= 0)
699 {
700 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
701 rc = VERR_INVALID_UTF16_ENCODING;
702 break;
703 }
704 RTUTF16 wc2 = *pwsz++; cwc--;
705 wc2 = RT_LE2H_U16(wc2);
706 if (wc2 < 0xdc00 || wc2 > 0xdfff)
707 {
708 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
709 rc = VERR_INVALID_UTF16_ENCODING;
710 break;
711 }
712 uint32_t CodePoint = 0x10000
713 + ( ((wc & 0x3ff) << 10)
714 | (wc2 & 0x3ff));
715 if (RT_UNLIKELY(cch < 4))
716 {
717 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
718 rc = VERR_BUFFER_OVERFLOW;
719 break;
720 }
721 cch -= 4;
722 *pwch++ = 0xf0 | (CodePoint >> 18);
723 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
724 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
725 *pwch++ = 0x80 | (CodePoint & 0x3f);
726 }
727 }
728
729 /* done */
730 *pwch = '\0';
731 *pcch = (char *)pwch - psz;
732 return rc;
733}
734
735
736
737RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
738{
739 /*
740 * Validate input.
741 */
742 Assert(VALID_PTR(ppszString));
743 Assert(VALID_PTR(pwszString));
744 *ppszString = NULL;
745
746 /*
747 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
748 */
749 size_t cch;
750#ifdef RT_BIG_ENDIAN
751 int rc = rtUtf16BigCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
752#else
753 int rc = rtUtf16LittleCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
754#endif
755 if (RT_SUCCESS(rc))
756 {
757 /*
758 * Allocate buffer and recode it.
759 */
760 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
761 if (pszResult)
762 {
763#ifdef RT_BIG_ENDIAN
764 rc = rtUtf16BigRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
765#else
766 rc = rtUtf16LittleRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
767#endif
768 if (RT_SUCCESS(rc))
769 {
770 *ppszString = pszResult;
771 return rc;
772 }
773
774 RTMemFree(pszResult);
775 }
776 else
777 rc = VERR_NO_STR_MEMORY;
778 }
779 return rc;
780}
781RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
782
783
784RTDECL(int) RTUtf16BigToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
785{
786 /*
787 * Validate input.
788 */
789 Assert(VALID_PTR(ppszString));
790 Assert(VALID_PTR(pwszString));
791 *ppszString = NULL;
792
793 /*
794 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
795 */
796 size_t cch;
797 int rc = rtUtf16BigCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
798 if (RT_SUCCESS(rc))
799 {
800 /*
801 * Allocate buffer and recode it.
802 */
803 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
804 if (pszResult)
805 {
806 rc = rtUtf16BigRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
807 if (RT_SUCCESS(rc))
808 {
809 *ppszString = pszResult;
810 return rc;
811 }
812
813 RTMemFree(pszResult);
814 }
815 else
816 rc = VERR_NO_STR_MEMORY;
817 }
818 return rc;
819}
820RT_EXPORT_SYMBOL(RTUtf16BigToUtf8Tag);
821
822
823RTDECL(int) RTUtf16LittleToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
824{
825 /*
826 * Validate input.
827 */
828 Assert(VALID_PTR(ppszString));
829 Assert(VALID_PTR(pwszString));
830 *ppszString = NULL;
831
832 /*
833 * Validate the UTF-16LE string and calculate the length of the UTF-8 encoding of it.
834 */
835 size_t cch;
836 int rc = rtUtf16LittleCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
837 if (RT_SUCCESS(rc))
838 {
839 /*
840 * Allocate buffer and recode it.
841 */
842 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
843 if (pszResult)
844 {
845 rc = rtUtf16LittleRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
846 if (RT_SUCCESS(rc))
847 {
848 *ppszString = pszResult;
849 return rc;
850 }
851
852 RTMemFree(pszResult);
853 }
854 else
855 rc = VERR_NO_STR_MEMORY;
856 }
857 return rc;
858}
859RT_EXPORT_SYMBOL(RTUtf16BigToUtf8Tag);
860
861
862RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
863{
864 /*
865 * Validate input.
866 */
867 AssertPtr(pwszString);
868 AssertPtr(ppsz);
869 AssertPtrNull(pcch);
870
871 /*
872 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
873 */
874 size_t cchResult;
875#ifdef RT_BIG_ENDIAN
876 int rc = rtUtf16BigCalcUtf8Length(pwszString, cwcString, &cchResult);
877#else
878 int rc = rtUtf16LittleCalcUtf8Length(pwszString, cwcString, &cchResult);
879#endif
880 if (RT_SUCCESS(rc))
881 {
882 if (pcch)
883 *pcch = cchResult;
884
885 /*
886 * Check buffer size / Allocate buffer and recode it.
887 */
888 bool fShouldFree;
889 char *pszResult;
890 if (cch > 0 && *ppsz)
891 {
892 fShouldFree = false;
893 if (RT_UNLIKELY(cch <= cchResult))
894 return VERR_BUFFER_OVERFLOW;
895 pszResult = *ppsz;
896 }
897 else
898 {
899 *ppsz = NULL;
900 fShouldFree = true;
901 cch = RT_MAX(cch, cchResult + 1);
902 pszResult = (char *)RTStrAllocTag(cch, pszTag);
903 }
904 if (pszResult)
905 {
906#ifdef RT_BIG_ENDIAN
907 rc = rtUtf16BigRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
908#else
909 rc = rtUtf16LittleRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
910#endif
911 if (RT_SUCCESS(rc))
912 {
913 *ppsz = pszResult;
914 return rc;
915 }
916
917 if (fShouldFree)
918 RTStrFree(pszResult);
919 }
920 else
921 rc = VERR_NO_STR_MEMORY;
922 }
923 return rc;
924}
925RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
926
927
928RTDECL(int) RTUtf16BigToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
929{
930 /*
931 * Validate input.
932 */
933 AssertPtr(pwszString);
934 AssertPtr(ppsz);
935 AssertPtrNull(pcch);
936
937 /*
938 * Validate the UTF-16BE string and calculate the length of the UTF-8 encoding of it.
939 */
940 size_t cchResult;
941 int rc = rtUtf16BigCalcUtf8Length(pwszString, cwcString, &cchResult);
942 if (RT_SUCCESS(rc))
943 {
944 if (pcch)
945 *pcch = cchResult;
946
947 /*
948 * Check buffer size / Allocate buffer and recode it.
949 */
950 bool fShouldFree;
951 char *pszResult;
952 if (cch > 0 && *ppsz)
953 {
954 fShouldFree = false;
955 if (RT_UNLIKELY(cch <= cchResult))
956 return VERR_BUFFER_OVERFLOW;
957 pszResult = *ppsz;
958 }
959 else
960 {
961 *ppsz = NULL;
962 fShouldFree = true;
963 cch = RT_MAX(cch, cchResult + 1);
964 pszResult = (char *)RTStrAllocTag(cch, pszTag);
965 }
966 if (pszResult)
967 {
968 rc = rtUtf16BigRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
969 if (RT_SUCCESS(rc))
970 {
971 *ppsz = pszResult;
972 return rc;
973 }
974
975 if (fShouldFree)
976 RTStrFree(pszResult);
977 }
978 else
979 rc = VERR_NO_STR_MEMORY;
980 }
981 return rc;
982}
983RT_EXPORT_SYMBOL(RTUtf16BigToUtf8ExTag);
984
985
986RTDECL(int) RTUtf16LittleToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch,
987 const char *pszTag)
988{
989 /*
990 * Validate input.
991 */
992 AssertPtr(pwszString);
993 AssertPtr(ppsz);
994 AssertPtrNull(pcch);
995
996 /*
997 * Validate the UTF-16LE string and calculate the length of the UTF-8 encoding of it.
998 */
999 size_t cchResult;
1000 int rc = rtUtf16LittleCalcUtf8Length(pwszString, cwcString, &cchResult);
1001 if (RT_SUCCESS(rc))
1002 {
1003 if (pcch)
1004 *pcch = cchResult;
1005
1006 /*
1007 * Check buffer size / Allocate buffer and recode it.
1008 */
1009 bool fShouldFree;
1010 char *pszResult;
1011 if (cch > 0 && *ppsz)
1012 {
1013 fShouldFree = false;
1014 if (RT_UNLIKELY(cch <= cchResult))
1015 return VERR_BUFFER_OVERFLOW;
1016 pszResult = *ppsz;
1017 }
1018 else
1019 {
1020 *ppsz = NULL;
1021 fShouldFree = true;
1022 cch = RT_MAX(cch, cchResult + 1);
1023 pszResult = (char *)RTStrAllocTag(cch, pszTag);
1024 }
1025 if (pszResult)
1026 {
1027 rc = rtUtf16LittleRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
1028 if (RT_SUCCESS(rc))
1029 {
1030 *ppsz = pszResult;
1031 return rc;
1032 }
1033
1034 if (fShouldFree)
1035 RTStrFree(pszResult);
1036 }
1037 else
1038 rc = VERR_NO_STR_MEMORY;
1039 }
1040 return rc;
1041}
1042RT_EXPORT_SYMBOL(RTUtf16BigToUtf8ExTag);
1043
1044
1045RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
1046{
1047 size_t cch;
1048#ifdef RT_BIG_ENDIAN
1049 int rc = rtUtf16BigCalcUtf8Length(pwsz, RTSTR_MAX, &cch);
1050#else
1051 int rc = rtUtf16LittleCalcUtf8Length(pwsz, RTSTR_MAX, &cch);
1052#endif
1053 return RT_SUCCESS(rc) ? cch : 0;
1054}
1055RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
1056
1057
1058RTDECL(size_t) RTUtf16BigCalcUtf8Len(PCRTUTF16 pwsz)
1059{
1060 size_t cch;
1061 int rc = rtUtf16BigCalcUtf8Length(pwsz, RTSTR_MAX, &cch);
1062 return RT_SUCCESS(rc) ? cch : 0;
1063}
1064RT_EXPORT_SYMBOL(RTUtf16BigCalcUtf8Len);
1065
1066
1067RTDECL(size_t) RTUtf16LittleCalcUtf8Len(PCRTUTF16 pwsz)
1068{
1069 size_t cch;
1070 int rc = rtUtf16LittleCalcUtf8Length(pwsz, RTSTR_MAX, &cch);
1071 return RT_SUCCESS(rc) ? cch : 0;
1072}
1073RT_EXPORT_SYMBOL(RTUtf16LittleCalcUtf8Len);
1074
1075
1076RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
1077{
1078 size_t cch;
1079#ifdef RT_BIG_ENDIAN
1080 int rc = rtUtf16BigCalcUtf8Length(pwsz, cwc, &cch);
1081#else
1082 int rc = rtUtf16LittleCalcUtf8Length(pwsz, cwc, &cch);
1083#endif
1084 if (pcch)
1085 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1086 return rc;
1087}
1088RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
1089
1090
1091RTDECL(int) RTUtf16BigCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
1092{
1093 size_t cch;
1094 int rc = rtUtf16BigCalcUtf8Length(pwsz, cwc, &cch);
1095 if (pcch)
1096 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1097 return rc;
1098}
1099RT_EXPORT_SYMBOL(RTUtf16BigCalcUtf8LenEx);
1100
1101
1102RTDECL(int) RTUtf16LittleCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
1103{
1104 size_t cch;
1105 int rc = rtUtf16LittleCalcUtf8Length(pwsz, cwc, &cch);
1106 if (pcch)
1107 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1108 return rc;
1109}
1110RT_EXPORT_SYMBOL(RTUtf16LittleCalcUtf8LenEx);
1111
1112
1113RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
1114{
1115 const RTUTF16 wc = *pwsz;
1116
1117 /* simple */
1118 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1119 return wc;
1120 if (wc < 0xfffe)
1121 {
1122 /* surrogate pair */
1123 if (wc < 0xdc00)
1124 {
1125 const RTUTF16 wc2 = pwsz[1];
1126 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
1127 {
1128 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1129 return uc;
1130 }
1131
1132 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1133 }
1134 else
1135 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1136 }
1137 else
1138 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1139 return RTUNICP_INVALID;
1140}
1141RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
1142
1143
1144RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1145{
1146 const RTUTF16 wc = **ppwsz;
1147
1148 /* simple */
1149 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1150 {
1151 (*ppwsz)++;
1152 *pCp = wc;
1153 return VINF_SUCCESS;
1154 }
1155
1156 int rc;
1157 if (wc < 0xfffe)
1158 {
1159 /* surrogate pair */
1160 if (wc < 0xdc00)
1161 {
1162 const RTUTF16 wc2 = (*ppwsz)[1];
1163 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
1164 {
1165 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1166 *pCp = uc;
1167 (*ppwsz) += 2;
1168 return VINF_SUCCESS;
1169 }
1170
1171 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1172 }
1173 else
1174 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1175 rc = VERR_INVALID_UTF16_ENCODING;
1176 }
1177 else
1178 {
1179 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1180 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
1181 }
1182 *pCp = RTUNICP_INVALID;
1183 (*ppwsz)++;
1184 return rc;
1185}
1186RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
1187
1188
1189RTDECL(int) RTUtf16BigGetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1190{
1191 const RTUTF16 wc = RT_BE2H_U16(**ppwsz);
1192
1193 /* simple */
1194 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1195 {
1196 (*ppwsz)++;
1197 *pCp = wc;
1198 return VINF_SUCCESS;
1199 }
1200
1201 int rc;
1202 if (wc < 0xfffe)
1203 {
1204 /* surrogate pair */
1205 if (wc < 0xdc00)
1206 {
1207 const RTUTF16 wc2 = RT_BE2H_U16((*ppwsz)[1]);
1208 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
1209 {
1210 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1211 *pCp = uc;
1212 (*ppwsz) += 2;
1213 return VINF_SUCCESS;
1214 }
1215
1216 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1217 }
1218 else
1219 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1220 rc = VERR_INVALID_UTF16_ENCODING;
1221 }
1222 else
1223 {
1224 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1225 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
1226 }
1227 *pCp = RTUNICP_INVALID;
1228 (*ppwsz)++;
1229 return rc;
1230}
1231RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
1232
1233
1234RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
1235{
1236 /* simple */
1237 if ( CodePoint < 0xd800
1238 || ( CodePoint > 0xdfff
1239 && CodePoint < 0xfffe))
1240 {
1241 *pwsz++ = (RTUTF16)CodePoint;
1242 return pwsz;
1243 }
1244
1245 /* surrogate pair */
1246 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
1247 {
1248 CodePoint -= 0x10000;
1249 *pwsz++ = 0xd800 | (CodePoint >> 10);
1250 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
1251 return pwsz;
1252 }
1253
1254 /* invalid code point. */
1255 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
1256 *pwsz++ = 0x7f;
1257 return pwsz;
1258}
1259RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
1260
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette