VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 67979

Last change on this file since 67979 was 67391, checked in by vboxsync, 8 years ago

IPRT: More ISO maker code.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 30.2 KB
Line 
1/* $Id: utf-16.cpp 67391 2017-06-14 12:13:48Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2016 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/asm.h>
36#include <iprt/mem.h>
37#include <iprt/assert.h>
38#include <iprt/err.h>
39#include "internal/string.h"
40
41
42/**
43 * Get get length in code points of an UTF-16 encoded string, validating the
44 * string while doing so.
45 *
46 * @returns IPRT status code.
47 * @param pwsz Pointer to the UTF-16 string.
48 * @param cwc The max length of the string in UTF-16 units. Use
49 * RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcwcActual Where to store the actual size of the UTF-16 string
52 * on success. Optional.
53 */
54static int rtUtf16Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcuc, size_t *pcwcActual)
55{
56 PCRTUTF16 pwszStart = pwsz;
57 size_t cCodePoints = 0;
58 while (cwc > 0)
59 {
60 RTUTF16 wc = *pwsz;
61 if (!wc)
62 break;
63 if (wc < 0xd800 || wc > 0xdfff)
64 {
65 cCodePoints++;
66 pwsz++;
67 cwc--;
68 }
69 /* Surrogate pair: */
70 else if (wc >= 0xdc00)
71 {
72 RTStrAssertMsgFailed(("Lone UTF-16 trail surrogate: %#x (%.*Rhxs)\n", wc, RT_MIN(cwc * 2, 10), pwsz));
73 return VERR_INVALID_UTF16_ENCODING;
74 }
75 else if (cwc < 2)
76 {
77 RTStrAssertMsgFailed(("Lone UTF-16 lead surrogate: %#x\n", wc));
78 return VERR_INVALID_UTF16_ENCODING;
79 }
80 else
81 {
82 RTUTF16 wcTrail = pwsz[1];
83 if (wcTrail < 0xdc00 || wcTrail > 0xdfff)
84 {
85 RTStrAssertMsgFailed(("Invalid UTF-16 trail surrogate: %#x (lead %#x)\n", wcTrail, wc));
86 return VERR_INVALID_UTF16_ENCODING;
87 }
88
89 cCodePoints++;
90 pwsz += 2;
91 cwc -= 2;
92 }
93 }
94
95 /* done */
96 *pcuc = cCodePoints;
97 if (pcwcActual)
98 *pcwcActual = pwsz - pwszStart;
99 return VINF_SUCCESS;
100}
101
102
103RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag)
104{
105 if (cb > sizeof(RTUTF16))
106 cb = RT_ALIGN_Z(cb, sizeof(RTUTF16));
107 else
108 cb = sizeof(RTUTF16);
109 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
110 if (pwsz)
111 *pwsz = '\0';
112 return pwsz;
113}
114RT_EXPORT_SYMBOL(RTUtf16AllocTag);
115
116
117RTDECL(int) RTUtf16ReallocTag(PRTUTF16 *ppwsz, size_t cbNew, const char *pszTag)
118{
119 PRTUTF16 pwszOld = *ppwsz;
120 cbNew = RT_ALIGN_Z(cbNew, sizeof(RTUTF16));
121 if (!cbNew)
122 {
123 RTMemFree(pwszOld);
124 *ppwsz = NULL;
125 }
126 else if (pwszOld)
127 {
128 PRTUTF16 pwszNew = (PRTUTF16)RTMemReallocTag(pwszOld, cbNew, pszTag);
129 if (!pwszNew)
130 return VERR_NO_STR_MEMORY;
131 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
132 *ppwsz = pwszNew;
133 }
134 else
135 {
136 PRTUTF16 pwszNew = (PRTUTF16)RTMemAllocTag(cbNew, pszTag);
137 if (!pwszNew)
138 return VERR_NO_UTF16_MEMORY;
139 pwszNew[0] = '\0';
140 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
141 *ppwsz = pwszNew;
142 }
143 return VINF_SUCCESS;
144}
145RT_EXPORT_SYMBOL(RTUtf16ReallocTag);
146
147
148RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
149{
150 if (pwszString)
151 RTMemTmpFree(pwszString);
152}
153RT_EXPORT_SYMBOL(RTUtf16Free);
154
155
156RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
157{
158 Assert(pwszString);
159 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
160 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
161 if (pwsz)
162 memcpy(pwsz, pwszString, cb);
163 return pwsz;
164}
165RT_EXPORT_SYMBOL(RTUtf16DupTag);
166
167
168RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
169{
170 Assert(pwszString);
171 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
172 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
173 if (pwsz)
174 {
175 memcpy(pwsz, pwszString, cb);
176 *ppwszString = pwsz;
177 return VINF_SUCCESS;
178 }
179 return VERR_NO_MEMORY;
180}
181RT_EXPORT_SYMBOL(RTUtf16DupExTag);
182
183
184RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
185{
186 if (!pwszString)
187 return 0;
188
189 PCRTUTF16 pwsz = pwszString;
190 while (*pwsz)
191 pwsz++;
192 return pwsz - pwszString;
193}
194RT_EXPORT_SYMBOL(RTUtf16Len);
195
196
197RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
198{
199 if (pwsz1 == pwsz2)
200 return 0;
201 if (!pwsz1)
202 return -1;
203 if (!pwsz2)
204 return 1;
205
206 for (;;)
207 {
208 register RTUTF16 wcs = *pwsz1;
209 register int iDiff = wcs - *pwsz2;
210 if (iDiff || !wcs)
211 return iDiff;
212 pwsz1++;
213 pwsz2++;
214 }
215}
216RT_EXPORT_SYMBOL(RTUtf16Cmp);
217
218
219RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2)
220{
221 /*
222 * NULL and empty strings are all the same.
223 */
224 if (!pwsz1)
225 return !psz2 || !*psz2 ? 0 : -1;
226 if (!psz2)
227 return !*pwsz1 ? 0 : 1;
228
229 /*
230 * Compare with a UTF-8 string by enumerating them char by char.
231 */
232 for (;;)
233 {
234 RTUNICP uc1;
235 int rc = RTUtf16GetCpEx(&pwsz1, &uc1);
236 AssertRCReturn(rc, 1);
237
238 RTUNICP uc2;
239 rc = RTStrGetCpEx(&psz2, &uc2);
240 AssertRCReturn(rc, -1);
241 if (uc1 == uc2)
242 {
243 if (uc1)
244 continue;
245 return 0;
246 }
247 return uc1 < uc2 ? -1 : 1;
248 }
249}
250RT_EXPORT_SYMBOL(RTUtf16CmpUtf8);
251
252
253RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz)
254{
255 return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
256}
257RT_EXPORT_SYMBOL(RTUtf16ValidateEncoding);
258
259
260RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags)
261{
262 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
263 VERR_INVALID_PARAMETER);
264 AssertPtr(pwsz);
265
266 /*
267 * Use rtUtf16Length for the job.
268 */
269 size_t cwcActual = 0; /* Shut up cc1plus. */
270 size_t cCpsIgnored;
271 int rc = rtUtf16Length(pwsz, cwc, &cCpsIgnored, &cwcActual);
272 if (RT_SUCCESS(rc))
273 {
274 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
275 {
276 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
277 cwcActual++;
278 if (cwcActual == cwc)
279 rc = VINF_SUCCESS;
280 else if (cwcActual < cwc)
281 rc = VERR_BUFFER_UNDERFLOW;
282 else
283 rc = VERR_BUFFER_OVERFLOW;
284 }
285 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
286 && cwcActual >= cwc)
287 rc = VERR_BUFFER_OVERFLOW;
288 }
289 return rc;
290}
291RT_EXPORT_SYMBOL(RTUtf16ValidateEncodingEx);
292
293
294RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz)
295{
296 int rc = RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
297 return RT_SUCCESS(rc);
298}
299RT_EXPORT_SYMBOL(RTUtf16IsValidEncoding);
300
301
302/**
303 * Helper for RTUtf16PurgeComplementSet.
304 *
305 * @returns true if @a Cp is valid, false if not.
306 * @param Cp The code point to validate.
307 * @param puszValidPairs Pair of valid code point sets.
308 * @param cValidPairs Number of pairs.
309 */
310DECLINLINE(bool) rtUtf16PurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
311{
312 while (cValidPairs-- > 0)
313 {
314 if ( Cp >= puszValidPairs[0]
315 && Cp <= puszValidPairs[1])
316 return true;
317 puszValidPairs += 2;
318 }
319 return false;
320}
321
322
323RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement)
324{
325 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
326
327 /*
328 * Calc valid pairs and check that we've got an even number.
329 */
330 uint32_t cValidPairs = 0;
331 while (puszValidPairs[cValidPairs * 2])
332 {
333 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
334 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
335 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
336 cValidPairs++;
337 }
338
339 /*
340 * Do the replacing.
341 */
342 ssize_t cReplacements = 0;
343 for (;;)
344 {
345 PRTUTF16 pwszCur = pwsz;
346 RTUNICP Cp;
347 int rc = RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp);
348 if (RT_SUCCESS(rc))
349 {
350 if (Cp)
351 {
352 if (!rtUtf16PurgeIsInSet(Cp, puszValidPairs, cValidPairs))
353 {
354 for (; pwszCur != pwsz; ++pwszCur)
355 *pwszCur = chReplacement;
356 ++cReplacements;
357 }
358 }
359 else
360 break;
361 }
362 else
363 return -1;
364 }
365 return cReplacements;
366}
367RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
368
369
370/**
371 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
372 *
373 * @returns iprt status code.
374 * @param pwsz The UTF-16 string.
375 * @param cwc The max length of the UTF-16 string to consider.
376 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
377 *
378 * @note rtUtf16BigCalcUtf8Length is a copy of this.
379 */
380static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
381{
382 int rc = VINF_SUCCESS;
383 size_t cch = 0;
384 while (cwc > 0)
385 {
386 RTUTF16 wc = *pwsz++; cwc--;
387 if (!wc)
388 break;
389 if (wc < 0xd800 || wc > 0xdfff)
390 {
391 if (wc < 0x80)
392 cch++;
393 else if (wc < 0x800)
394 cch += 2;
395 else if (wc < 0xfffe)
396 cch += 3;
397 else
398 {
399 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
400 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
401 break;
402 }
403 }
404 else
405 {
406 if (wc >= 0xdc00)
407 {
408 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
409 rc = VERR_INVALID_UTF16_ENCODING;
410 break;
411 }
412 if (cwc <= 0)
413 {
414 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
415 rc = VERR_INVALID_UTF16_ENCODING;
416 break;
417 }
418 wc = *pwsz++; cwc--;
419 if (wc < 0xdc00 || wc > 0xdfff)
420 {
421 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
422 rc = VERR_INVALID_UTF16_ENCODING;
423 break;
424 }
425 cch += 4;
426 }
427 }
428
429
430 /* done */
431 *pcch = cch;
432 return rc;
433}
434
435
436/**
437 * Validate the UTF-16BE encoding and calculates the length of an UTF-8
438 * encoding.
439 *
440 * @returns iprt status code.
441 * @param pwsz The UTF-16 string.
442 * @param cwc The max length of the UTF-16BE string to consider.
443 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
444 *
445 * @note Code is a copy of rtUtf16CalcUtf8Length, but with two RT_BE2H_U16
446 * invocations inserted.
447 */
448static int rtUtf16BigCalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
449{
450 int rc = VINF_SUCCESS;
451 size_t cch = 0;
452 while (cwc > 0)
453 {
454 RTUTF16 wc = *pwsz++; cwc--;
455 if (!wc)
456 break;
457 wc = RT_BE2H_U16(wc);
458 if (wc < 0xd800 || wc > 0xdfff)
459 {
460 if (wc < 0x80)
461 cch++;
462 else if (wc < 0x800)
463 cch += 2;
464 else if (wc < 0xfffe)
465 cch += 3;
466 else
467 {
468 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
469 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
470 break;
471 }
472 }
473 else
474 {
475 if (wc >= 0xdc00)
476 {
477 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
478 rc = VERR_INVALID_UTF16_ENCODING;
479 break;
480 }
481 if (cwc <= 0)
482 {
483 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
484 rc = VERR_INVALID_UTF16_ENCODING;
485 break;
486 }
487 wc = *pwsz++; cwc--;
488 wc = RT_BE2H_U16(wc);
489 if (wc < 0xdc00 || wc > 0xdfff)
490 {
491 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
492 rc = VERR_INVALID_UTF16_ENCODING;
493 break;
494 }
495 cch += 4;
496 }
497 }
498
499
500 /* done */
501 *pcch = cch;
502 return rc;
503}
504
505
506/**
507 * Recodes an valid UTF-16 string as UTF-8.
508 *
509 * @returns iprt status code.
510 * @param pwsz The UTF-16 string.
511 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
512 * will stop when cwc or '\\0' is reached.
513 * @param psz Where to store the UTF-8 string.
514 * @param cch The size of the UTF-8 buffer, excluding the terminator.
515 * @param pcch Where to store the number of octets actually encoded.
516 * @note rtUtf16BigRecodeAsUtf8 is a copy of this.
517 */
518static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
519{
520 unsigned char *pwch = (unsigned char *)psz;
521 int rc = VINF_SUCCESS;
522 while (cwc > 0)
523 {
524 RTUTF16 wc = *pwsz++; cwc--;
525 if (!wc)
526 break;
527 if (wc < 0xd800 || wc > 0xdfff)
528 {
529 if (wc < 0x80)
530 {
531 if (RT_UNLIKELY(cch < 1))
532 {
533 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
534 rc = VERR_BUFFER_OVERFLOW;
535 break;
536 }
537 cch--;
538 *pwch++ = (unsigned char)wc;
539 }
540 else if (wc < 0x800)
541 {
542 if (RT_UNLIKELY(cch < 2))
543 {
544 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
545 rc = VERR_BUFFER_OVERFLOW;
546 break;
547 }
548 cch -= 2;
549 *pwch++ = 0xc0 | (wc >> 6);
550 *pwch++ = 0x80 | (wc & 0x3f);
551 }
552 else if (wc < 0xfffe)
553 {
554 if (RT_UNLIKELY(cch < 3))
555 {
556 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
557 rc = VERR_BUFFER_OVERFLOW;
558 break;
559 }
560 cch -= 3;
561 *pwch++ = 0xe0 | (wc >> 12);
562 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
563 *pwch++ = 0x80 | (wc & 0x3f);
564 }
565 else
566 {
567 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
568 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
569 break;
570 }
571 }
572 else
573 {
574 if (wc >= 0xdc00)
575 {
576 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
577 rc = VERR_INVALID_UTF16_ENCODING;
578 break;
579 }
580 if (cwc <= 0)
581 {
582 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
583 rc = VERR_INVALID_UTF16_ENCODING;
584 break;
585 }
586 RTUTF16 wc2 = *pwsz++; cwc--;
587 if (wc2 < 0xdc00 || wc2 > 0xdfff)
588 {
589 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
590 rc = VERR_INVALID_UTF16_ENCODING;
591 break;
592 }
593 uint32_t CodePoint = 0x10000
594 + ( ((wc & 0x3ff) << 10)
595 | (wc2 & 0x3ff));
596 if (RT_UNLIKELY(cch < 4))
597 {
598 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
599 rc = VERR_BUFFER_OVERFLOW;
600 break;
601 }
602 cch -= 4;
603 *pwch++ = 0xf0 | (CodePoint >> 18);
604 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
605 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
606 *pwch++ = 0x80 | (CodePoint & 0x3f);
607 }
608 }
609
610 /* done */
611 *pwch = '\0';
612 *pcch = (char *)pwch - psz;
613 return rc;
614}
615
616
617/**
618 * Recodes an valid UTF-16BE string as UTF-8.
619 *
620 * @returns iprt status code.
621 * @param pwsz The UTF-16BE string.
622 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
623 * will stop when cwc or '\\0' is reached.
624 * @param psz Where to store the UTF-8 string.
625 * @param cch The size of the UTF-8 buffer, excluding the terminator.
626 * @param pcch Where to store the number of octets actually encoded.
627 *
628 * @note Copy of rtUtf16RecodeAsUtf8 with a few RT_BE2H_U16 invocations
629 * insterted.
630 */
631static int rtUtf16BigRecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
632{
633 unsigned char *pwch = (unsigned char *)psz;
634 int rc = VINF_SUCCESS;
635 while (cwc > 0)
636 {
637 RTUTF16 wc = *pwsz++; cwc--;
638 if (!wc)
639 break;
640 wc = RT_BE2H_U16(wc);
641 if (wc < 0xd800 || wc > 0xdfff)
642 {
643 if (wc < 0x80)
644 {
645 if (RT_UNLIKELY(cch < 1))
646 {
647 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
648 rc = VERR_BUFFER_OVERFLOW;
649 break;
650 }
651 cch--;
652 *pwch++ = (unsigned char)wc;
653 }
654 else if (wc < 0x800)
655 {
656 if (RT_UNLIKELY(cch < 2))
657 {
658 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
659 rc = VERR_BUFFER_OVERFLOW;
660 break;
661 }
662 cch -= 2;
663 *pwch++ = 0xc0 | (wc >> 6);
664 *pwch++ = 0x80 | (wc & 0x3f);
665 }
666 else if (wc < 0xfffe)
667 {
668 if (RT_UNLIKELY(cch < 3))
669 {
670 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
671 rc = VERR_BUFFER_OVERFLOW;
672 break;
673 }
674 cch -= 3;
675 *pwch++ = 0xe0 | (wc >> 12);
676 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
677 *pwch++ = 0x80 | (wc & 0x3f);
678 }
679 else
680 {
681 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
682 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
683 break;
684 }
685 }
686 else
687 {
688 if (wc >= 0xdc00)
689 {
690 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
691 rc = VERR_INVALID_UTF16_ENCODING;
692 break;
693 }
694 if (cwc <= 0)
695 {
696 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
697 rc = VERR_INVALID_UTF16_ENCODING;
698 break;
699 }
700 RTUTF16 wc2 = *pwsz++; cwc--;
701 wc2 = RT_BE2H_U16(wc2);
702 if (wc2 < 0xdc00 || wc2 > 0xdfff)
703 {
704 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
705 rc = VERR_INVALID_UTF16_ENCODING;
706 break;
707 }
708 uint32_t CodePoint = 0x10000
709 + ( ((wc & 0x3ff) << 10)
710 | (wc2 & 0x3ff));
711 if (RT_UNLIKELY(cch < 4))
712 {
713 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
714 rc = VERR_BUFFER_OVERFLOW;
715 break;
716 }
717 cch -= 4;
718 *pwch++ = 0xf0 | (CodePoint >> 18);
719 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
720 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
721 *pwch++ = 0x80 | (CodePoint & 0x3f);
722 }
723 }
724
725 /* done */
726 *pwch = '\0';
727 *pcch = (char *)pwch - psz;
728 return rc;
729}
730
731
732
733RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
734{
735 /*
736 * Validate input.
737 */
738 Assert(VALID_PTR(ppszString));
739 Assert(VALID_PTR(pwszString));
740 *ppszString = NULL;
741
742 /*
743 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
744 */
745 size_t cch;
746 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
747 if (RT_SUCCESS(rc))
748 {
749 /*
750 * Allocate buffer and recode it.
751 */
752 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
753 if (pszResult)
754 {
755 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
756 if (RT_SUCCESS(rc))
757 {
758 *ppszString = pszResult;
759 return rc;
760 }
761
762 RTMemFree(pszResult);
763 }
764 else
765 rc = VERR_NO_STR_MEMORY;
766 }
767 return rc;
768}
769RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
770
771
772RTDECL(int) RTUtf16BigToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
773{
774 /*
775 * Validate input.
776 */
777 Assert(VALID_PTR(ppszString));
778 Assert(VALID_PTR(pwszString));
779 *ppszString = NULL;
780
781 /*
782 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
783 */
784 size_t cch;
785 int rc = rtUtf16BigCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
786 if (RT_SUCCESS(rc))
787 {
788 /*
789 * Allocate buffer and recode it.
790 */
791 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
792 if (pszResult)
793 {
794 rc = rtUtf16BigRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
795 if (RT_SUCCESS(rc))
796 {
797 *ppszString = pszResult;
798 return rc;
799 }
800
801 RTMemFree(pszResult);
802 }
803 else
804 rc = VERR_NO_STR_MEMORY;
805 }
806 return rc;
807}
808RT_EXPORT_SYMBOL(RTUtf16BigToUtf8Tag);
809
810
811RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
812{
813 /*
814 * Validate input.
815 */
816 AssertPtr(pwszString);
817 AssertPtr(ppsz);
818 AssertPtrNull(pcch);
819
820 /*
821 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
822 */
823 size_t cchResult;
824 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
825 if (RT_SUCCESS(rc))
826 {
827 if (pcch)
828 *pcch = cchResult;
829
830 /*
831 * Check buffer size / Allocate buffer and recode it.
832 */
833 bool fShouldFree;
834 char *pszResult;
835 if (cch > 0 && *ppsz)
836 {
837 fShouldFree = false;
838 if (RT_UNLIKELY(cch <= cchResult))
839 return VERR_BUFFER_OVERFLOW;
840 pszResult = *ppsz;
841 }
842 else
843 {
844 *ppsz = NULL;
845 fShouldFree = true;
846 cch = RT_MAX(cch, cchResult + 1);
847 pszResult = (char *)RTStrAllocTag(cch, pszTag);
848 }
849 if (pszResult)
850 {
851 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
852 if (RT_SUCCESS(rc))
853 {
854 *ppsz = pszResult;
855 return rc;
856 }
857
858 if (fShouldFree)
859 RTStrFree(pszResult);
860 }
861 else
862 rc = VERR_NO_STR_MEMORY;
863 }
864 return rc;
865}
866RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
867
868
869RTDECL(int) RTUtf16BigToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
870{
871 /*
872 * Validate input.
873 */
874 AssertPtr(pwszString);
875 AssertPtr(ppsz);
876 AssertPtrNull(pcch);
877
878 /*
879 * Validate the UTF-16BE string and calculate the length of the UTF-8 encoding of it.
880 */
881 size_t cchResult;
882 int rc = rtUtf16BigCalcUtf8Length(pwszString, cwcString, &cchResult);
883 if (RT_SUCCESS(rc))
884 {
885 if (pcch)
886 *pcch = cchResult;
887
888 /*
889 * Check buffer size / Allocate buffer and recode it.
890 */
891 bool fShouldFree;
892 char *pszResult;
893 if (cch > 0 && *ppsz)
894 {
895 fShouldFree = false;
896 if (RT_UNLIKELY(cch <= cchResult))
897 return VERR_BUFFER_OVERFLOW;
898 pszResult = *ppsz;
899 }
900 else
901 {
902 *ppsz = NULL;
903 fShouldFree = true;
904 cch = RT_MAX(cch, cchResult + 1);
905 pszResult = (char *)RTStrAllocTag(cch, pszTag);
906 }
907 if (pszResult)
908 {
909 rc = rtUtf16BigRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
910 if (RT_SUCCESS(rc))
911 {
912 *ppsz = pszResult;
913 return rc;
914 }
915
916 if (fShouldFree)
917 RTStrFree(pszResult);
918 }
919 else
920 rc = VERR_NO_STR_MEMORY;
921 }
922 return rc;
923}
924RT_EXPORT_SYMBOL(RTUtf16BigToUtf8ExTag);
925
926
927RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
928{
929 size_t cch;
930 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
931 return RT_SUCCESS(rc) ? cch : 0;
932}
933RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
934
935
936RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
937{
938 size_t cch;
939 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
940 if (pcch)
941 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
942 return rc;
943}
944RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
945
946
947RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
948{
949 const RTUTF16 wc = *pwsz;
950
951 /* simple */
952 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
953 return wc;
954 if (wc < 0xfffe)
955 {
956 /* surrogate pair */
957 if (wc < 0xdc00)
958 {
959 const RTUTF16 wc2 = pwsz[1];
960 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
961 {
962 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
963 return uc;
964 }
965
966 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
967 }
968 else
969 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
970 }
971 else
972 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
973 return RTUNICP_INVALID;
974}
975RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
976
977
978RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
979{
980 const RTUTF16 wc = **ppwsz;
981
982 /* simple */
983 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
984 {
985 (*ppwsz)++;
986 *pCp = wc;
987 return VINF_SUCCESS;
988 }
989
990 int rc;
991 if (wc < 0xfffe)
992 {
993 /* surrogate pair */
994 if (wc < 0xdc00)
995 {
996 const RTUTF16 wc2 = (*ppwsz)[1];
997 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
998 {
999 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1000 *pCp = uc;
1001 (*ppwsz) += 2;
1002 return VINF_SUCCESS;
1003 }
1004
1005 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1006 }
1007 else
1008 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1009 rc = VERR_INVALID_UTF16_ENCODING;
1010 }
1011 else
1012 {
1013 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1014 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
1015 }
1016 *pCp = RTUNICP_INVALID;
1017 (*ppwsz)++;
1018 return rc;
1019}
1020RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
1021
1022
1023RTDECL(int) RTUtf16BigGetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1024{
1025 const RTUTF16 wc = RT_BE2H_U16(**ppwsz);
1026
1027 /* simple */
1028 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1029 {
1030 (*ppwsz)++;
1031 *pCp = wc;
1032 return VINF_SUCCESS;
1033 }
1034
1035 int rc;
1036 if (wc < 0xfffe)
1037 {
1038 /* surrogate pair */
1039 if (wc < 0xdc00)
1040 {
1041 const RTUTF16 wc2 = RT_BE2H_U16((*ppwsz)[1]);
1042 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
1043 {
1044 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1045 *pCp = uc;
1046 (*ppwsz) += 2;
1047 return VINF_SUCCESS;
1048 }
1049
1050 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1051 }
1052 else
1053 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1054 rc = VERR_INVALID_UTF16_ENCODING;
1055 }
1056 else
1057 {
1058 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1059 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
1060 }
1061 *pCp = RTUNICP_INVALID;
1062 (*ppwsz)++;
1063 return rc;
1064}
1065RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
1066
1067
1068RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
1069{
1070 /* simple */
1071 if ( CodePoint < 0xd800
1072 || ( CodePoint > 0xdfff
1073 && CodePoint < 0xfffe))
1074 {
1075 *pwsz++ = (RTUTF16)CodePoint;
1076 return pwsz;
1077 }
1078
1079 /* surrogate pair */
1080 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
1081 {
1082 CodePoint -= 0x10000;
1083 *pwsz++ = 0xd800 | (CodePoint >> 10);
1084 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
1085 return pwsz;
1086 }
1087
1088 /* invalid code point. */
1089 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
1090 *pwsz++ = 0x7f;
1091 return pwsz;
1092}
1093RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
1094
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette