VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 50792

Last change on this file since 50792 was 50792, checked in by vboxsync, 11 years ago

IPRT: Added RTUtf16ValidateEncodingEx, RTUtf16ValidateEncoding and RTUtf16IsValidEncoding.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 33.6 KB
Line 
1/* $Id: utf-16.cpp 50792 2014-03-14 20:17:09Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2012 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41/**
42 * Get get length in code points of an UTF-16 encoded string, validating the
43 * string while doing so.
44 *
45 * @returns IPRT status code.
46 * @param pwsz Pointer to the UTF-16 string.
47 * @param cwc The max length of the string in UTF-16 units. Use
48 * RTSTR_MAX if all of the string is to be examined.
49 * @param pcuc Where to store the length in unicode code points.
50 * @param pcwcActual Where to store the actual size of the UTF-16 string
51 * on success. Optional.
52 */
53static int rtUtf16Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcuc, size_t *pcwcActual)
54{
55 PCRTUTF16 pwszStart = pwsz;
56 size_t cCodePoints = 0;
57 while (cwc > 0)
58 {
59 RTUTF16 wc = *pwsz;
60 if (!wc)
61 break;
62 if (wc < 0xd800 || wc > 0xdfff)
63 {
64 cCodePoints++;
65 pwsz++;
66 cwc--;
67 }
68 /* Surrogate pair: */
69 else if (wc >= 0xdc00)
70 {
71 RTStrAssertMsgFailed(("Lone UTF-16 trail surrogate: %#x (%.*Rhxs)\n", wc, RT_MIN(cwc * 2, 10), pwsz));
72 return VERR_INVALID_UTF16_ENCODING;
73 }
74 else if (cwc < 2)
75 {
76 RTStrAssertMsgFailed(("Lone UTF-16 lead surrogate: %#x\n", wc));
77 return VERR_INVALID_UTF16_ENCODING;
78 }
79 else
80 {
81 RTUTF16 wcTrail = pwsz[1];
82 if (wcTrail < 0xdc00 || wcTrail > 0xdfff)
83 {
84 RTStrAssertMsgFailed(("Invalid UTF-16 trail surrogate: %#x (lead %#x)\n", wcTrail, wc));
85 return VERR_INVALID_UTF16_ENCODING;
86 }
87
88 cCodePoints++;
89 pwsz += 2;
90 cwc -= 2;
91 }
92 }
93
94 /* done */
95 *pcuc = cCodePoints;
96 if (pcwcActual)
97 *pcwcActual = pwsz - pwszStart;
98 return VINF_SUCCESS;
99}
100
101
102RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
103{
104 if (pwszString)
105 RTMemTmpFree(pwszString);
106}
107RT_EXPORT_SYMBOL(RTUtf16Free);
108
109
110RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
111{
112 Assert(pwszString);
113 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
114 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
115 if (pwsz)
116 memcpy(pwsz, pwszString, cb);
117 return pwsz;
118}
119RT_EXPORT_SYMBOL(RTUtf16DupTag);
120
121
122RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
123{
124 Assert(pwszString);
125 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
126 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
127 if (pwsz)
128 {
129 memcpy(pwsz, pwszString, cb);
130 *ppwszString = pwsz;
131 return VINF_SUCCESS;
132 }
133 return VERR_NO_MEMORY;
134}
135RT_EXPORT_SYMBOL(RTUtf16DupExTag);
136
137
138RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
139{
140 if (!pwszString)
141 return 0;
142
143 PCRTUTF16 pwsz = pwszString;
144 while (*pwsz)
145 pwsz++;
146 return pwsz - pwszString;
147}
148RT_EXPORT_SYMBOL(RTUtf16Len);
149
150
151RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
152{
153 if (pwsz1 == pwsz2)
154 return 0;
155 if (!pwsz1)
156 return -1;
157 if (!pwsz2)
158 return 1;
159
160 for (;;)
161 {
162 register RTUTF16 wcs = *pwsz1;
163 register int iDiff = wcs - *pwsz2;
164 if (iDiff || !wcs)
165 return iDiff;
166 pwsz1++;
167 pwsz2++;
168 }
169}
170RT_EXPORT_SYMBOL(RTUtf16Cmp);
171
172
173RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
174{
175 if (pwsz1 == pwsz2)
176 return 0;
177 if (!pwsz1)
178 return -1;
179 if (!pwsz2)
180 return 1;
181
182 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
183 for (;;)
184 {
185 register RTUTF16 wc1 = *pwsz1;
186 register RTUTF16 wc2 = *pwsz2;
187 register int iDiff = wc1 - wc2;
188 if (iDiff)
189 {
190 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
191 if ( wc1 < 0xd800
192 || wc2 < 0xd800
193 || wc1 > 0xdfff
194 || wc2 > 0xdfff)
195 {
196 /* simple UCS-2 char */
197 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
198 if (iDiff)
199 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
200 }
201 else
202 {
203 /* a damned pair */
204 RTUNICP uc1;
205 RTUNICP uc2;
206 if (wc1 >= 0xdc00)
207 {
208 if (pwsz1Start == pwsz1)
209 return iDiff;
210 uc1 = pwsz1[-1];
211 if (uc1 < 0xd800 || uc1 >= 0xdc00)
212 return iDiff;
213 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
214 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
215 }
216 else
217 {
218 uc1 = *++pwsz1;
219 if (uc1 < 0xdc00 || uc1 >= 0xe000)
220 return iDiff;
221 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
222 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
223 }
224 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
225 if (iDiff)
226 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
227 }
228 if (iDiff)
229 return iDiff;
230 }
231 if (!wc1)
232 return 0;
233 pwsz1++;
234 pwsz2++;
235 }
236}
237RT_EXPORT_SYMBOL(RTUtf16ICmp);
238
239
240RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
241{
242 PRTUTF16 pwc = pwsz;
243 for (;;)
244 {
245 RTUTF16 wc = *pwc;
246 if (!wc)
247 break;
248 if (wc < 0xd800 || wc >= 0xdc00)
249 {
250 RTUNICP ucFolded = RTUniCpToLower(wc);
251 if (ucFolded < 0x10000)
252 *pwc++ = RTUniCpToLower(wc);
253 }
254 else
255 {
256 /* surrogate */
257 RTUTF16 wc2 = pwc[1];
258 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
259 {
260 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
261 RTUNICP ucFolded = RTUniCpToLower(uc);
262 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
263 {
264 uc -= 0x10000;
265 *pwc++ = 0xd800 | (uc >> 10);
266 *pwc++ = 0xdc00 | (uc & 0x3ff);
267 }
268 }
269 else /* invalid encoding. */
270 pwc++;
271 }
272 }
273 return pwsz;
274}
275RT_EXPORT_SYMBOL(RTUtf16ToLower);
276
277
278RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
279{
280 PRTUTF16 pwc = pwsz;
281 for (;;)
282 {
283 RTUTF16 wc = *pwc;
284 if (!wc)
285 break;
286 if (wc < 0xd800 || wc >= 0xdc00)
287 *pwc++ = RTUniCpToUpper(wc);
288 else
289 {
290 /* surrogate */
291 RTUTF16 wc2 = pwc[1];
292 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
293 {
294 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
295 RTUNICP ucFolded = RTUniCpToUpper(uc);
296 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
297 {
298 uc -= 0x10000;
299 *pwc++ = 0xd800 | (uc >> 10);
300 *pwc++ = 0xdc00 | (uc & 0x3ff);
301 }
302 }
303 else /* invalid encoding. */
304 pwc++;
305 }
306 }
307 return pwsz;
308}
309RT_EXPORT_SYMBOL(RTUtf16ToUpper);
310
311
312RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz)
313{
314 return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
315}
316RT_EXPORT_SYMBOL(RTUtf16ValidateEncoding);
317
318
319RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags)
320{
321 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
322 AssertPtr(pwsz);
323
324 /*
325 * Use rtUtf16Length for the job.
326 */
327 size_t cwcActual;
328 size_t cCpsIgnored;
329 int rc = rtUtf16Length(pwsz, cwc, &cCpsIgnored, &cwcActual);
330 if (RT_SUCCESS(rc))
331 {
332 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
333 && cwcActual >= cwc)
334 rc = VERR_BUFFER_OVERFLOW;
335 }
336 return rc;
337}
338RT_EXPORT_SYMBOL(RTUtf16ValidateEncodingEx);
339
340
341RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz)
342{
343 int rc = RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
344 return RT_SUCCESS(rc);
345}
346RT_EXPORT_SYMBOL(RTUtf16IsValidEncoding);
347
348
349RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidSet, char chReplacement)
350{
351 size_t cReplacements = 0;
352 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
353 /* Validate the encoding. */
354 for (;;)
355 {
356 RTUNICP Cp;
357 PCRTUNICP pCp;
358 PRTUTF16 pwszOld = pwsz;
359 if (RT_FAILURE(RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp)))
360 return -1;
361 if (!Cp)
362 break;
363 for (pCp = puszValidSet; *pCp; pCp += 2)
364 {
365 AssertReturn(*(pCp + 1), -1);
366 if (*pCp <= Cp && *(pCp + 1) >= Cp) /* No, I won't do * and ++. */
367 break;
368 }
369 if (!*pCp)
370 {
371 for (; pwszOld != pwsz; ++pwszOld)
372 *pwszOld = chReplacement;
373 ++cReplacements;
374 }
375 }
376 return cReplacements;
377}
378RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
379
380
381/**
382 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
383 *
384 * @returns iprt status code.
385 * @param pwsz The UTF-16 string.
386 * @param cwc The max length of the UTF-16 string to consider.
387 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
388 */
389static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
390{
391 int rc = VINF_SUCCESS;
392 size_t cch = 0;
393 while (cwc > 0)
394 {
395 RTUTF16 wc = *pwsz++; cwc--;
396 if (!wc)
397 break;
398 else if (wc < 0xd800 || wc > 0xdfff)
399 {
400 if (wc < 0x80)
401 cch++;
402 else if (wc < 0x800)
403 cch += 2;
404 else if (wc < 0xfffe)
405 cch += 3;
406 else
407 {
408 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
409 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
410 break;
411 }
412 }
413 else
414 {
415 if (wc >= 0xdc00)
416 {
417 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
418 rc = VERR_INVALID_UTF16_ENCODING;
419 break;
420 }
421 if (cwc <= 0)
422 {
423 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
424 rc = VERR_INVALID_UTF16_ENCODING;
425 break;
426 }
427 wc = *pwsz++; cwc--;
428 if (wc < 0xdc00 || wc > 0xdfff)
429 {
430 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
431 rc = VERR_INVALID_UTF16_ENCODING;
432 break;
433 }
434 cch += 4;
435 }
436 }
437
438
439 /* done */
440 *pcch = cch;
441 return rc;
442}
443
444
445/**
446 * Recodes an valid UTF-16 string as UTF-8.
447 *
448 * @returns iprt status code.
449 * @param pwsz The UTF-16 string.
450 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
451 * will stop when cwc or '\\0' is reached.
452 * @param psz Where to store the UTF-8 string.
453 * @param cch The size of the UTF-8 buffer, excluding the terminator.
454 * @param pcch Where to store the number of octets actually encoded.
455 */
456static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
457{
458 unsigned char *pwch = (unsigned char *)psz;
459 int rc = VINF_SUCCESS;
460 while (cwc > 0)
461 {
462 RTUTF16 wc = *pwsz++; cwc--;
463 if (!wc)
464 break;
465 else if (wc < 0xd800 || wc > 0xdfff)
466 {
467 if (wc < 0x80)
468 {
469 if (RT_UNLIKELY(cch < 1))
470 {
471 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
472 rc = VERR_BUFFER_OVERFLOW;
473 break;
474 }
475 cch--;
476 *pwch++ = (unsigned char)wc;
477 }
478 else if (wc < 0x800)
479 {
480 if (RT_UNLIKELY(cch < 2))
481 {
482 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
483 rc = VERR_BUFFER_OVERFLOW;
484 break;
485 }
486 cch -= 2;
487 *pwch++ = 0xc0 | (wc >> 6);
488 *pwch++ = 0x80 | (wc & 0x3f);
489 }
490 else if (wc < 0xfffe)
491 {
492 if (RT_UNLIKELY(cch < 3))
493 {
494 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
495 rc = VERR_BUFFER_OVERFLOW;
496 break;
497 }
498 cch -= 3;
499 *pwch++ = 0xe0 | (wc >> 12);
500 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
501 *pwch++ = 0x80 | (wc & 0x3f);
502 }
503 else
504 {
505 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
506 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
507 break;
508 }
509 }
510 else
511 {
512 if (wc >= 0xdc00)
513 {
514 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
515 rc = VERR_INVALID_UTF16_ENCODING;
516 break;
517 }
518 if (cwc <= 0)
519 {
520 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
521 rc = VERR_INVALID_UTF16_ENCODING;
522 break;
523 }
524 RTUTF16 wc2 = *pwsz++; cwc--;
525 if (wc2 < 0xdc00 || wc2 > 0xdfff)
526 {
527 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
528 rc = VERR_INVALID_UTF16_ENCODING;
529 break;
530 }
531 uint32_t CodePoint = 0x10000
532 + ( ((wc & 0x3ff) << 10)
533 | (wc2 & 0x3ff));
534 if (RT_UNLIKELY(cch < 4))
535 {
536 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
537 rc = VERR_BUFFER_OVERFLOW;
538 break;
539 }
540 cch -= 4;
541 *pwch++ = 0xf0 | (CodePoint >> 18);
542 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
543 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
544 *pwch++ = 0x80 | (CodePoint & 0x3f);
545 }
546 }
547
548 /* done */
549 *pwch = '\0';
550 *pcch = (char *)pwch - psz;
551 return rc;
552}
553
554
555
556RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
557{
558 /*
559 * Validate input.
560 */
561 Assert(VALID_PTR(ppszString));
562 Assert(VALID_PTR(pwszString));
563 *ppszString = NULL;
564
565 /*
566 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
567 */
568 size_t cch;
569 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
570 if (RT_SUCCESS(rc))
571 {
572 /*
573 * Allocate buffer and recode it.
574 */
575 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
576 if (pszResult)
577 {
578 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
579 if (RT_SUCCESS(rc))
580 {
581 *ppszString = pszResult;
582 return rc;
583 }
584
585 RTMemFree(pszResult);
586 }
587 else
588 rc = VERR_NO_STR_MEMORY;
589 }
590 return rc;
591}
592RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
593
594
595RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
596{
597 /*
598 * Validate input.
599 */
600 Assert(VALID_PTR(pwszString));
601 Assert(VALID_PTR(ppsz));
602 Assert(!pcch || VALID_PTR(pcch));
603
604 /*
605 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
606 */
607 size_t cchResult;
608 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
609 if (RT_SUCCESS(rc))
610 {
611 if (pcch)
612 *pcch = cchResult;
613
614 /*
615 * Check buffer size / Allocate buffer and recode it.
616 */
617 bool fShouldFree;
618 char *pszResult;
619 if (cch > 0 && *ppsz)
620 {
621 fShouldFree = false;
622 if (RT_UNLIKELY(cch <= cchResult))
623 return VERR_BUFFER_OVERFLOW;
624 pszResult = *ppsz;
625 }
626 else
627 {
628 *ppsz = NULL;
629 fShouldFree = true;
630 cch = RT_MAX(cch, cchResult + 1);
631 pszResult = (char *)RTStrAllocTag(cch, pszTag);
632 }
633 if (pszResult)
634 {
635 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
636 if (RT_SUCCESS(rc))
637 {
638 *ppsz = pszResult;
639 return rc;
640 }
641
642 if (fShouldFree)
643 RTStrFree(pszResult);
644 }
645 else
646 rc = VERR_NO_STR_MEMORY;
647 }
648 return rc;
649}
650RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
651
652
653RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
654{
655 size_t cch;
656 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
657 return RT_SUCCESS(rc) ? cch : 0;
658}
659RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
660
661
662RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
663{
664 size_t cch;
665 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
666 if (pcch)
667 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
668 return rc;
669}
670RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
671
672
673RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
674{
675 const RTUTF16 wc = *pwsz;
676
677 /* simple */
678 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
679 return wc;
680 if (wc < 0xfffe)
681 {
682 /* surrogate pair */
683 if (wc < 0xdc00)
684 {
685 const RTUTF16 wc2 = pwsz[1];
686 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
687 {
688 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
689 return uc;
690 }
691
692 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
693 }
694 else
695 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
696 }
697 else
698 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
699 return RTUNICP_INVALID;
700}
701RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
702
703
704RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
705{
706 const RTUTF16 wc = **ppwsz;
707
708 /* simple */
709 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
710 {
711 (*ppwsz)++;
712 *pCp = wc;
713 return VINF_SUCCESS;
714 }
715
716 int rc;
717 if (wc < 0xfffe)
718 {
719 /* surrogate pair */
720 if (wc < 0xdc00)
721 {
722 const RTUTF16 wc2 = (*ppwsz)[1];
723 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
724 {
725 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
726 *pCp = uc;
727 (*ppwsz) += 2;
728 return VINF_SUCCESS;
729 }
730
731 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
732 }
733 else
734 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
735 rc = VERR_INVALID_UTF16_ENCODING;
736 }
737 else
738 {
739 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
740 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
741 }
742 *pCp = RTUNICP_INVALID;
743 (*ppwsz)++;
744 return rc;
745}
746RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
747
748
749RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
750{
751 /* simple */
752 if ( CodePoint < 0xd800
753 || ( CodePoint > 0xdfff
754 && CodePoint < 0xfffe))
755 {
756 *pwsz++ = (RTUTF16)CodePoint;
757 return pwsz;
758 }
759
760 /* surrogate pair */
761 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
762 {
763 CodePoint -= 0x10000;
764 *pwsz++ = 0xd800 | (CodePoint >> 10);
765 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
766 return pwsz;
767 }
768
769 /* invalid code point. */
770 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
771 *pwsz++ = 0x7f;
772 return pwsz;
773}
774RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
775
776
777/**
778 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
779 *
780 * @returns iprt status code.
781 * @param pwsz The UTF-16 string.
782 * @param cwc The max length of the UTF-16 string to consider.
783 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
784 */
785static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
786{
787 int rc = VINF_SUCCESS;
788 size_t cch = 0;
789 while (cwc > 0)
790 {
791 RTUTF16 wc = *pwsz++; cwc--;
792 if (!wc)
793 break;
794 else if (RT_LIKELY(wc < 0x100))
795 ++cch;
796 else
797 {
798 if (wc < 0xd800 || wc > 0xdfff)
799 {
800 if (wc >= 0xfffe)
801 {
802 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
803 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
804 break;
805 }
806 }
807 else
808 {
809 if (wc >= 0xdc00)
810 {
811 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
812 rc = VERR_INVALID_UTF16_ENCODING;
813 break;
814 }
815 if (cwc <= 0)
816 {
817 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
818 rc = VERR_INVALID_UTF16_ENCODING;
819 break;
820 }
821 wc = *pwsz++; cwc--;
822 if (wc < 0xdc00 || wc > 0xdfff)
823 {
824 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
825 rc = VERR_INVALID_UTF16_ENCODING;
826 break;
827 }
828 }
829
830 rc = VERR_NO_TRANSLATION;
831 break;
832 }
833 }
834
835 /* done */
836 *pcch = cch;
837 return rc;
838}
839
840
841/**
842 * Recodes an valid UTF-16 string as Latin1.
843 *
844 * @returns iprt status code.
845 * @param pwsz The UTF-16 string.
846 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
847 * will stop when cwc or '\\0' is reached.
848 * @param psz Where to store the Latin1 string.
849 * @param cch The size of the Latin1 buffer, excluding the terminator.
850 */
851static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch)
852{
853 unsigned char *pch = (unsigned char *)psz;
854 int rc = VINF_SUCCESS;
855 while (cwc > 0)
856 {
857 RTUTF16 wc = *pwsz++; cwc--;
858 if (!wc)
859 break;
860 if (RT_LIKELY(wc < 0x100))
861 {
862 if (RT_UNLIKELY(cch < 1))
863 {
864 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
865 rc = VERR_BUFFER_OVERFLOW;
866 break;
867 }
868 cch--;
869 *pch++ = (unsigned char)wc;
870 }
871 else
872 {
873 if (wc < 0xd800 || wc > 0xdfff)
874 {
875 if (wc >= 0xfffe)
876 {
877 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
878 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
879 break;
880 }
881 }
882 else
883 {
884 if (wc >= 0xdc00)
885 {
886 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
887 rc = VERR_INVALID_UTF16_ENCODING;
888 break;
889 }
890 if (cwc <= 0)
891 {
892 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
893 rc = VERR_INVALID_UTF16_ENCODING;
894 break;
895 }
896 RTUTF16 wc2 = *pwsz++; cwc--;
897 if (wc2 < 0xdc00 || wc2 > 0xdfff)
898 {
899 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
900 rc = VERR_INVALID_UTF16_ENCODING;
901 break;
902 }
903 }
904
905 rc = VERR_NO_TRANSLATION;
906 break;
907 }
908 }
909
910 /* done */
911 *pch = '\0';
912 return rc;
913}
914
915
916RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
917{
918 /*
919 * Validate input.
920 */
921 Assert(VALID_PTR(ppszString));
922 Assert(VALID_PTR(pwszString));
923 *ppszString = NULL;
924
925 /*
926 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
927 */
928 size_t cch;
929 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
930 if (RT_SUCCESS(rc))
931 {
932 /*
933 * Allocate buffer and recode it.
934 */
935 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
936 if (pszResult)
937 {
938 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch);
939 if (RT_SUCCESS(rc))
940 {
941 *ppszString = pszResult;
942 return rc;
943 }
944
945 RTMemFree(pszResult);
946 }
947 else
948 rc = VERR_NO_STR_MEMORY;
949 }
950 return rc;
951}
952RT_EXPORT_SYMBOL(RTUtf16ToLatin1Tag);
953
954
955RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
956{
957 /*
958 * Validate input.
959 */
960 AssertPtr(pwszString);
961 AssertPtr(ppsz);
962 AssertPtrNull(pcch);
963
964 /*
965 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
966 */
967 size_t cchResult;
968 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
969 if (RT_SUCCESS(rc))
970 {
971 if (pcch)
972 *pcch = cchResult;
973
974 /*
975 * Check buffer size / Allocate buffer and recode it.
976 */
977 bool fShouldFree;
978 char *pszResult;
979 if (cch > 0 && *ppsz)
980 {
981 fShouldFree = false;
982 if (cch <= cchResult)
983 return VERR_BUFFER_OVERFLOW;
984 pszResult = *ppsz;
985 }
986 else
987 {
988 *ppsz = NULL;
989 fShouldFree = true;
990 cch = RT_MAX(cch, cchResult + 1);
991 pszResult = (char *)RTMemAllocTag(cch, pszTag);
992 }
993 if (pszResult)
994 {
995 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1);
996 if (RT_SUCCESS(rc))
997 {
998 *ppsz = pszResult;
999 return rc;
1000 }
1001
1002 if (fShouldFree)
1003 RTMemFree(pszResult);
1004 }
1005 else
1006 rc = VERR_NO_STR_MEMORY;
1007 }
1008 return rc;
1009}
1010RT_EXPORT_SYMBOL(RTUtf16ToLatin1ExTag);
1011
1012
1013RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
1014{
1015 size_t cch;
1016 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
1017 return RT_SUCCESS(rc) ? cch : 0;
1018}
1019RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
1020
1021
1022RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
1023{
1024 size_t cch;
1025 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
1026 if (pcch)
1027 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1028 return rc;
1029}
1030RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
1031
1032
1033/**
1034 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
1035 * original length, but the function saves us nasty comments to that effect
1036 * all over the place.
1037 *
1038 * @returns IPRT status code.
1039 * @param psz Pointer to the Latin1 string.
1040 * @param cch The max length of the string. (btw cch = cb)
1041 * Use RTSTR_MAX if all of the string is to be examined.s
1042 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
1043 */
1044static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
1045{
1046 *pcwc = RTStrNLen(psz, cch);
1047 return VINF_SUCCESS;
1048}
1049
1050
1051/**
1052 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
1053 * sixteen bits, as Unicode is a superset of Latin1.
1054 *
1055 * Since we know the input is valid, we do *not* perform length checks.
1056 *
1057 * @returns iprt status code.
1058 * @param psz The Latin1 string to recode.
1059 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
1060 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1061 * @param pwsz Where to store the UTF-16 string.
1062 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
1063 */
1064static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
1065{
1066 int rc = VINF_SUCCESS;
1067 const unsigned char *puch = (const unsigned char *)psz;
1068 PRTUTF16 pwc = pwsz;
1069 while (cch-- > 0)
1070 {
1071 /* read the next char and check for terminator. */
1072 const unsigned char uch = *puch;
1073 if (!uch)
1074 break;
1075
1076 /* check for output overflow */
1077 if (RT_UNLIKELY(cwc < 1))
1078 {
1079 rc = VERR_BUFFER_OVERFLOW;
1080 break;
1081 }
1082
1083 /* expand the code point */
1084 *pwc++ = uch;
1085 cwc--;
1086 puch++;
1087 }
1088
1089 /* done */
1090 *pwc = '\0';
1091 return rc;
1092}
1093
1094
1095RTDECL(int) RTLatin1ToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
1096{
1097 /*
1098 * Validate input.
1099 */
1100 Assert(VALID_PTR(ppwszString));
1101 Assert(VALID_PTR(pszString));
1102 *ppwszString = NULL;
1103
1104 /*
1105 * Validate the input and calculate the length of the UTF-16 string.
1106 */
1107 size_t cwc;
1108 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
1109 if (RT_SUCCESS(rc))
1110 {
1111 /*
1112 * Allocate buffer.
1113 */
1114 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1115 if (pwsz)
1116 {
1117 /*
1118 * Encode the UTF-16 string.
1119 */
1120 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1121 if (RT_SUCCESS(rc))
1122 {
1123 *ppwszString = pwsz;
1124 return rc;
1125 }
1126 RTMemFree(pwsz);
1127 }
1128 else
1129 rc = VERR_NO_UTF16_MEMORY;
1130 }
1131 return rc;
1132}
1133RT_EXPORT_SYMBOL(RTLatin1ToUtf16Tag);
1134
1135
1136RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszString, size_t cchString,
1137 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1138{
1139 /*
1140 * Validate input.
1141 */
1142 Assert(VALID_PTR(pszString));
1143 Assert(VALID_PTR(ppwsz));
1144 Assert(!pcwc || VALID_PTR(pcwc));
1145
1146 /*
1147 * Validate the input and calculate the length of the UTF-16 string.
1148 */
1149 size_t cwcResult;
1150 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
1151 if (RT_SUCCESS(rc))
1152 {
1153 if (pcwc)
1154 *pcwc = cwcResult;
1155
1156 /*
1157 * Check buffer size / Allocate buffer.
1158 */
1159 bool fShouldFree;
1160 PRTUTF16 pwszResult;
1161 if (cwc > 0 && *ppwsz)
1162 {
1163 fShouldFree = false;
1164 if (cwc <= cwcResult)
1165 return VERR_BUFFER_OVERFLOW;
1166 pwszResult = *ppwsz;
1167 }
1168 else
1169 {
1170 *ppwsz = NULL;
1171 fShouldFree = true;
1172 cwc = RT_MAX(cwcResult + 1, cwc);
1173 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1174 }
1175 if (pwszResult)
1176 {
1177 /*
1178 * Encode the UTF-16 string.
1179 */
1180 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1181 if (RT_SUCCESS(rc))
1182 {
1183 *ppwsz = pwszResult;
1184 return rc;
1185 }
1186 if (fShouldFree)
1187 RTMemFree(pwszResult);
1188 }
1189 else
1190 rc = VERR_NO_UTF16_MEMORY;
1191 }
1192 return rc;
1193}
1194RT_EXPORT_SYMBOL(RTLatin1ToUtf16ExTag);
1195
1196
1197RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
1198{
1199 size_t cwc;
1200 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
1201 return RT_SUCCESS(rc) ? cwc : 0;
1202}
1203RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
1204
1205
1206RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1207{
1208 size_t cwc;
1209 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
1210 if (pcwc)
1211 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1212 return rc;
1213}
1214RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette