VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 25240

Last change on this file since 25240 was 21740, checked in by vboxsync, 15 years ago

IPRT: Some latin-1 cleanup and some preditions in the recoding loops.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 29.8 KB
Line 
1/* $Id: utf-16.cpp 21740 2009-07-21 12:06:38Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include "internal/iprt.h"
37
38#include <iprt/uni.h>
39#include <iprt/alloc.h>
40#include <iprt/assert.h>
41#include <iprt/err.h>
42#include "internal/string.h"
43
44
45
46RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
47{
48 if (pwszString)
49 RTMemTmpFree(pwszString);
50}
51RT_EXPORT_SYMBOL(RTUtf16Free);
52
53
54RTDECL(PRTUTF16) RTUtf16Dup(PCRTUTF16 pwszString)
55{
56 Assert(pwszString);
57 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
58 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb);
59 if (pwsz)
60 memcpy(pwsz, pwszString, cb);
61 return pwsz;
62}
63RT_EXPORT_SYMBOL(RTUtf16Dup);
64
65
66RTDECL(int) RTUtf16DupEx(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra)
67{
68 Assert(pwszString);
69 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
70 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb + cwcExtra * sizeof(RTUTF16));
71 if (pwsz)
72 {
73 memcpy(pwsz, pwszString, cb);
74 *ppwszString = pwsz;
75 return VINF_SUCCESS;
76 }
77 return VERR_NO_MEMORY;
78}
79RT_EXPORT_SYMBOL(RTUtf16DupEx);
80
81
82RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
83{
84 if (!pwszString)
85 return 0;
86
87 PCRTUTF16 pwsz = pwszString;
88 while (*pwsz)
89 pwsz++;
90 return pwsz - pwszString;
91}
92RT_EXPORT_SYMBOL(RTUtf16Len);
93
94
95RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
96{
97 if (pwsz1 == pwsz2)
98 return 0;
99 if (!pwsz1)
100 return -1;
101 if (!pwsz2)
102 return 1;
103
104 for (;;)
105 {
106 register RTUTF16 wcs = *pwsz1;
107 register int iDiff = wcs - *pwsz2;
108 if (iDiff || !wcs)
109 return iDiff;
110 pwsz1++;
111 pwsz2++;
112 }
113}
114RT_EXPORT_SYMBOL(RTUtf16Cmp);
115
116
117RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
118{
119 if (pwsz1 == pwsz2)
120 return 0;
121 if (!pwsz1)
122 return -1;
123 if (!pwsz2)
124 return 1;
125
126 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
127 for (;;)
128 {
129 register RTUTF16 wc1 = *pwsz1;
130 register RTUTF16 wc2 = *pwsz2;
131 register int iDiff = wc1 - wc2;
132 if (iDiff)
133 {
134 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
135 if ( wc1 < 0xd800
136 || wc2 < 0xd800
137 || wc1 > 0xdfff
138 || wc2 > 0xdfff)
139 {
140 /* simple UCS-2 char */
141 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
142 if (iDiff)
143 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
144 }
145 else
146 {
147 /* a damned pair */
148 RTUNICP uc1;
149 RTUNICP uc2;
150 if (wc1 >= 0xdc00)
151 {
152 if (pwsz1Start == pwsz1)
153 return iDiff;
154 uc1 = pwsz1[-1];
155 if (uc1 < 0xd800 || uc1 >= 0xdc00)
156 return iDiff;
157 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
158 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
159 }
160 else
161 {
162 uc1 = *++pwsz1;
163 if (uc1 < 0xdc00 || uc1 >= 0xe000)
164 return iDiff;
165 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
166 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
167 }
168 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
169 if (iDiff)
170 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
171 }
172 if (iDiff)
173 return iDiff;
174 }
175 if (!wc1)
176 return 0;
177 pwsz1++;
178 pwsz2++;
179 }
180}
181RT_EXPORT_SYMBOL(RTUtf16ICmp);
182
183
184RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
185{
186 PRTUTF16 pwc = pwsz;
187 for (;;)
188 {
189 RTUTF16 wc = *pwc;
190 if (!wc)
191 break;
192 if (wc < 0xd800 || wc >= 0xdc00)
193 {
194 RTUNICP ucFolded = RTUniCpToLower(wc);
195 if (ucFolded < 0x10000)
196 *pwc++ = RTUniCpToLower(wc);
197 }
198 else
199 {
200 /* surrogate */
201 RTUTF16 wc2 = pwc[1];
202 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
203 {
204 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
205 RTUNICP ucFolded = RTUniCpToLower(uc);
206 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
207 {
208 uc -= 0x10000;
209 *pwc++ = 0xd800 | (uc >> 10);
210 *pwc++ = 0xdc00 | (uc & 0x3ff);
211 }
212 }
213 else /* invalid encoding. */
214 pwc++;
215 }
216 }
217 return pwsz;
218}
219RT_EXPORT_SYMBOL(RTUtf16ToLower);
220
221
222RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
223{
224 PRTUTF16 pwc = pwsz;
225 for (;;)
226 {
227 RTUTF16 wc = *pwc;
228 if (!wc)
229 break;
230 if (wc < 0xd800 || wc >= 0xdc00)
231 *pwc++ = RTUniCpToUpper(wc);
232 else
233 {
234 /* surrogate */
235 RTUTF16 wc2 = pwc[1];
236 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
237 {
238 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
239 RTUNICP ucFolded = RTUniCpToUpper(uc);
240 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
241 {
242 uc -= 0x10000;
243 *pwc++ = 0xd800 | (uc >> 10);
244 *pwc++ = 0xdc00 | (uc & 0x3ff);
245 }
246 }
247 else /* invalid encoding. */
248 pwc++;
249 }
250 }
251 return pwsz;
252}
253RT_EXPORT_SYMBOL(RTUtf16ToUpper);
254
255
256/**
257 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
258 *
259 * @returns iprt status code.
260 * @param pwsz The UTF-16 string.
261 * @param cwc The max length of the UTF-16 string to consider.
262 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
263 */
264static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
265{
266 int rc = VINF_SUCCESS;
267 size_t cch = 0;
268 while (cwc > 0)
269 {
270 RTUTF16 wc = *pwsz++; cwc--;
271 if (!wc)
272 break;
273 else if (wc < 0xd800 || wc > 0xdfff)
274 {
275 if (wc < 0x80)
276 cch++;
277 else if (wc < 0x800)
278 cch += 2;
279 else if (wc < 0xfffe)
280 cch += 3;
281 else
282 {
283 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
284 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
285 break;
286 }
287 }
288 else
289 {
290 if (wc >= 0xdc00)
291 {
292 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
293 rc = VERR_INVALID_UTF16_ENCODING;
294 break;
295 }
296 if (cwc <= 0)
297 {
298 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
299 rc = VERR_INVALID_UTF16_ENCODING;
300 break;
301 }
302 wc = *pwsz++; cwc--;
303 if (wc < 0xdc00 || wc > 0xdfff)
304 {
305 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
306 rc = VERR_INVALID_UTF16_ENCODING;
307 break;
308 }
309 cch += 4;
310 }
311 }
312
313
314 /* done */
315 *pcch = cch;
316 return rc;
317}
318
319
320/**
321 * Recodes an valid UTF-16 string as UTF-8.
322 *
323 * @returns iprt status code.
324 * @param pwsz The UTF-16 string.
325 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
326 * will stop when cwc or '\\0' is reached.
327 * @param psz Where to store the UTF-8 string.
328 * @param cch The size of the UTF-8 buffer, excluding the terminator.
329 * @param pcch Where to store the number of octets actually encoded.
330 */
331static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
332{
333 unsigned char *pwch = (unsigned char *)psz;
334 int rc = VINF_SUCCESS;
335 while (cwc > 0)
336 {
337 RTUTF16 wc = *pwsz++; cwc--;
338 if (!wc)
339 break;
340 else if (wc < 0xd800 || wc > 0xdfff)
341 {
342 if (wc < 0x80)
343 {
344 if (RT_UNLIKELY(cch < 1))
345 {
346 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
347 rc = VERR_BUFFER_OVERFLOW;
348 break;
349 }
350 cch--;
351 *pwch++ = (unsigned char)wc;
352 }
353 else if (wc < 0x800)
354 {
355 if (RT_UNLIKELY(cch < 2))
356 {
357 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
358 rc = VERR_BUFFER_OVERFLOW;
359 break;
360 }
361 cch -= 2;
362 *pwch++ = 0xc0 | (wc >> 6);
363 *pwch++ = 0x80 | (wc & 0x3f);
364 }
365 else if (wc < 0xfffe)
366 {
367 if (RT_UNLIKELY(cch < 3))
368 {
369 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
370 rc = VERR_BUFFER_OVERFLOW;
371 break;
372 }
373 cch -= 3;
374 *pwch++ = 0xe0 | (wc >> 12);
375 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
376 *pwch++ = 0x80 | (wc & 0x3f);
377 }
378 else
379 {
380 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
381 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
382 break;
383 }
384 }
385 else
386 {
387 if (wc >= 0xdc00)
388 {
389 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
390 rc = VERR_INVALID_UTF16_ENCODING;
391 break;
392 }
393 if (cwc <= 0)
394 {
395 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
396 rc = VERR_INVALID_UTF16_ENCODING;
397 break;
398 }
399 RTUTF16 wc2 = *pwsz++; cwc--;
400 if (wc2 < 0xdc00 || wc2 > 0xdfff)
401 {
402 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
403 rc = VERR_INVALID_UTF16_ENCODING;
404 break;
405 }
406 uint32_t CodePoint = 0x10000
407 + ( ((wc & 0x3ff) << 10)
408 | (wc2 & 0x3ff));
409 if (RT_UNLIKELY(cch < 4))
410 {
411 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
412 rc = VERR_BUFFER_OVERFLOW;
413 break;
414 }
415 cch -= 4;
416 *pwch++ = 0xf0 | (CodePoint >> 18);
417 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
418 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
419 *pwch++ = 0x80 | (CodePoint & 0x3f);
420 }
421 }
422
423 /* done */
424 *pwch = '\0';
425 *pcch = (char *)pwch - psz;
426 return rc;
427}
428
429
430
431RTDECL(int) RTUtf16ToUtf8(PCRTUTF16 pwszString, char **ppszString)
432{
433 /*
434 * Validate input.
435 */
436 Assert(VALID_PTR(ppszString));
437 Assert(VALID_PTR(pwszString));
438 *ppszString = NULL;
439
440 /*
441 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
442 */
443 size_t cch;
444 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
445 if (RT_SUCCESS(rc))
446 {
447 /*
448 * Allocate buffer and recode it.
449 */
450 char *pszResult = (char *)RTMemAlloc(cch + 1);
451 if (pszResult)
452 {
453 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
454 if (RT_SUCCESS(rc))
455 {
456 *ppszString = pszResult;
457 return rc;
458 }
459
460 RTMemFree(pszResult);
461 }
462 else
463 rc = VERR_NO_STR_MEMORY;
464 }
465 return rc;
466}
467RT_EXPORT_SYMBOL(RTUtf16ToUtf8);
468
469
470RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
471{
472 /*
473 * Validate input.
474 */
475 Assert(VALID_PTR(pwszString));
476 Assert(VALID_PTR(ppsz));
477 Assert(!pcch || VALID_PTR(pcch));
478
479 /*
480 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
481 */
482 size_t cchResult;
483 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
484 if (RT_SUCCESS(rc))
485 {
486 if (pcch)
487 *pcch = cchResult;
488
489 /*
490 * Check buffer size / Allocate buffer and recode it.
491 */
492 bool fShouldFree;
493 char *pszResult;
494 if (cch > 0 && *ppsz)
495 {
496 fShouldFree = false;
497 if (RT_UNLIKELY(cch <= cchResult))
498 return VERR_BUFFER_OVERFLOW;
499 pszResult = *ppsz;
500 }
501 else
502 {
503 *ppsz = NULL;
504 fShouldFree = true;
505 cch = RT_MAX(cch, cchResult + 1);
506 pszResult = (char *)RTMemAlloc(cch);
507 }
508 if (pszResult)
509 {
510 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
511 if (RT_SUCCESS(rc))
512 {
513 *ppsz = pszResult;
514 return rc;
515 }
516
517 if (fShouldFree)
518 RTMemFree(pszResult);
519 }
520 else
521 rc = VERR_NO_STR_MEMORY;
522 }
523 return rc;
524}
525RT_EXPORT_SYMBOL(RTUtf16ToUtf8Ex);
526
527
528RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
529{
530 size_t cch;
531 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
532 return RT_SUCCESS(rc) ? cch : 0;
533}
534RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
535
536
537RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
538{
539 size_t cch;
540 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
541 if (pcch)
542 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
543 return rc;
544}
545RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
546
547
548RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
549{
550 const RTUTF16 wc = *pwsz;
551
552 /* simple */
553 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
554 return wc;
555 if (wc < 0xfffe)
556 {
557 /* surrogate pair */
558 if (wc < 0xdc00)
559 {
560 const RTUTF16 wc2 = pwsz[1];
561 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
562 {
563 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
564 return uc;
565 }
566
567 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
568 }
569 else
570 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
571 }
572 else
573 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
574 return RTUNICP_INVALID;
575}
576RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
577
578
579RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
580{
581 const RTUTF16 wc = **ppwsz;
582
583 /* simple */
584 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
585 {
586 (*ppwsz)++;
587 *pCp = wc;
588 return VINF_SUCCESS;
589 }
590
591 int rc;
592 if (wc < 0xfffe)
593 {
594 /* surrogate pair */
595 if (wc < 0xdc00)
596 {
597 const RTUTF16 wc2 = (*ppwsz)[1];
598 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
599 {
600 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
601 *pCp = uc;
602 (*ppwsz) += 2;
603 return VINF_SUCCESS;
604 }
605
606 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
607 }
608 else
609 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
610 rc = VERR_INVALID_UTF16_ENCODING;
611 }
612 else
613 {
614 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
615 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
616 }
617 *pCp = RTUNICP_INVALID;
618 (*ppwsz)++;
619 return rc;
620}
621RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
622
623
624RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
625{
626 /* simple */
627 if ( CodePoint < 0xd800
628 || ( CodePoint > 0xdfff
629 && CodePoint < 0xfffe))
630 {
631 *pwsz++ = (RTUTF16)CodePoint;
632 return pwsz;
633 }
634
635 /* surrogate pair */
636 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
637 {
638 CodePoint -= 0x10000;
639 *pwsz++ = 0xd800 | (CodePoint >> 10);
640 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
641 return pwsz;
642 }
643
644 /* invalid code point. */
645 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
646 *pwsz++ = 0x7f;
647 return pwsz;
648}
649RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
650
651
652/**
653 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
654 *
655 * @returns iprt status code.
656 * @param pwsz The UTF-16 string.
657 * @param cwc The max length of the UTF-16 string to consider.
658 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
659 */
660static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
661{
662 int rc = VINF_SUCCESS;
663 size_t cch = 0;
664 while (cwc > 0)
665 {
666 RTUTF16 wc = *pwsz++; cwc--;
667 if (!wc)
668 break;
669 else if (RT_LIKELY(wc < 0x100))
670 ++cch;
671 else
672 {
673 if (wc < 0xd800 || wc > 0xdfff)
674 {
675 if (wc >= 0xfffe)
676 {
677 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
678 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
679 break;
680 }
681 }
682 else
683 {
684 if (wc >= 0xdc00)
685 {
686 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
687 rc = VERR_INVALID_UTF16_ENCODING;
688 break;
689 }
690 if (cwc <= 0)
691 {
692 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
693 rc = VERR_INVALID_UTF16_ENCODING;
694 break;
695 }
696 wc = *pwsz++; cwc--;
697 if (wc < 0xdc00 || wc > 0xdfff)
698 {
699 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
700 rc = VERR_INVALID_UTF16_ENCODING;
701 break;
702 }
703 }
704
705 rc = VERR_NO_TRANSLATION;
706 break;
707 }
708 }
709
710 /* done */
711 *pcch = cch;
712 return rc;
713}
714
715
716/**
717 * Recodes an valid UTF-16 string as Latin1.
718 *
719 * @returns iprt status code.
720 * @param pwsz The UTF-16 string.
721 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
722 * will stop when cwc or '\\0' is reached.
723 * @param psz Where to store the Latin1 string.
724 * @param cch The size of the Latin1 buffer, excluding the terminator.
725 */
726static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch)
727{
728 unsigned char *pch = (unsigned char *)psz;
729 int rc = VINF_SUCCESS;
730 while (cwc > 0)
731 {
732 RTUTF16 wc = *pwsz++; cwc--;
733 if (!wc)
734 break;
735 if (RT_LIKELY(wc < 0x100))
736 {
737 if (RT_UNLIKELY(cch < 1))
738 {
739 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
740 rc = VERR_BUFFER_OVERFLOW;
741 break;
742 }
743 cch--;
744 *pch++ = (unsigned char)wc;
745 }
746 else
747 {
748 if (wc < 0xd800 || wc > 0xdfff)
749 {
750 if (wc >= 0xfffe)
751 {
752 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
753 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
754 break;
755 }
756 }
757 else
758 {
759 if (wc >= 0xdc00)
760 {
761 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
762 rc = VERR_INVALID_UTF16_ENCODING;
763 break;
764 }
765 if (cwc <= 0)
766 {
767 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
768 rc = VERR_INVALID_UTF16_ENCODING;
769 break;
770 }
771 RTUTF16 wc2 = *pwsz++; cwc--;
772 if (wc2 < 0xdc00 || wc2 > 0xdfff)
773 {
774 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
775 rc = VERR_INVALID_UTF16_ENCODING;
776 break;
777 }
778 }
779
780 rc = VERR_NO_TRANSLATION;
781 break;
782 }
783 }
784
785 /* done */
786 *pch = '\0';
787 return rc;
788}
789
790
791RTDECL(int) RTUtf16ToLatin1(PCRTUTF16 pwszString, char **ppszString)
792{
793 /*
794 * Validate input.
795 */
796 Assert(VALID_PTR(ppszString));
797 Assert(VALID_PTR(pwszString));
798 *ppszString = NULL;
799
800 /*
801 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
802 */
803 size_t cch;
804 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
805 if (RT_SUCCESS(rc))
806 {
807 /*
808 * Allocate buffer and recode it.
809 */
810 char *pszResult = (char *)RTMemAlloc(cch + 1);
811 if (pszResult)
812 {
813 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch);
814 if (RT_SUCCESS(rc))
815 {
816 *ppszString = pszResult;
817 return rc;
818 }
819
820 RTMemFree(pszResult);
821 }
822 else
823 rc = VERR_NO_STR_MEMORY;
824 }
825 return rc;
826}
827RT_EXPORT_SYMBOL(RTUtf16ToLatin1);
828
829
830RTDECL(int) RTUtf16ToLatin1Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
831{
832 /*
833 * Validate input.
834 */
835 AssertPtr(pwszString);
836 AssertPtr(ppsz);
837 AssertPtrNull(pcch);
838
839 /*
840 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
841 */
842 size_t cchResult;
843 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
844 if (RT_SUCCESS(rc))
845 {
846 if (pcch)
847 *pcch = cchResult;
848
849 /*
850 * Check buffer size / Allocate buffer and recode it.
851 */
852 bool fShouldFree;
853 char *pszResult;
854 if (cch > 0 && *ppsz)
855 {
856 fShouldFree = false;
857 if (cch <= cchResult)
858 return VERR_BUFFER_OVERFLOW;
859 pszResult = *ppsz;
860 }
861 else
862 {
863 *ppsz = NULL;
864 fShouldFree = true;
865 cch = RT_MAX(cch, cchResult + 1);
866 pszResult = (char *)RTMemAlloc(cch);
867 }
868 if (pszResult)
869 {
870 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1);
871 if (RT_SUCCESS(rc))
872 {
873 *ppsz = pszResult;
874 return rc;
875 }
876
877 if (fShouldFree)
878 RTMemFree(pszResult);
879 }
880 else
881 rc = VERR_NO_STR_MEMORY;
882 }
883 return rc;
884}
885RT_EXPORT_SYMBOL(RTUtf16ToLatin1Ex);
886
887
888RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
889{
890 size_t cch;
891 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
892 return RT_SUCCESS(rc) ? cch : 0;
893}
894RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
895
896
897RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
898{
899 size_t cch;
900 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
901 if (pcch)
902 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
903 return rc;
904}
905RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
906
907
908/**
909 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
910 * original length, but the function saves us nasty comments to that effect
911 * all over the place.
912 *
913 * @returns IPRT status code.
914 * @param psz Pointer to the Latin1 string.
915 * @param cch The max length of the string. (btw cch = cb)
916 * Use RTSTR_MAX if all of the string is to be examined.s
917 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
918 */
919static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
920{
921 *pcwc = RTStrNLen(psz, cch);
922 return VINF_SUCCESS;
923}
924
925
926/**
927 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
928 * sixteen bits, as Unicode is a superset of Latin1.
929 *
930 * Since we know the input is valid, we do *not* perform length checks.
931 *
932 * @returns iprt status code.
933 * @param psz The Latin1 string to recode.
934 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
935 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
936 * @param pwsz Where to store the UTF-16 string.
937 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
938 */
939static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
940{
941 int rc = VINF_SUCCESS;
942 const unsigned char *puch = (const unsigned char *)psz;
943 PRTUTF16 pwc = pwsz;
944 while (cch-- > 0)
945 {
946 /* read the next char and check for terminator. */
947 const unsigned char uch = *puch;
948 if (!uch)
949 break;
950
951 /* check for output overflow */
952 if (RT_UNLIKELY(cwc < 1))
953 {
954 rc = VERR_BUFFER_OVERFLOW;
955 break;
956 }
957
958 /* expand the code point */
959 *pwc++ = uch;
960 cwc--;
961 puch++;
962 }
963
964 /* done */
965 *pwc = '\0';
966 return rc;
967}
968
969
970RTDECL(int) RTLatin1ToUtf16(const char *pszString, PRTUTF16 *ppwszString)
971{
972 /*
973 * Validate input.
974 */
975 Assert(VALID_PTR(ppwszString));
976 Assert(VALID_PTR(pszString));
977 *ppwszString = NULL;
978
979 /*
980 * Validate the input and calculate the length of the UTF-16 string.
981 */
982 size_t cwc;
983 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
984 if (RT_SUCCESS(rc))
985 {
986 /*
987 * Allocate buffer.
988 */
989 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
990 if (pwsz)
991 {
992 /*
993 * Encode the UTF-16 string.
994 */
995 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
996 if (RT_SUCCESS(rc))
997 {
998 *ppwszString = pwsz;
999 return rc;
1000 }
1001 RTMemFree(pwsz);
1002 }
1003 else
1004 rc = VERR_NO_UTF16_MEMORY;
1005 }
1006 return rc;
1007}
1008RT_EXPORT_SYMBOL(RTLatin1ToUtf16);
1009
1010
1011RTDECL(int) RTLatin1ToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
1012{
1013 /*
1014 * Validate input.
1015 */
1016 Assert(VALID_PTR(pszString));
1017 Assert(VALID_PTR(ppwsz));
1018 Assert(!pcwc || VALID_PTR(pcwc));
1019
1020 /*
1021 * Validate the input and calculate the length of the UTF-16 string.
1022 */
1023 size_t cwcResult;
1024 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
1025 if (RT_SUCCESS(rc))
1026 {
1027 if (pcwc)
1028 *pcwc = cwcResult;
1029
1030 /*
1031 * Check buffer size / Allocate buffer.
1032 */
1033 bool fShouldFree;
1034 PRTUTF16 pwszResult;
1035 if (cwc > 0 && *ppwsz)
1036 {
1037 fShouldFree = false;
1038 if (cwc <= cwcResult)
1039 return VERR_BUFFER_OVERFLOW;
1040 pwszResult = *ppwsz;
1041 }
1042 else
1043 {
1044 *ppwsz = NULL;
1045 fShouldFree = true;
1046 cwc = RT_MAX(cwcResult + 1, cwc);
1047 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
1048 }
1049 if (pwszResult)
1050 {
1051 /*
1052 * Encode the UTF-16 string.
1053 */
1054 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1055 if (RT_SUCCESS(rc))
1056 {
1057 *ppwsz = pwszResult;
1058 return rc;
1059 }
1060 if (fShouldFree)
1061 RTMemFree(pwszResult);
1062 }
1063 else
1064 rc = VERR_NO_UTF16_MEMORY;
1065 }
1066 return rc;
1067}
1068RT_EXPORT_SYMBOL(RTLatin1ToUtf16Ex);
1069
1070
1071RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
1072{
1073 size_t cwc;
1074 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
1075 return RT_SUCCESS(rc) ? cwc : 0;
1076}
1077RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
1078
1079
1080RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1081{
1082 size_t cwc;
1083 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
1084 if (pcwc)
1085 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1086 return rc;
1087}
1088RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette