VirtualBox

source: vbox/trunk/src/VBox/Runtime/utf-16.cpp@ 3672

Last change on this file since 3672 was 3160, checked in by vboxsync, 17 years ago

rtUtf16RecodeAsUtf8 takes a cch excluding the terminator. removed dead code.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 16.6 KB
Line 
1/* $Id: utf-16.cpp 3160 2007-06-19 17:26:15Z vboxsync $ */
2/** @file
3 * innotek Portable Runtime - UTF-16
4 */
5
6/*
7 * Copyright (C) 2006-2007 innotek GmbH
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License as published by the Free Software Foundation,
13 * in version 2 as it comes in the "COPYING" file of the VirtualBox OSE
14 * distribution. VirtualBox OSE is distributed in the hope that it will
15 * be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * If you received this file as part of a commercial VirtualBox
18 * distribution, then only the terms of your commercial VirtualBox
19 * license agreement apply instead of the previous paragraph.
20 */
21
22
23/*******************************************************************************
24* Header Files *
25*******************************************************************************/
26#include <iprt/string.h>
27#include <iprt/uni.h>
28#include <iprt/alloc.h>
29#include <iprt/assert.h>
30#include <iprt/err.h>
31#include "internal/string.h"
32
33
34
35RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
36{
37 if (pwszString)
38 RTMemTmpFree(pwszString);
39}
40
41
42RTDECL(PRTUTF16) RTUtf16Dup(PCRTUTF16 pwszString)
43{
44 Assert(pwszString);
45 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
46 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb);
47 if (pwsz)
48 memcpy(pwsz, pwszString, cb);
49 return pwsz;
50}
51
52
53RTDECL(int) RTUtf16DupEx(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra)
54{
55 Assert(pwszString);
56 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
57 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb + cwcExtra * sizeof(RTUTF16));
58 if (pwsz)
59 {
60 memcpy(pwsz, pwszString, cb);
61 *ppwszString = pwsz;
62 return VINF_SUCCESS;
63 }
64 return VERR_NO_MEMORY;
65}
66
67
68RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
69{
70 if (!pwszString)
71 return 0;
72
73 PCRTUTF16 pwsz = pwszString;
74 while (*pwsz)
75 pwsz++;
76 return pwsz - pwszString;
77}
78
79
80RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
81{
82 if (pwsz1 == pwsz2)
83 return 0;
84 if (!pwsz1)
85 return -1;
86 if (!pwsz2)
87 return 1;
88
89 for (;;)
90 {
91 register RTUTF16 wcs = *pwsz1;
92 register int iDiff = wcs - *pwsz2;
93 if (iDiff || !wcs)
94 return iDiff;
95 pwsz1++;
96 pwsz2++;
97 }
98}
99
100
101RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
102{
103 if (pwsz1 == pwsz2)
104 return 0;
105 if (!pwsz1)
106 return -1;
107 if (!pwsz2)
108 return 1;
109
110 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
111 for (;;)
112 {
113 register RTUTF16 wc1 = *pwsz1;
114 register RTUTF16 wc2 = *pwsz2;
115 register int iDiff = wc1 - wc2;
116 if (iDiff)
117 {
118 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
119 if ( wc1 < 0xd800
120 || wc2 < 0xd800
121 || wc1 > 0xdfff
122 || wc2 > 0xdfff)
123 {
124 /* simple UCS-2 char */
125 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
126 if (iDiff)
127 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
128 }
129 else
130 {
131 /* a damned pair */
132 RTUNICP uc1;
133 RTUNICP uc2;
134 if (wc1 >= 0xdc00)
135 {
136 if (pwsz1Start == pwsz1)
137 return iDiff;
138 uc1 = pwsz1[-1];
139 if (uc1 < 0xd800 || uc1 >= 0xdc00)
140 return iDiff;
141 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
142 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
143 }
144 else
145 {
146 uc1 = *++pwsz1;
147 if (uc1 < 0xdc00 || uc1 >= 0xe000)
148 return iDiff;
149 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
150 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
151 }
152 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
153 if (iDiff)
154 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
155 }
156 if (iDiff)
157 return iDiff;
158 }
159 if (!wc1)
160 return 0;
161 pwsz1++;
162 pwsz2++;
163 }
164}
165
166
167RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
168{
169 PRTUTF16 pwc = pwsz;
170 for (;;)
171 {
172 RTUTF16 wc = *pwc;
173 if (!wc)
174 break;
175 if (wc < 0xd800 || wc >= 0xdc00)
176 {
177 RTUNICP ucFolded = RTUniCpToLower(wc);
178 if (ucFolded < 0x10000)
179 *pwc++ = RTUniCpToLower(wc);
180 }
181 else
182 {
183 /* surrogate */
184 RTUTF16 wc2 = pwc[1];
185 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
186 {
187 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
188 RTUNICP ucFolded = RTUniCpToLower(uc);
189 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
190 {
191 uc -= 0x10000;
192 *pwc++ = 0xd800 | (uc >> 10);
193 *pwc++ = 0xdc00 | (uc & 0x3ff);
194 }
195 }
196 else /* invalid encoding. */
197 pwc++;
198 }
199 }
200 return pwsz;
201}
202
203
204RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
205{
206 PRTUTF16 pwc = pwsz;
207 for (;;)
208 {
209 RTUTF16 wc = *pwc;
210 if (!wc)
211 break;
212 if (wc < 0xd800 || wc >= 0xdc00)
213 *pwc++ = RTUniCpToUpper(wc);
214 else
215 {
216 /* surrogate */
217 RTUTF16 wc2 = pwc[1];
218 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
219 {
220 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
221 RTUNICP ucFolded = RTUniCpToUpper(uc);
222 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
223 {
224 uc -= 0x10000;
225 *pwc++ = 0xd800 | (uc >> 10);
226 *pwc++ = 0xdc00 | (uc & 0x3ff);
227 }
228 }
229 else /* invalid encoding. */
230 pwc++;
231 }
232 }
233 return pwsz;
234}
235
236
237/**
238 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
239 *
240 * @returns iprt status code.
241 * @param pwsz The UTF-16 string.
242 * @param cwc The max length of the UTF-16 string to consider.
243 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
244 */
245static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
246{
247 int rc = VINF_SUCCESS;
248 size_t cch = 0;
249 while (cwc > 0)
250 {
251 RTUTF16 wc = *pwsz++; cwc--;
252 if (!wc)
253 break;
254 else if (wc < 0xd800 || wc > 0xdfff)
255 {
256 if (wc < 0x80)
257 cch++;
258 else if (wc < 0x800)
259 cch += 2;
260 else if (wc < 0xfffe)
261 cch += 3;
262 else
263 {
264 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
265 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
266 break;
267 }
268 }
269 else
270 {
271 if (wc >= 0xdc00)
272 {
273 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
274 rc = VERR_INVALID_UTF16_ENCODING;
275 break;
276 }
277 if (cwc <= 0)
278 {
279 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
280 rc = VERR_INVALID_UTF16_ENCODING;
281 break;
282 }
283 wc = *pwsz++; cwc--;
284 if (wc < 0xdc00 || wc > 0xdfff)
285 {
286 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
287 rc = VERR_INVALID_UTF16_ENCODING;
288 break;
289 }
290 cch += 4;
291 }
292 }
293
294
295 /* done */
296 *pcch = cch;
297 return rc;
298}
299
300
301/**
302 * Recodes an valid UTF-16 string as UTF-8.
303 *
304 * @returns iprt status code.
305 * @param pwsz The UTF-16 string.
306 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
307 * will stop when cwc or '\\0' is reached.
308 * @param psz Where to store the UTF-8 string.
309 * @param cch The size of the UTF-8 buffer, excluding the terminator.
310 * @param pcch Where to store the number of octets actually encoded.
311 */
312static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
313{
314 unsigned char *pwch = (unsigned char *)psz;
315 int rc = VINF_SUCCESS;
316 while (cwc > 0)
317 {
318 RTUTF16 wc = *pwsz++; cwc--;
319 if (!wc)
320 break;
321 else if (wc < 0xd800 || wc > 0xdfff)
322 {
323 if (wc < 0x80)
324 {
325 if (cch < 1)
326 {
327 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
328 rc = VERR_BUFFER_OVERFLOW;
329 break;
330 }
331 cch--;
332 *pwch++ = (unsigned char)wc;
333 }
334 else if (wc < 0x800)
335 {
336 if (cch < 2)
337 {
338 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
339 rc = VERR_BUFFER_OVERFLOW;
340 break;
341 }
342 cch -= 2;
343 *pwch++ = 0xc0 | (wc >> 6);
344 *pwch++ = 0x80 | (wc & 0x3f);
345 }
346 else if (wc < 0xfffe)
347 {
348 if (cch < 3)
349 {
350 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
351 rc = VERR_BUFFER_OVERFLOW;
352 break;
353 }
354 cch -= 3;
355 *pwch++ = 0xe0 | (wc >> 12);
356 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
357 *pwch++ = 0x80 | (wc & 0x3f);
358 }
359 else
360 {
361 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
362 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
363 break;
364 }
365 }
366 else
367 {
368 if (wc >= 0xdc00)
369 {
370 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
371 rc = VERR_INVALID_UTF16_ENCODING;
372 break;
373 }
374 if (cwc <= 0)
375 {
376 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
377 rc = VERR_INVALID_UTF16_ENCODING;
378 break;
379 }
380 RTUTF16 wc2 = *pwsz++; cwc--;
381 if (wc2 < 0xdc00 || wc2 > 0xdfff)
382 {
383 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
384 rc = VERR_INVALID_UTF16_ENCODING;
385 break;
386 }
387 uint32_t CodePoint = 0x10000
388 + ( ((wc & 0x3ff) << 10)
389 | (wc2 & 0x3ff));
390 if (cch < 4)
391 {
392 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
393 rc = VERR_BUFFER_OVERFLOW;
394 break;
395 }
396 cch -= 4;
397 *pwch++ = 0xf0 | (CodePoint >> 18);
398 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
399 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
400 *pwch++ = 0x80 | (CodePoint & 0x3f);
401 }
402 }
403
404 /* done */
405 *pwch = '\0';
406 *pcch = (char *)pwch - psz;
407 return rc;
408}
409
410
411
412RTDECL(int) RTUtf16ToUtf8(PCRTUTF16 pwszString, char **ppszString)
413{
414 /*
415 * Validate input.
416 */
417 Assert(VALID_PTR(ppszString));
418 Assert(VALID_PTR(pwszString));
419 *ppszString = NULL;
420
421 /*
422 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
423 */
424 size_t cch;
425 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
426 if (RT_SUCCESS(rc))
427 {
428 /*
429 * Allocate buffer and recode it.
430 */
431 char *pszResult = (char *)RTMemAlloc(cch + 1);
432 if (pszResult)
433 {
434 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
435 if (RT_SUCCESS(rc))
436 {
437 *ppszString = pszResult;
438 return rc;
439 }
440
441 RTMemFree(pszResult);
442 }
443 else
444 rc = VERR_NO_STR_MEMORY;
445 }
446 return rc;
447}
448
449
450RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
451{
452 /*
453 * Validate input.
454 */
455 Assert(VALID_PTR(pwszString));
456 Assert(VALID_PTR(ppsz));
457 Assert(!pcch || VALID_PTR(pcch));
458
459 /*
460 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
461 */
462 size_t cchResult;
463 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
464 if (RT_SUCCESS(rc))
465 {
466 if (pcch)
467 *pcch = cchResult;
468
469 /*
470 * Check buffer size / Allocate buffer and recode it.
471 */
472 bool fShouldFree;
473 char *pszResult;
474 if (cch > 0 && *ppsz)
475 {
476 fShouldFree = false;
477 if (cch <= cchResult)
478 return VERR_BUFFER_OVERFLOW;
479 pszResult = *ppsz;
480 }
481 else
482 {
483 *ppsz = NULL;
484 fShouldFree = true;
485 cch = RT_MAX(cch, cchResult + 1);
486 pszResult = (char *)RTMemAlloc(cch);
487 }
488 if (pszResult)
489 {
490 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
491 if (RT_SUCCESS(rc))
492 {
493 *ppsz = pszResult;
494 return rc;
495 }
496
497 if (fShouldFree)
498 RTMemFree(pszResult);
499 }
500 else
501 rc = VERR_NO_STR_MEMORY;
502 }
503 return rc;
504}
505
506
507RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
508{
509 const RTUTF16 wc = *pwsz;
510
511 /* simple */
512 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
513 return wc;
514 if (wc < 0xfffe)
515 {
516 /* surrogate pair */
517 if (wc < 0xdc00)
518 {
519 const RTUTF16 wc2 = pwsz[1];
520 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
521 {
522 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
523 return uc;
524 }
525
526 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
527 }
528 else
529 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
530 }
531 else
532 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
533 return RTUNICP_INVALID;
534}
535
536
537RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
538{
539 const RTUTF16 wc = **ppwsz;
540
541 /* simple */
542 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
543 {
544 (*ppwsz)++;
545 *pCp = wc;
546 return VINF_SUCCESS;
547 }
548
549 int rc;
550 if (wc < 0xfffe)
551 {
552 /* surrogate pair */
553 if (wc < 0xdc00)
554 {
555 const RTUTF16 wc2 = (*ppwsz)[1];
556 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
557 {
558 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
559 *pCp = uc;
560 (*ppwsz) += 2;
561 return VINF_SUCCESS;
562 }
563
564 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
565 }
566 else
567 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
568 rc = VERR_INVALID_UTF16_ENCODING;
569 }
570 else
571 {
572 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
573 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
574 }
575 *pCp = RTUNICP_INVALID;
576 (*ppwsz)++;
577 return rc;
578}
579
580
581RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
582{
583 /* simple */
584 if ( CodePoint < 0xd800
585 || ( CodePoint > 0xdfff
586 && CodePoint < 0xfffe))
587 {
588 *pwsz++ = (RTUTF16)CodePoint;
589 return pwsz;
590 }
591
592 /* surrogate pair */
593 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
594 {
595 CodePoint -= 0x10000;
596 *pwsz++ = 0xd800 | (CodePoint >> 10);
597 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
598 return pwsz;
599 }
600
601 /* invalid code point. */
602 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
603 *pwsz++ = 0x7f;
604 return pwsz;
605}
606
607
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette