VirtualBox

source: vbox/trunk/src/VBox/Runtime/utf-16.cpp@ 4071

Last change on this file since 4071 was 4071, checked in by vboxsync, 17 years ago

Biggest check-in ever. New source code headers for all (C) innotek files.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 16.4 KB
Line 
1/* $Id: utf-16.cpp 4071 2007-08-07 17:07:59Z vboxsync $ */
2/** @file
3 * innotek Portable Runtime - UTF-16
4 */
5
6/*
7 * Copyright (C) 2006-2007 innotek GmbH
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License as published by the Free Software Foundation,
13 * in version 2 as it comes in the "COPYING" file of the VirtualBox OSE
14 * distribution. VirtualBox OSE is distributed in the hope that it will
15 * be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*******************************************************************************
20* Header Files *
21*******************************************************************************/
22#include <iprt/string.h>
23#include <iprt/uni.h>
24#include <iprt/alloc.h>
25#include <iprt/assert.h>
26#include <iprt/err.h>
27#include "internal/string.h"
28
29
30
31RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
32{
33 if (pwszString)
34 RTMemTmpFree(pwszString);
35}
36
37
38RTDECL(PRTUTF16) RTUtf16Dup(PCRTUTF16 pwszString)
39{
40 Assert(pwszString);
41 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
42 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb);
43 if (pwsz)
44 memcpy(pwsz, pwszString, cb);
45 return pwsz;
46}
47
48
49RTDECL(int) RTUtf16DupEx(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra)
50{
51 Assert(pwszString);
52 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
53 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb + cwcExtra * sizeof(RTUTF16));
54 if (pwsz)
55 {
56 memcpy(pwsz, pwszString, cb);
57 *ppwszString = pwsz;
58 return VINF_SUCCESS;
59 }
60 return VERR_NO_MEMORY;
61}
62
63
64RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
65{
66 if (!pwszString)
67 return 0;
68
69 PCRTUTF16 pwsz = pwszString;
70 while (*pwsz)
71 pwsz++;
72 return pwsz - pwszString;
73}
74
75
76RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
77{
78 if (pwsz1 == pwsz2)
79 return 0;
80 if (!pwsz1)
81 return -1;
82 if (!pwsz2)
83 return 1;
84
85 for (;;)
86 {
87 register RTUTF16 wcs = *pwsz1;
88 register int iDiff = wcs - *pwsz2;
89 if (iDiff || !wcs)
90 return iDiff;
91 pwsz1++;
92 pwsz2++;
93 }
94}
95
96
97RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
98{
99 if (pwsz1 == pwsz2)
100 return 0;
101 if (!pwsz1)
102 return -1;
103 if (!pwsz2)
104 return 1;
105
106 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
107 for (;;)
108 {
109 register RTUTF16 wc1 = *pwsz1;
110 register RTUTF16 wc2 = *pwsz2;
111 register int iDiff = wc1 - wc2;
112 if (iDiff)
113 {
114 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
115 if ( wc1 < 0xd800
116 || wc2 < 0xd800
117 || wc1 > 0xdfff
118 || wc2 > 0xdfff)
119 {
120 /* simple UCS-2 char */
121 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
122 if (iDiff)
123 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
124 }
125 else
126 {
127 /* a damned pair */
128 RTUNICP uc1;
129 RTUNICP uc2;
130 if (wc1 >= 0xdc00)
131 {
132 if (pwsz1Start == pwsz1)
133 return iDiff;
134 uc1 = pwsz1[-1];
135 if (uc1 < 0xd800 || uc1 >= 0xdc00)
136 return iDiff;
137 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
138 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
139 }
140 else
141 {
142 uc1 = *++pwsz1;
143 if (uc1 < 0xdc00 || uc1 >= 0xe000)
144 return iDiff;
145 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
146 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
147 }
148 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
149 if (iDiff)
150 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
151 }
152 if (iDiff)
153 return iDiff;
154 }
155 if (!wc1)
156 return 0;
157 pwsz1++;
158 pwsz2++;
159 }
160}
161
162
163RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
164{
165 PRTUTF16 pwc = pwsz;
166 for (;;)
167 {
168 RTUTF16 wc = *pwc;
169 if (!wc)
170 break;
171 if (wc < 0xd800 || wc >= 0xdc00)
172 {
173 RTUNICP ucFolded = RTUniCpToLower(wc);
174 if (ucFolded < 0x10000)
175 *pwc++ = RTUniCpToLower(wc);
176 }
177 else
178 {
179 /* surrogate */
180 RTUTF16 wc2 = pwc[1];
181 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
182 {
183 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
184 RTUNICP ucFolded = RTUniCpToLower(uc);
185 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
186 {
187 uc -= 0x10000;
188 *pwc++ = 0xd800 | (uc >> 10);
189 *pwc++ = 0xdc00 | (uc & 0x3ff);
190 }
191 }
192 else /* invalid encoding. */
193 pwc++;
194 }
195 }
196 return pwsz;
197}
198
199
200RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
201{
202 PRTUTF16 pwc = pwsz;
203 for (;;)
204 {
205 RTUTF16 wc = *pwc;
206 if (!wc)
207 break;
208 if (wc < 0xd800 || wc >= 0xdc00)
209 *pwc++ = RTUniCpToUpper(wc);
210 else
211 {
212 /* surrogate */
213 RTUTF16 wc2 = pwc[1];
214 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
215 {
216 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
217 RTUNICP ucFolded = RTUniCpToUpper(uc);
218 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
219 {
220 uc -= 0x10000;
221 *pwc++ = 0xd800 | (uc >> 10);
222 *pwc++ = 0xdc00 | (uc & 0x3ff);
223 }
224 }
225 else /* invalid encoding. */
226 pwc++;
227 }
228 }
229 return pwsz;
230}
231
232
233/**
234 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
235 *
236 * @returns iprt status code.
237 * @param pwsz The UTF-16 string.
238 * @param cwc The max length of the UTF-16 string to consider.
239 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
240 */
241static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
242{
243 int rc = VINF_SUCCESS;
244 size_t cch = 0;
245 while (cwc > 0)
246 {
247 RTUTF16 wc = *pwsz++; cwc--;
248 if (!wc)
249 break;
250 else if (wc < 0xd800 || wc > 0xdfff)
251 {
252 if (wc < 0x80)
253 cch++;
254 else if (wc < 0x800)
255 cch += 2;
256 else if (wc < 0xfffe)
257 cch += 3;
258 else
259 {
260 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
261 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
262 break;
263 }
264 }
265 else
266 {
267 if (wc >= 0xdc00)
268 {
269 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
270 rc = VERR_INVALID_UTF16_ENCODING;
271 break;
272 }
273 if (cwc <= 0)
274 {
275 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
276 rc = VERR_INVALID_UTF16_ENCODING;
277 break;
278 }
279 wc = *pwsz++; cwc--;
280 if (wc < 0xdc00 || wc > 0xdfff)
281 {
282 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
283 rc = VERR_INVALID_UTF16_ENCODING;
284 break;
285 }
286 cch += 4;
287 }
288 }
289
290
291 /* done */
292 *pcch = cch;
293 return rc;
294}
295
296
297/**
298 * Recodes an valid UTF-16 string as UTF-8.
299 *
300 * @returns iprt status code.
301 * @param pwsz The UTF-16 string.
302 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
303 * will stop when cwc or '\\0' is reached.
304 * @param psz Where to store the UTF-8 string.
305 * @param cch The size of the UTF-8 buffer, excluding the terminator.
306 * @param pcch Where to store the number of octets actually encoded.
307 */
308static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
309{
310 unsigned char *pwch = (unsigned char *)psz;
311 int rc = VINF_SUCCESS;
312 while (cwc > 0)
313 {
314 RTUTF16 wc = *pwsz++; cwc--;
315 if (!wc)
316 break;
317 else if (wc < 0xd800 || wc > 0xdfff)
318 {
319 if (wc < 0x80)
320 {
321 if (cch < 1)
322 {
323 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
324 rc = VERR_BUFFER_OVERFLOW;
325 break;
326 }
327 cch--;
328 *pwch++ = (unsigned char)wc;
329 }
330 else if (wc < 0x800)
331 {
332 if (cch < 2)
333 {
334 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
335 rc = VERR_BUFFER_OVERFLOW;
336 break;
337 }
338 cch -= 2;
339 *pwch++ = 0xc0 | (wc >> 6);
340 *pwch++ = 0x80 | (wc & 0x3f);
341 }
342 else if (wc < 0xfffe)
343 {
344 if (cch < 3)
345 {
346 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
347 rc = VERR_BUFFER_OVERFLOW;
348 break;
349 }
350 cch -= 3;
351 *pwch++ = 0xe0 | (wc >> 12);
352 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
353 *pwch++ = 0x80 | (wc & 0x3f);
354 }
355 else
356 {
357 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
358 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
359 break;
360 }
361 }
362 else
363 {
364 if (wc >= 0xdc00)
365 {
366 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
367 rc = VERR_INVALID_UTF16_ENCODING;
368 break;
369 }
370 if (cwc <= 0)
371 {
372 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
373 rc = VERR_INVALID_UTF16_ENCODING;
374 break;
375 }
376 RTUTF16 wc2 = *pwsz++; cwc--;
377 if (wc2 < 0xdc00 || wc2 > 0xdfff)
378 {
379 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
380 rc = VERR_INVALID_UTF16_ENCODING;
381 break;
382 }
383 uint32_t CodePoint = 0x10000
384 + ( ((wc & 0x3ff) << 10)
385 | (wc2 & 0x3ff));
386 if (cch < 4)
387 {
388 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
389 rc = VERR_BUFFER_OVERFLOW;
390 break;
391 }
392 cch -= 4;
393 *pwch++ = 0xf0 | (CodePoint >> 18);
394 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
395 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
396 *pwch++ = 0x80 | (CodePoint & 0x3f);
397 }
398 }
399
400 /* done */
401 *pwch = '\0';
402 *pcch = (char *)pwch - psz;
403 return rc;
404}
405
406
407
408RTDECL(int) RTUtf16ToUtf8(PCRTUTF16 pwszString, char **ppszString)
409{
410 /*
411 * Validate input.
412 */
413 Assert(VALID_PTR(ppszString));
414 Assert(VALID_PTR(pwszString));
415 *ppszString = NULL;
416
417 /*
418 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
419 */
420 size_t cch;
421 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
422 if (RT_SUCCESS(rc))
423 {
424 /*
425 * Allocate buffer and recode it.
426 */
427 char *pszResult = (char *)RTMemAlloc(cch + 1);
428 if (pszResult)
429 {
430 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
431 if (RT_SUCCESS(rc))
432 {
433 *ppszString = pszResult;
434 return rc;
435 }
436
437 RTMemFree(pszResult);
438 }
439 else
440 rc = VERR_NO_STR_MEMORY;
441 }
442 return rc;
443}
444
445
446RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
447{
448 /*
449 * Validate input.
450 */
451 Assert(VALID_PTR(pwszString));
452 Assert(VALID_PTR(ppsz));
453 Assert(!pcch || VALID_PTR(pcch));
454
455 /*
456 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
457 */
458 size_t cchResult;
459 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
460 if (RT_SUCCESS(rc))
461 {
462 if (pcch)
463 *pcch = cchResult;
464
465 /*
466 * Check buffer size / Allocate buffer and recode it.
467 */
468 bool fShouldFree;
469 char *pszResult;
470 if (cch > 0 && *ppsz)
471 {
472 fShouldFree = false;
473 if (cch <= cchResult)
474 return VERR_BUFFER_OVERFLOW;
475 pszResult = *ppsz;
476 }
477 else
478 {
479 *ppsz = NULL;
480 fShouldFree = true;
481 cch = RT_MAX(cch, cchResult + 1);
482 pszResult = (char *)RTMemAlloc(cch);
483 }
484 if (pszResult)
485 {
486 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
487 if (RT_SUCCESS(rc))
488 {
489 *ppsz = pszResult;
490 return rc;
491 }
492
493 if (fShouldFree)
494 RTMemFree(pszResult);
495 }
496 else
497 rc = VERR_NO_STR_MEMORY;
498 }
499 return rc;
500}
501
502
503RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
504{
505 const RTUTF16 wc = *pwsz;
506
507 /* simple */
508 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
509 return wc;
510 if (wc < 0xfffe)
511 {
512 /* surrogate pair */
513 if (wc < 0xdc00)
514 {
515 const RTUTF16 wc2 = pwsz[1];
516 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
517 {
518 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
519 return uc;
520 }
521
522 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
523 }
524 else
525 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
526 }
527 else
528 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
529 return RTUNICP_INVALID;
530}
531
532
533RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
534{
535 const RTUTF16 wc = **ppwsz;
536
537 /* simple */
538 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
539 {
540 (*ppwsz)++;
541 *pCp = wc;
542 return VINF_SUCCESS;
543 }
544
545 int rc;
546 if (wc < 0xfffe)
547 {
548 /* surrogate pair */
549 if (wc < 0xdc00)
550 {
551 const RTUTF16 wc2 = (*ppwsz)[1];
552 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
553 {
554 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
555 *pCp = uc;
556 (*ppwsz) += 2;
557 return VINF_SUCCESS;
558 }
559
560 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
561 }
562 else
563 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
564 rc = VERR_INVALID_UTF16_ENCODING;
565 }
566 else
567 {
568 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
569 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
570 }
571 *pCp = RTUNICP_INVALID;
572 (*ppwsz)++;
573 return rc;
574}
575
576
577RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
578{
579 /* simple */
580 if ( CodePoint < 0xd800
581 || ( CodePoint > 0xdfff
582 && CodePoint < 0xfffe))
583 {
584 *pwsz++ = (RTUTF16)CodePoint;
585 return pwsz;
586 }
587
588 /* surrogate pair */
589 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
590 {
591 CodePoint -= 0x10000;
592 *pwsz++ = 0xd800 | (CodePoint >> 10);
593 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
594 return pwsz;
595 }
596
597 /* invalid code point. */
598 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
599 *pwsz++ = 0x7f;
600 return pwsz;
601}
602
603
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette