VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 8155

Last change on this file since 8155 was 8155, checked in by vboxsync, 17 years ago

The Big Sun Rebranding Header Change

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 38.5 KB
Line 
1/* $Id: utf-8.cpp 8155 2008-04-18 15:16:47Z vboxsync $ */
2/** @file
3 * innotek Portable Runtime - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include <iprt/uni.h>
37#include <iprt/alloc.h>
38#include <iprt/assert.h>
39#include <iprt/err.h>
40#include "internal/string.h"
41
42
43
44/**
45 * Get get length in code points of a UTF-8 encoded string.
46 * The string is validated while doing this.
47 *
48 * @returns IPRT status code.
49 * @param psz Pointer to the UTF-8 string.
50 * @param cch The max length of the string. (btw cch = cb)
51 * Use RTSTR_MAX if all of the string is to be examined.s
52 * @param pcuc Where to store the length in unicode code points.
53 */
54static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66 unsigned cb;
67 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
68 cb = 2;
69 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
70 cb = 3;
71 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
72 cb = 4;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
74 cb = 5;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
76 cb = 6;
77 else
78 {
79 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
80 return VERR_INVALID_UTF8_ENCODING;
81 }
82
83 /* check length */
84 if (cb > cch)
85 {
86 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
87 return VERR_INVALID_UTF8_ENCODING;
88 }
89
90 /* validate the rest */
91 switch (cb)
92 {
93 case 6:
94 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
95 case 5:
96 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97 case 4:
98 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 case 3:
100 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 case 2:
102 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 break;
104 }
105
106 /* validate the code point. */
107 RTUNICP uc;
108 switch (cb)
109 {
110 case 6:
111 uc = (puch[5] & 0x3f)
112 | ((RTUNICP)(puch[4] & 0x3f) << 6)
113 | ((RTUNICP)(puch[3] & 0x3f) << 12)
114 | ((RTUNICP)(puch[2] & 0x3f) << 18)
115 | ((RTUNICP)(puch[1] & 0x3f) << 24)
116 | ((RTUNICP)(uch & 0x01) << 30);
117 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
118 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
119 break;
120 case 5:
121 uc = (puch[4] & 0x3f)
122 | ((RTUNICP)(puch[3] & 0x3f) << 6)
123 | ((RTUNICP)(puch[2] & 0x3f) << 12)
124 | ((RTUNICP)(puch[1] & 0x3f) << 18)
125 | ((RTUNICP)(uch & 0x03) << 24);
126 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
127 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
128 break;
129 case 4:
130 uc = (puch[3] & 0x3f)
131 | ((RTUNICP)(puch[2] & 0x3f) << 6)
132 | ((RTUNICP)(puch[1] & 0x3f) << 12)
133 | ((RTUNICP)(uch & 0x07) << 18);
134 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
135 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136 break;
137 case 3:
138 uc = (puch[2] & 0x3f)
139 | ((RTUNICP)(puch[1] & 0x3f) << 6)
140 | ((RTUNICP)(uch & 0x0f) << 12);
141 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
142 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
143 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
144 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
145 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
146 break;
147 case 2:
148 uc = (puch[1] & 0x3f)
149 | ((RTUNICP)(uch & 0x1f) << 6);
150 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
151 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
152 break;
153 }
154
155 /* advance */
156 cch -= cb;
157 puch += cb;
158 }
159 else
160 {
161 /* one ASCII byte */
162 puch++;
163 cch--;
164 }
165 cCodePoints++;
166 }
167
168 /* done */
169 *pcuc = cCodePoints;
170 return VINF_SUCCESS;
171}
172
173
174/**
175 * Decodes and UTF-8 string into an array of unicode code point.
176 *
177 * Since we know the input is valid, we do *not* perform encoding or length checks.
178 *
179 * @returns iprt status code.
180 * @param psz The UTF-8 string to recode. This is a valid encoding.
181 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
182 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
183 * @param paCps Where to store the code points array.
184 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
185 * @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
186 */
187static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t *pcCps)
188{
189 int rc = VINF_SUCCESS;
190 const unsigned char *puch = (const unsigned char *)psz;
191 const PRTUNICP pCpEnd = paCps + cCps;
192 PRTUNICP pCp = paCps;
193 Assert(pCpEnd >= pCp);
194 while (cch > 0)
195 {
196 /* read the next char and check for terminator. */
197 const unsigned char uch = *puch;
198 if (!uch)
199 break;
200
201 /* check for output overflow */
202 if (pCp >= pCpEnd)
203 {
204 rc = VERR_BUFFER_OVERFLOW;
205 break;
206 }
207
208 /* decode and recode the code point */
209 if (!(uch & RT_BIT(7)))
210 {
211 *pCp++ = uch;
212 puch++;
213 cch--;
214 }
215#ifdef RT_STRICT
216 else if (!(uch & RT_BIT(6)))
217 AssertMsgFailed(("Internal error!\n"));
218#endif
219 else if (!(uch & RT_BIT(5)))
220 {
221 *pCp++ = (puch[1] & 0x3f)
222 | ((uint16_t)(uch & 0x1f) << 6);
223 puch += 2;
224 cch -= 2;
225 }
226 else if (!(uch & RT_BIT(4)))
227 {
228 *pCp++ = (puch[2] & 0x3f)
229 | ((uint16_t)(puch[1] & 0x3f) << 6)
230 | ((uint16_t)(uch & 0x0f) << 12);
231 puch += 3;
232 cch -= 3;
233 }
234 else if (!(uch & RT_BIT(3)))
235 {
236 *pCp++ = (puch[3] & 0x3f)
237 | ((RTUNICP)(puch[2] & 0x3f) << 6)
238 | ((RTUNICP)(puch[1] & 0x3f) << 12)
239 | ((RTUNICP)(uch & 0x07) << 18);
240 puch += 4;
241 cch -= 4;
242 }
243 else if (!(uch & RT_BIT(2)))
244 {
245 *pCp++ = (puch[4] & 0x3f)
246 | ((RTUNICP)(puch[3] & 0x3f) << 6)
247 | ((RTUNICP)(puch[2] & 0x3f) << 12)
248 | ((RTUNICP)(puch[1] & 0x3f) << 18)
249 | ((RTUNICP)(uch & 0x03) << 24);
250 puch += 5;
251 cch -= 6;
252 }
253 else
254 {
255 Assert(!(uch & RT_BIT(1)));
256 *pCp++ = (puch[5] & 0x3f)
257 | ((RTUNICP)(puch[4] & 0x3f) << 6)
258 | ((RTUNICP)(puch[3] & 0x3f) << 12)
259 | ((RTUNICP)(puch[2] & 0x3f) << 18)
260 | ((RTUNICP)(puch[1] & 0x3f) << 24)
261 | ((RTUNICP)(uch & 0x01) << 30);
262 puch += 6;
263 cch -= 6;
264 }
265 }
266
267 /* done */
268 *pCp = 0;
269 *pcCps = pCp - paCps;
270 return rc;
271}
272
273
274RTDECL(size_t) RTStrUniLen(const char *psz)
275{
276 size_t cCodePoints;
277 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints);
278 return RT_SUCCESS(rc) ? cCodePoints : 0;
279}
280
281
282RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
283{
284 size_t cCodePoints;
285 int rc = rtUtf8Length(psz, cch, &cCodePoints);
286 if (pcCps)
287 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
288 return rc;
289}
290
291
292RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
293{
294 /*
295 * Validate input.
296 */
297 Assert(VALID_PTR(pszString));
298 Assert(VALID_PTR(ppaCps));
299 *ppaCps = NULL;
300
301 /*
302 * Validate the UTF-8 input and count its code points.
303 */
304 size_t cCps;
305 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps);
306 if (RT_SUCCESS(rc))
307 {
308 /*
309 * Allocate buffer.
310 */
311 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
312 if (paCps)
313 {
314 /*
315 * Decode the string.
316 */
317 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
318 if (RT_SUCCESS(rc))
319 {
320 *ppaCps = paCps;
321 return rc;
322 }
323 RTMemFree(paCps);
324 }
325 else
326 rc = VERR_NO_CODE_POINT_MEMORY;
327 }
328 return rc;
329}
330
331
332RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
333{
334 /*
335 * Validate input.
336 */
337 Assert(VALID_PTR(pszString));
338 Assert(VALID_PTR(ppaCps));
339 Assert(!pcCps || VALID_PTR(pcCps));
340
341 /*
342 * Validate the UTF-8 input and count the code points.
343 */
344 size_t cCpsResult;
345 int rc = rtUtf8Length(pszString, cchString, &cCpsResult);
346 if (RT_SUCCESS(rc))
347 {
348 if (pcCps)
349 *pcCps = cCpsResult;
350
351 /*
352 * Check buffer size / Allocate buffer.
353 */
354 bool fShouldFree;
355 PRTUNICP paCpsResult;
356 if (cCps > 0 && *ppaCps)
357 {
358 fShouldFree = false;
359 if (cCps <= cCpsResult)
360 return VERR_BUFFER_OVERFLOW;
361 paCpsResult = *ppaCps;
362 }
363 else
364 {
365 *ppaCps = NULL;
366 fShouldFree = true;
367 cCps = RT_MAX(cCpsResult + 1, cCps);
368 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
369 }
370 if (paCpsResult)
371 {
372 /*
373 * Encode the UTF-16 string.
374 */
375 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
376 if (RT_SUCCESS(rc))
377 {
378 *ppaCps = paCpsResult;
379 return rc;
380 }
381 if (fShouldFree)
382 RTMemFree(paCpsResult);
383 }
384 else
385 rc = VERR_NO_CODE_POINT_MEMORY;
386 }
387 return rc;
388}
389
390
391/**
392 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
393 *
394 * @returns IPRT status code.
395 * @param psz Pointer to the UTF-8 string.
396 * @param cch The max length of the string. (btw cch = cb)
397 * Use RTSTR_MAX if all of the string is to be examined.s
398 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
399 */
400static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
401{
402 const unsigned char *puch = (const unsigned char *)psz;
403 size_t cwc = 0;
404 while (cch > 0)
405 {
406 const unsigned char uch = *puch;
407 if (!uch)
408 break;
409 if (!(uch & RT_BIT(7)))
410 {
411 /* one ASCII byte */
412 cwc++;
413 puch++;
414 cch--;
415 }
416 else
417 {
418 /* figure sequence length and validate the first byte */
419 unsigned cb;
420 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
421 cb = 2;
422 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
423 cb = 3;
424 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
425 cb = 4;
426 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
427 cb = 5;
428 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
429 cb = 6;
430 else
431 {
432 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
433 return VERR_INVALID_UTF8_ENCODING;
434 }
435
436 /* check length */
437 if (cb > cch)
438 {
439 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
440 return VERR_INVALID_UTF8_ENCODING;
441 }
442
443 /* validate the rest */
444 switch (cb)
445 {
446 case 6:
447 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
448 case 5:
449 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
450 case 4:
451 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
452 case 3:
453 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
454 case 2:
455 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
456 break;
457 }
458
459 /* validate the code point. */
460 RTUNICP uc;
461 switch (cb)
462 {
463 case 6:
464 uc = (puch[5] & 0x3f)
465 | ((RTUNICP)(puch[4] & 0x3f) << 6)
466 | ((RTUNICP)(puch[3] & 0x3f) << 12)
467 | ((RTUNICP)(puch[2] & 0x3f) << 18)
468 | ((RTUNICP)(puch[1] & 0x3f) << 24)
469 | ((RTUNICP)(uch & 0x01) << 30);
470 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
471 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
472 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
473 return VERR_CANT_RECODE_AS_UTF16;
474 case 5:
475 uc = (puch[4] & 0x3f)
476 | ((RTUNICP)(puch[3] & 0x3f) << 6)
477 | ((RTUNICP)(puch[2] & 0x3f) << 12)
478 | ((RTUNICP)(puch[1] & 0x3f) << 18)
479 | ((RTUNICP)(uch & 0x03) << 24);
480 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
481 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
482 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
483 return VERR_CANT_RECODE_AS_UTF16;
484 case 4:
485 uc = (puch[3] & 0x3f)
486 | ((RTUNICP)(puch[2] & 0x3f) << 6)
487 | ((RTUNICP)(puch[1] & 0x3f) << 12)
488 | ((RTUNICP)(uch & 0x07) << 18);
489 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
490 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
491 RTStrAssertMsgReturn(uc <= 0x0010ffff,
492 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
493 cwc++;
494 break;
495 case 3:
496 uc = (puch[2] & 0x3f)
497 | ((RTUNICP)(puch[1] & 0x3f) << 6)
498 | ((RTUNICP)(uch & 0x0f) << 12);
499 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
500 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
501 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
502 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
503 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
504 break;
505 case 2:
506 uc = (puch[1] & 0x3f)
507 | ((RTUNICP)(uch & 0x1f) << 6);
508 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
509 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
510 break;
511 }
512
513 /* advance */
514 cch -= cb;
515 puch += cb;
516 cwc++;
517 }
518 }
519
520 /* done */
521 *pcwc = cwc;
522 return VINF_SUCCESS;
523}
524
525
526/**
527 * Recodes a valid UTF-8 string as UTF-16.
528 *
529 * Since we know the input is valid, we do *not* perform encoding or length checks.
530 *
531 * @returns iprt status code.
532 * @param psz The UTF-8 string to recode. This is a valid encoding.
533 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
534 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
535 * @param pwsz Where to store the UTF-16 string.
536 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
537 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
538 */
539static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
540{
541 int rc = VINF_SUCCESS;
542 const unsigned char *puch = (const unsigned char *)psz;
543 const PRTUTF16 pwszEnd = pwsz + cwc;
544 PRTUTF16 pwc = pwsz;
545 Assert(pwszEnd >= pwc);
546 while (cch > 0)
547 {
548 /* read the next char and check for terminator. */
549 const unsigned char uch = *puch;
550 if (!uch)
551 break;
552
553 /* check for output overflow */
554 if (pwc >= pwszEnd)
555 {
556 rc = VERR_BUFFER_OVERFLOW;
557 break;
558 }
559
560 /* decode and recode the code point */
561 if (!(uch & RT_BIT(7)))
562 {
563 *pwc++ = uch;
564 puch++;
565 cch--;
566 }
567 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
568 {
569 uint16_t uc = (puch[1] & 0x3f)
570 | ((uint16_t)(uch & 0x1f) << 6);
571 *pwc++ = uc;
572 puch += 2;
573 cch -= 2;
574 }
575 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
576 {
577 uint16_t uc = (puch[2] & 0x3f)
578 | ((uint16_t)(puch[1] & 0x3f) << 6)
579 | ((uint16_t)(uch & 0x0f) << 12);
580 *pwc++ = uc;
581 puch += 3;
582 cch -= 3;
583 }
584 else
585 {
586 /* generate surrugate pair */
587 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
588 RTUNICP uc = (puch[3] & 0x3f)
589 | ((RTUNICP)(puch[2] & 0x3f) << 6)
590 | ((RTUNICP)(puch[1] & 0x3f) << 12)
591 | ((RTUNICP)(uch & 0x07) << 18);
592 if (pwc + 1 >= pwszEnd)
593 {
594 rc = VERR_BUFFER_OVERFLOW;
595 break;
596 }
597 uc -= 0x10000;
598 *pwc++ = 0xd800 | (uc >> 10);
599 *pwc++ = 0xdc00 | (uc & 0x3ff);
600 puch += 4;
601 cch -= 4;
602 }
603 }
604
605 /* done */
606 *pwc = '\0';
607 *pcwc = pwc - pwsz;
608 return rc;
609}
610
611
612RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
613{
614 /*
615 * Validate input.
616 */
617 Assert(VALID_PTR(ppwszString));
618 Assert(VALID_PTR(pszString));
619 *ppwszString = NULL;
620
621 /*
622 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
623 */
624 size_t cwc;
625 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
626 if (RT_SUCCESS(rc))
627 {
628 /*
629 * Allocate buffer.
630 */
631 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
632 if (pwsz)
633 {
634 /*
635 * Encode the UTF-16 string.
636 */
637 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
638 if (RT_SUCCESS(rc))
639 {
640 *ppwszString = pwsz;
641 return rc;
642 }
643 RTMemFree(pwsz);
644 }
645 else
646 rc = VERR_NO_UTF16_MEMORY;
647 }
648 return rc;
649}
650
651
652RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
653{
654 /*
655 * Validate input.
656 */
657 Assert(VALID_PTR(pszString));
658 Assert(VALID_PTR(ppwsz));
659 Assert(!pcwc || VALID_PTR(pcwc));
660
661 /*
662 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
663 */
664 size_t cwcResult;
665 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
666 if (RT_SUCCESS(rc))
667 {
668 if (pcwc)
669 *pcwc = cwcResult;
670
671 /*
672 * Check buffer size / Allocate buffer.
673 */
674 bool fShouldFree;
675 PRTUTF16 pwszResult;
676 if (cwc > 0 && *ppwsz)
677 {
678 fShouldFree = false;
679 if (cwc <= cwcResult)
680 return VERR_BUFFER_OVERFLOW;
681 pwszResult = *ppwsz;
682 }
683 else
684 {
685 *ppwsz = NULL;
686 fShouldFree = true;
687 cwc = RT_MAX(cwcResult + 1, cwc);
688 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
689 }
690 if (pwszResult)
691 {
692 /*
693 * Encode the UTF-16 string.
694 */
695 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
696 if (RT_SUCCESS(rc))
697 {
698 *ppwsz = pwszResult;
699 return rc;
700 }
701 if (fShouldFree)
702 RTMemFree(pwszResult);
703 }
704 else
705 rc = VERR_NO_UTF16_MEMORY;
706 }
707 return rc;
708}
709
710
711RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
712{
713 size_t cwc;
714 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
715 return RT_SUCCESS(rc) ? cwc : 0;
716}
717
718
719RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
720{
721 size_t cwc;
722 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
723 if (pcwc)
724 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
725 return rc;
726}
727
728
729/**
730 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
731 * @returns rc
732 * @param ppsz The pointer to the the string position point.
733 * @param pCp Where to store RTUNICP_INVALID.
734 * @param rc The iprt error code.
735 */
736static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
737{
738 /*
739 * Try find a valid encoding.
740 */
741 (*ppsz)++; /** @todo code this! */
742 *pCp = RTUNICP_INVALID;
743 return rc;
744}
745
746
747RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
748{
749 RTUNICP Cp;
750 RTStrGetCpExInternal(&psz, &Cp);
751 return Cp;
752}
753
754
755RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
756{
757 const unsigned char *puch = (const unsigned char *)*ppsz;
758 const unsigned char uch = *puch;
759 RTUNICP uc;
760
761 /* ASCII ? */
762 if (!(uch & RT_BIT(7)))
763 {
764 uc = uch;
765 puch++;
766 }
767 else if (uch & RT_BIT(6))
768 {
769 /* figure the length and validate the first octet. */
770 unsigned cb;
771 if (!(uch & RT_BIT(5)))
772 cb = 2;
773 else if (!(uch & RT_BIT(4)))
774 cb = 3;
775 else if (!(uch & RT_BIT(3)))
776 cb = 4;
777 else if (!(uch & RT_BIT(2)))
778 cb = 5;
779 else if (!(uch & RT_BIT(1)))
780 cb = 6;
781 else
782 {
783 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
784 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
785 }
786
787 /* validate the rest */
788 switch (cb)
789 {
790 case 6:
791 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
792 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
793 case 5:
794 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
795 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
796 case 4:
797 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
798 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
799 case 3:
800 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
801 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
802 case 2:
803 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
804 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
805 break;
806 }
807
808 /* get and validate the code point. */
809 switch (cb)
810 {
811 case 6:
812 uc = (puch[5] & 0x3f)
813 | ((RTUNICP)(puch[4] & 0x3f) << 6)
814 | ((RTUNICP)(puch[3] & 0x3f) << 12)
815 | ((RTUNICP)(puch[2] & 0x3f) << 18)
816 | ((RTUNICP)(puch[1] & 0x3f) << 24)
817 | ((RTUNICP)(uch & 0x01) << 30);
818 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
819 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
820 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
821 break;
822 case 5:
823 uc = (puch[4] & 0x3f)
824 | ((RTUNICP)(puch[3] & 0x3f) << 6)
825 | ((RTUNICP)(puch[2] & 0x3f) << 12)
826 | ((RTUNICP)(puch[1] & 0x3f) << 18)
827 | ((RTUNICP)(uch & 0x03) << 24);
828 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
829 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
830 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
831 break;
832 case 4:
833 uc = (puch[3] & 0x3f)
834 | ((RTUNICP)(puch[2] & 0x3f) << 6)
835 | ((RTUNICP)(puch[1] & 0x3f) << 12)
836 | ((RTUNICP)(uch & 0x07) << 18);
837 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
838 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
839 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
840 break;
841 case 3:
842 uc = (puch[2] & 0x3f)
843 | ((RTUNICP)(puch[1] & 0x3f) << 6)
844 | ((RTUNICP)(uch & 0x0f) << 12);
845 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
846 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
847 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
848 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
849 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
850 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
851 break;
852 case 2:
853 uc = (puch[1] & 0x3f)
854 | ((RTUNICP)(uch & 0x1f) << 6);
855 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
856 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
857 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
858 break;
859 default: /* impossible, but GCC is bitching. */
860 uc = RTUNICP_INVALID;
861 break;
862 }
863 puch += cb;
864 }
865 else
866 {
867 /* 6th bit is always set. */
868 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
869 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
870 }
871 *pCp = uc;
872 *ppsz = (const char *)puch;
873 return VINF_SUCCESS;
874}
875
876
877RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
878{
879 unsigned char *puch = (unsigned char *)psz;
880 if (uc < 0x80)
881 *puch++ = (unsigned char )uc;
882 else if (uc < 0x00000800)
883 {
884 *puch++ = 0xc0 | (uc >> 6);
885 *puch++ = 0x80 | (uc & 0x3f);
886 }
887 else if (uc < 0x00010000)
888 {
889 if ( uc < 0x0000d8000
890 || ( uc > 0x0000dfff
891 && uc < 0x0000fffe))
892 {
893 *puch++ = 0xe0 | (uc >> 12);
894 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
895 *puch++ = 0x80 | (uc & 0x3f);
896 }
897 else
898 {
899 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
900 *puch++ = 0x7f;
901 }
902 }
903 else if (uc < 0x00200000)
904 {
905 *puch++ = 0xf0 | (uc >> 18);
906 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
907 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
908 *puch++ = 0x80 | (uc & 0x3f);
909 }
910 else if (uc < 0x04000000)
911 {
912 *puch++ = 0xf1 | (uc >> 24);
913 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
914 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
915 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
916 *puch++ = 0x80 | (uc & 0x3f);
917 }
918 else if (uc <= 0x7fffffff)
919 {
920 *puch++ = 0xf3 | (uc >> 30);
921 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
922 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
923 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
924 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
925 *puch++ = 0x80 | (uc & 0x3f);
926 }
927 else
928 {
929 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
930 *puch++ = 0x7f;
931 }
932
933 return (char *)puch;
934}
935
936
937RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
938{
939 if (pszStart < psz)
940 {
941 /* simple char? */
942 const unsigned char *puch = (const unsigned char *)psz;
943 unsigned uch = *--puch;
944 if (!(uch & RT_BIT(7)))
945 return (char *)puch;
946 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
947
948 /* two or more. */
949 uint32_t uMask = 0xffffffc0;
950 while ( (const unsigned char *)pszStart < puch
951 && !(uMask & 1))
952 {
953 unsigned uch = *--puch;
954 if ((uch & 0xc0) != 0x80)
955 {
956 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
957 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
958 (char *)pszStart);
959 return (char *)puch;
960 }
961 uMask >>= 1;
962 }
963 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
964 }
965 return (char *)pszStart;
966}
967
968
969/**
970 * Performs a case sensitive string compare between two UTF-8 strings.
971 *
972 * Encoding errors are ignored by the current implementation. So, the only
973 * difference between this and the CRT strcmp function is the handling of
974 * NULL arguments.
975 *
976 * @returns < 0 if the first string less than the second string.
977 * @returns 0 if the first string identical to the second string.
978 * @returns > 0 if the first string greater than the second string.
979 * @param psz1 First UTF-8 string. Null is allowed.
980 * @param psz2 Second UTF-8 string. Null is allowed.
981 */
982RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)
983{
984 if (psz1 == psz2)
985 return 0;
986 if (!psz1)
987 return -1;
988 if (!psz2)
989 return 1;
990
991 return strcmp(psz1, psz2);
992}
993
994
995/**
996 * Performs a case insensitive string compare between two UTF-8 strings.
997 *
998 * This is a simplified compare, as only the simplified lower/upper case folding
999 * specified by the unicode specs are used. It does not consider character pairs
1000 * as they are used in some languages, just simple upper & lower case compares.
1001 *
1002 * The result is the difference between the mismatching codepoints after they
1003 * both have been lower cased.
1004 *
1005 * If the string encoding is invalid the function will assert (strict builds)
1006 * and use RTStrCmp for the remainder of the string.
1007 *
1008 * @returns < 0 if the first string less than the second string.
1009 * @returns 0 if the first string identical to the second string.
1010 * @returns > 0 if the first string greater than the second string.
1011 * @param psz1 First UTF-8 string. Null is allowed.
1012 * @param psz2 Second UTF-8 string. Null is allowed.
1013 */
1014RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
1015{
1016 if (psz1 == psz2)
1017 return 0;
1018 if (!psz1)
1019 return -1;
1020 if (!psz2)
1021 return 1;
1022
1023#if 1 /* new */
1024 const char *pszStart1 = psz1;
1025 for (;;)
1026 {
1027 /* Get the codepoints */
1028 RTUNICP cp1;
1029 int rc = RTStrGetCpEx(&psz1, &cp1);
1030 if (RT_FAILURE(rc))
1031 {
1032 AssertRC(rc);
1033 psz1--;
1034 break;
1035 }
1036
1037 RTUNICP cp2;
1038 rc = RTStrGetCpEx(&psz2, &cp2);
1039 if (RT_FAILURE(rc))
1040 {
1041 AssertRC(rc);
1042 psz2--;
1043 psz1 = RTStrPrevCp(pszStart1, psz1);
1044 break;
1045 }
1046
1047 /* compare */
1048 int iDiff = cp1 - cp2;
1049 if (iDiff)
1050 {
1051 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1052 if (iDiff)
1053 {
1054 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1055 if (iDiff)
1056 return iDiff;
1057 }
1058 }
1059
1060 /* hit the terminator? */
1061 if (!cp1)
1062 return 0;
1063 }
1064
1065 /* Hit some bad encoding, continue in case insensitive mode. */
1066 return RTStrCmp(psz1, psz2);
1067#else /* old */
1068#ifdef RT_OS_WINDOWS
1069 return stricmp(psz1, psz2);
1070#else /* !RT_OS_WINDOWS */
1071 return strcasecmp(psz1, psz2);
1072#endif /* !RT_OS_WINDOWS */
1073#endif
1074}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette