utf-8.cpp@ 8155

Last change on this file since 8155 was 8155, checked in by vboxsync, 17 years ago
The Big Sun Rebranding Header Change
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 38.5 KB

Line
1	/* $Id: utf-8.cpp 8155 2008-04-18 15:16:47Z vboxsync $ */
2	/** @file
3	* innotek Portable Runtime - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2007 Sun Microsystems, Inc.
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*
26	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27	* Clara, CA 95054 USA or visit http://www.sun.com if you need
28	* additional information or have any questions.
29	*/
30
31
32	/*******************************************************************************
33	* Header Files *
34	*******************************************************************************/
35	#include <iprt/string.h>
36	#include <iprt/uni.h>
37	#include <iprt/alloc.h>
38	#include <iprt/assert.h>
39	#include <iprt/err.h>
40	#include "internal/string.h"
41
42
43
44	/**
45	* Get get length in code points of a UTF-8 encoded string.
46	* The string is validated while doing this.
47	*
48	* @returns IPRT status code.
49	* @param psz Pointer to the UTF-8 string.
50	* @param cch The max length of the string. (btw cch = cb)
51	* Use RTSTR_MAX if all of the string is to be examined.s
52	* @param pcuc Where to store the length in unicode code points.
53	*/
54	static int rtUtf8Length(const char psz, size_t cch, size_t pcuc)
55	{
56	const unsigned char puch = (const unsigned char )psz;
57	size_t cCodePoints = 0;
58	while (cch > 0)
59	{
60	const unsigned char uch = *puch;
61	if (!uch)
62	break;
63	if (uch & RT_BIT(7))
64	{
65	/* figure sequence length and validate the first byte */
66	unsigned cb;
67	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
68	cb = 2;
69	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
70	cb = 3;
71	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
72	cb = 4;
73	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
74	cb = 5;
75	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
76	cb = 6;
77	else
78	{
79	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
80	return VERR_INVALID_UTF8_ENCODING;
81	}
82
83	/* check length */
84	if (cb > cch)
85	{
86	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
87	return VERR_INVALID_UTF8_ENCODING;
88	}
89
90	/* validate the rest */
91	switch (cb)
92	{
93	case 6:
94	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
95	case 5:
96	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97	case 4:
98	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99	case 3:
100	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101	case 2:
102	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103	break;
104	}
105
106	/* validate the code point. */
107	RTUNICP uc;
108	switch (cb)
109	{
110	case 6:
111	uc = (puch[5] & 0x3f)
112	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
113	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
114	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
115	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
116	\| ((RTUNICP)(uch & 0x01) << 30);
117	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
118	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
119	break;
120	case 5:
121	uc = (puch[4] & 0x3f)
122	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
123	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
124	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
125	\| ((RTUNICP)(uch & 0x03) << 24);
126	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
127	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
128	break;
129	case 4:
130	uc = (puch[3] & 0x3f)
131	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
132	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
133	\| ((RTUNICP)(uch & 0x07) << 18);
134	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
135	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136	break;
137	case 3:
138	uc = (puch[2] & 0x3f)
139	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
140	\| ((RTUNICP)(uch & 0x0f) << 12);
141	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
142	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
143	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
144	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
145	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
146	break;
147	case 2:
148	uc = (puch[1] & 0x3f)
149	\| ((RTUNICP)(uch & 0x1f) << 6);
150	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
151	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
152	break;
153	}
154
155	/* advance */
156	cch -= cb;
157	puch += cb;
158	}
159	else
160	{
161	/* one ASCII byte */
162	puch++;
163	cch--;
164	}
165	cCodePoints++;
166	}
167
168	/* done */
169	*pcuc = cCodePoints;
170	return VINF_SUCCESS;
171	}
172
173
174	/**
175	* Decodes and UTF-8 string into an array of unicode code point.
176	*
177	* Since we know the input is valid, we do not perform encoding or length checks.
178	*
179	* @returns iprt status code.
180	* @param psz The UTF-8 string to recode. This is a valid encoding.
181	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
182	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
183	* @param paCps Where to store the code points array.
184	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
185	* @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
186	*/
187	static int rtUtf8Decode(const char psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t pcCps)
188	{
189	int rc = VINF_SUCCESS;
190	const unsigned char puch = (const unsigned char )psz;
191	const PRTUNICP pCpEnd = paCps + cCps;
192	PRTUNICP pCp = paCps;
193	Assert(pCpEnd >= pCp);
194	while (cch > 0)
195	{
196	/* read the next char and check for terminator. */
197	const unsigned char uch = *puch;
198	if (!uch)
199	break;
200
201	/* check for output overflow */
202	if (pCp >= pCpEnd)
203	{
204	rc = VERR_BUFFER_OVERFLOW;
205	break;
206	}
207
208	/* decode and recode the code point */
209	if (!(uch & RT_BIT(7)))
210	{
211	*pCp++ = uch;
212	puch++;
213	cch--;
214	}
215	#ifdef RT_STRICT
216	else if (!(uch & RT_BIT(6)))
217	AssertMsgFailed(("Internal error!\n"));
218	#endif
219	else if (!(uch & RT_BIT(5)))
220	{
221	*pCp++ = (puch[1] & 0x3f)
222	\| ((uint16_t)(uch & 0x1f) << 6);
223	puch += 2;
224	cch -= 2;
225	}
226	else if (!(uch & RT_BIT(4)))
227	{
228	*pCp++ = (puch[2] & 0x3f)
229	\| ((uint16_t)(puch[1] & 0x3f) << 6)
230	\| ((uint16_t)(uch & 0x0f) << 12);
231	puch += 3;
232	cch -= 3;
233	}
234	else if (!(uch & RT_BIT(3)))
235	{
236	*pCp++ = (puch[3] & 0x3f)
237	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
238	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
239	\| ((RTUNICP)(uch & 0x07) << 18);
240	puch += 4;
241	cch -= 4;
242	}
243	else if (!(uch & RT_BIT(2)))
244	{
245	*pCp++ = (puch[4] & 0x3f)
246	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
247	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
248	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
249	\| ((RTUNICP)(uch & 0x03) << 24);
250	puch += 5;
251	cch -= 6;
252	}
253	else
254	{
255	Assert(!(uch & RT_BIT(1)));
256	*pCp++ = (puch[5] & 0x3f)
257	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
258	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
259	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
260	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
261	\| ((RTUNICP)(uch & 0x01) << 30);
262	puch += 6;
263	cch -= 6;
264	}
265	}
266
267	/* done */
268	*pCp = 0;
269	*pcCps = pCp - paCps;
270	return rc;
271	}
272
273
274	RTDECL(size_t) RTStrUniLen(const char *psz)
275	{
276	size_t cCodePoints;
277	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints);
278	return RT_SUCCESS(rc) ? cCodePoints : 0;
279	}
280
281
282	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
283	{
284	size_t cCodePoints;
285	int rc = rtUtf8Length(psz, cch, &cCodePoints);
286	if (pcCps)
287	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
288	return rc;
289	}
290
291
292	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
293	{
294	/*
295	* Validate input.
296	*/
297	Assert(VALID_PTR(pszString));
298	Assert(VALID_PTR(ppaCps));
299	*ppaCps = NULL;
300
301	/*
302	* Validate the UTF-8 input and count its code points.
303	*/
304	size_t cCps;
305	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps);
306	if (RT_SUCCESS(rc))
307	{
308	/*
309	* Allocate buffer.
310	*/
311	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
312	if (paCps)
313	{
314	/*
315	* Decode the string.
316	*/
317	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
318	if (RT_SUCCESS(rc))
319	{
320	*ppaCps = paCps;
321	return rc;
322	}
323	RTMemFree(paCps);
324	}
325	else
326	rc = VERR_NO_CODE_POINT_MEMORY;
327	}
328	return rc;
329	}
330
331
332	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
333	{
334	/*
335	* Validate input.
336	*/
337	Assert(VALID_PTR(pszString));
338	Assert(VALID_PTR(ppaCps));
339	Assert(!pcCps \|\| VALID_PTR(pcCps));
340
341	/*
342	* Validate the UTF-8 input and count the code points.
343	*/
344	size_t cCpsResult;
345	int rc = rtUtf8Length(pszString, cchString, &cCpsResult);
346	if (RT_SUCCESS(rc))
347	{
348	if (pcCps)
349	*pcCps = cCpsResult;
350
351	/*
352	* Check buffer size / Allocate buffer.
353	*/
354	bool fShouldFree;
355	PRTUNICP paCpsResult;
356	if (cCps > 0 && *ppaCps)
357	{
358	fShouldFree = false;
359	if (cCps <= cCpsResult)
360	return VERR_BUFFER_OVERFLOW;
361	paCpsResult = *ppaCps;
362	}
363	else
364	{
365	*ppaCps = NULL;
366	fShouldFree = true;
367	cCps = RT_MAX(cCpsResult + 1, cCps);
368	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
369	}
370	if (paCpsResult)
371	{
372	/*
373	* Encode the UTF-16 string.
374	*/
375	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
376	if (RT_SUCCESS(rc))
377	{
378	*ppaCps = paCpsResult;
379	return rc;
380	}
381	if (fShouldFree)
382	RTMemFree(paCpsResult);
383	}
384	else
385	rc = VERR_NO_CODE_POINT_MEMORY;
386	}
387	return rc;
388	}
389
390
391	/**
392	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
393	*
394	* @returns IPRT status code.
395	* @param psz Pointer to the UTF-8 string.
396	* @param cch The max length of the string. (btw cch = cb)
397	* Use RTSTR_MAX if all of the string is to be examined.s
398	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
399	*/
400	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
401	{
402	const unsigned char puch = (const unsigned char )psz;
403	size_t cwc = 0;
404	while (cch > 0)
405	{
406	const unsigned char uch = *puch;
407	if (!uch)
408	break;
409	if (!(uch & RT_BIT(7)))
410	{
411	/* one ASCII byte */
412	cwc++;
413	puch++;
414	cch--;
415	}
416	else
417	{
418	/* figure sequence length and validate the first byte */
419	unsigned cb;
420	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
421	cb = 2;
422	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
423	cb = 3;
424	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
425	cb = 4;
426	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
427	cb = 5;
428	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
429	cb = 6;
430	else
431	{
432	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
433	return VERR_INVALID_UTF8_ENCODING;
434	}
435
436	/* check length */
437	if (cb > cch)
438	{
439	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
440	return VERR_INVALID_UTF8_ENCODING;
441	}
442
443	/* validate the rest */
444	switch (cb)
445	{
446	case 6:
447	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
448	case 5:
449	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
450	case 4:
451	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
452	case 3:
453	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
454	case 2:
455	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
456	break;
457	}
458
459	/* validate the code point. */
460	RTUNICP uc;
461	switch (cb)
462	{
463	case 6:
464	uc = (puch[5] & 0x3f)
465	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
466	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
467	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
468	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
469	\| ((RTUNICP)(uch & 0x01) << 30);
470	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
471	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
472	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
473	return VERR_CANT_RECODE_AS_UTF16;
474	case 5:
475	uc = (puch[4] & 0x3f)
476	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
477	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
478	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
479	\| ((RTUNICP)(uch & 0x03) << 24);
480	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
481	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
482	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
483	return VERR_CANT_RECODE_AS_UTF16;
484	case 4:
485	uc = (puch[3] & 0x3f)
486	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
487	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
488	\| ((RTUNICP)(uch & 0x07) << 18);
489	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
490	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
491	RTStrAssertMsgReturn(uc <= 0x0010ffff,
492	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
493	cwc++;
494	break;
495	case 3:
496	uc = (puch[2] & 0x3f)
497	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
498	\| ((RTUNICP)(uch & 0x0f) << 12);
499	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
500	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
501	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
502	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
503	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
504	break;
505	case 2:
506	uc = (puch[1] & 0x3f)
507	\| ((RTUNICP)(uch & 0x1f) << 6);
508	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
509	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
510	break;
511	}
512
513	/* advance */
514	cch -= cb;
515	puch += cb;
516	cwc++;
517	}
518	}
519
520	/* done */
521	*pcwc = cwc;
522	return VINF_SUCCESS;
523	}
524
525
526	/**
527	* Recodes a valid UTF-8 string as UTF-16.
528	*
529	* Since we know the input is valid, we do not perform encoding or length checks.
530	*
531	* @returns iprt status code.
532	* @param psz The UTF-8 string to recode. This is a valid encoding.
533	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
534	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
535	* @param pwsz Where to store the UTF-16 string.
536	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
537	* @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
538	*/
539	static int rtUtf8RecodeAsUtf16(const char psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t pcwc)
540	{
541	int rc = VINF_SUCCESS;
542	const unsigned char puch = (const unsigned char )psz;
543	const PRTUTF16 pwszEnd = pwsz + cwc;
544	PRTUTF16 pwc = pwsz;
545	Assert(pwszEnd >= pwc);
546	while (cch > 0)
547	{
548	/* read the next char and check for terminator. */
549	const unsigned char uch = *puch;
550	if (!uch)
551	break;
552
553	/* check for output overflow */
554	if (pwc >= pwszEnd)
555	{
556	rc = VERR_BUFFER_OVERFLOW;
557	break;
558	}
559
560	/* decode and recode the code point */
561	if (!(uch & RT_BIT(7)))
562	{
563	*pwc++ = uch;
564	puch++;
565	cch--;
566	}
567	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
568	{
569	uint16_t uc = (puch[1] & 0x3f)
570	\| ((uint16_t)(uch & 0x1f) << 6);
571	*pwc++ = uc;
572	puch += 2;
573	cch -= 2;
574	}
575	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
576	{
577	uint16_t uc = (puch[2] & 0x3f)
578	\| ((uint16_t)(puch[1] & 0x3f) << 6)
579	\| ((uint16_t)(uch & 0x0f) << 12);
580	*pwc++ = uc;
581	puch += 3;
582	cch -= 3;
583	}
584	else
585	{
586	/* generate surrugate pair */
587	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
588	RTUNICP uc = (puch[3] & 0x3f)
589	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
590	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
591	\| ((RTUNICP)(uch & 0x07) << 18);
592	if (pwc + 1 >= pwszEnd)
593	{
594	rc = VERR_BUFFER_OVERFLOW;
595	break;
596	}
597	uc -= 0x10000;
598	*pwc++ = 0xd800 \| (uc >> 10);
599	*pwc++ = 0xdc00 \| (uc & 0x3ff);
600	puch += 4;
601	cch -= 4;
602	}
603	}
604
605	/* done */
606	*pwc = '\0';
607	*pcwc = pwc - pwsz;
608	return rc;
609	}
610
611
612	RTDECL(int) RTStrToUtf16(const char pszString, PRTUTF16 ppwszString)
613	{
614	/*
615	* Validate input.
616	*/
617	Assert(VALID_PTR(ppwszString));
618	Assert(VALID_PTR(pszString));
619	*ppwszString = NULL;
620
621	/*
622	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
623	*/
624	size_t cwc;
625	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
626	if (RT_SUCCESS(rc))
627	{
628	/*
629	* Allocate buffer.
630	*/
631	PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
632	if (pwsz)
633	{
634	/*
635	* Encode the UTF-16 string.
636	*/
637	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
638	if (RT_SUCCESS(rc))
639	{
640	*ppwszString = pwsz;
641	return rc;
642	}
643	RTMemFree(pwsz);
644	}
645	else
646	rc = VERR_NO_UTF16_MEMORY;
647	}
648	return rc;
649	}
650
651
652	RTDECL(int) RTStrToUtf16Ex(const char pszString, size_t cchString, PRTUTF16 ppwsz, size_t cwc, size_t *pcwc)
653	{
654	/*
655	* Validate input.
656	*/
657	Assert(VALID_PTR(pszString));
658	Assert(VALID_PTR(ppwsz));
659	Assert(!pcwc \|\| VALID_PTR(pcwc));
660
661	/*
662	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
663	*/
664	size_t cwcResult;
665	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
666	if (RT_SUCCESS(rc))
667	{
668	if (pcwc)
669	*pcwc = cwcResult;
670
671	/*
672	* Check buffer size / Allocate buffer.
673	*/
674	bool fShouldFree;
675	PRTUTF16 pwszResult;
676	if (cwc > 0 && *ppwsz)
677	{
678	fShouldFree = false;
679	if (cwc <= cwcResult)
680	return VERR_BUFFER_OVERFLOW;
681	pwszResult = *ppwsz;
682	}
683	else
684	{
685	*ppwsz = NULL;
686	fShouldFree = true;
687	cwc = RT_MAX(cwcResult + 1, cwc);
688	pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
689	}
690	if (pwszResult)
691	{
692	/*
693	* Encode the UTF-16 string.
694	*/
695	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
696	if (RT_SUCCESS(rc))
697	{
698	*ppwsz = pwszResult;
699	return rc;
700	}
701	if (fShouldFree)
702	RTMemFree(pwszResult);
703	}
704	else
705	rc = VERR_NO_UTF16_MEMORY;
706	}
707	return rc;
708	}
709
710
711	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
712	{
713	size_t cwc;
714	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
715	return RT_SUCCESS(rc) ? cwc : 0;
716	}
717
718
719	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
720	{
721	size_t cwc;
722	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
723	if (pcwc)
724	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
725	return rc;
726	}
727
728
729	/**
730	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
731	* @returns rc
732	* @param ppsz The pointer to the the string position point.
733	* @param pCp Where to store RTUNICP_INVALID.
734	* @param rc The iprt error code.
735	*/
736	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
737	{
738	/*
739	* Try find a valid encoding.
740	*/
741	(ppsz)++; /* @todo code this! */
742	*pCp = RTUNICP_INVALID;
743	return rc;
744	}
745
746
747	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
748	{
749	RTUNICP Cp;
750	RTStrGetCpExInternal(&psz, &Cp);
751	return Cp;
752	}
753
754
755	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
756	{
757	const unsigned char puch = (const unsigned char )*ppsz;
758	const unsigned char uch = *puch;
759	RTUNICP uc;
760
761	/* ASCII ? */
762	if (!(uch & RT_BIT(7)))
763	{
764	uc = uch;
765	puch++;
766	}
767	else if (uch & RT_BIT(6))
768	{
769	/* figure the length and validate the first octet. */
770	unsigned cb;
771	if (!(uch & RT_BIT(5)))
772	cb = 2;
773	else if (!(uch & RT_BIT(4)))
774	cb = 3;
775	else if (!(uch & RT_BIT(3)))
776	cb = 4;
777	else if (!(uch & RT_BIT(2)))
778	cb = 5;
779	else if (!(uch & RT_BIT(1)))
780	cb = 6;
781	else
782	{
783	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
784	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
785	}
786
787	/* validate the rest */
788	switch (cb)
789	{
790	case 6:
791	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
792	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
793	case 5:
794	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
795	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
796	case 4:
797	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
798	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
799	case 3:
800	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
801	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
802	case 2:
803	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
804	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
805	break;
806	}
807
808	/* get and validate the code point. */
809	switch (cb)
810	{
811	case 6:
812	uc = (puch[5] & 0x3f)
813	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
814	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
815	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
816	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
817	\| ((RTUNICP)(uch & 0x01) << 30);
818	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
819	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
820	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
821	break;
822	case 5:
823	uc = (puch[4] & 0x3f)
824	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
825	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
826	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
827	\| ((RTUNICP)(uch & 0x03) << 24);
828	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
829	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
830	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
831	break;
832	case 4:
833	uc = (puch[3] & 0x3f)
834	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
835	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
836	\| ((RTUNICP)(uch & 0x07) << 18);
837	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
838	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
839	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
840	break;
841	case 3:
842	uc = (puch[2] & 0x3f)
843	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
844	\| ((RTUNICP)(uch & 0x0f) << 12);
845	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
846	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
847	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
848	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
849	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
850	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
851	break;
852	case 2:
853	uc = (puch[1] & 0x3f)
854	\| ((RTUNICP)(uch & 0x1f) << 6);
855	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
856	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
857	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
858	break;
859	default: /* impossible, but GCC is bitching. */
860	uc = RTUNICP_INVALID;
861	break;
862	}
863	puch += cb;
864	}
865	else
866	{
867	/* 6th bit is always set. */
868	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
869	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
870	}
871	*pCp = uc;
872	ppsz = (const char )puch;
873	return VINF_SUCCESS;
874	}
875
876
877	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
878	{
879	unsigned char puch = (unsigned char )psz;
880	if (uc < 0x80)
881	*puch++ = (unsigned char )uc;
882	else if (uc < 0x00000800)
883	{
884	*puch++ = 0xc0 \| (uc >> 6);
885	*puch++ = 0x80 \| (uc & 0x3f);
886	}
887	else if (uc < 0x00010000)
888	{
889	if ( uc < 0x0000d8000
890	\|\| ( uc > 0x0000dfff
891	&& uc < 0x0000fffe))
892	{
893	*puch++ = 0xe0 \| (uc >> 12);
894	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
895	*puch++ = 0x80 \| (uc & 0x3f);
896	}
897	else
898	{
899	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
900	*puch++ = 0x7f;
901	}
902	}
903	else if (uc < 0x00200000)
904	{
905	*puch++ = 0xf0 \| (uc >> 18);
906	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
907	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
908	*puch++ = 0x80 \| (uc & 0x3f);
909	}
910	else if (uc < 0x04000000)
911	{
912	*puch++ = 0xf1 \| (uc >> 24);
913	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
914	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
915	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
916	*puch++ = 0x80 \| (uc & 0x3f);
917	}
918	else if (uc <= 0x7fffffff)
919	{
920	*puch++ = 0xf3 \| (uc >> 30);
921	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
922	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
923	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
924	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
925	*puch++ = 0x80 \| (uc & 0x3f);
926	}
927	else
928	{
929	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
930	*puch++ = 0x7f;
931	}
932
933	return (char *)puch;
934	}
935
936
937	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
938	{
939	if (pszStart < psz)
940	{
941	/* simple char? */
942	const unsigned char puch = (const unsigned char )psz;
943	unsigned uch = *--puch;
944	if (!(uch & RT_BIT(7)))
945	return (char *)puch;
946	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
947
948	/* two or more. */
949	uint32_t uMask = 0xffffffc0;
950	while ( (const unsigned char *)pszStart < puch
951	&& !(uMask & 1))
952	{
953	unsigned uch = *--puch;
954	if ((uch & 0xc0) != 0x80)
955	{
956	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
957	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
958	(char *)pszStart);
959	return (char *)puch;
960	}
961	uMask >>= 1;
962	}
963	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
964	}
965	return (char *)pszStart;
966	}
967
968
969	/**
970	* Performs a case sensitive string compare between two UTF-8 strings.
971	*
972	* Encoding errors are ignored by the current implementation. So, the only
973	* difference between this and the CRT strcmp function is the handling of
974	* NULL arguments.
975	*
976	* @returns < 0 if the first string less than the second string.
977	* @returns 0 if the first string identical to the second string.
978	* @returns > 0 if the first string greater than the second string.
979	* @param psz1 First UTF-8 string. Null is allowed.
980	* @param psz2 Second UTF-8 string. Null is allowed.
981	*/
982	RTDECL(int) RTStrCmp(const char psz1, const char psz2)
983	{
984	if (psz1 == psz2)
985	return 0;
986	if (!psz1)
987	return -1;
988	if (!psz2)
989	return 1;
990
991	return strcmp(psz1, psz2);
992	}
993
994
995	/**
996	* Performs a case insensitive string compare between two UTF-8 strings.
997	*
998	* This is a simplified compare, as only the simplified lower/upper case folding
999	* specified by the unicode specs are used. It does not consider character pairs
1000	* as they are used in some languages, just simple upper & lower case compares.
1001	*
1002	* The result is the difference between the mismatching codepoints after they
1003	* both have been lower cased.
1004	*
1005	* If the string encoding is invalid the function will assert (strict builds)
1006	* and use RTStrCmp for the remainder of the string.
1007	*
1008	* @returns < 0 if the first string less than the second string.
1009	* @returns 0 if the first string identical to the second string.
1010	* @returns > 0 if the first string greater than the second string.
1011	* @param psz1 First UTF-8 string. Null is allowed.
1012	* @param psz2 Second UTF-8 string. Null is allowed.
1013	*/
1014	RTDECL(int) RTStrICmp(const char psz1, const char psz2)
1015	{
1016	if (psz1 == psz2)
1017	return 0;
1018	if (!psz1)
1019	return -1;
1020	if (!psz2)
1021	return 1;
1022
1023	#if 1 /* new */
1024	const char *pszStart1 = psz1;
1025	for (;;)
1026	{
1027	/* Get the codepoints */
1028	RTUNICP cp1;
1029	int rc = RTStrGetCpEx(&psz1, &cp1);
1030	if (RT_FAILURE(rc))
1031	{
1032	AssertRC(rc);
1033	psz1--;
1034	break;
1035	}
1036
1037	RTUNICP cp2;
1038	rc = RTStrGetCpEx(&psz2, &cp2);
1039	if (RT_FAILURE(rc))
1040	{
1041	AssertRC(rc);
1042	psz2--;
1043	psz1 = RTStrPrevCp(pszStart1, psz1);
1044	break;
1045	}
1046
1047	/* compare */
1048	int iDiff = cp1 - cp2;
1049	if (iDiff)
1050	{
1051	iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1052	if (iDiff)
1053	{
1054	iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1055	if (iDiff)
1056	return iDiff;
1057	}
1058	}
1059
1060	/* hit the terminator? */
1061	if (!cp1)
1062	return 0;
1063	}
1064
1065	/* Hit some bad encoding, continue in case insensitive mode. */
1066	return RTStrCmp(psz1, psz2);
1067	#else /* old */
1068	#ifdef RT_OS_WINDOWS
1069	return stricmp(psz1, psz2);
1070	#else /* !RT_OS_WINDOWS */
1071	return strcasecmp(psz1, psz2);
1072	#endif /* !RT_OS_WINDOWS */
1073	#endif
1074	}

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 8155

Download in other formats: