utf-8.cpp@ 4968

Last change on this file since 4968 was 4071, checked in by vboxsync, 17 years ago
Biggest check-in ever. New source code headers for all (C) innotek files.
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 35.3 KB

Line
1	/* $Id: utf-8.cpp 4071 2007-08-07 17:07:59Z vboxsync $ */
2	/** @file
3	* innotek Portable Runtime - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2007 innotek GmbH
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License as published by the Free Software Foundation,
13	* in version 2 as it comes in the "COPYING" file of the VirtualBox OSE
14	* distribution. VirtualBox OSE is distributed in the hope that it will
15	* be useful, but WITHOUT ANY WARRANTY of any kind.
16	*/
17
18
19	/*******************************************************************************
20	* Header Files *
21	*******************************************************************************/
22	#include <iprt/string.h>
23	#include <iprt/uni.h>
24	#include <iprt/alloc.h>
25	#include <iprt/assert.h>
26	#include <iprt/err.h>
27	#include "internal/string.h"
28
29
30
31	/**
32	* Get get length in code points of a UTF-8 encoded string.
33	* The string is validated while doing this.
34	*
35	* @returns IPRT status code.
36	* @param psz Pointer to the UTF-8 string.
37	* @param cch The max length of the string. (btw cch = cb)
38	* Use RTSTR_MAX if all of the string is to be examined.s
39	* @param pcuc Where to store the length in unicode code points.
40	*/
41	static int rtUtf8Length(const char psz, size_t cch, size_t pcuc)
42	{
43	const unsigned char puch = (const unsigned char )psz;
44	size_t cCodePoints = 0;
45	while (cch > 0)
46	{
47	const unsigned char uch = *puch;
48	if (!uch)
49	break;
50	if (uch & BIT(7))
51	{
52	/* figure sequence length and validate the first byte */
53	unsigned cb;
54	if ((uch & (BIT(7) \| BIT(6) \| BIT(5))) == (BIT(7) \| BIT(6)))
55	cb = 2;
56	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4))) == (BIT(7) \| BIT(6) \| BIT(5)))
57	cb = 3;
58	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3))) == (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4)))
59	cb = 4;
60	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3) \| BIT(2))) == (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3)))
61	cb = 5;
62	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3) \| BIT(2) \| BIT(1))) == (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3) \| BIT(2)))
63	cb = 6;
64	else
65	{
66	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
67	return VERR_INVALID_UTF8_ENCODING;
68	}
69
70	/* check length */
71	if (cb > cch)
72	{
73	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
74	return VERR_INVALID_UTF8_ENCODING;
75	}
76
77	/* validate the rest */
78	switch (cb)
79	{
80	case 6:
81	RTStrAssertMsgReturn((puch[5] & (BIT(7) \| BIT(6))) == BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
82	case 5:
83	RTStrAssertMsgReturn((puch[4] & (BIT(7) \| BIT(6))) == BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
84	case 4:
85	RTStrAssertMsgReturn((puch[3] & (BIT(7) \| BIT(6))) == BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
86	case 3:
87	RTStrAssertMsgReturn((puch[2] & (BIT(7) \| BIT(6))) == BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
88	case 2:
89	RTStrAssertMsgReturn((puch[1] & (BIT(7) \| BIT(6))) == BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
90	break;
91	}
92
93	/* validate the code point. */
94	RTUNICP uc;
95	switch (cb)
96	{
97	case 6:
98	uc = (puch[5] & 0x3f)
99	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
100	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
101	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
102	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
103	\| ((RTUNICP)(uch & 0x01) << 30);
104	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
105	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
106	break;
107	case 5:
108	uc = (puch[4] & 0x3f)
109	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
110	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
111	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
112	\| ((RTUNICP)(uch & 0x03) << 24);
113	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
114	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
115	break;
116	case 4:
117	uc = (puch[3] & 0x3f)
118	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
119	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
120	\| ((RTUNICP)(uch & 0x07) << 18);
121	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
122	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
123	break;
124	case 3:
125	uc = (puch[2] & 0x3f)
126	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
127	\| ((RTUNICP)(uch & 0x0f) << 12);
128	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
129	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
130	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
131	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
132	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
133	break;
134	case 2:
135	uc = (puch[1] & 0x3f)
136	\| ((RTUNICP)(uch & 0x1f) << 6);
137	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
138	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
139	break;
140	}
141
142	/* advance */
143	cch -= cb;
144	puch += cb;
145	}
146	else
147	{
148	/* one ASCII byte */
149	puch++;
150	cch--;
151	}
152	cCodePoints++;
153	}
154
155	/* done */
156	*pcuc = cCodePoints;
157	return VINF_SUCCESS;
158	}
159
160
161	/**
162	* Decodes and UTF-8 string into an array of unicode code point.
163	*
164	* Since we know the input is valid, we do not perform encoding or length checks.
165	*
166	* @returns iprt status code.
167	* @param psz The UTF-8 string to recode. This is a valid encoding.
168	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
169	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
170	* @param paCps Where to store the code points array.
171	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
172	* @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
173	*/
174	static int rtUtf8Decode(const char psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t pcCps)
175	{
176	int rc = VINF_SUCCESS;
177	const unsigned char puch = (const unsigned char )psz;
178	const PRTUNICP pCpEnd = paCps + cCps;
179	PRTUNICP pCp = paCps;
180	Assert(pCpEnd >= pCp);
181	while (cch > 0)
182	{
183	/* read the next char and check for terminator. */
184	const unsigned char uch = *puch;
185	if (!uch)
186	break;
187
188	/* check for output overflow */
189	if (pCp >= pCpEnd)
190	{
191	rc = VERR_BUFFER_OVERFLOW;
192	break;
193	}
194
195	/* decode and recode the code point */
196	if (!(uch & BIT(7)))
197	{
198	*pCp++ = uch;
199	puch++;
200	cch--;
201	}
202	#ifdef RT_STRICT
203	else if (!(uch & BIT(6)))
204	AssertMsgFailed(("Internal error!\n"));
205	#endif
206	else if (!(uch & BIT(5)))
207	{
208	*pCp++ = (puch[1] & 0x3f)
209	\| ((uint16_t)(uch & 0x1f) << 6);
210	puch += 2;
211	cch -= 2;
212	}
213	else if (!(uch & BIT(4)))
214	{
215	*pCp++ = (puch[2] & 0x3f)
216	\| ((uint16_t)(puch[1] & 0x3f) << 6)
217	\| ((uint16_t)(uch & 0x0f) << 12);
218	puch += 3;
219	cch -= 3;
220	}
221	else if (!(uch & BIT(3)))
222	{
223	*pCp++ = (puch[3] & 0x3f)
224	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
225	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
226	\| ((RTUNICP)(uch & 0x07) << 18);
227	puch += 4;
228	cch -= 4;
229	}
230	else if (!(uch & BIT(2)))
231	{
232	*pCp++ = (puch[4] & 0x3f)
233	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
234	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
235	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
236	\| ((RTUNICP)(uch & 0x03) << 24);
237	puch += 5;
238	cch -= 6;
239	}
240	else
241	{
242	Assert(!(uch & BIT(1)));
243	*pCp++ = (puch[5] & 0x3f)
244	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
245	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
246	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
247	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
248	\| ((RTUNICP)(uch & 0x01) << 30);
249	puch += 6;
250	cch -= 6;
251	}
252	}
253
254	/* done */
255	*pCp = 0;
256	*pcCps = pCp - paCps;
257	return rc;
258	}
259
260
261	RTDECL(size_t) RTStrUniLen(const char *psz)
262	{
263	size_t cCodePoints;
264	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints);
265	return RT_SUCCESS(rc) ? cCodePoints : 0;
266	}
267
268
269	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
270	{
271	size_t cCodePoints;
272	int rc = rtUtf8Length(psz, cch, &cCodePoints);
273	if (pcCps)
274	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
275	return rc;
276	}
277
278
279	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
280	{
281	/*
282	* Validate input.
283	*/
284	Assert(VALID_PTR(pszString));
285	Assert(VALID_PTR(ppaCps));
286	*ppaCps = NULL;
287
288	/*
289	* Validate the UTF-8 input and count its code points.
290	*/
291	size_t cCps;
292	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps);
293	if (RT_SUCCESS(rc))
294	{
295	/*
296	* Allocate buffer.
297	*/
298	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
299	if (paCps)
300	{
301	/*
302	* Decode the string.
303	*/
304	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
305	if (RT_SUCCESS(rc))
306	{
307	*ppaCps = paCps;
308	return rc;
309	}
310	RTMemFree(paCps);
311	}
312	else
313	rc = VERR_NO_CODE_POINT_MEMORY;
314	}
315	return rc;
316	}
317
318
319	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
320	{
321	/*
322	* Validate input.
323	*/
324	Assert(VALID_PTR(pszString));
325	Assert(VALID_PTR(ppaCps));
326	Assert(!pcCps \|\| VALID_PTR(pcCps));
327
328	/*
329	* Validate the UTF-8 input and count the code points.
330	*/
331	size_t cCpsResult;
332	int rc = rtUtf8Length(pszString, cchString, &cCpsResult);
333	if (RT_SUCCESS(rc))
334	{
335	if (pcCps)
336	*pcCps = cCpsResult;
337
338	/*
339	* Check buffer size / Allocate buffer.
340	*/
341	bool fShouldFree;
342	PRTUNICP paCpsResult;
343	if (cCps > 0 && *ppaCps)
344	{
345	fShouldFree = false;
346	if (cCps <= cCpsResult)
347	return VERR_BUFFER_OVERFLOW;
348	paCpsResult = *ppaCps;
349	}
350	else
351	{
352	*ppaCps = NULL;
353	fShouldFree = true;
354	cCps = RT_MAX(cCpsResult + 1, cCps);
355	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
356	}
357	if (paCpsResult)
358	{
359	/*
360	* Encode the UTF-16 string.
361	*/
362	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
363	if (RT_SUCCESS(rc))
364	{
365	*ppaCps = paCpsResult;
366	return rc;
367	}
368	if (fShouldFree)
369	RTMemFree(paCpsResult);
370	}
371	else
372	rc = VERR_NO_CODE_POINT_MEMORY;
373	}
374	return rc;
375	}
376
377
378	/**
379	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
380	*
381	* @returns IPRT status code.
382	* @param psz Pointer to the UTF-8 string.
383	* @param cch The max length of the string. (btw cch = cb)
384	* Use RTSTR_MAX if all of the string is to be examined.s
385	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
386	*/
387	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
388	{
389	const unsigned char puch = (const unsigned char )psz;
390	size_t cwc = 0;
391	while (cch > 0)
392	{
393	const unsigned char uch = *puch;
394	if (!uch)
395	break;
396	if (!(uch & BIT(7)))
397	{
398	/* one ASCII byte */
399	cwc++;
400	puch++;
401	cch--;
402	}
403	else
404	{
405	/* figure sequence length and validate the first byte */
406	unsigned cb;
407	if ((uch & (BIT(7) \| BIT(6) \| BIT(5))) == (BIT(7) \| BIT(6)))
408	cb = 2;
409	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4))) == (BIT(7) \| BIT(6) \| BIT(5)))
410	cb = 3;
411	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3))) == (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4)))
412	cb = 4;
413	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3) \| BIT(2))) == (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3)))
414	cb = 5;
415	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3) \| BIT(2) \| BIT(1))) == (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3) \| BIT(2)))
416	cb = 6;
417	else
418	{
419	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
420	return VERR_INVALID_UTF8_ENCODING;
421	}
422
423	/* check length */
424	if (cb > cch)
425	{
426	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
427	return VERR_INVALID_UTF8_ENCODING;
428	}
429
430	/* validate the rest */
431	switch (cb)
432	{
433	case 6:
434	RTStrAssertMsgReturn((puch[5] & (BIT(7) \| BIT(6))) == BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
435	case 5:
436	RTStrAssertMsgReturn((puch[4] & (BIT(7) \| BIT(6))) == BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
437	case 4:
438	RTStrAssertMsgReturn((puch[3] & (BIT(7) \| BIT(6))) == BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
439	case 3:
440	RTStrAssertMsgReturn((puch[2] & (BIT(7) \| BIT(6))) == BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
441	case 2:
442	RTStrAssertMsgReturn((puch[1] & (BIT(7) \| BIT(6))) == BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
443	break;
444	}
445
446	/* validate the code point. */
447	RTUNICP uc;
448	switch (cb)
449	{
450	case 6:
451	uc = (puch[5] & 0x3f)
452	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
453	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
454	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
455	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
456	\| ((RTUNICP)(uch & 0x01) << 30);
457	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
458	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
459	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
460	return VERR_CANT_RECODE_AS_UTF16;
461	case 5:
462	uc = (puch[4] & 0x3f)
463	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
464	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
465	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
466	\| ((RTUNICP)(uch & 0x03) << 24);
467	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
468	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
469	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
470	return VERR_CANT_RECODE_AS_UTF16;
471	case 4:
472	uc = (puch[3] & 0x3f)
473	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
474	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
475	\| ((RTUNICP)(uch & 0x07) << 18);
476	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
477	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
478	RTStrAssertMsgReturn(uc <= 0x0010ffff,
479	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
480	cwc++;
481	break;
482	case 3:
483	uc = (puch[2] & 0x3f)
484	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
485	\| ((RTUNICP)(uch & 0x0f) << 12);
486	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
487	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
488	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
489	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
490	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
491	break;
492	case 2:
493	uc = (puch[1] & 0x3f)
494	\| ((RTUNICP)(uch & 0x1f) << 6);
495	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
496	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
497	break;
498	}
499
500	/* advance */
501	cch -= cb;
502	puch += cb;
503	cwc++;
504	}
505	}
506
507	/* done */
508	*pcwc = cwc;
509	return VINF_SUCCESS;
510	}
511
512
513	/**
514	* Recodes a valid UTF-8 string as UTF-16.
515	*
516	* Since we know the input is valid, we do not perform encoding or length checks.
517	*
518	* @returns iprt status code.
519	* @param psz The UTF-8 string to recode. This is a valid encoding.
520	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
521	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
522	* @param pwsz Where to store the UTF-16 string.
523	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
524	* @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
525	*/
526	static int rtUtf8RecodeAsUtf16(const char psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t pcwc)
527	{
528	int rc = VINF_SUCCESS;
529	const unsigned char puch = (const unsigned char )psz;
530	const PRTUTF16 pwszEnd = pwsz + cwc;
531	PRTUTF16 pwc = pwsz;
532	Assert(pwszEnd >= pwc);
533	while (cch > 0)
534	{
535	/* read the next char and check for terminator. */
536	const unsigned char uch = *puch;
537	if (!uch)
538	break;
539
540	/* check for output overflow */
541	if (pwc >= pwszEnd)
542	{
543	rc = VERR_BUFFER_OVERFLOW;
544	break;
545	}
546
547	/* decode and recode the code point */
548	if (!(uch & BIT(7)))
549	{
550	*pwc++ = uch;
551	puch++;
552	cch--;
553	}
554	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5))) == (BIT(7) \| BIT(6)))
555	{
556	uint16_t uc = (puch[1] & 0x3f)
557	\| ((uint16_t)(uch & 0x1f) << 6);
558	*pwc++ = uc;
559	puch += 2;
560	cch -= 2;
561	}
562	else if ((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4))) == (BIT(7) \| BIT(6) \| BIT(5)))
563	{
564	uint16_t uc = (puch[2] & 0x3f)
565	\| ((uint16_t)(puch[1] & 0x3f) << 6)
566	\| ((uint16_t)(uch & 0x0f) << 12);
567	*pwc++ = uc;
568	puch += 3;
569	cch -= 3;
570	}
571	else
572	{
573	/* generate surrugate pair */
574	Assert((uch & (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4) \| BIT(3))) == (BIT(7) \| BIT(6) \| BIT(5) \| BIT(4)));
575	RTUNICP uc = (puch[3] & 0x3f)
576	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
577	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
578	\| ((RTUNICP)(uch & 0x07) << 18);
579	if (pwc + 1 >= pwszEnd)
580	{
581	rc = VERR_BUFFER_OVERFLOW;
582	break;
583	}
584	uc -= 0x10000;
585	*pwc++ = 0xd800 \| (uc >> 10);
586	*pwc++ = 0xdc00 \| (uc & 0x3ff);
587	puch += 4;
588	cch -= 4;
589	}
590	}
591
592	/* done */
593	*pwc = '\0';
594	*pcwc = pwc - pwsz;
595	return rc;
596	}
597
598
599	RTDECL(int) RTStrToUtf16(const char pszString, PRTUTF16 ppwszString)
600	{
601	/*
602	* Validate input.
603	*/
604	Assert(VALID_PTR(ppwszString));
605	Assert(VALID_PTR(pszString));
606	*ppwszString = NULL;
607
608	/*
609	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
610	*/
611	size_t cwc;
612	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
613	if (RT_SUCCESS(rc))
614	{
615	/*
616	* Allocate buffer.
617	*/
618	PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
619	if (pwsz)
620	{
621	/*
622	* Encode the UTF-16 string.
623	*/
624	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
625	if (RT_SUCCESS(rc))
626	{
627	*ppwszString = pwsz;
628	return rc;
629	}
630	RTMemFree(pwsz);
631	}
632	else
633	rc = VERR_NO_UTF16_MEMORY;
634	}
635	return rc;
636	}
637
638
639	RTDECL(int) RTStrToUtf16Ex(const char pszString, size_t cchString, PRTUTF16 ppwsz, size_t cwc, size_t *pcwc)
640	{
641	/*
642	* Validate input.
643	*/
644	Assert(VALID_PTR(pszString));
645	Assert(VALID_PTR(ppwsz));
646	Assert(!pcwc \|\| VALID_PTR(pcwc));
647
648	/*
649	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
650	*/
651	size_t cwcResult;
652	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
653	if (RT_SUCCESS(rc))
654	{
655	if (pcwc)
656	*pcwc = cwcResult;
657
658	/*
659	* Check buffer size / Allocate buffer.
660	*/
661	bool fShouldFree;
662	PRTUTF16 pwszResult;
663	if (cwc > 0 && *ppwsz)
664	{
665	fShouldFree = false;
666	if (cwc <= cwcResult)
667	return VERR_BUFFER_OVERFLOW;
668	pwszResult = *ppwsz;
669	}
670	else
671	{
672	*ppwsz = NULL;
673	fShouldFree = true;
674	cwc = RT_MAX(cwcResult + 1, cwc);
675	pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
676	}
677	if (pwszResult)
678	{
679	/*
680	* Encode the UTF-16 string.
681	*/
682	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
683	if (RT_SUCCESS(rc))
684	{
685	*ppwsz = pwszResult;
686	return rc;
687	}
688	if (fShouldFree)
689	RTMemFree(pwszResult);
690	}
691	else
692	rc = VERR_NO_UTF16_MEMORY;
693	}
694	return rc;
695	}
696
697
698	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
699	{
700	size_t cwc;
701	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
702	return RT_SUCCESS(rc) ? cwc : 0;
703	}
704
705
706	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
707	{
708	size_t cwc;
709	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
710	if (pcwc)
711	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
712	return rc;
713	}
714
715
716	/**
717	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
718	* @returns rc
719	* @param ppsz The pointer to the the string position point.
720	* @param pCp Where to store RTUNICP_INVALID.
721	* @param rc The iprt error code.
722	*/
723	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
724	{
725	/*
726	* Try find a valid encoding.
727	*/
728	(ppsz)++; /* @todo code this! */
729	*pCp = RTUNICP_INVALID;
730	return rc;
731	}
732
733
734	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
735	{
736	RTUNICP Cp;
737	RTStrGetCpExInternal(&psz, &Cp);
738	return Cp;
739	}
740
741
742	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
743	{
744	const unsigned char puch = (const unsigned char )*ppsz;
745	const unsigned char uch = *puch;
746	RTUNICP uc;
747
748	/* ASCII ? */
749	if (!(uch & BIT(7)))
750	{
751	uc = uch;
752	puch++;
753	}
754	else if (uch & BIT(6))
755	{
756	/* figure the length and validate the first octet. */
757	unsigned cb;
758	if (!(uch & BIT(5)))
759	cb = 2;
760	else if (!(uch & BIT(4)))
761	cb = 3;
762	else if (!(uch & BIT(3)))
763	cb = 4;
764	else if (!(uch & BIT(2)))
765	cb = 5;
766	else if (!(uch & BIT(1)))
767	cb = 6;
768	else
769	{
770	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
771	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
772	}
773
774	/* validate the rest */
775	switch (cb)
776	{
777	case 6:
778	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
779	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
780	case 5:
781	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
782	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
783	case 4:
784	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
785	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
786	case 3:
787	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
788	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
789	case 2:
790	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
791	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
792	break;
793	}
794
795	/* get and validate the code point. */
796	switch (cb)
797	{
798	case 6:
799	uc = (puch[5] & 0x3f)
800	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
801	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
802	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
803	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
804	\| ((RTUNICP)(uch & 0x01) << 30);
805	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
806	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
807	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
808	break;
809	case 5:
810	uc = (puch[4] & 0x3f)
811	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
812	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
813	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
814	\| ((RTUNICP)(uch & 0x03) << 24);
815	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
816	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
817	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
818	break;
819	case 4:
820	uc = (puch[3] & 0x3f)
821	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
822	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
823	\| ((RTUNICP)(uch & 0x07) << 18);
824	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
825	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
826	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
827	break;
828	case 3:
829	uc = (puch[2] & 0x3f)
830	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
831	\| ((RTUNICP)(uch & 0x0f) << 12);
832	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
833	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
834	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
835	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
836	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
837	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
838	break;
839	case 2:
840	uc = (puch[1] & 0x3f)
841	\| ((RTUNICP)(uch & 0x1f) << 6);
842	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
843	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
844	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
845	break;
846	default: /* impossible, but GCC is bitching. */
847	uc = RTUNICP_INVALID;
848	break;
849	}
850	puch += cb;
851	}
852	else
853	{
854	/* 6th bit is always set. */
855	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
856	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
857	}
858	*pCp = uc;
859	ppsz = (const char )puch;
860	return VINF_SUCCESS;
861	}
862
863
864	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
865	{
866	unsigned char puch = (unsigned char )psz;
867	if (uc < 0x80)
868	*puch++ = (unsigned char )uc;
869	else if (uc < 0x00000800)
870	{
871	*puch++ = 0xc0 \| (uc >> 6);
872	*puch++ = 0x80 \| (uc & 0x3f);
873	}
874	else if (uc < 0x00010000)
875	{
876	if ( uc < 0x0000d8000
877	\|\| ( uc > 0x0000dfff
878	&& uc < 0x0000fffe))
879	{
880	*puch++ = 0xe0 \| (uc >> 12);
881	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
882	*puch++ = 0x80 \| (uc & 0x3f);
883	}
884	else
885	{
886	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
887	*puch++ = 0x7f;
888	}
889	}
890	else if (uc < 0x00200000)
891	{
892	*puch++ = 0xf0 \| (uc >> 18);
893	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
894	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
895	*puch++ = 0x80 \| (uc & 0x3f);
896	}
897	else if (uc < 0x04000000)
898	{
899	*puch++ = 0xf1 \| (uc >> 24);
900	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
901	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
902	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
903	*puch++ = 0x80 \| (uc & 0x3f);
904	}
905	else if (uc <= 0x7fffffff)
906	{
907	*puch++ = 0xf3 \| (uc >> 30);
908	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
909	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
910	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
911	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
912	*puch++ = 0x80 \| (uc & 0x3f);
913	}
914	else
915	{
916	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
917	*puch++ = 0x7f;
918	}
919
920	return (char *)puch;
921	}
922
923
924	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
925	{
926	if (pszStart < psz)
927	{
928	/* simple char? */
929	const unsigned char puch = (const unsigned char )psz;
930	unsigned uch = *--puch;
931	if (!(uch & BIT(7)))
932	return (char *)puch;
933	RTStrAssertMsgReturn(!(uch & BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
934
935	/* two or more. */
936	uint32_t uMask = 0xffffffc0;
937	while ( (const unsigned char *)pszStart < puch
938	&& !(uMask & 1))
939	{
940	unsigned uch = *--puch;
941	if ((uch & 0xc0) != 0x80)
942	{
943	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
944	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
945	(char *)pszStart);
946	return (char *)puch;
947	}
948	uMask >>= 1;
949	}
950	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
951	}
952	return (char *)pszStart;
953	}
954
955
956	/**
957	* Performs a case insensitive string compare between two UTF-8 strings.
958	*
959	* This is a simplified compare, as only the simplified lower/upper case folding
960	* specified by the unicode specs are used. It does not consider character pairs
961	* as they are used in some languages, just simple upper & lower case compares.
962	*
963	* @returns < 0 if the first string less than the second string.
964	* @returns 0 if the first string identical to the second string.
965	* @returns > 0 if the first string greater than the second string.
966	* @param psz1 First UTF-8 string.
967	* @param psz2 Second UTF-8 string.
968	*/
969	RTDECL(int) RTStrICmp(const char psz1, const char psz2)
970	{
971	/** @todo implement proper UTF-8 case-insensitive string comparison. */
972	#ifdef RT_OS_WINDOWS
973	return stricmp(psz1, psz2);
974	#else /* !RT_OS_WINDOWS */
975	return strcasecmp(psz1, psz2);
976	#endif /* !RT_OS_WINDOWS */
977	}

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/utf-8.cpp@ 4968

Download in other formats: