utf-8.cpp@ 57358

Last change on this file since 57358 was 57358, checked in by vboxsync, 9 years ago
*: scm cleanup run.
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 55.1 KB

Line
1	/* $Id: utf-8.cpp 57358 2015-08-14 15:16:38Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2015 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*********************************************************************************************************************************
29	* Header Files *
30	*********************************************************************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/alloc.h>
36	#include <iprt/assert.h>
37	#include <iprt/err.h>
38	#include "internal/string.h"
39
40
41
42	/**
43	* Get get length in code points of a UTF-8 encoded string.
44	* The string is validated while doing this.
45	*
46	* @returns IPRT status code.
47	* @param psz Pointer to the UTF-8 string.
48	* @param cch The max length of the string. (btw cch = cb)
49	* Use RTSTR_MAX if all of the string is to be examined.
50	* @param pcuc Where to store the length in unicode code points.
51	* @param pcchActual Where to store the actual size of the UTF-8 string
52	* on success (cch = cb again). Optional.
53	*/
54	DECLHIDDEN(int) rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
55	{
56	const unsigned char puch = (const unsigned char )psz;
57	size_t cCodePoints = 0;
58	while (cch > 0)
59	{
60	const unsigned char uch = *puch;
61	if (!uch)
62	break;
63	if (uch & RT_BIT(7))
64	{
65	/* figure sequence length and validate the first byte */
66	/** @todo RT_USE_RTC_3629 */
67	unsigned cb;
68	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
69	cb = 2;
70	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
71	cb = 3;
72	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
73	cb = 4;
74	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
75	cb = 5;
76	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
77	cb = 6;
78	else
79	{
80	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81	return VERR_INVALID_UTF8_ENCODING;
82	}
83
84	/* check length */
85	if (cb > cch)
86	{
87	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88	return VERR_INVALID_UTF8_ENCODING;
89	}
90
91	/* validate the rest */
92	switch (cb)
93	{
94	case 6:
95	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96	case 5:
97	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98	case 4:
99	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100	case 3:
101	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102	case 2:
103	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104	break;
105	}
106
107	/* validate the code point. */
108	RTUNICP uc;
109	switch (cb)
110	{
111	case 6:
112	uc = (puch[5] & 0x3f)
113	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
114	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
115	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
116	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
117	\| ((RTUNICP)(uch & 0x01) << 30);
118	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120	break;
121	case 5:
122	uc = (puch[4] & 0x3f)
123	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
124	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
125	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
126	\| ((RTUNICP)(uch & 0x03) << 24);
127	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129	break;
130	case 4:
131	uc = (puch[3] & 0x3f)
132	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
133	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
134	\| ((RTUNICP)(uch & 0x07) << 18);
135	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137	break;
138	case 3:
139	uc = (puch[2] & 0x3f)
140	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
141	\| ((RTUNICP)(uch & 0x0f) << 12);
142	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
146	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147	break;
148	case 2:
149	uc = (puch[1] & 0x3f)
150	\| ((RTUNICP)(uch & 0x1f) << 6);
151	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153	break;
154	}
155
156	/* advance */
157	cch -= cb;
158	puch += cb;
159	}
160	else
161	{
162	/* one ASCII byte */
163	puch++;
164	cch--;
165	}
166	cCodePoints++;
167	}
168
169	/* done */
170	*pcuc = cCodePoints;
171	if (pcchActual)
172	pcchActual = puch - (unsigned char const )psz;
173	return VINF_SUCCESS;
174	}
175
176
177	/**
178	* Decodes and UTF-8 string into an array of unicode code point.
179	*
180	* Since we know the input is valid, we do not perform encoding or length checks.
181	*
182	* @returns iprt status code.
183	* @param psz The UTF-8 string to recode. This is a valid encoding.
184	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186	* @param paCps Where to store the code points array.
187	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188	*/
189	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190	{
191	int rc = VINF_SUCCESS;
192	const unsigned char puch = (const unsigned char )psz;
193	PRTUNICP pCp = paCps;
194	while (cch > 0)
195	{
196	/* read the next char and check for terminator. */
197	const unsigned char uch = *puch;
198	if (!uch)
199	break;
200
201	/* check for output overflow */
202	if (RT_UNLIKELY(cCps < 1))
203	{
204	rc = VERR_BUFFER_OVERFLOW;
205	break;
206	}
207	cCps--;
208
209	/* decode and recode the code point */
210	if (!(uch & RT_BIT(7)))
211	{
212	*pCp++ = uch;
213	puch++;
214	cch--;
215	}
216	#ifdef RT_STRICT
217	else if (!(uch & RT_BIT(6)))
218	AssertMsgFailed(("Internal error!\n"));
219	#endif
220	else if (!(uch & RT_BIT(5)))
221	{
222	*pCp++ = (puch[1] & 0x3f)
223	\| ((uint16_t)(uch & 0x1f) << 6);
224	puch += 2;
225	cch -= 2;
226	}
227	else if (!(uch & RT_BIT(4)))
228	{
229	*pCp++ = (puch[2] & 0x3f)
230	\| ((uint16_t)(puch[1] & 0x3f) << 6)
231	\| ((uint16_t)(uch & 0x0f) << 12);
232	puch += 3;
233	cch -= 3;
234	}
235	else if (!(uch & RT_BIT(3)))
236	{
237	*pCp++ = (puch[3] & 0x3f)
238	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
239	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
240	\| ((RTUNICP)(uch & 0x07) << 18);
241	puch += 4;
242	cch -= 4;
243	}
244	else if (!(uch & RT_BIT(2)))
245	{
246	*pCp++ = (puch[4] & 0x3f)
247	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
248	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
249	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
250	\| ((RTUNICP)(uch & 0x03) << 24);
251	puch += 5;
252	cch -= 6;
253	}
254	else
255	{
256	Assert(!(uch & RT_BIT(1)));
257	*pCp++ = (puch[5] & 0x3f)
258	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
259	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
260	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
261	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
262	\| ((RTUNICP)(uch & 0x01) << 30);
263	puch += 6;
264	cch -= 6;
265	}
266	}
267
268	/* done */
269	*pCp = 0;
270	return rc;
271	}
272
273
274	RTDECL(size_t) RTStrUniLen(const char *psz)
275	{
276	size_t cCodePoints;
277	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278	return RT_SUCCESS(rc) ? cCodePoints : 0;
279	}
280	RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
284	{
285	size_t cCodePoints;
286	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287	if (pcCps)
288	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289	return rc;
290	}
291	RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294	RTDECL(int) RTStrValidateEncoding(const char *psz)
295	{
296	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297	}
298	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302	{
303	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED \| RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
304	VERR_INVALID_PARAMETER);
305	AssertPtr(psz);
306
307	/*
308	* Use rtUtf8Length for the job.
309	*/
310	size_t cchActual;
311	size_t cCpsIgnored;
312	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
313	if (RT_SUCCESS(rc))
314	{
315	if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
316	{
317	if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
318	cchActual++;
319	if (cchActual == cch)
320	rc = VINF_SUCCESS;
321	else if (cchActual < cch)
322	rc = VERR_BUFFER_UNDERFLOW;
323	else
324	rc = VERR_BUFFER_OVERFLOW;
325	}
326	else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
327	&& cchActual >= cch)
328	rc = VERR_BUFFER_OVERFLOW;
329	}
330	return rc;
331	}
332	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
333
334
335	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
336	{
337	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
338	return RT_SUCCESS(rc);
339	}
340	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
341
342
343	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
344	{
345	size_t cErrors = 0;
346	for (;;)
347	{
348	RTUNICP Cp;
349	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
350	if (RT_SUCCESS(rc))
351	{
352	if (!Cp)
353	break;
354	}
355	else
356	{
357	psz[-1] = '?';
358	cErrors++;
359	}
360	}
361	return cErrors;
362	}
363	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
364
365
366	RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
367	{
368	size_t cReplacements = 0;
369	AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
370	for (;;)
371	{
372	RTUNICP Cp;
373	PCRTUNICP pCp;
374	char *pszOld = psz;
375	if (RT_FAILURE(RTStrGetCpEx((const char **)&psz, &Cp)))
376	return -1;
377	if (!Cp)
378	break;
379	for (pCp = puszValidSet; *pCp; pCp += 2)
380	{
381	AssertReturn(*(pCp + 1), -1);
382	if (pCp <= Cp && (pCp + 1) >= Cp) /* No, I won't do * and ++. */
383	break;
384	}
385	if (!*pCp)
386	{
387	for (; pszOld != psz; ++pszOld)
388	*pszOld = chReplacement;
389	++cReplacements;
390	}
391	}
392	return cReplacements;
393	}
394	RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
395
396
397	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
398	{
399	/*
400	* Validate input.
401	*/
402	Assert(VALID_PTR(pszString));
403	Assert(VALID_PTR(ppaCps));
404	*ppaCps = NULL;
405
406	/*
407	* Validate the UTF-8 input and count its code points.
408	*/
409	size_t cCps;
410	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
411	if (RT_SUCCESS(rc))
412	{
413	/*
414	* Allocate buffer.
415	*/
416	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
417	if (paCps)
418	{
419	/*
420	* Decode the string.
421	*/
422	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
423	if (RT_SUCCESS(rc))
424	{
425	*ppaCps = paCps;
426	return rc;
427	}
428	RTMemFree(paCps);
429	}
430	else
431	rc = VERR_NO_CODE_POINT_MEMORY;
432	}
433	return rc;
434	}
435	RT_EXPORT_SYMBOL(RTStrToUni);
436
437
438	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
439	{
440	/*
441	* Validate input.
442	*/
443	Assert(VALID_PTR(pszString));
444	Assert(VALID_PTR(ppaCps));
445	Assert(!pcCps \|\| VALID_PTR(pcCps));
446
447	/*
448	* Validate the UTF-8 input and count the code points.
449	*/
450	size_t cCpsResult;
451	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
452	if (RT_SUCCESS(rc))
453	{
454	if (pcCps)
455	*pcCps = cCpsResult;
456
457	/*
458	* Check buffer size / Allocate buffer.
459	*/
460	bool fShouldFree;
461	PRTUNICP paCpsResult;
462	if (cCps > 0 && *ppaCps)
463	{
464	fShouldFree = false;
465	if (cCps <= cCpsResult)
466	return VERR_BUFFER_OVERFLOW;
467	paCpsResult = *ppaCps;
468	}
469	else
470	{
471	*ppaCps = NULL;
472	fShouldFree = true;
473	cCps = RT_MAX(cCpsResult + 1, cCps);
474	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
475	}
476	if (paCpsResult)
477	{
478	/*
479	* Encode the UTF-16 string.
480	*/
481	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
482	if (RT_SUCCESS(rc))
483	{
484	*ppaCps = paCpsResult;
485	return rc;
486	}
487	if (fShouldFree)
488	RTMemFree(paCpsResult);
489	}
490	else
491	rc = VERR_NO_CODE_POINT_MEMORY;
492	}
493	return rc;
494	}
495	RT_EXPORT_SYMBOL(RTStrToUniEx);
496
497
498	/**
499	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
500	*
501	* @returns IPRT status code.
502	* @param psz Pointer to the UTF-8 string.
503	* @param cch The max length of the string. (btw cch = cb)
504	* Use RTSTR_MAX if all of the string is to be examined.
505	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
506	*/
507	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
508	{
509	const unsigned char puch = (const unsigned char )psz;
510	size_t cwc = 0;
511	while (cch > 0)
512	{
513	const unsigned char uch = *puch;
514	if (!uch)
515	break;
516	if (!(uch & RT_BIT(7)))
517	{
518	/* one ASCII byte */
519	cwc++;
520	puch++;
521	cch--;
522	}
523	else
524	{
525	/* figure sequence length and validate the first byte */
526	unsigned cb;
527	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
528	cb = 2;
529	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
530	cb = 3;
531	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
532	cb = 4;
533	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
534	cb = 5;
535	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
536	cb = 6;
537	else
538	{
539	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
540	return VERR_INVALID_UTF8_ENCODING;
541	}
542
543	/* check length */
544	if (cb > cch)
545	{
546	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
547	return VERR_INVALID_UTF8_ENCODING;
548	}
549
550	/* validate the rest */
551	switch (cb)
552	{
553	case 6:
554	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
555	case 5:
556	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
557	case 4:
558	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
559	case 3:
560	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
561	case 2:
562	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
563	break;
564	}
565
566	/* validate the code point. */
567	RTUNICP uc;
568	switch (cb)
569	{
570	case 6:
571	uc = (puch[5] & 0x3f)
572	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
573	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
574	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
575	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
576	\| ((RTUNICP)(uch & 0x01) << 30);
577	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
578	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
579	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
580	return VERR_CANT_RECODE_AS_UTF16;
581	case 5:
582	uc = (puch[4] & 0x3f)
583	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
584	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
585	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
586	\| ((RTUNICP)(uch & 0x03) << 24);
587	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
588	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
589	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
590	return VERR_CANT_RECODE_AS_UTF16;
591	case 4:
592	uc = (puch[3] & 0x3f)
593	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
594	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
595	\| ((RTUNICP)(uch & 0x07) << 18);
596	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
597	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
598	RTStrAssertMsgReturn(uc <= 0x0010ffff,
599	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
600	cwc++;
601	break;
602	case 3:
603	uc = (puch[2] & 0x3f)
604	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
605	\| ((RTUNICP)(uch & 0x0f) << 12);
606	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
607	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
608	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
609	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
610	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
611	break;
612	case 2:
613	uc = (puch[1] & 0x3f)
614	\| ((RTUNICP)(uch & 0x1f) << 6);
615	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
616	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
617	break;
618	}
619
620	/* advance */
621	cch -= cb;
622	puch += cb;
623	cwc++;
624	}
625	}
626
627	/* done */
628	*pcwc = cwc;
629	return VINF_SUCCESS;
630	}
631
632
633	/**
634	* Recodes a valid UTF-8 string as UTF-16.
635	*
636	* Since we know the input is valid, we do not perform encoding or length checks.
637	*
638	* @returns iprt status code.
639	* @param psz The UTF-8 string to recode. This is a valid encoding.
640	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
641	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
642	* @param pwsz Where to store the UTF-16 string.
643	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
644	*/
645	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
646	{
647	int rc = VINF_SUCCESS;
648	const unsigned char puch = (const unsigned char )psz;
649	PRTUTF16 pwc = pwsz;
650	while (cch > 0)
651	{
652	/* read the next char and check for terminator. */
653	const unsigned char uch = *puch;
654	if (!uch)
655	break;
656
657	/* check for output overflow */
658	if (RT_UNLIKELY(cwc < 1))
659	{
660	rc = VERR_BUFFER_OVERFLOW;
661	break;
662	}
663	cwc--;
664
665	/* decode and recode the code point */
666	if (!(uch & RT_BIT(7)))
667	{
668	*pwc++ = uch;
669	puch++;
670	cch--;
671	}
672	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
673	{
674	uint16_t uc = (puch[1] & 0x3f)
675	\| ((uint16_t)(uch & 0x1f) << 6);
676	*pwc++ = uc;
677	puch += 2;
678	cch -= 2;
679	}
680	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
681	{
682	uint16_t uc = (puch[2] & 0x3f)
683	\| ((uint16_t)(puch[1] & 0x3f) << 6)
684	\| ((uint16_t)(uch & 0x0f) << 12);
685	*pwc++ = uc;
686	puch += 3;
687	cch -= 3;
688	}
689	else
690	{
691	/* generate surrogate pair */
692	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
693	RTUNICP uc = (puch[3] & 0x3f)
694	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
695	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
696	\| ((RTUNICP)(uch & 0x07) << 18);
697	if (RT_UNLIKELY(cwc < 1))
698	{
699	rc = VERR_BUFFER_OVERFLOW;
700	break;
701	}
702	cwc--;
703
704	uc -= 0x10000;
705	*pwc++ = 0xd800 \| (uc >> 10);
706	*pwc++ = 0xdc00 \| (uc & 0x3ff);
707	puch += 4;
708	cch -= 4;
709	}
710	}
711
712	/* done */
713	*pwc = '\0';
714	return rc;
715	}
716
717
718	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
719	{
720	/*
721	* Validate input.
722	*/
723	Assert(VALID_PTR(ppwszString));
724	Assert(VALID_PTR(pszString));
725	*ppwszString = NULL;
726
727	/*
728	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
729	*/
730	size_t cwc;
731	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
732	if (RT_SUCCESS(rc))
733	{
734	/*
735	* Allocate buffer.
736	*/
737	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
738	if (pwsz)
739	{
740	/*
741	* Encode the UTF-16 string.
742	*/
743	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
744	if (RT_SUCCESS(rc))
745	{
746	*ppwszString = pwsz;
747	return rc;
748	}
749	RTMemFree(pwsz);
750	}
751	else
752	rc = VERR_NO_UTF16_MEMORY;
753	}
754	return rc;
755	}
756	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
757
758
759	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
760	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
761	{
762	/*
763	* Validate input.
764	*/
765	Assert(VALID_PTR(pszString));
766	Assert(VALID_PTR(ppwsz));
767	Assert(!pcwc \|\| VALID_PTR(pcwc));
768
769	/*
770	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
771	*/
772	size_t cwcResult;
773	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
774	if (RT_SUCCESS(rc))
775	{
776	if (pcwc)
777	*pcwc = cwcResult;
778
779	/*
780	* Check buffer size / Allocate buffer.
781	*/
782	bool fShouldFree;
783	PRTUTF16 pwszResult;
784	if (cwc > 0 && *ppwsz)
785	{
786	fShouldFree = false;
787	if (cwc <= cwcResult)
788	return VERR_BUFFER_OVERFLOW;
789	pwszResult = *ppwsz;
790	}
791	else
792	{
793	*ppwsz = NULL;
794	fShouldFree = true;
795	cwc = RT_MAX(cwcResult + 1, cwc);
796	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
797	}
798	if (pwszResult)
799	{
800	/*
801	* Encode the UTF-16 string.
802	*/
803	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
804	if (RT_SUCCESS(rc))
805	{
806	*ppwsz = pwszResult;
807	return rc;
808	}
809	if (fShouldFree)
810	RTMemFree(pwszResult);
811	}
812	else
813	rc = VERR_NO_UTF16_MEMORY;
814	}
815	return rc;
816	}
817	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
818
819
820	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
821	{
822	size_t cwc;
823	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
824	return RT_SUCCESS(rc) ? cwc : 0;
825	}
826	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
827
828
829	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
830	{
831	size_t cwc;
832	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
833	if (pcwc)
834	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
835	return rc;
836	}
837	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
838
839
840	/**
841	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
842	*
843	* @returns iprt status code.
844	* @param psz The Latin-1 string.
845	* @param cchIn The max length of the Latin-1 string to consider.
846	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
847	*/
848	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
849	{
850	size_t cch = 0;
851	for (;;)
852	{
853	RTUNICP Cp;
854	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
855	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
856	break;
857	if (RT_FAILURE(rc))
858	return rc;
859	cch += RTStrCpSize(Cp); /* cannot fail */
860	}
861
862	/* done */
863	*pcch = cch;
864	return VINF_SUCCESS;
865	}
866
867
868	/**
869	* Recodes a Latin-1 string as UTF-8.
870	*
871	* @returns iprt status code.
872	* @param psz The Latin-1 string.
873	* @param cchIn The number of characters to process from psz. The recoding
874	* will stop when cch or '\\0' is reached.
875	* @param psz Where to store the UTF-8 string.
876	* @param cch The size of the UTF-8 buffer, excluding the terminator.
877	*/
878	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
879	{
880	int rc;
881	for (;;)
882	{
883	RTUNICP Cp;
884	size_t cchCp;
885	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
886	if (Cp == 0 \|\| RT_FAILURE(rc))
887	break;
888	cchCp = RTStrCpSize(Cp);
889	if (RT_UNLIKELY(cch < cchCp))
890	{
891	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
892	rc = VERR_BUFFER_OVERFLOW;
893	break;
894	}
895	cch -= cchCp;
896	psz = RTStrPutCp(psz, Cp);
897	}
898
899	/* done */
900	if (rc == VERR_END_OF_STRING)
901	rc = VINF_SUCCESS;
902	*psz = '\0';
903	return rc;
904	}
905
906
907
908	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
909	{
910	/*
911	* Validate input.
912	*/
913	Assert(VALID_PTR(ppszString));
914	Assert(VALID_PTR(pszString));
915	*ppszString = NULL;
916
917	/*
918	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
919	*/
920	size_t cch;
921	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
922	if (RT_SUCCESS(rc))
923	{
924	/*
925	* Allocate buffer and recode it.
926	*/
927	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
928	if (pszResult)
929	{
930	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
931	if (RT_SUCCESS(rc))
932	{
933	*ppszString = pszResult;
934	return rc;
935	}
936
937	RTMemFree(pszResult);
938	}
939	else
940	rc = VERR_NO_STR_MEMORY;
941	}
942	return rc;
943	}
944	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
945
946
947	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
948	{
949	/*
950	* Validate input.
951	*/
952	Assert(VALID_PTR(pszString));
953	Assert(VALID_PTR(ppsz));
954	Assert(!pcch \|\| VALID_PTR(pcch));
955
956	/*
957	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
958	*/
959	size_t cchResult;
960	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
961	if (RT_SUCCESS(rc))
962	{
963	if (pcch)
964	*pcch = cchResult;
965
966	/*
967	* Check buffer size / Allocate buffer and recode it.
968	*/
969	bool fShouldFree;
970	char *pszResult;
971	if (cch > 0 && *ppsz)
972	{
973	fShouldFree = false;
974	if (RT_UNLIKELY(cch <= cchResult))
975	return VERR_BUFFER_OVERFLOW;
976	pszResult = *ppsz;
977	}
978	else
979	{
980	*ppsz = NULL;
981	fShouldFree = true;
982	cch = RT_MAX(cch, cchResult + 1);
983	pszResult = (char *)RTStrAllocTag(cch, pszTag);
984	}
985	if (pszResult)
986	{
987	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
988	if (RT_SUCCESS(rc))
989	{
990	*ppsz = pszResult;
991	return rc;
992	}
993
994	if (fShouldFree)
995	RTStrFree(pszResult);
996	}
997	else
998	rc = VERR_NO_STR_MEMORY;
999	}
1000	return rc;
1001	}
1002	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1003
1004
1005	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1006	{
1007	size_t cch;
1008	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1009	return RT_SUCCESS(rc) ? cch : 0;
1010	}
1011	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1012
1013
1014	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
1015	{
1016	size_t cch;
1017	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1018	if (pcch)
1019	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1020	return rc;
1021	}
1022	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1023
1024
1025	/**
1026	* Calculates the Latin-1 length of a string, validating the encoding while
1027	* doing so.
1028	*
1029	* @returns IPRT status code.
1030	* @param psz Pointer to the UTF-8 string.
1031	* @param cchIn The max length of the string. (btw cch = cb)
1032	* Use RTSTR_MAX if all of the string is to be examined.
1033	* @param pcch Where to store the length of the Latin-1 string in bytes.
1034	*/
1035	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
1036	{
1037	size_t cch = 0;
1038	for (;;)
1039	{
1040	RTUNICP Cp;
1041	size_t cchCp;
1042	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1043	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1044	break;
1045	if (RT_FAILURE(rc))
1046	return rc;
1047	cchCp = RTLatin1CpSize(Cp);
1048	if (cchCp == 0)
1049	return VERR_NO_TRANSLATION;
1050	cch += cchCp;
1051	}
1052
1053	/* done */
1054	*pcch = cch;
1055	return VINF_SUCCESS;
1056	}
1057
1058
1059	/**
1060	* Recodes a valid UTF-8 string as Latin-1.
1061	*
1062	* Since we know the input is valid, we do not perform encoding or length checks.
1063	*
1064	* @returns iprt status code.
1065	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1066	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1067	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1068	* @param psz Where to store the Latin-1 string.
1069	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1070	*/
1071	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1072	{
1073	int rc;
1074	for (;;)
1075	{
1076	RTUNICP Cp;
1077	size_t cchCp;
1078	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1079	if (Cp == 0 \|\| RT_FAILURE(rc))
1080	break;
1081	cchCp = RTLatin1CpSize(Cp);
1082	if (RT_UNLIKELY(cch < cchCp))
1083	{
1084	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1085	rc = VERR_BUFFER_OVERFLOW;
1086	break;
1087	}
1088	cch -= cchCp;
1089	psz = RTLatin1PutCp(psz, Cp);
1090	}
1091
1092	/* done */
1093	if (rc == VERR_END_OF_STRING)
1094	rc = VINF_SUCCESS;
1095	*psz = '\0';
1096	return rc;
1097	}
1098
1099
1100
1101	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1102	{
1103	/*
1104	* Validate input.
1105	*/
1106	Assert(VALID_PTR(ppszString));
1107	Assert(VALID_PTR(pszString));
1108	*ppszString = NULL;
1109
1110	/*
1111	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1112	*/
1113	size_t cch;
1114	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1115	if (RT_SUCCESS(rc))
1116	{
1117	/*
1118	* Allocate buffer.
1119	*/
1120	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1121	if (psz)
1122	{
1123	/*
1124	* Encode the UTF-16 string.
1125	*/
1126	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1127	if (RT_SUCCESS(rc))
1128	{
1129	*ppszString = psz;
1130	return rc;
1131	}
1132	RTMemFree(psz);
1133	}
1134	else
1135	rc = VERR_NO_STR_MEMORY;
1136	}
1137	return rc;
1138	}
1139	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1140
1141
1142	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1143	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1144	{
1145	/*
1146	* Validate input.
1147	*/
1148	Assert(VALID_PTR(pszString));
1149	Assert(VALID_PTR(ppsz));
1150	Assert(!pcch \|\| VALID_PTR(pcch));
1151
1152	/*
1153	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1154	*/
1155	size_t cchResult;
1156	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1157	if (RT_SUCCESS(rc))
1158	{
1159	if (pcch)
1160	*pcch = cchResult;
1161
1162	/*
1163	* Check buffer size / Allocate buffer.
1164	*/
1165	bool fShouldFree;
1166	char *pszResult;
1167	if (cch > 0 && *ppsz)
1168	{
1169	fShouldFree = false;
1170	if (cch <= cchResult)
1171	return VERR_BUFFER_OVERFLOW;
1172	pszResult = *ppsz;
1173	}
1174	else
1175	{
1176	*ppsz = NULL;
1177	fShouldFree = true;
1178	cch = RT_MAX(cchResult + 1, cch);
1179	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1180	}
1181	if (pszResult)
1182	{
1183	/*
1184	* Encode the Latin-1 string.
1185	*/
1186	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1187	if (RT_SUCCESS(rc))
1188	{
1189	*ppsz = pszResult;
1190	return rc;
1191	}
1192	if (fShouldFree)
1193	RTMemFree(pszResult);
1194	}
1195	else
1196	rc = VERR_NO_STR_MEMORY;
1197	}
1198	return rc;
1199	}
1200	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1201
1202
1203	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1204	{
1205	size_t cch;
1206	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1207	return RT_SUCCESS(rc) ? cch : 0;
1208	}
1209	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1210
1211
1212	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1213	{
1214	size_t cch;
1215	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1216	if (pcch)
1217	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1218	return rc;
1219	}
1220	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1221
1222
1223	/**
1224	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1225	* @returns rc
1226	* @param ppsz The pointer to the string position point.
1227	* @param pCp Where to store RTUNICP_INVALID.
1228	* @param rc The iprt error code.
1229	*/
1230	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1231	{
1232	/*
1233	* Try find a valid encoding.
1234	*/
1235	(ppsz)++; /* @todo code this! */
1236	*pCp = RTUNICP_INVALID;
1237	return rc;
1238	}
1239
1240
1241	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1242	{
1243	RTUNICP Cp;
1244	RTStrGetCpExInternal(&psz, &Cp);
1245	return Cp;
1246	}
1247	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1248
1249
1250	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1251	{
1252	const unsigned char puch = (const unsigned char )*ppsz;
1253	const unsigned char uch = *puch;
1254	RTUNICP uc;
1255
1256	/* ASCII ? */
1257	if (!(uch & RT_BIT(7)))
1258	{
1259	uc = uch;
1260	puch++;
1261	}
1262	else if (uch & RT_BIT(6))
1263	{
1264	/* figure the length and validate the first octet. */
1265	/** @todo RT_USE_RTC_3629 */
1266	unsigned cb;
1267	if (!(uch & RT_BIT(5)))
1268	cb = 2;
1269	else if (!(uch & RT_BIT(4)))
1270	cb = 3;
1271	else if (!(uch & RT_BIT(3)))
1272	cb = 4;
1273	else if (!(uch & RT_BIT(2)))
1274	cb = 5;
1275	else if (!(uch & RT_BIT(1)))
1276	cb = 6;
1277	else
1278	{
1279	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1280	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1281	}
1282
1283	/* validate the rest */
1284	switch (cb)
1285	{
1286	case 6:
1287	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1288	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1289	case 5:
1290	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1291	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1292	case 4:
1293	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1294	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1295	case 3:
1296	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1297	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1298	case 2:
1299	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1300	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1301	break;
1302	}
1303
1304	/* get and validate the code point. */
1305	switch (cb)
1306	{
1307	case 6:
1308	uc = (puch[5] & 0x3f)
1309	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1310	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1311	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1312	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1313	\| ((RTUNICP)(uch & 0x01) << 30);
1314	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1315	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1316	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1317	break;
1318	case 5:
1319	uc = (puch[4] & 0x3f)
1320	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1321	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1322	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1323	\| ((RTUNICP)(uch & 0x03) << 24);
1324	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1325	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1326	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1327	break;
1328	case 4:
1329	uc = (puch[3] & 0x3f)
1330	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1331	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1332	\| ((RTUNICP)(uch & 0x07) << 18);
1333	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1334	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1335	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1336	break;
1337	case 3:
1338	uc = (puch[2] & 0x3f)
1339	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1340	\| ((RTUNICP)(uch & 0x0f) << 12);
1341	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1342	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1343	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1344	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1345	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1346	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1347	break;
1348	case 2:
1349	uc = (puch[1] & 0x3f)
1350	\| ((RTUNICP)(uch & 0x1f) << 6);
1351	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1352	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1353	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1354	break;
1355	default: /* impossible, but GCC is bitching. */
1356	uc = RTUNICP_INVALID;
1357	break;
1358	}
1359	puch += cb;
1360	}
1361	else
1362	{
1363	/* 6th bit is always set. */
1364	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1365	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1366	}
1367	*pCp = uc;
1368	ppsz = (const char )puch;
1369	return VINF_SUCCESS;
1370	}
1371	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1372
1373
1374	/**
1375	* Handle invalid encodings passed to RTStrGetCpNEx().
1376	* @returns rc
1377	* @param ppsz The pointer to the string position point.
1378	* @param pcch Pointer to the string length.
1379	* @param pCp Where to store RTUNICP_INVALID.
1380	* @param rc The iprt error code.
1381	*/
1382	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1383	{
1384	/*
1385	* Try find a valid encoding.
1386	*/
1387	(ppsz)++; /* @todo code this! */
1388	(*pcch)--;
1389	*pCp = RTUNICP_INVALID;
1390	return rc;
1391	}
1392
1393
1394	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1395	{
1396	const unsigned char puch = (const unsigned char )*ppsz;
1397	const unsigned char uch = *puch;
1398	size_t cch = *pcch;
1399	RTUNICP uc;
1400
1401	if (cch == 0)
1402	{
1403	*pCp = RTUNICP_INVALID;
1404	return VERR_END_OF_STRING;
1405	}
1406
1407	/* ASCII ? */
1408	if (!(uch & RT_BIT(7)))
1409	{
1410	uc = uch;
1411	puch++;
1412	cch--;
1413	}
1414	else if (uch & RT_BIT(6))
1415	{
1416	/* figure the length and validate the first octet. */
1417	/** @todo RT_USE_RTC_3629 */
1418	unsigned cb;
1419	if (!(uch & RT_BIT(5)))
1420	cb = 2;
1421	else if (!(uch & RT_BIT(4)))
1422	cb = 3;
1423	else if (!(uch & RT_BIT(3)))
1424	cb = 4;
1425	else if (!(uch & RT_BIT(2)))
1426	cb = 5;
1427	else if (!(uch & RT_BIT(1)))
1428	cb = 6;
1429	else
1430	{
1431	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1432	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1433	}
1434
1435	if (cb > cch)
1436	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1437
1438	/* validate the rest */
1439	switch (cb)
1440	{
1441	case 6:
1442	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1443	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1444	case 5:
1445	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1446	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1447	case 4:
1448	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1449	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1450	case 3:
1451	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1452	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1453	case 2:
1454	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1455	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1456	break;
1457	}
1458
1459	/* get and validate the code point. */
1460	switch (cb)
1461	{
1462	case 6:
1463	uc = (puch[5] & 0x3f)
1464	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1465	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1466	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1467	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1468	\| ((RTUNICP)(uch & 0x01) << 30);
1469	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1470	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1471	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1472	break;
1473	case 5:
1474	uc = (puch[4] & 0x3f)
1475	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1476	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1477	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1478	\| ((RTUNICP)(uch & 0x03) << 24);
1479	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1480	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1481	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1482	break;
1483	case 4:
1484	uc = (puch[3] & 0x3f)
1485	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1486	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1487	\| ((RTUNICP)(uch & 0x07) << 18);
1488	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1489	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1490	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1491	break;
1492	case 3:
1493	uc = (puch[2] & 0x3f)
1494	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1495	\| ((RTUNICP)(uch & 0x0f) << 12);
1496	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1497	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1498	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1499	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1500	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1501	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1502	break;
1503	case 2:
1504	uc = (puch[1] & 0x3f)
1505	\| ((RTUNICP)(uch & 0x1f) << 6);
1506	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1507	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1508	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1509	break;
1510	default: /* impossible, but GCC is bitching. */
1511	uc = RTUNICP_INVALID;
1512	break;
1513	}
1514	puch += cb;
1515	cch -= cb;
1516	}
1517	else
1518	{
1519	/* 6th bit is always set. */
1520	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1521	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1522	}
1523	*pCp = uc;
1524	ppsz = (const char )puch;
1525	(*pcch) = cch;
1526	return VINF_SUCCESS;
1527	}
1528	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1529
1530
1531	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1532	{
1533	unsigned char puch = (unsigned char )psz;
1534	if (uc < 0x80)
1535	*puch++ = (unsigned char )uc;
1536	else if (uc < 0x00000800)
1537	{
1538	*puch++ = 0xc0 \| (uc >> 6);
1539	*puch++ = 0x80 \| (uc & 0x3f);
1540	}
1541	else if (uc < 0x00010000)
1542	{
1543	/** @todo RT_USE_RTC_3629 */
1544	if ( uc < 0x0000d8000
1545	\|\| ( uc > 0x0000dfff
1546	&& uc < 0x0000fffe))
1547	{
1548	*puch++ = 0xe0 \| (uc >> 12);
1549	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1550	*puch++ = 0x80 \| (uc & 0x3f);
1551	}
1552	else
1553	{
1554	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1555	*puch++ = 0x7f;
1556	}
1557	}
1558	/** @todo RT_USE_RTC_3629 */
1559	else if (uc < 0x00200000)
1560	{
1561	*puch++ = 0xf0 \| (uc >> 18);
1562	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1563	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1564	*puch++ = 0x80 \| (uc & 0x3f);
1565	}
1566	else if (uc < 0x04000000)
1567	{
1568	*puch++ = 0xf8 \| (uc >> 24);
1569	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1570	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1571	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1572	*puch++ = 0x80 \| (uc & 0x3f);
1573	}
1574	else if (uc <= 0x7fffffff)
1575	{
1576	*puch++ = 0xfc \| (uc >> 30);
1577	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1578	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1579	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1580	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1581	*puch++ = 0x80 \| (uc & 0x3f);
1582	}
1583	else
1584	{
1585	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1586	*puch++ = 0x7f;
1587	}
1588
1589	return (char *)puch;
1590	}
1591	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1592
1593
1594	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1595	{
1596	if (pszStart < psz)
1597	{
1598	/* simple char? */
1599	const unsigned char puch = (const unsigned char )psz;
1600	unsigned uch = *--puch;
1601	if (!(uch & RT_BIT(7)))
1602	return (char *)puch;
1603	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1604
1605	/* two or more. */
1606	uint32_t uMask = 0xffffffc0;
1607	while ( (const unsigned char *)pszStart < puch
1608	&& !(uMask & 1))
1609	{
1610	uch = *--puch;
1611	if ((uch & 0xc0) != 0x80)
1612	{
1613	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1614	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1615	(char *)pszStart);
1616	return (char *)puch;
1617	}
1618	uMask >>= 1;
1619	}
1620	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1621	}
1622	return (char *)pszStart;
1623	}
1624	RT_EXPORT_SYMBOL(RTStrPrevCp);
1625

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 57358

Download in other formats: