utf-8.cpp@ 81106

Last change on this file since 81106 was 76553, checked in by vboxsync, 6 years ago
scm --update-copyright-year
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 70.7 KB

Line
1	/* $Id: utf-8.cpp 76553 2019-01-01 01:45:53Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2019 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*********************************************************************************************************************************
29	* Header Files *
30	*********************************************************************************************************************************/
31	#include <iprt/string.h>
32	#include <iprt/latin1.h>
33	#include "internal/iprt.h"
34
35	#include <iprt/uni.h>
36	#include <iprt/asm.h>
37	#include <iprt/alloc.h>
38	#include <iprt/assert.h>
39	#include <iprt/err.h>
40	#include "internal/string.h"
41
42
43
44	/**
45	* Get get length in code points of a UTF-8 encoded string.
46	* The string is validated while doing this.
47	*
48	* @returns IPRT status code.
49	* @param psz Pointer to the UTF-8 string.
50	* @param cch The max length of the string. (btw cch = cb)
51	* Use RTSTR_MAX if all of the string is to be examined.
52	* @param pcuc Where to store the length in unicode code points.
53	* @param pcchActual Where to store the actual size of the UTF-8 string
54	* on success (cch = cb again). Optional.
55	*/
56	DECLHIDDEN(int) rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
57	{
58	const unsigned char puch = (const unsigned char )psz;
59	size_t cCodePoints = 0;
60	while (cch > 0)
61	{
62	const unsigned char uch = *puch;
63	if (!uch)
64	break;
65	if (uch & RT_BIT(7))
66	{
67	/* figure sequence length and validate the first byte */
68	/** @todo RT_USE_RTC_3629 */
69	unsigned cb;
70	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
71	cb = 2;
72	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
73	cb = 3;
74	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
75	cb = 4;
76	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
77	cb = 5;
78	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
79	cb = 6;
80	else
81	{
82	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
83	return VERR_INVALID_UTF8_ENCODING;
84	}
85
86	/* check length */
87	if (cb > cch)
88	{
89	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
90	return VERR_INVALID_UTF8_ENCODING;
91	}
92
93	/* validate the rest */
94	switch (cb)
95	{
96	case 6:
97	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98	RT_FALL_THRU();
99	case 5:
100	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101	RT_FALL_THRU();
102	case 4:
103	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104	RT_FALL_THRU();
105	case 3:
106	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
107	RT_FALL_THRU();
108	case 2:
109	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
110	break;
111	}
112
113	/* validate the code point. */
114	RTUNICP uc;
115	switch (cb)
116	{
117	case 6:
118	uc = (puch[5] & 0x3f)
119	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
120	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
121	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
122	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
123	\| ((RTUNICP)(uch & 0x01) << 30);
124	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
125	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
126	break;
127	case 5:
128	uc = (puch[4] & 0x3f)
129	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
130	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
131	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
132	\| ((RTUNICP)(uch & 0x03) << 24);
133	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
134	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
135	break;
136	case 4:
137	uc = (puch[3] & 0x3f)
138	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
139	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
140	\| ((RTUNICP)(uch & 0x07) << 18);
141	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
142	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
143	break;
144	case 3:
145	uc = (puch[2] & 0x3f)
146	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
147	\| ((RTUNICP)(uch & 0x0f) << 12);
148	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
149	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
150	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
151	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
152	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
153	break;
154	case 2:
155	uc = (puch[1] & 0x3f)
156	\| ((RTUNICP)(uch & 0x1f) << 6);
157	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
158	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
159	break;
160	}
161
162	/* advance */
163	cch -= cb;
164	puch += cb;
165	}
166	else
167	{
168	/* one ASCII byte */
169	puch++;
170	cch--;
171	}
172	cCodePoints++;
173	}
174
175	/* done */
176	*pcuc = cCodePoints;
177	if (pcchActual)
178	pcchActual = puch - (unsigned char const )psz;
179	return VINF_SUCCESS;
180	}
181
182
183	/**
184	* Decodes and UTF-8 string into an array of unicode code point.
185	*
186	* Since we know the input is valid, we do not perform encoding or length checks.
187	*
188	* @returns iprt status code.
189	* @param psz The UTF-8 string to recode. This is a valid encoding.
190	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
191	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
192	* @param paCps Where to store the code points array.
193	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
194	*/
195	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
196	{
197	int rc = VINF_SUCCESS;
198	const unsigned char puch = (const unsigned char )psz;
199	PRTUNICP pCp = paCps;
200	while (cch > 0)
201	{
202	/* read the next char and check for terminator. */
203	const unsigned char uch = *puch;
204	if (uch)
205	{ /* we only break once, so consider this the likely branch. */ }
206	else
207	break;
208
209	/* check for output overflow */
210	if (RT_LIKELY(cCps >= 1))
211	{ /* likely */ }
212	else
213	{
214	rc = VERR_BUFFER_OVERFLOW;
215	break;
216	}
217	cCps--;
218
219	/* decode and recode the code point */
220	if (!(uch & RT_BIT(7)))
221	{
222	*pCp++ = uch;
223	puch++;
224	cch--;
225	}
226	#ifdef RT_STRICT
227	else if (!(uch & RT_BIT(6)))
228	AssertMsgFailed(("Internal error!\n"));
229	#endif
230	else if (!(uch & RT_BIT(5)))
231	{
232	*pCp++ = (puch[1] & 0x3f)
233	\| ((uint16_t)(uch & 0x1f) << 6);
234	puch += 2;
235	cch -= 2;
236	}
237	else if (!(uch & RT_BIT(4)))
238	{
239	*pCp++ = (puch[2] & 0x3f)
240	\| ((uint16_t)(puch[1] & 0x3f) << 6)
241	\| ((uint16_t)(uch & 0x0f) << 12);
242	puch += 3;
243	cch -= 3;
244	}
245	else if (!(uch & RT_BIT(3)))
246	{
247	*pCp++ = (puch[3] & 0x3f)
248	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
249	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
250	\| ((RTUNICP)(uch & 0x07) << 18);
251	puch += 4;
252	cch -= 4;
253	}
254	else if (!(uch & RT_BIT(2)))
255	{
256	*pCp++ = (puch[4] & 0x3f)
257	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
258	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
259	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
260	\| ((RTUNICP)(uch & 0x03) << 24);
261	puch += 5;
262	cch -= 6;
263	}
264	else
265	{
266	Assert(!(uch & RT_BIT(1)));
267	*pCp++ = (puch[5] & 0x3f)
268	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
269	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
270	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
271	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
272	\| ((RTUNICP)(uch & 0x01) << 30);
273	puch += 6;
274	cch -= 6;
275	}
276	}
277
278	/* done */
279	*pCp = 0;
280	return rc;
281	}
282
283
284	RTDECL(size_t) RTStrUniLen(const char *psz)
285	{
286	size_t cCodePoints;
287	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
288	return RT_SUCCESS(rc) ? cCodePoints : 0;
289	}
290	RT_EXPORT_SYMBOL(RTStrUniLen);
291
292
293	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
294	{
295	size_t cCodePoints;
296	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
297	if (pcCps)
298	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
299	return rc;
300	}
301	RT_EXPORT_SYMBOL(RTStrUniLenEx);
302
303
304	RTDECL(int) RTStrValidateEncoding(const char *psz)
305	{
306	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
307	}
308	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
309
310
311	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
312	{
313	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED \| RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
314	VERR_INVALID_PARAMETER);
315	AssertPtr(psz);
316
317	/*
318	* Use rtUtf8Length for the job.
319	*/
320	size_t cchActual;
321	size_t cCpsIgnored;
322	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
323	if (RT_SUCCESS(rc))
324	{
325	if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
326	{
327	if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
328	cchActual++;
329	if (cchActual == cch)
330	rc = VINF_SUCCESS;
331	else if (cchActual < cch)
332	rc = VERR_BUFFER_UNDERFLOW;
333	else
334	rc = VERR_BUFFER_OVERFLOW;
335	}
336	else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
337	&& cchActual >= cch)
338	rc = VERR_BUFFER_OVERFLOW;
339	}
340	return rc;
341	}
342	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
343
344
345	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
346	{
347	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
348	return RT_SUCCESS(rc);
349	}
350	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
351
352
353	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
354	{
355	size_t cErrors = 0;
356	for (;;)
357	{
358	RTUNICP Cp;
359	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
360	if (RT_SUCCESS(rc))
361	{
362	if (!Cp)
363	break;
364	}
365	else
366	{
367	psz[-1] = '?';
368	cErrors++;
369	}
370	}
371	return cErrors;
372	}
373	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
374
375
376	/**
377	* Helper for RTStrPurgeComplementSet.
378	*
379	* @returns true if @a Cp is valid, false if not.
380	* @param Cp The code point to validate.
381	* @param puszValidPairs Pair of valid code point sets.
382	* @param cValidPairs Number of pairs.
383	*/
384	DECLINLINE(bool) rtStrPurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
385	{
386	while (cValidPairs-- > 0)
387	{
388	if ( Cp >= puszValidPairs[0]
389	&& Cp <= puszValidPairs[1])
390	return true;
391	puszValidPairs += 2;
392	}
393	return false;
394	}
395
396
397	RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidPairs, char chReplacement)
398	{
399	AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
400
401	/*
402	* Calc valid pairs and check that we've got an even number.
403	*/
404	uint32_t cValidPairs = 0;
405	while (puszValidPairs[cValidPairs * 2])
406	{
407	AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
408	AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
409	("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
410	cValidPairs++;
411	}
412
413	/*
414	* Do the replacing.
415	*/
416	ssize_t cReplacements = 0;
417	for (;;)
418	{
419	char *pszCur = psz;
420	RTUNICP Cp;
421	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
422	if (RT_SUCCESS(rc))
423	{
424	if (Cp)
425	{
426	if (!rtStrPurgeIsInSet(Cp, puszValidPairs, cValidPairs))
427	{
428	for (; pszCur != psz; ++pszCur)
429	*pszCur = chReplacement;
430	++cReplacements;
431	}
432	}
433	else
434	break;
435	}
436	else
437	return -1;
438	}
439	return cReplacements;
440	}
441	RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
442
443
444	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
445	{
446	/*
447	* Validate input.
448	*/
449	Assert(VALID_PTR(pszString));
450	Assert(VALID_PTR(ppaCps));
451	*ppaCps = NULL;
452
453	/*
454	* Validate the UTF-8 input and count its code points.
455	*/
456	size_t cCps;
457	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
458	if (RT_SUCCESS(rc))
459	{
460	/*
461	* Allocate buffer.
462	*/
463	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
464	if (paCps)
465	{
466	/*
467	* Decode the string.
468	*/
469	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
470	if (RT_SUCCESS(rc))
471	{
472	*ppaCps = paCps;
473	return rc;
474	}
475	RTMemFree(paCps);
476	}
477	else
478	rc = VERR_NO_CODE_POINT_MEMORY;
479	}
480	return rc;
481	}
482	RT_EXPORT_SYMBOL(RTStrToUni);
483
484
485	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
486	{
487	/*
488	* Validate input.
489	*/
490	Assert(VALID_PTR(pszString));
491	Assert(VALID_PTR(ppaCps));
492	Assert(!pcCps \|\| VALID_PTR(pcCps));
493
494	/*
495	* Validate the UTF-8 input and count the code points.
496	*/
497	size_t cCpsResult;
498	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
499	if (RT_SUCCESS(rc))
500	{
501	if (pcCps)
502	*pcCps = cCpsResult;
503
504	/*
505	* Check buffer size / Allocate buffer.
506	*/
507	bool fShouldFree;
508	PRTUNICP paCpsResult;
509	if (cCps > 0 && *ppaCps)
510	{
511	fShouldFree = false;
512	if (cCps <= cCpsResult)
513	return VERR_BUFFER_OVERFLOW;
514	paCpsResult = *ppaCps;
515	}
516	else
517	{
518	*ppaCps = NULL;
519	fShouldFree = true;
520	cCps = RT_MAX(cCpsResult + 1, cCps);
521	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
522	}
523	if (paCpsResult)
524	{
525	/*
526	* Encode the UTF-16 string.
527	*/
528	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
529	if (RT_SUCCESS(rc))
530	{
531	*ppaCps = paCpsResult;
532	return rc;
533	}
534	if (fShouldFree)
535	RTMemFree(paCpsResult);
536	}
537	else
538	rc = VERR_NO_CODE_POINT_MEMORY;
539	}
540	return rc;
541	}
542	RT_EXPORT_SYMBOL(RTStrToUniEx);
543
544
545	/**
546	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
547	*
548	* @returns IPRT status code.
549	* @param psz Pointer to the UTF-8 string.
550	* @param cch The max length of the string. (btw cch = cb)
551	* @param pcwc Where to store the length of the UTF-16 string as a number
552	* of RTUTF16 characters.
553	* @sa rtUtf8CalcUtf16Length
554	*/
555	static int rtUtf8CalcUtf16LengthN(const char psz, size_t cch, size_t pcwc)
556	{
557	const unsigned char puch = (const unsigned char )psz;
558	size_t cwc = 0;
559	while (cch > 0)
560	{
561	const unsigned char uch = *puch;
562	if (!(uch & RT_BIT(7)))
563	{
564	/* one ASCII byte */
565	if (uch)
566	{
567	cwc++;
568	puch++;
569	cch--;
570	}
571	else
572	break;
573	}
574	else
575	{
576	/*
577	* Multibyte sequence is more complicated when we have length
578	* restrictions on the input.
579	*/
580	/* figure sequence length and validate the first byte */
581	unsigned cb;
582	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
583	cb = 2;
584	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
585	cb = 3;
586	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
587	cb = 4;
588	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
589	cb = 5;
590	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
591	cb = 6;
592	else
593	{
594	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
595	return VERR_INVALID_UTF8_ENCODING;
596	}
597
598	/* check length */
599	if (cb > cch)
600	{
601	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
602	return VERR_INVALID_UTF8_ENCODING;
603	}
604
605	/* validate the rest */
606	switch (cb)
607	{
608	case 6:
609	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
610	RT_FALL_THRU();
611	case 5:
612	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
613	RT_FALL_THRU();
614	case 4:
615	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
616	RT_FALL_THRU();
617	case 3:
618	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
619	RT_FALL_THRU();
620	case 2:
621	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
622	break;
623	}
624
625	/* validate the code point. */
626	RTUNICP uc;
627	switch (cb)
628	{
629	case 6:
630	uc = (puch[5] & 0x3f)
631	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
632	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
633	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
634	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
635	\| ((RTUNICP)(uch & 0x01) << 30);
636	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
637	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
638	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
639	return VERR_CANT_RECODE_AS_UTF16;
640	case 5:
641	uc = (puch[4] & 0x3f)
642	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
643	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
644	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
645	\| ((RTUNICP)(uch & 0x03) << 24);
646	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
647	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
648	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
649	return VERR_CANT_RECODE_AS_UTF16;
650	case 4:
651	uc = (puch[3] & 0x3f)
652	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
653	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
654	\| ((RTUNICP)(uch & 0x07) << 18);
655	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
656	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
657	RTStrAssertMsgReturn(uc <= 0x0010ffff,
658	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
659	cwc++;
660	break;
661	case 3:
662	uc = (puch[2] & 0x3f)
663	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
664	\| ((RTUNICP)(uch & 0x0f) << 12);
665	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
666	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
667	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
668	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
669	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
670	break;
671	case 2:
672	uc = (puch[1] & 0x3f)
673	\| ((RTUNICP)(uch & 0x1f) << 6);
674	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
675	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
676	break;
677	}
678
679	/* advance */
680	cch -= cb;
681	puch += cb;
682	cwc++;
683	}
684	}
685
686	/* done */
687	*pcwc = cwc;
688	return VINF_SUCCESS;
689	}
690
691
692	/**
693	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
694	*
695	* @returns IPRT status code.
696	* @param psz Pointer to the UTF-8 string.
697	* @param pcwc Where to store the length of the UTF-16 string as a number
698	* of RTUTF16 characters.
699	* @sa rtUtf8CalcUtf16LengthN
700	*/
701	static int rtUtf8CalcUtf16Length(const char psz, size_t pcwc)
702	{
703	const unsigned char puch = (const unsigned char )psz;
704	size_t cwc = 0;
705	for (;;)
706	{
707	const unsigned char uch = *puch;
708	if (!(uch & RT_BIT(7)))
709	{
710	/* one ASCII byte */
711	if (uch)
712	{
713	cwc++;
714	puch++;
715	}
716	else
717	break;
718	}
719	else
720	{
721	/*
722	* Figure sequence length, implicitly validate the first byte.
723	* Then validate the additional bytes.
724	* Finally validate the code point.
725	*/
726	unsigned cb;
727	RTUNICP uc;
728	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
729	{
730	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
731	uc = (puch[1] & 0x3f)
732	\| ((RTUNICP)(uch & 0x1f) << 6);
733	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
734	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
735	cb = 2;
736	}
737	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
738	{
739	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
740	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
741	uc = (puch[2] & 0x3f)
742	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
743	\| ((RTUNICP)(uch & 0x0f) << 12);
744	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
745	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
746	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
747	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
748	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
749	cb = 3;
750	}
751	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
752	{
753	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
754	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
755	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
756	uc = (puch[3] & 0x3f)
757	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
758	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
759	\| ((RTUNICP)(uch & 0x07) << 18);
760	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
761	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
762	RTStrAssertMsgReturn(uc <= 0x0010ffff,
763	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
764	cwc++;
765	cb = 4;
766	}
767	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
768	{
769	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
770	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
771	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
772	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
773	uc = (puch[4] & 0x3f)
774	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
775	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
776	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
777	\| ((RTUNICP)(uch & 0x03) << 24);
778	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
779	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
780	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
781	return VERR_CANT_RECODE_AS_UTF16;
782	//cb = 5;
783	}
784	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
785	{
786	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
787	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
788	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
789	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
790	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
791	uc = (puch[5] & 0x3f)
792	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
793	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
794	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
795	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
796	\| ((RTUNICP)(uch & 0x01) << 30);
797	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
798	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
799	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
800	return VERR_CANT_RECODE_AS_UTF16;
801	//cb = 6;
802	}
803	else
804	{
805	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
806	return VERR_INVALID_UTF8_ENCODING;
807	}
808
809	/* advance */
810	puch += cb;
811	cwc++;
812	}
813	}
814
815	/* done */
816	*pcwc = cwc;
817	return VINF_SUCCESS;
818	}
819
820
821
822	/**
823	* Recodes a valid UTF-8 string as UTF-16.
824	*
825	* Since we know the input is valid, we do not perform encoding or length checks.
826	*
827	* @returns iprt status code.
828	* @param psz The UTF-8 string to recode. This is a valid encoding.
829	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
830	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
831	* @param pwsz Where to store the UTF-16 string.
832	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
833	*
834	* @note rtUtf8RecodeAsUtf16Big is a duplicate with RT_H2BE_U16 applied.
835	*/
836	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
837	{
838	int rc = VINF_SUCCESS;
839	const unsigned char puch = (const unsigned char )psz;
840	PRTUTF16 pwc = pwsz;
841	while (cch > 0)
842	{
843	/* read the next char and check for terminator. */
844	const unsigned char uch = *puch;
845	if (uch)
846	{ /* we only break once, so consider this the likely branch. */ }
847	else
848	break;
849
850	/* check for output overflow */
851	if (RT_LIKELY(cwc >= 1))
852	{ /* likely */ }
853	else
854	{
855	rc = VERR_BUFFER_OVERFLOW;
856	break;
857	}
858	cwc--;
859
860	/* decode and recode the code point */
861	if (!(uch & RT_BIT(7)))
862	{
863	*pwc++ = uch;
864	puch++;
865	cch--;
866	}
867	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
868	{
869	uint16_t uc = (puch[1] & 0x3f)
870	\| ((uint16_t)(uch & 0x1f) << 6);
871	*pwc++ = uc;
872	puch += 2;
873	cch -= 2;
874	}
875	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
876	{
877	uint16_t uc = (puch[2] & 0x3f)
878	\| ((uint16_t)(puch[1] & 0x3f) << 6)
879	\| ((uint16_t)(uch & 0x0f) << 12);
880	*pwc++ = uc;
881	puch += 3;
882	cch -= 3;
883	}
884	else
885	{
886	/* generate surrogate pair */
887	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
888	RTUNICP uc = (puch[3] & 0x3f)
889	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
890	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
891	\| ((RTUNICP)(uch & 0x07) << 18);
892	if (RT_UNLIKELY(cwc < 1))
893	{
894	rc = VERR_BUFFER_OVERFLOW;
895	break;
896	}
897	cwc--;
898
899	uc -= 0x10000;
900	*pwc++ = 0xd800 \| (uc >> 10);
901	*pwc++ = 0xdc00 \| (uc & 0x3ff);
902	puch += 4;
903	cch -= 4;
904	}
905	}
906
907	/* done */
908	*pwc = '\0';
909	return rc;
910	}
911
912
913	/**
914	* Recodes a valid UTF-8 string as UTF-16BE.
915	*
916	* Since we know the input is valid, we do not perform encoding or length checks.
917	*
918	* @returns iprt status code.
919	* @param psz The UTF-8 string to recode. This is a valid encoding.
920	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
921	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
922	* @param pwsz Where to store the UTF-16BE string.
923	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
924	*
925	* @note This is a copy of rtUtf8RecodeAsUtf16 with RT_H2BE_U16 applied.
926	*/
927	static int rtUtf8RecodeAsUtf16Big(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
928	{
929	int rc = VINF_SUCCESS;
930	const unsigned char puch = (const unsigned char )psz;
931	PRTUTF16 pwc = pwsz;
932	while (cch > 0)
933	{
934	/* read the next char and check for terminator. */
935	const unsigned char uch = *puch;
936	if (uch)
937	{ /* we only break once, so consider this the likely branch. */ }
938	else
939	break;
940
941	/* check for output overflow */
942	if (RT_LIKELY(cwc >= 1))
943	{ /* likely */ }
944	else
945	{
946	rc = VERR_BUFFER_OVERFLOW;
947	break;
948	}
949	cwc--;
950
951	/* decode and recode the code point */
952	if (!(uch & RT_BIT(7)))
953	{
954	*pwc++ = RT_H2BE_U16((RTUTF16)uch);
955	puch++;
956	cch--;
957	}
958	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
959	{
960	uint16_t uc = (puch[1] & 0x3f)
961	\| ((uint16_t)(uch & 0x1f) << 6);
962	*pwc++ = RT_H2BE_U16(uc);
963	puch += 2;
964	cch -= 2;
965	}
966	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
967	{
968	uint16_t uc = (puch[2] & 0x3f)
969	\| ((uint16_t)(puch[1] & 0x3f) << 6)
970	\| ((uint16_t)(uch & 0x0f) << 12);
971	*pwc++ = RT_H2BE_U16(uc);
972	puch += 3;
973	cch -= 3;
974	}
975	else
976	{
977	/* generate surrogate pair */
978	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
979	RTUNICP uc = (puch[3] & 0x3f)
980	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
981	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
982	\| ((RTUNICP)(uch & 0x07) << 18);
983	if (RT_UNLIKELY(cwc < 1))
984	{
985	rc = VERR_BUFFER_OVERFLOW;
986	break;
987	}
988	cwc--;
989
990	uc -= 0x10000;
991	*pwc++ = RT_H2BE_U16(0xd800 \| (uc >> 10));
992	*pwc++ = RT_H2BE_U16(0xdc00 \| (uc & 0x3ff));
993	puch += 4;
994	cch -= 4;
995	}
996	}
997
998	/* done */
999	*pwc = '\0';
1000	return rc;
1001	}
1002
1003
1004	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
1005	{
1006	/*
1007	* Validate input.
1008	*/
1009	Assert(VALID_PTR(ppwszString));
1010	Assert(VALID_PTR(pszString));
1011	*ppwszString = NULL;
1012
1013	/*
1014	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1015	*/
1016	size_t cwc;
1017	int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1018	if (RT_SUCCESS(rc))
1019	{
1020	/*
1021	* Allocate buffer.
1022	*/
1023	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1024	if (pwsz)
1025	{
1026	/*
1027	* Encode the UTF-16 string.
1028	*/
1029	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1030	if (RT_SUCCESS(rc))
1031	{
1032	*ppwszString = pwsz;
1033	return rc;
1034	}
1035	RTMemFree(pwsz);
1036	}
1037	else
1038	rc = VERR_NO_UTF16_MEMORY;
1039	}
1040	return rc;
1041	}
1042	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
1043
1044
1045	RTDECL(int) RTStrToUtf16BigTag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
1046	{
1047	/*
1048	* Validate input.
1049	*/
1050	Assert(VALID_PTR(ppwszString));
1051	Assert(VALID_PTR(pszString));
1052	*ppwszString = NULL;
1053
1054	/*
1055	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1056	*/
1057	size_t cwc;
1058	int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1059	if (RT_SUCCESS(rc))
1060	{
1061	/*
1062	* Allocate buffer.
1063	*/
1064	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1065	if (pwsz)
1066	{
1067	/*
1068	* Encode the UTF-16 string.
1069	*/
1070	rc = rtUtf8RecodeAsUtf16Big(pszString, RTSTR_MAX, pwsz, cwc);
1071	if (RT_SUCCESS(rc))
1072	{
1073	*ppwszString = pwsz;
1074	return rc;
1075	}
1076	RTMemFree(pwsz);
1077	}
1078	else
1079	rc = VERR_NO_UTF16_MEMORY;
1080	}
1081	return rc;
1082	}
1083	RT_EXPORT_SYMBOL(RTStrToUtf16BigTag);
1084
1085
1086	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
1087	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
1088	{
1089	/*
1090	* Validate input.
1091	*/
1092	Assert(VALID_PTR(pszString));
1093	Assert(VALID_PTR(ppwsz));
1094	Assert(!pcwc \|\| VALID_PTR(pcwc));
1095
1096	/*
1097	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1098	*/
1099	size_t cwcResult;
1100	int rc;
1101	if (cchString != RTSTR_MAX)
1102	rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1103	else
1104	rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1105	if (RT_SUCCESS(rc))
1106	{
1107	if (pcwc)
1108	*pcwc = cwcResult;
1109
1110	/*
1111	* Check buffer size / Allocate buffer.
1112	*/
1113	bool fShouldFree;
1114	PRTUTF16 pwszResult;
1115	if (cwc > 0 && *ppwsz)
1116	{
1117	fShouldFree = false;
1118	if (cwc <= cwcResult)
1119	return VERR_BUFFER_OVERFLOW;
1120	pwszResult = *ppwsz;
1121	}
1122	else
1123	{
1124	*ppwsz = NULL;
1125	fShouldFree = true;
1126	cwc = RT_MAX(cwcResult + 1, cwc);
1127	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1128	}
1129	if (pwszResult)
1130	{
1131	/*
1132	* Encode the UTF-16 string.
1133	*/
1134	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1135	if (RT_SUCCESS(rc))
1136	{
1137	*ppwsz = pwszResult;
1138	return rc;
1139	}
1140	if (fShouldFree)
1141	RTMemFree(pwszResult);
1142	}
1143	else
1144	rc = VERR_NO_UTF16_MEMORY;
1145	}
1146	return rc;
1147	}
1148	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
1149
1150
1151	RTDECL(int) RTStrToUtf16BigExTag(const char *pszString, size_t cchString,
1152	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
1153	{
1154	/*
1155	* Validate input.
1156	*/
1157	Assert(VALID_PTR(pszString));
1158	Assert(VALID_PTR(ppwsz));
1159	Assert(!pcwc \|\| VALID_PTR(pcwc));
1160
1161	/*
1162	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1163	*/
1164	size_t cwcResult;
1165	int rc;
1166	if (cchString != RTSTR_MAX)
1167	rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1168	else
1169	rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1170	if (RT_SUCCESS(rc))
1171	{
1172	if (pcwc)
1173	*pcwc = cwcResult;
1174
1175	/*
1176	* Check buffer size / Allocate buffer.
1177	*/
1178	bool fShouldFree;
1179	PRTUTF16 pwszResult;
1180	if (cwc > 0 && *ppwsz)
1181	{
1182	fShouldFree = false;
1183	if (cwc <= cwcResult)
1184	return VERR_BUFFER_OVERFLOW;
1185	pwszResult = *ppwsz;
1186	}
1187	else
1188	{
1189	*ppwsz = NULL;
1190	fShouldFree = true;
1191	cwc = RT_MAX(cwcResult + 1, cwc);
1192	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1193	}
1194	if (pwszResult)
1195	{
1196	/*
1197	* Encode the UTF-16BE string.
1198	*/
1199	rc = rtUtf8RecodeAsUtf16Big(pszString, cchString, pwszResult, cwc - 1);
1200	if (RT_SUCCESS(rc))
1201	{
1202	*ppwsz = pwszResult;
1203	return rc;
1204	}
1205	if (fShouldFree)
1206	RTMemFree(pwszResult);
1207	}
1208	else
1209	rc = VERR_NO_UTF16_MEMORY;
1210	}
1211	return rc;
1212	}
1213	RT_EXPORT_SYMBOL(RTStrToUtf16BigExTag);
1214
1215
1216	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
1217	{
1218	size_t cwc;
1219	int rc = rtUtf8CalcUtf16Length(psz, &cwc);
1220	return RT_SUCCESS(rc) ? cwc : 0;
1221	}
1222	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
1223
1224
1225	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
1226	{
1227	size_t cwc;
1228	int rc;
1229	if (cch != RTSTR_MAX)
1230	rc = rtUtf8CalcUtf16LengthN(psz, cch, &cwc);
1231	else
1232	rc = rtUtf8CalcUtf16Length(psz, &cwc);
1233	if (pcwc)
1234	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1235	return rc;
1236	}
1237	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
1238
1239
1240	/**
1241	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
1242	*
1243	* @returns iprt status code.
1244	* @param psz The Latin-1 string.
1245	* @param cchIn The max length of the Latin-1 string to consider.
1246	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
1247	*/
1248	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
1249	{
1250	size_t cch = 0;
1251	for (;;)
1252	{
1253	RTUNICP Cp;
1254	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
1255	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1256	break;
1257	if (RT_FAILURE(rc))
1258	return rc;
1259	cch += RTStrCpSize(Cp); /* cannot fail */
1260	}
1261
1262	/* done */
1263	*pcch = cch;
1264	return VINF_SUCCESS;
1265	}
1266
1267
1268	/**
1269	* Recodes a Latin-1 string as UTF-8.
1270	*
1271	* @returns iprt status code.
1272	* @param pszIn The Latin-1 string.
1273	* @param cchIn The number of characters to process from psz. The recoding
1274	* will stop when cch or '\\0' is reached.
1275	* @param psz Where to store the UTF-8 string.
1276	* @param cch The size of the UTF-8 buffer, excluding the terminator.
1277	*/
1278	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
1279	{
1280	int rc;
1281	for (;;)
1282	{
1283	RTUNICP Cp;
1284	size_t cchCp;
1285	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
1286	if (Cp == 0 \|\| RT_FAILURE(rc))
1287	break;
1288	cchCp = RTStrCpSize(Cp);
1289	if (RT_UNLIKELY(cch < cchCp))
1290	{
1291	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1292	rc = VERR_BUFFER_OVERFLOW;
1293	break;
1294	}
1295	cch -= cchCp;
1296	psz = RTStrPutCp(psz, Cp);
1297	}
1298
1299	/* done */
1300	if (rc == VERR_END_OF_STRING)
1301	rc = VINF_SUCCESS;
1302	*psz = '\0';
1303	return rc;
1304	}
1305
1306
1307
1308	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
1309	{
1310	/*
1311	* Validate input.
1312	*/
1313	Assert(VALID_PTR(ppszString));
1314	Assert(VALID_PTR(pszString));
1315	*ppszString = NULL;
1316
1317	/*
1318	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
1319	*/
1320	size_t cch;
1321	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
1322	if (RT_SUCCESS(rc))
1323	{
1324	/*
1325	* Allocate buffer and recode it.
1326	*/
1327	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
1328	if (pszResult)
1329	{
1330	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
1331	if (RT_SUCCESS(rc))
1332	{
1333	*ppszString = pszResult;
1334	return rc;
1335	}
1336
1337	RTMemFree(pszResult);
1338	}
1339	else
1340	rc = VERR_NO_STR_MEMORY;
1341	}
1342	return rc;
1343	}
1344	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
1345
1346
1347	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
1348	{
1349	/*
1350	* Validate input.
1351	*/
1352	Assert(VALID_PTR(pszString));
1353	Assert(VALID_PTR(ppsz));
1354	Assert(!pcch \|\| VALID_PTR(pcch));
1355
1356	/*
1357	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
1358	*/
1359	size_t cchResult;
1360	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
1361	if (RT_SUCCESS(rc))
1362	{
1363	if (pcch)
1364	*pcch = cchResult;
1365
1366	/*
1367	* Check buffer size / Allocate buffer and recode it.
1368	*/
1369	bool fShouldFree;
1370	char *pszResult;
1371	if (cch > 0 && *ppsz)
1372	{
1373	fShouldFree = false;
1374	if (RT_UNLIKELY(cch <= cchResult))
1375	return VERR_BUFFER_OVERFLOW;
1376	pszResult = *ppsz;
1377	}
1378	else
1379	{
1380	*ppsz = NULL;
1381	fShouldFree = true;
1382	cch = RT_MAX(cch, cchResult + 1);
1383	pszResult = (char *)RTStrAllocTag(cch, pszTag);
1384	}
1385	if (pszResult)
1386	{
1387	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
1388	if (RT_SUCCESS(rc))
1389	{
1390	*ppsz = pszResult;
1391	return rc;
1392	}
1393
1394	if (fShouldFree)
1395	RTStrFree(pszResult);
1396	}
1397	else
1398	rc = VERR_NO_STR_MEMORY;
1399	}
1400	return rc;
1401	}
1402	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1403
1404
1405	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1406	{
1407	size_t cch;
1408	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1409	return RT_SUCCESS(rc) ? cch : 0;
1410	}
1411	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1412
1413
1414	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
1415	{
1416	size_t cch;
1417	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1418	if (pcch)
1419	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1420	return rc;
1421	}
1422	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1423
1424
1425	/**
1426	* Calculates the Latin-1 length of a string, validating the encoding while
1427	* doing so.
1428	*
1429	* @returns IPRT status code.
1430	* @param psz Pointer to the UTF-8 string.
1431	* @param cchIn The max length of the string. (btw cch = cb)
1432	* Use RTSTR_MAX if all of the string is to be examined.
1433	* @param pcch Where to store the length of the Latin-1 string in bytes.
1434	*/
1435	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
1436	{
1437	size_t cch = 0;
1438	for (;;)
1439	{
1440	RTUNICP Cp;
1441	size_t cchCp;
1442	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1443	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1444	break;
1445	if (RT_FAILURE(rc))
1446	return rc;
1447	cchCp = RTLatin1CpSize(Cp);
1448	if (cchCp == 0)
1449	return VERR_NO_TRANSLATION;
1450	cch += cchCp;
1451	}
1452
1453	/* done */
1454	*pcch = cch;
1455	return VINF_SUCCESS;
1456	}
1457
1458
1459	/**
1460	* Recodes a valid UTF-8 string as Latin-1.
1461	*
1462	* Since we know the input is valid, we do not perform encoding or length checks.
1463	*
1464	* @returns iprt status code.
1465	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1466	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1467	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1468	* @param psz Where to store the Latin-1 string.
1469	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1470	*/
1471	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1472	{
1473	int rc;
1474	for (;;)
1475	{
1476	RTUNICP Cp;
1477	size_t cchCp;
1478	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1479	if (Cp == 0 \|\| RT_FAILURE(rc))
1480	break;
1481	cchCp = RTLatin1CpSize(Cp);
1482	if (RT_UNLIKELY(cch < cchCp))
1483	{
1484	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1485	rc = VERR_BUFFER_OVERFLOW;
1486	break;
1487	}
1488	cch -= cchCp;
1489	psz = RTLatin1PutCp(psz, Cp);
1490	}
1491
1492	/* done */
1493	if (rc == VERR_END_OF_STRING)
1494	rc = VINF_SUCCESS;
1495	*psz = '\0';
1496	return rc;
1497	}
1498
1499
1500
1501	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1502	{
1503	/*
1504	* Validate input.
1505	*/
1506	Assert(VALID_PTR(ppszString));
1507	Assert(VALID_PTR(pszString));
1508	*ppszString = NULL;
1509
1510	/*
1511	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1512	*/
1513	size_t cch;
1514	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1515	if (RT_SUCCESS(rc))
1516	{
1517	/*
1518	* Allocate buffer.
1519	*/
1520	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1521	if (psz)
1522	{
1523	/*
1524	* Encode the UTF-16 string.
1525	*/
1526	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1527	if (RT_SUCCESS(rc))
1528	{
1529	*ppszString = psz;
1530	return rc;
1531	}
1532	RTMemFree(psz);
1533	}
1534	else
1535	rc = VERR_NO_STR_MEMORY;
1536	}
1537	return rc;
1538	}
1539	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1540
1541
1542	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1543	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1544	{
1545	/*
1546	* Validate input.
1547	*/
1548	Assert(VALID_PTR(pszString));
1549	Assert(VALID_PTR(ppsz));
1550	Assert(!pcch \|\| VALID_PTR(pcch));
1551
1552	/*
1553	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1554	*/
1555	size_t cchResult;
1556	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1557	if (RT_SUCCESS(rc))
1558	{
1559	if (pcch)
1560	*pcch = cchResult;
1561
1562	/*
1563	* Check buffer size / Allocate buffer.
1564	*/
1565	bool fShouldFree;
1566	char *pszResult;
1567	if (cch > 0 && *ppsz)
1568	{
1569	fShouldFree = false;
1570	if (cch <= cchResult)
1571	return VERR_BUFFER_OVERFLOW;
1572	pszResult = *ppsz;
1573	}
1574	else
1575	{
1576	*ppsz = NULL;
1577	fShouldFree = true;
1578	cch = RT_MAX(cchResult + 1, cch);
1579	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1580	}
1581	if (pszResult)
1582	{
1583	/*
1584	* Encode the Latin-1 string.
1585	*/
1586	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1587	if (RT_SUCCESS(rc))
1588	{
1589	*ppsz = pszResult;
1590	return rc;
1591	}
1592	if (fShouldFree)
1593	RTMemFree(pszResult);
1594	}
1595	else
1596	rc = VERR_NO_STR_MEMORY;
1597	}
1598	return rc;
1599	}
1600	RT_EXPORT_SYMBOL(RTStrToLatin1ExTag);
1601
1602
1603	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1604	{
1605	size_t cch;
1606	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1607	return RT_SUCCESS(rc) ? cch : 0;
1608	}
1609	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1610
1611
1612	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1613	{
1614	size_t cch;
1615	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1616	if (pcch)
1617	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1618	return rc;
1619	}
1620	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1621
1622
1623	/**
1624	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1625	* @returns rc
1626	* @param ppsz The pointer to the string position point.
1627	* @param pCp Where to store RTUNICP_INVALID.
1628	* @param rc The iprt error code.
1629	*/
1630	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1631	{
1632	/*
1633	* Try find a valid encoding.
1634	*/
1635	(ppsz)++; /* @todo code this! */
1636	*pCp = RTUNICP_INVALID;
1637	return rc;
1638	}
1639
1640
1641	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1642	{
1643	RTUNICP Cp;
1644	RTStrGetCpExInternal(&psz, &Cp);
1645	return Cp;
1646	}
1647	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1648
1649
1650	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1651	{
1652	const unsigned char puch = (const unsigned char )*ppsz;
1653	const unsigned char uch = *puch;
1654	RTUNICP uc;
1655
1656	/* ASCII ? */
1657	if (!(uch & RT_BIT(7)))
1658	{
1659	uc = uch;
1660	puch++;
1661	}
1662	else if (uch & RT_BIT(6))
1663	{
1664	/* figure the length and validate the first octet. */
1665	/** @todo RT_USE_RTC_3629 */
1666	unsigned cb;
1667	if (!(uch & RT_BIT(5)))
1668	cb = 2;
1669	else if (!(uch & RT_BIT(4)))
1670	cb = 3;
1671	else if (!(uch & RT_BIT(3)))
1672	cb = 4;
1673	else if (!(uch & RT_BIT(2)))
1674	cb = 5;
1675	else if (!(uch & RT_BIT(1)))
1676	cb = 6;
1677	else
1678	{
1679	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1680	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1681	}
1682
1683	/* validate the rest */
1684	switch (cb)
1685	{
1686	case 6:
1687	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1688	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1689	RT_FALL_THRU();
1690	case 5:
1691	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1692	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1693	RT_FALL_THRU();
1694	case 4:
1695	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1696	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1697	RT_FALL_THRU();
1698	case 3:
1699	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1700	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1701	RT_FALL_THRU();
1702	case 2:
1703	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1704	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1705	break;
1706	}
1707
1708	/* get and validate the code point. */
1709	switch (cb)
1710	{
1711	case 6:
1712	uc = (puch[5] & 0x3f)
1713	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1714	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1715	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1716	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1717	\| ((RTUNICP)(uch & 0x01) << 30);
1718	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1719	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1720	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1721	break;
1722	case 5:
1723	uc = (puch[4] & 0x3f)
1724	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1725	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1726	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1727	\| ((RTUNICP)(uch & 0x03) << 24);
1728	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1729	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1730	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1731	break;
1732	case 4:
1733	uc = (puch[3] & 0x3f)
1734	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1735	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1736	\| ((RTUNICP)(uch & 0x07) << 18);
1737	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1738	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1739	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1740	break;
1741	case 3:
1742	uc = (puch[2] & 0x3f)
1743	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1744	\| ((RTUNICP)(uch & 0x0f) << 12);
1745	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1746	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1747	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1748	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1749	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1750	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1751	break;
1752	case 2:
1753	uc = (puch[1] & 0x3f)
1754	\| ((RTUNICP)(uch & 0x1f) << 6);
1755	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1756	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1757	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1758	break;
1759	default: /* impossible, but GCC is bitching. */
1760	uc = RTUNICP_INVALID;
1761	break;
1762	}
1763	puch += cb;
1764	}
1765	else
1766	{
1767	/* 6th bit is always set. */
1768	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1769	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1770	}
1771	*pCp = uc;
1772	ppsz = (const char )puch;
1773	return VINF_SUCCESS;
1774	}
1775	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1776
1777
1778	/**
1779	* Handle invalid encodings passed to RTStrGetCpNEx().
1780	* @returns rc
1781	* @param ppsz The pointer to the string position point.
1782	* @param pcch Pointer to the string length.
1783	* @param pCp Where to store RTUNICP_INVALID.
1784	* @param rc The iprt error code.
1785	*/
1786	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1787	{
1788	/*
1789	* Try find a valid encoding.
1790	*/
1791	(ppsz)++; /* @todo code this! */
1792	(*pcch)--;
1793	*pCp = RTUNICP_INVALID;
1794	return rc;
1795	}
1796
1797
1798	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1799	{
1800	const unsigned char puch = (const unsigned char )*ppsz;
1801	const unsigned char uch = *puch;
1802	size_t cch = *pcch;
1803	RTUNICP uc;
1804
1805	if (cch == 0)
1806	{
1807	*pCp = RTUNICP_INVALID;
1808	return VERR_END_OF_STRING;
1809	}
1810
1811	/* ASCII ? */
1812	if (!(uch & RT_BIT(7)))
1813	{
1814	uc = uch;
1815	puch++;
1816	cch--;
1817	}
1818	else if (uch & RT_BIT(6))
1819	{
1820	/* figure the length and validate the first octet. */
1821	/** @todo RT_USE_RTC_3629 */
1822	unsigned cb;
1823	if (!(uch & RT_BIT(5)))
1824	cb = 2;
1825	else if (!(uch & RT_BIT(4)))
1826	cb = 3;
1827	else if (!(uch & RT_BIT(3)))
1828	cb = 4;
1829	else if (!(uch & RT_BIT(2)))
1830	cb = 5;
1831	else if (!(uch & RT_BIT(1)))
1832	cb = 6;
1833	else
1834	{
1835	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1836	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1837	}
1838
1839	if (cb > cch)
1840	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1841
1842	/* validate the rest */
1843	switch (cb)
1844	{
1845	case 6:
1846	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1847	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1848	RT_FALL_THRU();
1849	case 5:
1850	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1851	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1852	RT_FALL_THRU();
1853	case 4:
1854	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1855	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1856	RT_FALL_THRU();
1857	case 3:
1858	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1859	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1860	RT_FALL_THRU();
1861	case 2:
1862	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1863	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1864	break;
1865	}
1866
1867	/* get and validate the code point. */
1868	switch (cb)
1869	{
1870	case 6:
1871	uc = (puch[5] & 0x3f)
1872	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1873	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1874	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1875	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1876	\| ((RTUNICP)(uch & 0x01) << 30);
1877	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1878	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1879	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1880	break;
1881	case 5:
1882	uc = (puch[4] & 0x3f)
1883	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1884	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1885	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1886	\| ((RTUNICP)(uch & 0x03) << 24);
1887	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1888	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1889	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1890	break;
1891	case 4:
1892	uc = (puch[3] & 0x3f)
1893	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1894	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1895	\| ((RTUNICP)(uch & 0x07) << 18);
1896	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1897	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1898	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1899	break;
1900	case 3:
1901	uc = (puch[2] & 0x3f)
1902	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1903	\| ((RTUNICP)(uch & 0x0f) << 12);
1904	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1905	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1906	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1907	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1908	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1909	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1910	break;
1911	case 2:
1912	uc = (puch[1] & 0x3f)
1913	\| ((RTUNICP)(uch & 0x1f) << 6);
1914	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1915	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1916	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1917	break;
1918	default: /* impossible, but GCC is bitching. */
1919	uc = RTUNICP_INVALID;
1920	break;
1921	}
1922	puch += cb;
1923	cch -= cb;
1924	}
1925	else
1926	{
1927	/* 6th bit is always set. */
1928	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1929	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1930	}
1931	*pCp = uc;
1932	ppsz = (const char )puch;
1933	(*pcch) = cch;
1934	return VINF_SUCCESS;
1935	}
1936	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1937
1938
1939	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1940	{
1941	unsigned char puch = (unsigned char )psz;
1942	if (uc < 0x80)
1943	*puch++ = (unsigned char )uc;
1944	else if (uc < 0x00000800)
1945	{
1946	*puch++ = 0xc0 \| (uc >> 6);
1947	*puch++ = 0x80 \| (uc & 0x3f);
1948	}
1949	else if (uc < 0x00010000)
1950	{
1951	/** @todo RT_USE_RTC_3629 */
1952	if ( uc < 0x0000d8000
1953	\|\| ( uc > 0x0000dfff
1954	&& uc < 0x0000fffe))
1955	{
1956	*puch++ = 0xe0 \| (uc >> 12);
1957	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1958	*puch++ = 0x80 \| (uc & 0x3f);
1959	}
1960	else
1961	{
1962	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1963	*puch++ = 0x7f;
1964	}
1965	}
1966	/** @todo RT_USE_RTC_3629 */
1967	else if (uc < 0x00200000)
1968	{
1969	*puch++ = 0xf0 \| (uc >> 18);
1970	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1971	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1972	*puch++ = 0x80 \| (uc & 0x3f);
1973	}
1974	else if (uc < 0x04000000)
1975	{
1976	*puch++ = 0xf8 \| (uc >> 24);
1977	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1978	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1979	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1980	*puch++ = 0x80 \| (uc & 0x3f);
1981	}
1982	else if (uc <= 0x7fffffff)
1983	{
1984	*puch++ = 0xfc \| (uc >> 30);
1985	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1986	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1987	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1988	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1989	*puch++ = 0x80 \| (uc & 0x3f);
1990	}
1991	else
1992	{
1993	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1994	*puch++ = 0x7f;
1995	}
1996
1997	return (char *)puch;
1998	}
1999	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
2000
2001
2002	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
2003	{
2004	if (pszStart < psz)
2005	{
2006	/* simple char? */
2007	const unsigned char puch = (const unsigned char )psz;
2008	unsigned uch = *--puch;
2009	if (!(uch & RT_BIT(7)))
2010	return (char *)puch;
2011	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
2012
2013	/* two or more. */
2014	uint32_t uMask = 0xffffffc0;
2015	while ( (const unsigned char *)pszStart < puch
2016	&& !(uMask & 1))
2017	{
2018	uch = *--puch;
2019	if ((uch & 0xc0) != 0x80)
2020	{
2021	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
2022	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
2023	(char *)pszStart);
2024	return (char *)puch;
2025	}
2026	uMask >>= 1;
2027	}
2028	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
2029	}
2030	return (char *)pszStart;
2031	}
2032	RT_EXPORT_SYMBOL(RTStrPrevCp);
2033

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 81106

Download in other formats: