utf-8.cpp@ 98103

Last change on this file since 98103 was 98103, checked in by vboxsync, 23 months ago
Copyright year updates by scm.
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 70.8 KB

Line
1	/* $Id: utf-8.cpp 98103 2023-01-17 14:15:46Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* The contents of this file may alternatively be used under the terms
26	* of the Common Development and Distribution License Version 1.0
27	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	* in the VirtualBox distribution, in which case the provisions of the
29	* CDDL are applicable instead of those of the GPL.
30	*
31	* You may elect to license modified versions of this file under the
32	* terms and conditions of either the GPL or the CDDL or both.
33	*
34	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	*/
36
37
38	/*********************************************************************************************************************************
39	* Header Files *
40	*********************************************************************************************************************************/
41	#include <iprt/string.h>
42	#include <iprt/latin1.h>
43	#include "internal/iprt.h"
44
45	#include <iprt/uni.h>
46	#include <iprt/asm.h>
47	#include <iprt/alloc.h>
48	#include <iprt/assert.h>
49	#include <iprt/err.h>
50	#include "internal/string.h"
51
52
53
54	/**
55	* Get get length in code points of a UTF-8 encoded string.
56	* The string is validated while doing this.
57	*
58	* @returns IPRT status code.
59	* @param psz Pointer to the UTF-8 string.
60	* @param cch The max length of the string. (btw cch = cb)
61	* Use RTSTR_MAX if all of the string is to be examined.
62	* @param pcuc Where to store the length in unicode code points.
63	* @param pcchActual Where to store the actual size of the UTF-8 string
64	* on success (cch = cb again). Optional.
65	*/
66	DECLHIDDEN(int) rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
67	{
68	const unsigned char puch = (const unsigned char )psz;
69	size_t cCodePoints = 0;
70	while (cch > 0)
71	{
72	const unsigned char uch = *puch;
73	if (!uch)
74	break;
75	if (uch & RT_BIT(7))
76	{
77	/* figure sequence length and validate the first byte */
78	/** @todo RT_USE_RTC_3629 */
79	unsigned cb;
80	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
81	cb = 2;
82	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
83	cb = 3;
84	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
85	cb = 4;
86	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
87	cb = 5;
88	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
89	cb = 6;
90	else
91	{
92	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
93	return VERR_INVALID_UTF8_ENCODING;
94	}
95
96	/* check length */
97	if (cb > cch)
98	{
99	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
100	return VERR_INVALID_UTF8_ENCODING;
101	}
102
103	/* validate the rest */
104	switch (cb)
105	{
106	case 6:
107	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
108	RT_FALL_THRU();
109	case 5:
110	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
111	RT_FALL_THRU();
112	case 4:
113	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
114	RT_FALL_THRU();
115	case 3:
116	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
117	RT_FALL_THRU();
118	case 2:
119	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120	break;
121	}
122
123	/* validate the code point. */
124	RTUNICP uc;
125	switch (cb)
126	{
127	case 6:
128	uc = (puch[5] & 0x3f)
129	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
130	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
131	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
132	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
133	\| ((RTUNICP)(uch & 0x01) << 30);
134	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
135	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136	break;
137	case 5:
138	uc = (puch[4] & 0x3f)
139	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
140	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
141	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
142	\| ((RTUNICP)(uch & 0x03) << 24);
143	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
144	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
145	break;
146	case 4:
147	uc = (puch[3] & 0x3f)
148	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
149	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
150	\| ((RTUNICP)(uch & 0x07) << 18);
151	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
152	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153	break;
154	case 3:
155	uc = (puch[2] & 0x3f)
156	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
157	\| ((RTUNICP)(uch & 0x0f) << 12);
158	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
159	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
160	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
161	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
162	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
163	break;
164	case 2:
165	uc = (puch[1] & 0x3f)
166	\| ((RTUNICP)(uch & 0x1f) << 6);
167	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
168	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
169	break;
170	}
171
172	/* advance */
173	cch -= cb;
174	puch += cb;
175	}
176	else
177	{
178	/* one ASCII byte */
179	puch++;
180	cch--;
181	}
182	cCodePoints++;
183	}
184
185	/* done */
186	*pcuc = cCodePoints;
187	if (pcchActual)
188	pcchActual = puch - (unsigned char const )psz;
189	return VINF_SUCCESS;
190	}
191
192
193	/**
194	* Decodes and UTF-8 string into an array of unicode code point.
195	*
196	* Since we know the input is valid, we do not perform encoding or length checks.
197	*
198	* @returns iprt status code.
199	* @param psz The UTF-8 string to recode. This is a valid encoding.
200	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
201	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
202	* @param paCps Where to store the code points array.
203	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
204	*/
205	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
206	{
207	int rc = VINF_SUCCESS;
208	const unsigned char puch = (const unsigned char )psz;
209	PRTUNICP pCp = paCps;
210	while (cch > 0)
211	{
212	/* read the next char and check for terminator. */
213	const unsigned char uch = *puch;
214	if (uch)
215	{ /* we only break once, so consider this the likely branch. */ }
216	else
217	break;
218
219	/* check for output overflow */
220	if (RT_LIKELY(cCps >= 1))
221	{ /* likely */ }
222	else
223	{
224	rc = VERR_BUFFER_OVERFLOW;
225	break;
226	}
227	cCps--;
228
229	/* decode and recode the code point */
230	if (!(uch & RT_BIT(7)))
231	{
232	*pCp++ = uch;
233	puch++;
234	cch--;
235	}
236	#ifdef RT_STRICT
237	else if (!(uch & RT_BIT(6)))
238	AssertMsgFailed(("Internal error!\n"));
239	#endif
240	else if (!(uch & RT_BIT(5)))
241	{
242	*pCp++ = (puch[1] & 0x3f)
243	\| ((uint16_t)(uch & 0x1f) << 6);
244	puch += 2;
245	cch -= 2;
246	}
247	else if (!(uch & RT_BIT(4)))
248	{
249	*pCp++ = (puch[2] & 0x3f)
250	\| ((uint16_t)(puch[1] & 0x3f) << 6)
251	\| ((uint16_t)(uch & 0x0f) << 12);
252	puch += 3;
253	cch -= 3;
254	}
255	else if (!(uch & RT_BIT(3)))
256	{
257	*pCp++ = (puch[3] & 0x3f)
258	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
259	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
260	\| ((RTUNICP)(uch & 0x07) << 18);
261	puch += 4;
262	cch -= 4;
263	}
264	else if (!(uch & RT_BIT(2)))
265	{
266	*pCp++ = (puch[4] & 0x3f)
267	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
268	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
269	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
270	\| ((RTUNICP)(uch & 0x03) << 24);
271	puch += 5;
272	cch -= 6;
273	}
274	else
275	{
276	Assert(!(uch & RT_BIT(1)));
277	*pCp++ = (puch[5] & 0x3f)
278	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
279	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
280	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
281	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
282	\| ((RTUNICP)(uch & 0x01) << 30);
283	puch += 6;
284	cch -= 6;
285	}
286	}
287
288	/* done */
289	*pCp = 0;
290	return rc;
291	}
292
293
294	RTDECL(size_t) RTStrUniLen(const char *psz)
295	{
296	size_t cCodePoints;
297	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
298	return RT_SUCCESS(rc) ? cCodePoints : 0;
299	}
300	RT_EXPORT_SYMBOL(RTStrUniLen);
301
302
303	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
304	{
305	size_t cCodePoints;
306	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
307	if (pcCps)
308	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
309	return rc;
310	}
311	RT_EXPORT_SYMBOL(RTStrUniLenEx);
312
313
314	RTDECL(int) RTStrValidateEncoding(const char *psz)
315	{
316	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
317	}
318	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
319
320
321	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
322	{
323	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED \| RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
324	VERR_INVALID_PARAMETER);
325	AssertPtr(psz);
326
327	/*
328	* Use rtUtf8Length for the job.
329	*/
330	size_t cchActual;
331	size_t cCpsIgnored;
332	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
333	if (RT_SUCCESS(rc))
334	{
335	if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
336	{
337	if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
338	cchActual++;
339	if (cchActual == cch)
340	rc = VINF_SUCCESS;
341	else if (cchActual < cch)
342	rc = VERR_BUFFER_UNDERFLOW;
343	else
344	rc = VERR_BUFFER_OVERFLOW;
345	}
346	else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
347	&& cchActual >= cch)
348	rc = VERR_BUFFER_OVERFLOW;
349	}
350	return rc;
351	}
352	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
353
354
355	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
356	{
357	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
358	return RT_SUCCESS(rc);
359	}
360	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
361
362
363	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
364	{
365	size_t cErrors = 0;
366	for (;;)
367	{
368	RTUNICP Cp;
369	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
370	if (RT_SUCCESS(rc))
371	{
372	if (!Cp)
373	break;
374	}
375	else
376	{
377	psz[-1] = '?';
378	cErrors++;
379	}
380	}
381	return cErrors;
382	}
383	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
384
385
386	/**
387	* Helper for RTStrPurgeComplementSet.
388	*
389	* @returns true if @a Cp is valid, false if not.
390	* @param Cp The code point to validate.
391	* @param puszValidPairs Pair of valid code point sets.
392	* @param cValidPairs Number of pairs.
393	*/
394	DECLINLINE(bool) rtStrPurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
395	{
396	while (cValidPairs-- > 0)
397	{
398	if ( Cp >= puszValidPairs[0]
399	&& Cp <= puszValidPairs[1])
400	return true;
401	puszValidPairs += 2;
402	}
403	return false;
404	}
405
406
407	RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidPairs, char chReplacement)
408	{
409	AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
410
411	/*
412	* Calc valid pairs and check that we've got an even number.
413	*/
414	uint32_t cValidPairs = 0;
415	while (puszValidPairs[cValidPairs * 2])
416	{
417	AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
418	AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
419	("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
420	cValidPairs++;
421	}
422
423	/*
424	* Do the replacing.
425	*/
426	ssize_t cReplacements = 0;
427	for (;;)
428	{
429	char *pszCur = psz;
430	RTUNICP Cp;
431	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
432	if (RT_SUCCESS(rc))
433	{
434	if (Cp)
435	{
436	if (!rtStrPurgeIsInSet(Cp, puszValidPairs, cValidPairs))
437	{
438	for (; pszCur != psz; ++pszCur)
439	*pszCur = chReplacement;
440	++cReplacements;
441	}
442	}
443	else
444	break;
445	}
446	else
447	return -1;
448	}
449	return cReplacements;
450	}
451	RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
452
453
454	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
455	{
456	/*
457	* Validate input.
458	*/
459	AssertPtr(pszString);
460	AssertPtr(ppaCps);
461	*ppaCps = NULL;
462
463	/*
464	* Validate the UTF-8 input and count its code points.
465	*/
466	size_t cCps;
467	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
468	if (RT_SUCCESS(rc))
469	{
470	/*
471	* Allocate buffer.
472	*/
473	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
474	if (paCps)
475	{
476	/*
477	* Decode the string.
478	*/
479	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
480	if (RT_SUCCESS(rc))
481	{
482	*ppaCps = paCps;
483	return rc;
484	}
485	RTMemFree(paCps);
486	}
487	else
488	rc = VERR_NO_CODE_POINT_MEMORY;
489	}
490	return rc;
491	}
492	RT_EXPORT_SYMBOL(RTStrToUni);
493
494
495	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
496	{
497	/*
498	* Validate input.
499	*/
500	AssertPtr(pszString);
501	AssertPtr(ppaCps);
502	AssertPtrNull(pcCps);
503
504	/*
505	* Validate the UTF-8 input and count the code points.
506	*/
507	size_t cCpsResult;
508	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
509	if (RT_SUCCESS(rc))
510	{
511	if (pcCps)
512	*pcCps = cCpsResult;
513
514	/*
515	* Check buffer size / Allocate buffer.
516	*/
517	bool fShouldFree;
518	PRTUNICP paCpsResult;
519	if (cCps > 0 && *ppaCps)
520	{
521	fShouldFree = false;
522	if (cCps <= cCpsResult)
523	return VERR_BUFFER_OVERFLOW;
524	paCpsResult = *ppaCps;
525	}
526	else
527	{
528	*ppaCps = NULL;
529	fShouldFree = true;
530	cCps = RT_MAX(cCpsResult + 1, cCps);
531	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
532	}
533	if (paCpsResult)
534	{
535	/*
536	* Encode the UTF-16 string.
537	*/
538	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
539	if (RT_SUCCESS(rc))
540	{
541	*ppaCps = paCpsResult;
542	return rc;
543	}
544	if (fShouldFree)
545	RTMemFree(paCpsResult);
546	}
547	else
548	rc = VERR_NO_CODE_POINT_MEMORY;
549	}
550	return rc;
551	}
552	RT_EXPORT_SYMBOL(RTStrToUniEx);
553
554
555	/**
556	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
557	*
558	* @returns IPRT status code.
559	* @param psz Pointer to the UTF-8 string.
560	* @param cch The max length of the string. (btw cch = cb)
561	* @param pcwc Where to store the length of the UTF-16 string as a number
562	* of RTUTF16 characters.
563	* @sa rtUtf8CalcUtf16Length
564	*/
565	static int rtUtf8CalcUtf16LengthN(const char psz, size_t cch, size_t pcwc)
566	{
567	const unsigned char puch = (const unsigned char )psz;
568	size_t cwc = 0;
569	while (cch > 0)
570	{
571	const unsigned char uch = *puch;
572	if (!(uch & RT_BIT(7)))
573	{
574	/* one ASCII byte */
575	if (uch)
576	{
577	cwc++;
578	puch++;
579	cch--;
580	}
581	else
582	break;
583	}
584	else
585	{
586	/*
587	* Multibyte sequence is more complicated when we have length
588	* restrictions on the input.
589	*/
590	/* figure sequence length and validate the first byte */
591	unsigned cb;
592	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
593	cb = 2;
594	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
595	cb = 3;
596	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
597	cb = 4;
598	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
599	cb = 5;
600	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
601	cb = 6;
602	else
603	{
604	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
605	return VERR_INVALID_UTF8_ENCODING;
606	}
607
608	/* check length */
609	if (cb > cch)
610	{
611	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
612	return VERR_INVALID_UTF8_ENCODING;
613	}
614
615	/* validate the rest */
616	switch (cb)
617	{
618	case 6:
619	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
620	RT_FALL_THRU();
621	case 5:
622	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
623	RT_FALL_THRU();
624	case 4:
625	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
626	RT_FALL_THRU();
627	case 3:
628	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
629	RT_FALL_THRU();
630	case 2:
631	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
632	break;
633	}
634
635	/* validate the code point. */
636	RTUNICP uc;
637	switch (cb)
638	{
639	case 6:
640	uc = (puch[5] & 0x3f)
641	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
642	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
643	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
644	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
645	\| ((RTUNICP)(uch & 0x01) << 30);
646	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
647	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
648	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
649	return VERR_CANT_RECODE_AS_UTF16;
650	case 5:
651	uc = (puch[4] & 0x3f)
652	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
653	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
654	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
655	\| ((RTUNICP)(uch & 0x03) << 24);
656	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
657	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
658	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
659	return VERR_CANT_RECODE_AS_UTF16;
660	case 4:
661	uc = (puch[3] & 0x3f)
662	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
663	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
664	\| ((RTUNICP)(uch & 0x07) << 18);
665	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
666	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
667	RTStrAssertMsgReturn(uc <= 0x0010ffff,
668	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
669	cwc++;
670	break;
671	case 3:
672	uc = (puch[2] & 0x3f)
673	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
674	\| ((RTUNICP)(uch & 0x0f) << 12);
675	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
676	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
677	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
678	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
679	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
680	break;
681	case 2:
682	uc = (puch[1] & 0x3f)
683	\| ((RTUNICP)(uch & 0x1f) << 6);
684	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
685	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
686	break;
687	}
688
689	/* advance */
690	cch -= cb;
691	puch += cb;
692	cwc++;
693	}
694	}
695
696	/* done */
697	*pcwc = cwc;
698	return VINF_SUCCESS;
699	}
700
701
702	/**
703	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
704	*
705	* @returns IPRT status code.
706	* @param psz Pointer to the UTF-8 string.
707	* @param pcwc Where to store the length of the UTF-16 string as a number
708	* of RTUTF16 characters.
709	* @sa rtUtf8CalcUtf16LengthN
710	*/
711	static int rtUtf8CalcUtf16Length(const char psz, size_t pcwc)
712	{
713	const unsigned char puch = (const unsigned char )psz;
714	size_t cwc = 0;
715	for (;;)
716	{
717	const unsigned char uch = *puch;
718	if (!(uch & RT_BIT(7)))
719	{
720	/* one ASCII byte */
721	if (uch)
722	{
723	cwc++;
724	puch++;
725	}
726	else
727	break;
728	}
729	else
730	{
731	/*
732	* Figure sequence length, implicitly validate the first byte.
733	* Then validate the additional bytes.
734	* Finally validate the code point.
735	*/
736	unsigned cb;
737	RTUNICP uc;
738	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
739	{
740	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
741	uc = (puch[1] & 0x3f)
742	\| ((RTUNICP)(uch & 0x1f) << 6);
743	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
744	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
745	cb = 2;
746	}
747	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
748	{
749	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
750	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
751	uc = (puch[2] & 0x3f)
752	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
753	\| ((RTUNICP)(uch & 0x0f) << 12);
754	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
755	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
756	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
757	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
758	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
759	cb = 3;
760	}
761	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
762	{
763	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
764	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
765	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
766	uc = (puch[3] & 0x3f)
767	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
768	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
769	\| ((RTUNICP)(uch & 0x07) << 18);
770	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
771	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
772	RTStrAssertMsgReturn(uc <= 0x0010ffff,
773	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
774	cwc++;
775	cb = 4;
776	}
777	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
778	{
779	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
780	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
781	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
782	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
783	uc = (puch[4] & 0x3f)
784	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
785	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
786	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
787	\| ((RTUNICP)(uch & 0x03) << 24);
788	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
789	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
790	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
791	return VERR_CANT_RECODE_AS_UTF16;
792	//cb = 5;
793	}
794	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
795	{
796	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
797	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
798	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
799	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
800	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
801	uc = (puch[5] & 0x3f)
802	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
803	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
804	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
805	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
806	\| ((RTUNICP)(uch & 0x01) << 30);
807	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
808	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
809	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
810	return VERR_CANT_RECODE_AS_UTF16;
811	//cb = 6;
812	}
813	else
814	{
815	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
816	return VERR_INVALID_UTF8_ENCODING;
817	}
818
819	/* advance */
820	puch += cb;
821	cwc++;
822	}
823	}
824
825	/* done */
826	*pcwc = cwc;
827	return VINF_SUCCESS;
828	}
829
830
831
832	/**
833	* Recodes a valid UTF-8 string as UTF-16.
834	*
835	* Since we know the input is valid, we do not perform encoding or length checks.
836	*
837	* @returns iprt status code.
838	* @param psz The UTF-8 string to recode. This is a valid encoding.
839	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
840	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
841	* @param pwsz Where to store the UTF-16 string.
842	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
843	*
844	* @note rtUtf8RecodeAsUtf16Big is a duplicate with RT_H2BE_U16 applied.
845	*/
846	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
847	{
848	int rc = VINF_SUCCESS;
849	const unsigned char puch = (const unsigned char )psz;
850	PRTUTF16 pwc = pwsz;
851	while (cch > 0)
852	{
853	/* read the next char and check for terminator. */
854	const unsigned char uch = *puch;
855	if (uch)
856	{ /* we only break once, so consider this the likely branch. */ }
857	else
858	break;
859
860	/* check for output overflow */
861	if (RT_LIKELY(cwc >= 1))
862	{ /* likely */ }
863	else
864	{
865	rc = VERR_BUFFER_OVERFLOW;
866	break;
867	}
868	cwc--;
869
870	/* decode and recode the code point */
871	if (!(uch & RT_BIT(7)))
872	{
873	*pwc++ = uch;
874	puch++;
875	cch--;
876	}
877	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
878	{
879	uint16_t uc = (puch[1] & 0x3f)
880	\| ((uint16_t)(uch & 0x1f) << 6);
881	*pwc++ = uc;
882	puch += 2;
883	cch -= 2;
884	}
885	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
886	{
887	uint16_t uc = (puch[2] & 0x3f)
888	\| ((uint16_t)(puch[1] & 0x3f) << 6)
889	\| ((uint16_t)(uch & 0x0f) << 12);
890	*pwc++ = uc;
891	puch += 3;
892	cch -= 3;
893	}
894	else
895	{
896	/* generate surrogate pair */
897	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
898	RTUNICP uc = (puch[3] & 0x3f)
899	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
900	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
901	\| ((RTUNICP)(uch & 0x07) << 18);
902	if (RT_UNLIKELY(cwc < 1))
903	{
904	rc = VERR_BUFFER_OVERFLOW;
905	break;
906	}
907	cwc--;
908
909	uc -= 0x10000;
910	*pwc++ = 0xd800 \| (uc >> 10);
911	*pwc++ = 0xdc00 \| (uc & 0x3ff);
912	puch += 4;
913	cch -= 4;
914	}
915	}
916
917	/* done */
918	*pwc = '\0';
919	return rc;
920	}
921
922
923	/**
924	* Recodes a valid UTF-8 string as UTF-16BE.
925	*
926	* Since we know the input is valid, we do not perform encoding or length checks.
927	*
928	* @returns iprt status code.
929	* @param psz The UTF-8 string to recode. This is a valid encoding.
930	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
931	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
932	* @param pwsz Where to store the UTF-16BE string.
933	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
934	*
935	* @note This is a copy of rtUtf8RecodeAsUtf16 with RT_H2BE_U16 applied.
936	*/
937	static int rtUtf8RecodeAsUtf16Big(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
938	{
939	int rc = VINF_SUCCESS;
940	const unsigned char puch = (const unsigned char )psz;
941	PRTUTF16 pwc = pwsz;
942	while (cch > 0)
943	{
944	/* read the next char and check for terminator. */
945	const unsigned char uch = *puch;
946	if (uch)
947	{ /* we only break once, so consider this the likely branch. */ }
948	else
949	break;
950
951	/* check for output overflow */
952	if (RT_LIKELY(cwc >= 1))
953	{ /* likely */ }
954	else
955	{
956	rc = VERR_BUFFER_OVERFLOW;
957	break;
958	}
959	cwc--;
960
961	/* decode and recode the code point */
962	if (!(uch & RT_BIT(7)))
963	{
964	*pwc++ = RT_H2BE_U16((RTUTF16)uch);
965	puch++;
966	cch--;
967	}
968	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
969	{
970	uint16_t uc = (puch[1] & 0x3f)
971	\| ((uint16_t)(uch & 0x1f) << 6);
972	*pwc++ = RT_H2BE_U16(uc);
973	puch += 2;
974	cch -= 2;
975	}
976	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
977	{
978	uint16_t uc = (puch[2] & 0x3f)
979	\| ((uint16_t)(puch[1] & 0x3f) << 6)
980	\| ((uint16_t)(uch & 0x0f) << 12);
981	*pwc++ = RT_H2BE_U16(uc);
982	puch += 3;
983	cch -= 3;
984	}
985	else
986	{
987	/* generate surrogate pair */
988	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
989	RTUNICP uc = (puch[3] & 0x3f)
990	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
991	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
992	\| ((RTUNICP)(uch & 0x07) << 18);
993	if (RT_UNLIKELY(cwc < 1))
994	{
995	rc = VERR_BUFFER_OVERFLOW;
996	break;
997	}
998	cwc--;
999
1000	uc -= 0x10000;
1001	*pwc++ = RT_H2BE_U16(0xd800 \| (uc >> 10));
1002	*pwc++ = RT_H2BE_U16(0xdc00 \| (uc & 0x3ff));
1003	puch += 4;
1004	cch -= 4;
1005	}
1006	}
1007
1008	/* done */
1009	*pwc = '\0';
1010	return rc;
1011	}
1012
1013
1014	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
1015	{
1016	/*
1017	* Validate input.
1018	*/
1019	AssertPtr(ppwszString);
1020	AssertPtr(pszString);
1021	*ppwszString = NULL;
1022
1023	/*
1024	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1025	*/
1026	size_t cwc;
1027	int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1028	if (RT_SUCCESS(rc))
1029	{
1030	/*
1031	* Allocate buffer.
1032	*/
1033	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1034	if (pwsz)
1035	{
1036	/*
1037	* Encode the UTF-16 string.
1038	*/
1039	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1040	if (RT_SUCCESS(rc))
1041	{
1042	*ppwszString = pwsz;
1043	return rc;
1044	}
1045	RTMemFree(pwsz);
1046	}
1047	else
1048	rc = VERR_NO_UTF16_MEMORY;
1049	}
1050	return rc;
1051	}
1052	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
1053
1054
1055	RTDECL(int) RTStrToUtf16BigTag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
1056	{
1057	/*
1058	* Validate input.
1059	*/
1060	AssertPtr(ppwszString);
1061	AssertPtr(pszString);
1062	*ppwszString = NULL;
1063
1064	/*
1065	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1066	*/
1067	size_t cwc;
1068	int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1069	if (RT_SUCCESS(rc))
1070	{
1071	/*
1072	* Allocate buffer.
1073	*/
1074	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1075	if (pwsz)
1076	{
1077	/*
1078	* Encode the UTF-16 string.
1079	*/
1080	rc = rtUtf8RecodeAsUtf16Big(pszString, RTSTR_MAX, pwsz, cwc);
1081	if (RT_SUCCESS(rc))
1082	{
1083	*ppwszString = pwsz;
1084	return rc;
1085	}
1086	RTMemFree(pwsz);
1087	}
1088	else
1089	rc = VERR_NO_UTF16_MEMORY;
1090	}
1091	return rc;
1092	}
1093	RT_EXPORT_SYMBOL(RTStrToUtf16BigTag);
1094
1095
1096	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
1097	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
1098	{
1099	/*
1100	* Validate input.
1101	*/
1102	AssertPtr(pszString);
1103	AssertPtr(ppwsz);
1104	AssertPtrNull(pcwc);
1105
1106	/*
1107	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1108	*/
1109	size_t cwcResult;
1110	int rc;
1111	if (cchString != RTSTR_MAX)
1112	rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1113	else
1114	rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1115	if (RT_SUCCESS(rc))
1116	{
1117	if (pcwc)
1118	*pcwc = cwcResult;
1119
1120	/*
1121	* Check buffer size / Allocate buffer.
1122	*/
1123	bool fShouldFree;
1124	PRTUTF16 pwszResult;
1125	if (cwc > 0 && *ppwsz)
1126	{
1127	fShouldFree = false;
1128	if (cwc <= cwcResult)
1129	return VERR_BUFFER_OVERFLOW;
1130	pwszResult = *ppwsz;
1131	}
1132	else
1133	{
1134	*ppwsz = NULL;
1135	fShouldFree = true;
1136	cwc = RT_MAX(cwcResult + 1, cwc);
1137	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1138	}
1139	if (pwszResult)
1140	{
1141	/*
1142	* Encode the UTF-16 string.
1143	*/
1144	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1145	if (RT_SUCCESS(rc))
1146	{
1147	*ppwsz = pwszResult;
1148	return rc;
1149	}
1150	if (fShouldFree)
1151	RTMemFree(pwszResult);
1152	}
1153	else
1154	rc = VERR_NO_UTF16_MEMORY;
1155	}
1156	return rc;
1157	}
1158	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
1159
1160
1161	RTDECL(int) RTStrToUtf16BigExTag(const char *pszString, size_t cchString,
1162	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
1163	{
1164	/*
1165	* Validate input.
1166	*/
1167	AssertPtr(pszString);
1168	AssertPtr(ppwsz);
1169	AssertPtrNull(pcwc);
1170
1171	/*
1172	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1173	*/
1174	size_t cwcResult;
1175	int rc;
1176	if (cchString != RTSTR_MAX)
1177	rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1178	else
1179	rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1180	if (RT_SUCCESS(rc))
1181	{
1182	if (pcwc)
1183	*pcwc = cwcResult;
1184
1185	/*
1186	* Check buffer size / Allocate buffer.
1187	*/
1188	bool fShouldFree;
1189	PRTUTF16 pwszResult;
1190	if (cwc > 0 && *ppwsz)
1191	{
1192	fShouldFree = false;
1193	if (cwc <= cwcResult)
1194	return VERR_BUFFER_OVERFLOW;
1195	pwszResult = *ppwsz;
1196	}
1197	else
1198	{
1199	*ppwsz = NULL;
1200	fShouldFree = true;
1201	cwc = RT_MAX(cwcResult + 1, cwc);
1202	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1203	}
1204	if (pwszResult)
1205	{
1206	/*
1207	* Encode the UTF-16BE string.
1208	*/
1209	rc = rtUtf8RecodeAsUtf16Big(pszString, cchString, pwszResult, cwc - 1);
1210	if (RT_SUCCESS(rc))
1211	{
1212	*ppwsz = pwszResult;
1213	return rc;
1214	}
1215	if (fShouldFree)
1216	RTMemFree(pwszResult);
1217	}
1218	else
1219	rc = VERR_NO_UTF16_MEMORY;
1220	}
1221	return rc;
1222	}
1223	RT_EXPORT_SYMBOL(RTStrToUtf16BigExTag);
1224
1225
1226	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
1227	{
1228	size_t cwc;
1229	int rc = rtUtf8CalcUtf16Length(psz, &cwc);
1230	return RT_SUCCESS(rc) ? cwc : 0;
1231	}
1232	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
1233
1234
1235	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
1236	{
1237	size_t cwc;
1238	int rc;
1239	if (cch != RTSTR_MAX)
1240	rc = rtUtf8CalcUtf16LengthN(psz, cch, &cwc);
1241	else
1242	rc = rtUtf8CalcUtf16Length(psz, &cwc);
1243	if (pcwc)
1244	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1245	return rc;
1246	}
1247	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
1248
1249
1250	/**
1251	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
1252	*
1253	* @returns iprt status code.
1254	* @param psz The Latin-1 string.
1255	* @param cchIn The max length of the Latin-1 string to consider.
1256	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
1257	*/
1258	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
1259	{
1260	size_t cch = 0;
1261	for (;;)
1262	{
1263	RTUNICP Cp;
1264	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
1265	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1266	break;
1267	if (RT_FAILURE(rc))
1268	return rc;
1269	cch += RTStrCpSize(Cp); /* cannot fail */
1270	}
1271
1272	/* done */
1273	*pcch = cch;
1274	return VINF_SUCCESS;
1275	}
1276
1277
1278	/**
1279	* Recodes a Latin-1 string as UTF-8.
1280	*
1281	* @returns iprt status code.
1282	* @param pszIn The Latin-1 string.
1283	* @param cchIn The number of characters to process from psz. The recoding
1284	* will stop when cch or '\\0' is reached.
1285	* @param psz Where to store the UTF-8 string.
1286	* @param cch The size of the UTF-8 buffer, excluding the terminator.
1287	*/
1288	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
1289	{
1290	int rc;
1291	for (;;)
1292	{
1293	RTUNICP Cp;
1294	size_t cchCp;
1295	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
1296	if (Cp == 0 \|\| RT_FAILURE(rc))
1297	break;
1298	cchCp = RTStrCpSize(Cp);
1299	if (RT_UNLIKELY(cch < cchCp))
1300	{
1301	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1302	rc = VERR_BUFFER_OVERFLOW;
1303	break;
1304	}
1305	cch -= cchCp;
1306	psz = RTStrPutCp(psz, Cp);
1307	}
1308
1309	/* done */
1310	if (rc == VERR_END_OF_STRING)
1311	rc = VINF_SUCCESS;
1312	*psz = '\0';
1313	return rc;
1314	}
1315
1316
1317
1318	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
1319	{
1320	/*
1321	* Validate input.
1322	*/
1323	AssertPtr(ppszString);
1324	AssertPtr(pszString);
1325	*ppszString = NULL;
1326
1327	/*
1328	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
1329	*/
1330	size_t cch;
1331	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
1332	if (RT_SUCCESS(rc))
1333	{
1334	/*
1335	* Allocate buffer and recode it.
1336	*/
1337	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
1338	if (pszResult)
1339	{
1340	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
1341	if (RT_SUCCESS(rc))
1342	{
1343	*ppszString = pszResult;
1344	return rc;
1345	}
1346
1347	RTMemFree(pszResult);
1348	}
1349	else
1350	rc = VERR_NO_STR_MEMORY;
1351	}
1352	return rc;
1353	}
1354	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
1355
1356
1357	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
1358	{
1359	/*
1360	* Validate input.
1361	*/
1362	AssertPtr(pszString);
1363	AssertPtr(ppsz);
1364	AssertPtrNull(pcch);
1365
1366	/*
1367	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
1368	*/
1369	size_t cchResult;
1370	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
1371	if (RT_SUCCESS(rc))
1372	{
1373	if (pcch)
1374	*pcch = cchResult;
1375
1376	/*
1377	* Check buffer size / Allocate buffer and recode it.
1378	*/
1379	bool fShouldFree;
1380	char *pszResult;
1381	if (cch > 0 && *ppsz)
1382	{
1383	fShouldFree = false;
1384	if (RT_UNLIKELY(cch <= cchResult))
1385	return VERR_BUFFER_OVERFLOW;
1386	pszResult = *ppsz;
1387	}
1388	else
1389	{
1390	*ppsz = NULL;
1391	fShouldFree = true;
1392	cch = RT_MAX(cch, cchResult + 1);
1393	pszResult = (char *)RTStrAllocTag(cch, pszTag);
1394	}
1395	if (pszResult)
1396	{
1397	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
1398	if (RT_SUCCESS(rc))
1399	{
1400	*ppsz = pszResult;
1401	return rc;
1402	}
1403
1404	if (fShouldFree)
1405	RTStrFree(pszResult);
1406	}
1407	else
1408	rc = VERR_NO_STR_MEMORY;
1409	}
1410	return rc;
1411	}
1412	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1413
1414
1415	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1416	{
1417	size_t cch;
1418	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1419	return RT_SUCCESS(rc) ? cch : 0;
1420	}
1421	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1422
1423
1424	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
1425	{
1426	size_t cch;
1427	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1428	if (pcch)
1429	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1430	return rc;
1431	}
1432	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1433
1434
1435	/**
1436	* Calculates the Latin-1 length of a string, validating the encoding while
1437	* doing so.
1438	*
1439	* @returns IPRT status code.
1440	* @param psz Pointer to the UTF-8 string.
1441	* @param cchIn The max length of the string. (btw cch = cb)
1442	* Use RTSTR_MAX if all of the string is to be examined.
1443	* @param pcch Where to store the length of the Latin-1 string in bytes.
1444	*/
1445	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
1446	{
1447	size_t cch = 0;
1448	for (;;)
1449	{
1450	RTUNICP Cp;
1451	size_t cchCp;
1452	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1453	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1454	break;
1455	if (RT_FAILURE(rc))
1456	return rc;
1457	cchCp = RTLatin1CpSize(Cp);
1458	if (cchCp == 0)
1459	return VERR_NO_TRANSLATION;
1460	cch += cchCp;
1461	}
1462
1463	/* done */
1464	*pcch = cch;
1465	return VINF_SUCCESS;
1466	}
1467
1468
1469	/**
1470	* Recodes a valid UTF-8 string as Latin-1.
1471	*
1472	* Since we know the input is valid, we do not perform encoding or length checks.
1473	*
1474	* @returns iprt status code.
1475	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1476	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1477	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1478	* @param psz Where to store the Latin-1 string.
1479	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1480	*/
1481	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1482	{
1483	int rc;
1484	for (;;)
1485	{
1486	RTUNICP Cp;
1487	size_t cchCp;
1488	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1489	if (Cp == 0 \|\| RT_FAILURE(rc))
1490	break;
1491	cchCp = RTLatin1CpSize(Cp);
1492	if (RT_UNLIKELY(cch < cchCp))
1493	{
1494	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1495	rc = VERR_BUFFER_OVERFLOW;
1496	break;
1497	}
1498	cch -= cchCp;
1499	psz = RTLatin1PutCp(psz, Cp);
1500	}
1501
1502	/* done */
1503	if (rc == VERR_END_OF_STRING)
1504	rc = VINF_SUCCESS;
1505	*psz = '\0';
1506	return rc;
1507	}
1508
1509
1510
1511	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1512	{
1513	/*
1514	* Validate input.
1515	*/
1516	AssertPtr(ppszString);
1517	AssertPtr(pszString);
1518	*ppszString = NULL;
1519
1520	/*
1521	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1522	*/
1523	size_t cch;
1524	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1525	if (RT_SUCCESS(rc))
1526	{
1527	/*
1528	* Allocate buffer.
1529	*/
1530	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1531	if (psz)
1532	{
1533	/*
1534	* Encode the UTF-16 string.
1535	*/
1536	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1537	if (RT_SUCCESS(rc))
1538	{
1539	*ppszString = psz;
1540	return rc;
1541	}
1542	RTMemFree(psz);
1543	}
1544	else
1545	rc = VERR_NO_STR_MEMORY;
1546	}
1547	return rc;
1548	}
1549	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1550
1551
1552	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1553	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1554	{
1555	/*
1556	* Validate input.
1557	*/
1558	AssertPtr(pszString);
1559	AssertPtr(ppsz);
1560	AssertPtrNull(pcch);
1561
1562	/*
1563	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1564	*/
1565	size_t cchResult;
1566	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1567	if (RT_SUCCESS(rc))
1568	{
1569	if (pcch)
1570	*pcch = cchResult;
1571
1572	/*
1573	* Check buffer size / Allocate buffer.
1574	*/
1575	bool fShouldFree;
1576	char *pszResult;
1577	if (cch > 0 && *ppsz)
1578	{
1579	fShouldFree = false;
1580	if (cch <= cchResult)
1581	return VERR_BUFFER_OVERFLOW;
1582	pszResult = *ppsz;
1583	}
1584	else
1585	{
1586	*ppsz = NULL;
1587	fShouldFree = true;
1588	cch = RT_MAX(cchResult + 1, cch);
1589	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1590	}
1591	if (pszResult)
1592	{
1593	/*
1594	* Encode the Latin-1 string.
1595	*/
1596	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1597	if (RT_SUCCESS(rc))
1598	{
1599	*ppsz = pszResult;
1600	return rc;
1601	}
1602	if (fShouldFree)
1603	RTMemFree(pszResult);
1604	}
1605	else
1606	rc = VERR_NO_STR_MEMORY;
1607	}
1608	return rc;
1609	}
1610	RT_EXPORT_SYMBOL(RTStrToLatin1ExTag);
1611
1612
1613	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1614	{
1615	size_t cch;
1616	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1617	return RT_SUCCESS(rc) ? cch : 0;
1618	}
1619	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1620
1621
1622	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1623	{
1624	size_t cch;
1625	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1626	if (pcch)
1627	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1628	return rc;
1629	}
1630	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1631
1632
1633	/**
1634	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1635	* @returns rc
1636	* @param ppsz The pointer to the string position point.
1637	* @param pCp Where to store RTUNICP_INVALID.
1638	* @param rc The iprt error code.
1639	*/
1640	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1641	{
1642	/*
1643	* Try find a valid encoding.
1644	*/
1645	(ppsz)++; /* @todo code this! */
1646	*pCp = RTUNICP_INVALID;
1647	return rc;
1648	}
1649
1650
1651	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1652	{
1653	RTUNICP Cp;
1654	RTStrGetCpExInternal(&psz, &Cp);
1655	return Cp;
1656	}
1657	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1658
1659
1660	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1661	{
1662	const unsigned char puch = (const unsigned char )*ppsz;
1663	const unsigned char uch = *puch;
1664	RTUNICP uc;
1665
1666	/* ASCII ? */
1667	if (!(uch & RT_BIT(7)))
1668	{
1669	uc = uch;
1670	puch++;
1671	}
1672	else if (uch & RT_BIT(6))
1673	{
1674	/* figure the length and validate the first octet. */
1675	/** @todo RT_USE_RTC_3629 */
1676	unsigned cb;
1677	if (!(uch & RT_BIT(5)))
1678	cb = 2;
1679	else if (!(uch & RT_BIT(4)))
1680	cb = 3;
1681	else if (!(uch & RT_BIT(3)))
1682	cb = 4;
1683	else if (!(uch & RT_BIT(2)))
1684	cb = 5;
1685	else if (!(uch & RT_BIT(1)))
1686	cb = 6;
1687	else
1688	{
1689	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1690	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1691	}
1692
1693	/* validate the rest */
1694	switch (cb)
1695	{
1696	case 6:
1697	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1698	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1699	RT_FALL_THRU();
1700	case 5:
1701	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1702	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1703	RT_FALL_THRU();
1704	case 4:
1705	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1706	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1707	RT_FALL_THRU();
1708	case 3:
1709	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1710	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1711	RT_FALL_THRU();
1712	case 2:
1713	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1714	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1715	break;
1716	}
1717
1718	/* get and validate the code point. */
1719	switch (cb)
1720	{
1721	case 6:
1722	uc = (puch[5] & 0x3f)
1723	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1724	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1725	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1726	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1727	\| ((RTUNICP)(uch & 0x01) << 30);
1728	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1729	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1730	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1731	break;
1732	case 5:
1733	uc = (puch[4] & 0x3f)
1734	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1735	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1736	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1737	\| ((RTUNICP)(uch & 0x03) << 24);
1738	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1739	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1740	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1741	break;
1742	case 4:
1743	uc = (puch[3] & 0x3f)
1744	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1745	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1746	\| ((RTUNICP)(uch & 0x07) << 18);
1747	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1748	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1749	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1750	break;
1751	case 3:
1752	uc = (puch[2] & 0x3f)
1753	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1754	\| ((RTUNICP)(uch & 0x0f) << 12);
1755	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1756	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1757	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1758	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1759	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1760	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1761	break;
1762	case 2:
1763	uc = (puch[1] & 0x3f)
1764	\| ((RTUNICP)(uch & 0x1f) << 6);
1765	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1766	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1767	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1768	break;
1769	default: /* impossible, but GCC is bitching. */
1770	uc = RTUNICP_INVALID;
1771	break;
1772	}
1773	puch += cb;
1774	}
1775	else
1776	{
1777	/* 6th bit is always set. */
1778	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1779	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1780	}
1781	*pCp = uc;
1782	ppsz = (const char )puch;
1783	return VINF_SUCCESS;
1784	}
1785	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1786
1787
1788	/**
1789	* Handle invalid encodings passed to RTStrGetCpNEx().
1790	* @returns rc
1791	* @param ppsz The pointer to the string position point.
1792	* @param pcch Pointer to the string length.
1793	* @param pCp Where to store RTUNICP_INVALID.
1794	* @param rc The iprt error code.
1795	*/
1796	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1797	{
1798	/*
1799	* Try find a valid encoding.
1800	*/
1801	(ppsz)++; /* @todo code this! */
1802	(*pcch)--;
1803	*pCp = RTUNICP_INVALID;
1804	return rc;
1805	}
1806
1807
1808	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1809	{
1810	const unsigned char puch = (const unsigned char )*ppsz;
1811	const unsigned char uch = *puch;
1812	size_t cch = *pcch;
1813	RTUNICP uc;
1814
1815	if (cch == 0)
1816	{
1817	*pCp = RTUNICP_INVALID;
1818	return VERR_END_OF_STRING;
1819	}
1820
1821	/* ASCII ? */
1822	if (!(uch & RT_BIT(7)))
1823	{
1824	uc = uch;
1825	puch++;
1826	cch--;
1827	}
1828	else if (uch & RT_BIT(6))
1829	{
1830	/* figure the length and validate the first octet. */
1831	/** @todo RT_USE_RTC_3629 */
1832	unsigned cb;
1833	if (!(uch & RT_BIT(5)))
1834	cb = 2;
1835	else if (!(uch & RT_BIT(4)))
1836	cb = 3;
1837	else if (!(uch & RT_BIT(3)))
1838	cb = 4;
1839	else if (!(uch & RT_BIT(2)))
1840	cb = 5;
1841	else if (!(uch & RT_BIT(1)))
1842	cb = 6;
1843	else
1844	{
1845	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1846	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1847	}
1848
1849	if (cb > cch)
1850	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1851
1852	/* validate the rest */
1853	switch (cb)
1854	{
1855	case 6:
1856	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1857	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1858	RT_FALL_THRU();
1859	case 5:
1860	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1861	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1862	RT_FALL_THRU();
1863	case 4:
1864	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1865	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1866	RT_FALL_THRU();
1867	case 3:
1868	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1869	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1870	RT_FALL_THRU();
1871	case 2:
1872	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1873	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1874	break;
1875	}
1876
1877	/* get and validate the code point. */
1878	switch (cb)
1879	{
1880	case 6:
1881	uc = (puch[5] & 0x3f)
1882	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1883	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1884	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1885	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1886	\| ((RTUNICP)(uch & 0x01) << 30);
1887	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1888	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1889	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1890	break;
1891	case 5:
1892	uc = (puch[4] & 0x3f)
1893	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1894	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1895	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1896	\| ((RTUNICP)(uch & 0x03) << 24);
1897	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1898	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1899	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1900	break;
1901	case 4:
1902	uc = (puch[3] & 0x3f)
1903	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1904	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1905	\| ((RTUNICP)(uch & 0x07) << 18);
1906	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1907	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1908	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1909	break;
1910	case 3:
1911	uc = (puch[2] & 0x3f)
1912	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1913	\| ((RTUNICP)(uch & 0x0f) << 12);
1914	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1915	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1916	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1917	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1918	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1919	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1920	break;
1921	case 2:
1922	uc = (puch[1] & 0x3f)
1923	\| ((RTUNICP)(uch & 0x1f) << 6);
1924	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1925	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1926	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1927	break;
1928	default: /* impossible, but GCC is bitching. */
1929	uc = RTUNICP_INVALID;
1930	break;
1931	}
1932	puch += cb;
1933	cch -= cb;
1934	}
1935	else
1936	{
1937	/* 6th bit is always set. */
1938	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1939	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1940	}
1941	*pCp = uc;
1942	ppsz = (const char )puch;
1943	(*pcch) = cch;
1944	return VINF_SUCCESS;
1945	}
1946	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1947
1948
1949	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1950	{
1951	unsigned char puch = (unsigned char )psz;
1952	if (uc < 0x80)
1953	*puch++ = (unsigned char )uc;
1954	else if (uc < 0x00000800)
1955	{
1956	*puch++ = 0xc0 \| (uc >> 6);
1957	*puch++ = 0x80 \| (uc & 0x3f);
1958	}
1959	else if (uc < 0x00010000)
1960	{
1961	/** @todo RT_USE_RTC_3629 */
1962	if ( uc < 0x0000d8000
1963	\|\| ( uc > 0x0000dfff
1964	&& uc < 0x0000fffe))
1965	{
1966	*puch++ = 0xe0 \| (uc >> 12);
1967	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1968	*puch++ = 0x80 \| (uc & 0x3f);
1969	}
1970	else
1971	{
1972	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1973	*puch++ = 0x7f;
1974	}
1975	}
1976	/** @todo RT_USE_RTC_3629 */
1977	else if (uc < 0x00200000)
1978	{
1979	*puch++ = 0xf0 \| (uc >> 18);
1980	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1981	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1982	*puch++ = 0x80 \| (uc & 0x3f);
1983	}
1984	else if (uc < 0x04000000)
1985	{
1986	*puch++ = 0xf8 \| (uc >> 24);
1987	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1988	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1989	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1990	*puch++ = 0x80 \| (uc & 0x3f);
1991	}
1992	else if (uc <= 0x7fffffff)
1993	{
1994	*puch++ = 0xfc \| (uc >> 30);
1995	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1996	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1997	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1998	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1999	*puch++ = 0x80 \| (uc & 0x3f);
2000	}
2001	else
2002	{
2003	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
2004	*puch++ = 0x7f;
2005	}
2006
2007	return (char *)puch;
2008	}
2009	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
2010
2011
2012	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
2013	{
2014	if (pszStart < psz)
2015	{
2016	/* simple char? */
2017	const unsigned char puch = (const unsigned char )psz;
2018	unsigned uch = *--puch;
2019	if (!(uch & RT_BIT(7)))
2020	return (char *)puch;
2021	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
2022
2023	/* two or more. */
2024	uint32_t uMask = 0xffffffc0;
2025	while ( (const unsigned char *)pszStart < puch
2026	&& !(uMask & 1))
2027	{
2028	uch = *--puch;
2029	if ((uch & 0xc0) != 0x80)
2030	{
2031	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
2032	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
2033	(char *)pszStart);
2034	return (char *)puch;
2035	}
2036	uMask >>= 1;
2037	}
2038	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
2039	}
2040	return (char *)pszStart;
2041	}
2042	RT_EXPORT_SYMBOL(RTStrPrevCp);
2043

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 98103

Download in other formats: