VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16-latin-1.cpp@ 53528

Last change on this file since 53528 was 51770, checked in by vboxsync, 11 years ago

Merged in iprt++ dev branch.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 13.4 KB
Line 
1/* $Id: utf-16-latin-1.cpp 51770 2014-07-01 18:14:02Z vboxsync $ */
2/** @file
3 * IPRT - Latin-1 and UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2014 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41/**
42 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
43 *
44 * @returns iprt status code.
45 * @param pwsz The UTF-16 string.
46 * @param cwc The max length of the UTF-16 string to consider.
47 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
48 */
49static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
50{
51 int rc = VINF_SUCCESS;
52 size_t cch = 0;
53 while (cwc > 0)
54 {
55 RTUTF16 wc = *pwsz++; cwc--;
56 if (!wc)
57 break;
58 else if (RT_LIKELY(wc < 0x100))
59 ++cch;
60 else
61 {
62 if (wc < 0xd800 || wc > 0xdfff)
63 {
64 if (wc >= 0xfffe)
65 {
66 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
67 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
68 break;
69 }
70 }
71 else
72 {
73 if (wc >= 0xdc00)
74 {
75 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
76 rc = VERR_INVALID_UTF16_ENCODING;
77 break;
78 }
79 if (cwc <= 0)
80 {
81 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
82 rc = VERR_INVALID_UTF16_ENCODING;
83 break;
84 }
85 wc = *pwsz++; cwc--;
86 if (wc < 0xdc00 || wc > 0xdfff)
87 {
88 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
89 rc = VERR_INVALID_UTF16_ENCODING;
90 break;
91 }
92 }
93
94 rc = VERR_NO_TRANSLATION;
95 break;
96 }
97 }
98
99 /* done */
100 *pcch = cch;
101 return rc;
102}
103
104
105/**
106 * Recodes an valid UTF-16 string as Latin1.
107 *
108 * @returns iprt status code.
109 * @param pwsz The UTF-16 string.
110 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
111 * will stop when cwc or '\\0' is reached.
112 * @param psz Where to store the Latin1 string.
113 * @param cch The size of the Latin1 buffer, excluding the terminator.
114 */
115static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch)
116{
117 unsigned char *pch = (unsigned char *)psz;
118 int rc = VINF_SUCCESS;
119 while (cwc > 0)
120 {
121 RTUTF16 wc = *pwsz++; cwc--;
122 if (!wc)
123 break;
124 if (RT_LIKELY(wc < 0x100))
125 {
126 if (RT_UNLIKELY(cch < 1))
127 {
128 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
129 rc = VERR_BUFFER_OVERFLOW;
130 break;
131 }
132 cch--;
133 *pch++ = (unsigned char)wc;
134 }
135 else
136 {
137 if (wc < 0xd800 || wc > 0xdfff)
138 {
139 if (wc >= 0xfffe)
140 {
141 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
142 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
143 break;
144 }
145 }
146 else
147 {
148 if (wc >= 0xdc00)
149 {
150 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
151 rc = VERR_INVALID_UTF16_ENCODING;
152 break;
153 }
154 if (cwc <= 0)
155 {
156 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
157 rc = VERR_INVALID_UTF16_ENCODING;
158 break;
159 }
160 RTUTF16 wc2 = *pwsz++; cwc--;
161 if (wc2 < 0xdc00 || wc2 > 0xdfff)
162 {
163 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
164 rc = VERR_INVALID_UTF16_ENCODING;
165 break;
166 }
167 }
168
169 rc = VERR_NO_TRANSLATION;
170 break;
171 }
172 }
173
174 /* done */
175 *pch = '\0';
176 return rc;
177}
178
179
180RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
181{
182 /*
183 * Validate input.
184 */
185 Assert(VALID_PTR(ppszString));
186 Assert(VALID_PTR(pwszString));
187 *ppszString = NULL;
188
189 /*
190 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
191 */
192 size_t cch;
193 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
194 if (RT_SUCCESS(rc))
195 {
196 /*
197 * Allocate buffer and recode it.
198 */
199 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
200 if (pszResult)
201 {
202 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch);
203 if (RT_SUCCESS(rc))
204 {
205 *ppszString = pszResult;
206 return rc;
207 }
208
209 RTMemFree(pszResult);
210 }
211 else
212 rc = VERR_NO_STR_MEMORY;
213 }
214 return rc;
215}
216RT_EXPORT_SYMBOL(RTUtf16ToLatin1Tag);
217
218
219RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
220{
221 /*
222 * Validate input.
223 */
224 AssertPtr(pwszString);
225 AssertPtr(ppsz);
226 AssertPtrNull(pcch);
227
228 /*
229 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
230 */
231 size_t cchResult;
232 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
233 if (RT_SUCCESS(rc))
234 {
235 if (pcch)
236 *pcch = cchResult;
237
238 /*
239 * Check buffer size / Allocate buffer and recode it.
240 */
241 bool fShouldFree;
242 char *pszResult;
243 if (cch > 0 && *ppsz)
244 {
245 fShouldFree = false;
246 if (cch <= cchResult)
247 return VERR_BUFFER_OVERFLOW;
248 pszResult = *ppsz;
249 }
250 else
251 {
252 *ppsz = NULL;
253 fShouldFree = true;
254 cch = RT_MAX(cch, cchResult + 1);
255 pszResult = (char *)RTMemAllocTag(cch, pszTag);
256 }
257 if (pszResult)
258 {
259 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1);
260 if (RT_SUCCESS(rc))
261 {
262 *ppsz = pszResult;
263 return rc;
264 }
265
266 if (fShouldFree)
267 RTMemFree(pszResult);
268 }
269 else
270 rc = VERR_NO_STR_MEMORY;
271 }
272 return rc;
273}
274RT_EXPORT_SYMBOL(RTUtf16ToLatin1ExTag);
275
276
277RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
278{
279 size_t cch;
280 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
281 return RT_SUCCESS(rc) ? cch : 0;
282}
283RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
284
285
286RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
287{
288 size_t cch;
289 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
290 if (pcch)
291 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
292 return rc;
293}
294RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
295
296
297/**
298 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
299 * original length, but the function saves us nasty comments to that effect
300 * all over the place.
301 *
302 * @returns IPRT status code.
303 * @param psz Pointer to the Latin1 string.
304 * @param cch The max length of the string. (btw cch = cb)
305 * Use RTSTR_MAX if all of the string is to be examined.s
306 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
307 */
308static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
309{
310 *pcwc = RTStrNLen(psz, cch);
311 return VINF_SUCCESS;
312}
313
314
315/**
316 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
317 * sixteen bits, as Unicode is a superset of Latin1.
318 *
319 * Since we know the input is valid, we do *not* perform length checks.
320 *
321 * @returns iprt status code.
322 * @param psz The Latin1 string to recode.
323 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
324 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
325 * @param pwsz Where to store the UTF-16 string.
326 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
327 */
328static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
329{
330 int rc = VINF_SUCCESS;
331 const unsigned char *puch = (const unsigned char *)psz;
332 PRTUTF16 pwc = pwsz;
333 while (cch-- > 0)
334 {
335 /* read the next char and check for terminator. */
336 const unsigned char uch = *puch;
337 if (!uch)
338 break;
339
340 /* check for output overflow */
341 if (RT_UNLIKELY(cwc < 1))
342 {
343 rc = VERR_BUFFER_OVERFLOW;
344 break;
345 }
346
347 /* expand the code point */
348 *pwc++ = uch;
349 cwc--;
350 puch++;
351 }
352
353 /* done */
354 *pwc = '\0';
355 return rc;
356}
357
358
359RTDECL(int) RTLatin1ToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
360{
361 /*
362 * Validate input.
363 */
364 Assert(VALID_PTR(ppwszString));
365 Assert(VALID_PTR(pszString));
366 *ppwszString = NULL;
367
368 /*
369 * Validate the input and calculate the length of the UTF-16 string.
370 */
371 size_t cwc;
372 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
373 if (RT_SUCCESS(rc))
374 {
375 /*
376 * Allocate buffer.
377 */
378 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
379 if (pwsz)
380 {
381 /*
382 * Encode the UTF-16 string.
383 */
384 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
385 if (RT_SUCCESS(rc))
386 {
387 *ppwszString = pwsz;
388 return rc;
389 }
390 RTMemFree(pwsz);
391 }
392 else
393 rc = VERR_NO_UTF16_MEMORY;
394 }
395 return rc;
396}
397RT_EXPORT_SYMBOL(RTLatin1ToUtf16Tag);
398
399
400RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszString, size_t cchString,
401 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
402{
403 /*
404 * Validate input.
405 */
406 Assert(VALID_PTR(pszString));
407 Assert(VALID_PTR(ppwsz));
408 Assert(!pcwc || VALID_PTR(pcwc));
409
410 /*
411 * Validate the input and calculate the length of the UTF-16 string.
412 */
413 size_t cwcResult;
414 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
415 if (RT_SUCCESS(rc))
416 {
417 if (pcwc)
418 *pcwc = cwcResult;
419
420 /*
421 * Check buffer size / Allocate buffer.
422 */
423 bool fShouldFree;
424 PRTUTF16 pwszResult;
425 if (cwc > 0 && *ppwsz)
426 {
427 fShouldFree = false;
428 if (cwc <= cwcResult)
429 return VERR_BUFFER_OVERFLOW;
430 pwszResult = *ppwsz;
431 }
432 else
433 {
434 *ppwsz = NULL;
435 fShouldFree = true;
436 cwc = RT_MAX(cwcResult + 1, cwc);
437 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
438 }
439 if (pwszResult)
440 {
441 /*
442 * Encode the UTF-16 string.
443 */
444 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
445 if (RT_SUCCESS(rc))
446 {
447 *ppwsz = pwszResult;
448 return rc;
449 }
450 if (fShouldFree)
451 RTMemFree(pwszResult);
452 }
453 else
454 rc = VERR_NO_UTF16_MEMORY;
455 }
456 return rc;
457}
458RT_EXPORT_SYMBOL(RTLatin1ToUtf16ExTag);
459
460
461RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
462{
463 size_t cwc;
464 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
465 return RT_SUCCESS(rc) ? cwc : 0;
466}
467RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
468
469
470RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
471{
472 size_t cwc;
473 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
474 if (pcwc)
475 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
476 return rc;
477}
478RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
479
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette