VirtualBox

source: vbox/trunk/src/VBox/Runtime/r3/posix/utf8-posix.cpp@ 76881

Last change on this file since 76881 was 76553, checked in by vboxsync, 6 years ago

scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 17.1 KB
Line 
1/* $Id: utf8-posix.cpp 76553 2019-01-01 01:45:53Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 helpers, POSIX.
4 */
5
6/*
7 * Copyright (C) 2006-2019 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/alloc.h>
35#include <iprt/assert.h>
36#include <iprt/err.h>
37#include <iprt/string.h>
38
39#include <errno.h>
40#include <locale.h>
41
42/* iconv prototype changed with 165+ (thanks to PSARC/2010/160 Bugster 7037400) */
43#if defined(RT_OS_SOLARIS)
44# if !defined(_XPG6)
45# define IPRT_XPG6_TMP_DEF
46# define _XPG6
47# endif
48# if defined(__USE_LEGACY_PROTOTYPES__)
49# define IPRT_LEGACY_PROTO_TMP_DEF
50# undef __USE_LEGACY_PROTOTYPES__
51# endif
52#endif /* RT_OS_SOLARIS */
53
54# include <iconv.h>
55
56#if defined(RT_OS_SOLARIS)
57# if defined(IPRT_XPG6_TMP_DEF)
58# undef _XPG6
59# undef IPRT_XPG6_TMP_DEF
60# endif
61# if defined(IPRT_LEGACY_PROTO_TMP_DEF)
62# define __USE_LEGACY_PROTOTYPES__
63# undef IPRT_LEGACY_PROTO_TMP_DEF
64# endif
65#endif /* RT_OS_SOLARIS */
66
67#include <wctype.h>
68
69#include <langinfo.h>
70
71#include "internal/alignmentchecks.h"
72#include "internal/string.h"
73#ifdef RT_WITH_ICONV_CACHE
74# include "internal/thread.h"
75AssertCompile(sizeof(iconv_t) <= sizeof(void *));
76#endif
77
78
79/* There are different opinions about the constness of the input buffer. */
80#if defined(RT_OS_LINUX) || defined(RT_OS_HAIKU) || defined(RT_OS_SOLARIS) \
81 || (defined(RT_OS_DARWIN) && defined(_DARWIN_FEATURE_UNIX_CONFORMANCE))
82# define NON_CONST_ICONV_INPUT
83#endif
84#ifdef RT_OS_FREEBSD
85# include <sys/param.h>
86# if __FreeBSD_version >= 1002000 /* Changed around 10.2.2 (https://svnweb.freebsd.org/base?view=revision&revision=281550) */
87# define NON_CONST_ICONV_INPUT
88# else
89# error __FreeBSD_version__
90# endif
91#endif
92
93
94/**
95 * Gets the codeset of the current locale (LC_CTYPE).
96 *
97 * @returns Pointer to read-only string with the codeset name.
98 */
99DECLHIDDEN(const char *) rtStrGetLocaleCodeset(void)
100{
101 return nl_langinfo(CODESET);
102}
103
104
105#ifdef RT_WITH_ICONV_CACHE
106
107/**
108 * Initializes the iconv handle cache associated with a thread.
109 *
110 * @param pThread The thread in question.
111 */
112DECLHIDDEN(void) rtStrIconvCacheInit(PRTTHREADINT pThread)
113{
114 for (size_t i = 0; i < RT_ELEMENTS(pThread->ahIconvs); i++)
115 pThread->ahIconvs[i] = (iconv_t)-1;
116}
117
118/**
119 * Destroys the iconv handle cache associated with a thread.
120 *
121 * @param pThread The thread in question.
122 */
123DECLHIDDEN(void) rtStrIconvCacheDestroy(PRTTHREADINT pThread)
124{
125 for (size_t i = 0; i < RT_ELEMENTS(pThread->ahIconvs); i++)
126 {
127 iconv_t hIconv = (iconv_t)pThread->ahIconvs[i];
128 pThread->ahIconvs[i] = (iconv_t)-1;
129 if (hIconv != (iconv_t)-1)
130 iconv_close(hIconv);
131 }
132}
133
134
135/**
136 * Converts a string from one charset to another.
137 *
138 * @returns iprt status code.
139 * @param pvInput Pointer to intput string.
140 * @param cbInput Size (in bytes) of input string. Excludes any terminators.
141 * @param pszInputCS Codeset of the input string.
142 * @param ppvOutput Pointer to pointer to output buffer if cbOutput > 0.
143 * If cbOutput is 0 this is where the pointer to the allocated
144 * buffer is stored.
145 * @param cbOutput Size of the passed in buffer.
146 * @param pszOutputCS Codeset of the input string.
147 * @param cFactor Input vs. output size factor.
148 * @param phIconv Pointer to the cache entry.
149 */
150static int rtstrConvertCached(const void *pvInput, size_t cbInput, const char *pszInputCS,
151 void **ppvOutput, size_t cbOutput, const char *pszOutputCS,
152 unsigned cFactor, iconv_t *phIconv)
153{
154 /*
155 * Allocate buffer
156 */
157 bool fUcs2Term;
158 void *pvOutput;
159 size_t cbOutput2;
160 if (!cbOutput)
161 {
162 cbOutput2 = cbInput * cFactor;
163 pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
164 if (!pvOutput)
165 return VERR_NO_TMP_MEMORY;
166 fUcs2Term = true;
167 }
168 else
169 {
170 pvOutput = *ppvOutput;
171 fUcs2Term = !strcmp(pszOutputCS, "UCS-2")
172 || !strcmp(pszOutputCS, "UTF-16")
173 || !strcmp(pszOutputCS, "ucs-2")
174 || !strcmp(pszOutputCS, "utf-16");
175 cbOutput2 = cbOutput - (fUcs2Term ? sizeof(RTUTF16) : 1);
176 if (cbOutput2 > cbOutput)
177 return VERR_BUFFER_OVERFLOW;
178 }
179
180 /*
181 * Use a loop here to retry with bigger buffers.
182 */
183 for (unsigned cTries = 10; cTries > 0; cTries--)
184 {
185 /*
186 * Create conversion object if necessary.
187 */
188 iconv_t hIconv = (iconv_t)*phIconv;
189 if (hIconv == (iconv_t)-1)
190 {
191#if defined(RT_OS_SOLARIS) || defined(RT_OS_NETBSD)
192 /* Some systems don't grok empty codeset strings, so help them find the current codeset. */
193 if (!*pszInputCS)
194 pszInputCS = rtStrGetLocaleCodeset();
195 if (!*pszOutputCS)
196 pszOutputCS = rtStrGetLocaleCodeset();
197#endif
198 IPRT_ALIGNMENT_CHECKS_DISABLE(); /* glibc causes trouble */
199 *phIconv = hIconv = iconv_open(pszOutputCS, pszInputCS);
200 IPRT_ALIGNMENT_CHECKS_ENABLE();
201 }
202 if (hIconv != (iconv_t)-1)
203 {
204 /*
205 * Do the conversion.
206 */
207 size_t cbInLeft = cbInput;
208 size_t cbOutLeft = cbOutput2;
209 const void *pvInputLeft = pvInput;
210 void *pvOutputLeft = pvOutput;
211 size_t cchNonRev;
212#ifdef NON_CONST_ICONV_INPUT
213 cchNonRev = iconv(hIconv, (char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
214#else
215 cchNonRev = iconv(hIconv, (const char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
216#endif
217 if (cchNonRev != (size_t)-1)
218 {
219 if (!cbInLeft)
220 {
221 /*
222 * We're done, just add the terminator and return.
223 * (Two terminators to support UCS-2 output, too.)
224 */
225 ((char *)pvOutputLeft)[0] = '\0';
226 if (fUcs2Term)
227 ((char *)pvOutputLeft)[1] = '\0';
228 *ppvOutput = pvOutput;
229 if (cchNonRev == 0)
230 return VINF_SUCCESS;
231 return VWRN_NO_TRANSLATION;
232 }
233 errno = E2BIG;
234 }
235
236 /*
237 * If we failed because of output buffer space we'll
238 * increase the output buffer size and retry.
239 */
240 if (errno == E2BIG)
241 {
242 if (!cbOutput)
243 {
244 RTMemTmpFree(pvOutput);
245 cbOutput2 *= 2;
246 pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
247 if (!pvOutput)
248 return VERR_NO_TMP_MEMORY;
249 continue;
250 }
251 return VERR_BUFFER_OVERFLOW;
252 }
253
254 /*
255 * Close the handle on all other errors to make sure we won't carry
256 * any bad state with us.
257 */
258 *phIconv = (iconv_t)-1;
259 iconv_close(hIconv);
260 }
261 break;
262 }
263
264 /* failure */
265 if (!cbOutput)
266 RTMemTmpFree(pvOutput);
267 return VERR_NO_TRANSLATION;
268}
269
270#endif /* RT_WITH_ICONV_CACHE */
271
272/**
273 * Converts a string from one charset to another without using the handle cache.
274 *
275 * @returns IPRT status code.
276 *
277 * @param pvInput Pointer to intput string.
278 * @param cbInput Size (in bytes) of input string. Excludes any terminators.
279 * @param pszInputCS Codeset of the input string.
280 * @param ppvOutput Pointer to pointer to output buffer if cbOutput > 0.
281 * If cbOutput is 0 this is where the pointer to the allocated
282 * buffer is stored.
283 * @param cbOutput Size of the passed in buffer.
284 * @param pszOutputCS Codeset of the input string.
285 * @param cFactor Input vs. output size factor.
286 */
287static int rtStrConvertUncached(const void *pvInput, size_t cbInput, const char *pszInputCS,
288 void **ppvOutput, size_t cbOutput, const char *pszOutputCS,
289 unsigned cFactor)
290{
291 /*
292 * Allocate buffer
293 */
294 bool fUcs2Term;
295 void *pvOutput;
296 size_t cbOutput2;
297 if (!cbOutput)
298 {
299 cbOutput2 = cbInput * cFactor;
300 pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
301 if (!pvOutput)
302 return VERR_NO_TMP_MEMORY;
303 fUcs2Term = true;
304 }
305 else
306 {
307 pvOutput = *ppvOutput;
308 fUcs2Term = !strcmp(pszOutputCS, "UCS-2");
309 cbOutput2 = cbOutput - (fUcs2Term ? sizeof(RTUTF16) : 1);
310 if (cbOutput2 > cbOutput)
311 return VERR_BUFFER_OVERFLOW;
312 }
313
314 /*
315 * Use a loop here to retry with bigger buffers.
316 */
317 for (unsigned cTries = 10; cTries > 0; cTries--)
318 {
319 /*
320 * Create conversion object.
321 */
322#if defined(RT_OS_SOLARIS) || defined(RT_OS_NETBSD)
323 /* Some systems don't grok empty codeset strings, so help them find the current codeset. */
324 if (!*pszInputCS)
325 pszInputCS = rtStrGetLocaleCodeset();
326 if (!*pszOutputCS)
327 pszOutputCS = rtStrGetLocaleCodeset();
328#endif
329 IPRT_ALIGNMENT_CHECKS_DISABLE(); /* glibc causes trouble */
330 iconv_t icHandle = iconv_open(pszOutputCS, pszInputCS);
331 IPRT_ALIGNMENT_CHECKS_ENABLE();
332 if (icHandle != (iconv_t)-1)
333 {
334 /*
335 * Do the conversion.
336 */
337 size_t cbInLeft = cbInput;
338 size_t cbOutLeft = cbOutput2;
339 const void *pvInputLeft = pvInput;
340 void *pvOutputLeft = pvOutput;
341 size_t cchNonRev;
342#ifdef NON_CONST_ICONV_INPUT
343 cchNonRev = iconv(icHandle, (char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
344#else
345 cchNonRev = iconv(icHandle, (const char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
346#endif
347 if (cchNonRev != (size_t)-1)
348 {
349 if (!cbInLeft)
350 {
351 /*
352 * We're done, just add the terminator and return.
353 * (Two terminators to support UCS-2 output, too.)
354 */
355 iconv_close(icHandle);
356 ((char *)pvOutputLeft)[0] = '\0';
357 if (fUcs2Term)
358 ((char *)pvOutputLeft)[1] = '\0';
359 *ppvOutput = pvOutput;
360 if (cchNonRev == 0)
361 return VINF_SUCCESS;
362 return VWRN_NO_TRANSLATION;
363 }
364 errno = E2BIG;
365 }
366 iconv_close(icHandle);
367
368 /*
369 * If we failed because of output buffer space we'll
370 * increase the output buffer size and retry.
371 */
372 if (errno == E2BIG)
373 {
374 if (!cbOutput)
375 {
376 RTMemTmpFree(pvOutput);
377 cbOutput2 *= 2;
378 pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
379 if (!pvOutput)
380 return VERR_NO_TMP_MEMORY;
381 continue;
382 }
383 return VERR_BUFFER_OVERFLOW;
384 }
385 }
386 break;
387 }
388
389 /* failure */
390 if (!cbOutput)
391 RTMemTmpFree(pvOutput);
392 return VERR_NO_TRANSLATION;
393}
394
395
396/**
397 * Wrapper that selects rtStrConvertCached or rtStrConvertUncached.
398 *
399 * @returns IPRT status code.
400 *
401 * @param pszInput Pointer to intput string.
402 * @param cchInput Size (in bytes) of input string. Excludes any
403 * terminators.
404 * @param pszInputCS Codeset of the input string.
405 * @param ppszOutput Pointer to pointer to output buffer if cbOutput > 0.
406 * If cbOutput is 0 this is where the pointer to the
407 * allocated buffer is stored.
408 * @param cbOutput Size of the passed in buffer.
409 * @param pszOutputCS Codeset of the input string.
410 * @param cFactor Input vs. output size factor.
411 * @param enmCacheIdx The iconv cache index.
412 */
413DECLINLINE(int) rtStrConvertWrapper(const char *pchInput, size_t cchInput, const char *pszInputCS,
414 char **ppszOutput, size_t cbOutput, const char *pszOutputCS,
415 unsigned cFactor, RTSTRICONV enmCacheIdx)
416{
417#ifdef RT_WITH_ICONV_CACHE
418 RTTHREAD hSelf = RTThreadSelf();
419 if (hSelf != NIL_RTTHREAD)
420 {
421 PRTTHREADINT pThread = rtThreadGet(hSelf);
422 if (pThread)
423 {
424 if ((pThread->fIntFlags & (RTTHREADINT_FLAGS_ALIEN | RTTHREADINT_FLAGS_MAIN)) != RTTHREADINT_FLAGS_ALIEN)
425 {
426 int rc = rtstrConvertCached(pchInput, cchInput, pszInputCS,
427 (void **)ppszOutput, cbOutput, pszOutputCS,
428 cFactor, (iconv_t *)&pThread->ahIconvs[enmCacheIdx]);
429 rtThreadRelease(pThread);
430 return rc;
431 }
432 rtThreadRelease(pThread);
433 }
434 }
435#endif
436 return rtStrConvertUncached(pchInput, cchInput, pszInputCS,
437 (void **)ppszOutput, cbOutput, pszOutputCS,
438 cFactor);
439}
440
441
442/**
443 * Internal API for use by the path conversion code.
444 *
445 * @returns IPRT status code.
446 *
447 * @param pszInput Pointer to intput string.
448 * @param cchInput Size (in bytes) of input string. Excludes any
449 * terminators.
450 * @param pszInputCS Codeset of the input string.
451 * @param ppszOutput Pointer to pointer to output buffer if cbOutput > 0.
452 * If cbOutput is 0 this is where the pointer to the
453 * allocated buffer is stored.
454 * @param cbOutput Size of the passed in buffer.
455 * @param pszOutputCS Codeset of the input string.
456 * @param cFactor Input vs. output size factor.
457 * @param enmCacheIdx The iconv cache index.
458 */
459DECLHIDDEN(int) rtStrConvert(const char *pchInput, size_t cchInput, const char *pszInputCS,
460 char **ppszOutput, size_t cbOutput, const char *pszOutputCS,
461 unsigned cFactor, RTSTRICONV enmCacheIdx)
462{
463 Assert(enmCacheIdx >= 0 && enmCacheIdx < RTSTRICONV_END);
464 return rtStrConvertWrapper(pchInput, cchInput, pszInputCS,
465 ppszOutput, cbOutput, pszOutputCS,
466 cFactor, enmCacheIdx);
467}
468
469
470RTR3DECL(int) RTStrUtf8ToCurrentCPTag(char **ppszString, const char *pszString, const char *pszTag)
471{
472 Assert(ppszString);
473 Assert(pszString);
474 *ppszString = NULL;
475
476 /*
477 * Assume result string length is not longer than UTF-8 string.
478 */
479 size_t cch = strlen(pszString);
480 if (cch <= 0)
481 {
482 /* zero length string passed. */
483 *ppszString = (char *)RTMemTmpAllocZTag(sizeof(char), pszTag);
484 if (*ppszString)
485 return VINF_SUCCESS;
486 return VERR_NO_TMP_MEMORY;
487 }
488 return rtStrConvertWrapper(pszString, cch, "UTF-8", ppszString, 0, "", 1, RTSTRICONV_UTF8_TO_LOCALE);
489}
490
491
492RTR3DECL(int) RTStrCurrentCPToUtf8Tag(char **ppszString, const char *pszString, const char *pszTag)
493{
494 Assert(ppszString);
495 Assert(pszString);
496 *ppszString = NULL;
497
498 /*
499 * Attempt with UTF-8 length of 2x the native length.
500 */
501 size_t cch = strlen(pszString);
502 if (cch <= 0)
503 {
504 /* zero length string passed. */
505 *ppszString = (char *)RTMemTmpAllocZTag(sizeof(char), pszTag);
506 if (*ppszString)
507 return VINF_SUCCESS;
508 return VERR_NO_TMP_MEMORY;
509 }
510 return rtStrConvertWrapper(pszString, cch, "", ppszString, 0, "UTF-8", 2, RTSTRICONV_LOCALE_TO_UTF8);
511}
512
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette