nsNativeCharsetUtils.cpp

Last change on this file was 104490, checked in by vboxsync, 6 months ago
src/libs/xpcom/io: Some smaller cleanups, fix a possible double expansion of a statement with side effects passed to a macro, bugref:3409
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 26.4 KB

Line
1	/* *** BEGIN LICENSE BLOCK ***
2	* Version: MPL 1.1/GPL 2.0/LGPL 2.1
3	*
4	* The contents of this file are subject to the Mozilla Public License Version
5	* 1.1 (the "License"); you may not use this file except in compliance with
6	* the License. You may obtain a copy of the License at
7	* http://www.mozilla.org/MPL/
8	*
9	* Software distributed under the License is distributed on an "AS IS" basis,
10	* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11	* for the specific language governing rights and limitations under the
12	* License.
13	*
14	* The Original Code is Mozilla.
15	*
16	* The Initial Developer of the Original Code is
17	* Netscape Communications Corporation.
18	* Portions created by the Initial Developer are Copyright (C) 2002
19	* the Initial Developer. All Rights Reserved.
20	*
21	* Contributor(s):
22	* Darin Fisher <darin@netscape.com>
23	* Brian Stell <bstell@ix.netcom.com>
24	* Frank Tang <ftang@netscape.com>
25	* Brendan Eich <brendan@mozilla.org>
26	* Sergei Dolgov <sergei_d@fi.fi.tartu.ee>
27	*
28	* Alternatively, the contents of this file may be used under the terms of
29	* either the GNU General Public License Version 2 or later (the "GPL"), or
30	* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
31	* in which case the provisions of the GPL or the LGPL are applicable instead
32	* of those above. If you wish to allow use of your version of this file only
33	* under the terms of either the GPL or the LGPL, and not to allow others to
34	* use your version of this file under the terms of the MPL, indicate your
35	* decision by deleting the provisions above and replace them with the notice
36	* and other provisions required by the GPL or the LGPL. If you do not delete
37	* the provisions above, a recipient may use your version of this file under
38	* the terms of any one of the MPL, the GPL or the LGPL.
39	*
40	* *** END LICENSE BLOCK *** */
41
42	#include "xpcom-private.h"
43
44	//-----------------------------------------------------------------------------
45	// XP_UNIX
46	//-----------------------------------------------------------------------------
47	#if defined(XP_UNIX)
48
49	#include <stdlib.h> // mbtowc, wctomb
50	#include <locale.h> // setlocale
51	#include "nscore.h"
52	#include "nsAString.h"
53	#include "nsReadableUtils.h"
54
55	#include <iprt/assert.h>
56	#include <iprt/errcore.h>
57	#include <iprt/semaphore.h>
58
59	//
60	// choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
61	// but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
62	// or not (see bug 206811 and
63	// news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
64	// iconv for all platforms where nltypes.h and nllanginfo.h are present
65	// along with iconv.
66	//
67	#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
68	#define USE_ICONV 1
69	#else
70	#define USE_STDCONV 1
71	#endif
72
73	static void
74	isolatin1_to_utf16(const char *input, PRUint32 inputLeft, PRUnichar *output, PRUint32 outputLeft)
75	{
76	while (inputLeft && outputLeft) {
77	output = (unsigned char) input;
78	(*input)++;
79	(*inputLeft)--;
80	(*output)++;
81	(*outputLeft)--;
82	}
83	}
84
85	static void
86	utf16_to_isolatin1(const PRUnichar *input, PRUint32 inputLeft, char *output, PRUint32 outputLeft)
87	{
88	while (inputLeft && outputLeft) {
89	output = (unsigned char) input;
90	(*input)++;
91	(*inputLeft)--;
92	(*output)++;
93	(*outputLeft)--;
94	}
95	}
96
97	//-----------------------------------------------------------------------------
98	// conversion using iconv
99	//-----------------------------------------------------------------------------
100	#if defined(USE_ICONV)
101	#include <nl_types.h> // CODESET
102	#include <langinfo.h> // nl_langinfo
103	#include <iconv.h> // iconv_open, iconv, iconv_close
104	#include <errno.h>
105
106	#if defined(HAVE_ICONV_WITH_CONST_INPUT)
107	#define ICONV_INPUT(x) (x)
108	#else
109	#define ICONV_INPUT(x) ((char **)x)
110	#endif
111
112	// solaris definitely needs this, but we'll enable it by default
113	// just in case... but we know for sure that iconv(3) in glibc
114	// doesn't need this.
115	#if !defined(__GLIBC__)
116	#define ENABLE_UTF8_FALLBACK_SUPPORT
117	#endif
118
119	#define INVALID_ICONV_T ((iconv_t) -1)
120
121	static inline size_t
122	xp_iconv(iconv_t converter,
123	const char **input,
124	size_t *inputLeft,
125	char **output,
126	size_t *outputLeft)
127	{
128	size_t res, outputAvail = outputLeft ? *outputLeft : 0;
129	res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
130	if (res == (size_t) -1) {
131	// on some platforms (e.g., linux) iconv will fail with
132	// E2BIG if it cannot convert _all_ of its input. it'll
133	// still adjust all of the in/out params correctly, so we
134	// can ignore this error. the assumption is that we will
135	// be called again to complete the conversion.
136	if ((errno == E2BIG) && (*outputLeft < outputAvail))
137	res = 0;
138	}
139	return res;
140	}
141
142	static inline void
143	xp_iconv_reset(iconv_t converter)
144	{
145	// NOTE: the man pages on Solaris claim that you can pass NULL
146	// for all parameter to reset the converter, but beware the
147	// evil Solaris crash if you go down this route >:-)
148
149	const char *zero_char_in_ptr = NULL;
150	char *zero_char_out_ptr = NULL;
151	size_t zero_size_in = 0,
152	zero_size_out = 0;
153
154	xp_iconv(converter, &zero_char_in_ptr,
155	&zero_size_in,
156	&zero_char_out_ptr,
157	&zero_size_out);
158	}
159
160	static inline iconv_t
161	xp_iconv_open(const char to_list, const char from_list)
162	{
163	iconv_t res;
164	const char **from_name;
165	const char **to_name;
166
167	// try all possible combinations to locate a converter.
168	to_name = to_list;
169	while (*to_name) {
170	if (**to_name) {
171	from_name = from_list;
172	while (*from_name) {
173	if (**from_name) {
174	res = iconv_open(to_name, from_name);
175	if (res != INVALID_ICONV_T)
176	return res;
177	}
178	from_name++;
179	}
180	}
181	to_name++;
182	}
183
184	return INVALID_ICONV_T;
185	}
186
187	/*
188	* PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
189	* have to use UTF-16 with iconv(3) on platforms where it's supported.
190	* However, the way UTF-16 and UCS-2 are interpreted varies across platforms
191	* and implementations of iconv(3). On Tru64, it also depends on the environment
192	* variable. To avoid the trouble arising from byte-swapping
193	* (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
194	* back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
195	* on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
196	* which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
197	* and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
198	* variable ICONV_BYTEORDER is set to 'big-endian', about which not much
199	* can be done other than adding a note in the release notes. (bug 206811)
200	*/
201	static const char *UTF_16_NAMES[] = {
202	#if defined(IS_LITTLE_ENDIAN)
203	"UTF-16LE",
204	#if defined(__GLIBC__)
205	"UNICODELITTLE",
206	#endif
207	"UCS-2LE",
208	#else
209	"UTF-16BE",
210	#if defined(__GLIBC__)
211	"UNICODEBIG",
212	#endif
213	"UCS-2BE",
214	#endif
215	"UTF-16",
216	"UCS-2",
217	"UCS2",
218	"UCS_2",
219	"ucs-2",
220	"ucs2",
221	"ucs_2",
222	NULL
223	};
224
225	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
226	static const char *UTF_8_NAMES[] = {
227	"UTF-8",
228	"UTF8",
229	"UTF_8",
230	"utf-8",
231	"utf8",
232	"utf_8",
233	NULL
234	};
235	#endif
236
237	static const char *ISO_8859_1_NAMES[] = {
238	"ISO-8859-1",
239	#if !defined(__GLIBC__)
240	"ISO8859-1",
241	"ISO88591",
242	"ISO_8859_1",
243	"ISO8859_1",
244	"iso-8859-1",
245	"iso8859-1",
246	"iso88591",
247	"iso_8859_1",
248	"iso8859_1",
249	#endif
250	NULL
251	};
252
253	class nsNativeCharsetConverter
254	{
255	public:
256	nsNativeCharsetConverter();
257	~nsNativeCharsetConverter();
258
259	nsresult NativeToUnicode(const char *input , PRUint32 inputLeft,
260	PRUnichar *output, PRUint32 outputLeft);
261	nsresult UnicodeToNative(const PRUnichar *input , PRUint32 inputLeft,
262	char *output, PRUint32 outputLeft);
263
264	static void GlobalInit();
265	static void GlobalShutdown();
266
267	private:
268	static iconv_t gNativeToUnicode;
269	static iconv_t gUnicodeToNative;
270	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
271	static iconv_t gNativeToUTF8;
272	static iconv_t gUTF8ToNative;
273	static iconv_t gUnicodeToUTF8;
274	static iconv_t gUTF8ToUnicode;
275	#endif
276	static RTSEMFASTMUTEX gLock;
277	static PRBool gInitialized;
278
279	static void LazyInit();
280
281	static void Lock() { if (gLock != NILRTSEMFASTMUTEX) RTSemFastMutexRequest(gLock); }
282	static void Unlock() { if (gLock != NILRTSEMFASTMUTEX) RTSemFastMutexRelease(gLock); }
283	};
284
285	iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
286	iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
287	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
288	iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
289	iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
290	iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
291	iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
292	#endif
293	RTSEMFASTMUTEX nsNativeCharsetConverter::gLock = NIL_RTSEMFASTMUTEX;
294	PRBool nsNativeCharsetConverter::gInitialized = PR_FALSE;
295
296	void
297	nsNativeCharsetConverter::LazyInit()
298	{
299	const char *blank_list[] = { "", NULL };
300	const char **native_charset_list = blank_list;
301	const char *native_charset = nl_langinfo(CODESET);
302	if (native_charset == nsnull) {
303	NS_ERROR("native charset is unknown");
304	// fallback to ISO-8859-1
305	native_charset_list = ISO_8859_1_NAMES;
306	}
307	else
308	native_charset_list[0] = native_charset;
309
310	gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
311	gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
312
313	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
314	if (gNativeToUnicode == INVALID_ICONV_T) {
315	gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
316	gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
317	NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
318	NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
319	}
320	if (gUnicodeToNative == INVALID_ICONV_T) {
321	gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
322	gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
323	NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
324	NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
325	}
326	#else
327	NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
328	NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
329	#endif
330
331	/*
332	* On Solaris 8 (and newer?), the iconv modules converting to UCS-2
333	* prepend a byte order mark unicode character (BOM, u+FEFF) during
334	* the first use of the iconv converter. The same is the case of
335	* glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
336	* However, we use 'UTF-16LE/BE' in both cases, instead so that we
337	* should be safe. But just in case...
338	*
339	* This dummy conversion gets rid of the BOMs and fixes bug 153562.
340	*/
341	char dummy_input[1] = { ' ' };
342	char dummy_output[4];
343
344	if (gNativeToUnicode != INVALID_ICONV_T) {
345	const char *input = dummy_input;
346	size_t input_left = sizeof(dummy_input);
347	char *output = dummy_output;
348	size_t output_left = sizeof(dummy_output);
349
350	xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
351	}
352	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
353	if (gUTF8ToUnicode != INVALID_ICONV_T) {
354	const char *input = dummy_input;
355	size_t input_left = sizeof(dummy_input);
356	char *output = dummy_output;
357	size_t output_left = sizeof(dummy_output);
358
359	xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
360	}
361	#endif
362
363	gInitialized = PR_TRUE;
364	}
365
366	void
367	nsNativeCharsetConverter::GlobalInit()
368	{
369	int vrc = RTSemFastMutexCreate(&gLock);
370	NS_ASSERTION(RT_SUCCESS(vrc), "lock creation failed");
371	}
372
373	void
374	nsNativeCharsetConverter::GlobalShutdown()
375	{
376	if (gLock != NIL_RTSEMFASTMUTEX) {
377	RTSemFastMutexDestroy(gLock);
378	gLock = NIL_RTSEMFASTMUTEX;
379	}
380
381	if (gNativeToUnicode != INVALID_ICONV_T) {
382	iconv_close(gNativeToUnicode);
383	gNativeToUnicode = INVALID_ICONV_T;
384	}
385
386	if (gUnicodeToNative != INVALID_ICONV_T) {
387	iconv_close(gUnicodeToNative);
388	gUnicodeToNative = INVALID_ICONV_T;
389	}
390
391	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
392	if (gNativeToUTF8 != INVALID_ICONV_T) {
393	iconv_close(gNativeToUTF8);
394	gNativeToUTF8 = INVALID_ICONV_T;
395	}
396	if (gUTF8ToNative != INVALID_ICONV_T) {
397	iconv_close(gUTF8ToNative);
398	gUTF8ToNative = INVALID_ICONV_T;
399	}
400	if (gUnicodeToUTF8 != INVALID_ICONV_T) {
401	iconv_close(gUnicodeToUTF8);
402	gUnicodeToUTF8 = INVALID_ICONV_T;
403	}
404	if (gUTF8ToUnicode != INVALID_ICONV_T) {
405	iconv_close(gUTF8ToUnicode);
406	gUTF8ToUnicode = INVALID_ICONV_T;
407	}
408	#endif
409
410	gInitialized = PR_FALSE;
411	}
412
413	nsNativeCharsetConverter::nsNativeCharsetConverter()
414	{
415	Lock();
416	if (!gInitialized)
417	LazyInit();
418	}
419
420	nsNativeCharsetConverter::~nsNativeCharsetConverter()
421	{
422	// reset converters for next time
423	if (gNativeToUnicode != INVALID_ICONV_T)
424	xp_iconv_reset(gNativeToUnicode);
425	if (gUnicodeToNative != INVALID_ICONV_T)
426	xp_iconv_reset(gUnicodeToNative);
427	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
428	if (gNativeToUTF8 != INVALID_ICONV_T)
429	xp_iconv_reset(gNativeToUTF8);
430	if (gUTF8ToNative != INVALID_ICONV_T)
431	xp_iconv_reset(gUTF8ToNative);
432	if (gUnicodeToUTF8 != INVALID_ICONV_T)
433	xp_iconv_reset(gUnicodeToUTF8);
434	if (gUTF8ToUnicode != INVALID_ICONV_T)
435	xp_iconv_reset(gUTF8ToUnicode);
436	#endif
437	Unlock();
438	}
439
440	nsresult
441	nsNativeCharsetConverter::NativeToUnicode(const char **input,
442	PRUint32 *inputLeft,
443	PRUnichar **output,
444	PRUint32 *outputLeft)
445	{
446	size_t res = 0;
447	size_t inLeft = (size_t) *inputLeft;
448	size_t outLeft = (size_t) outputLeft 2;
449
450	if (gNativeToUnicode != INVALID_ICONV_T) {
451
452	res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
453
454	*inputLeft = inLeft;
455	*outputLeft = outLeft / 2;
456	if (res != (size_t) -1)
457	return NS_OK;
458
459	NS_WARNING("conversion from native to utf-16 failed");
460
461	// reset converter
462	xp_iconv_reset(gNativeToUnicode);
463	}
464	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
465	else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
466	(gUTF8ToUnicode != INVALID_ICONV_T)) {
467	// convert first to UTF8, then from UTF8 to UCS2
468	const char in = input;
469
470	char ubuf[1024];
471
472	// we assume we're always called with enough space in \|output\|,
473	// so convert many chars at a time...
474	while (inLeft) {
475	char *p = ubuf;
476	size_t n = sizeof(ubuf);
477	res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
478	if (res == (size_t) -1) {
479	NS_ERROR("conversion from native to utf-8 failed");
480	break;
481	}
482	NS_ASSERTION(outLeft > 0, "bad assumption");
483	p = ubuf;
484	n = sizeof(ubuf) - n;
485	res = xp_iconv(gUTF8ToUnicode, (const char ) &p, &n, (char ) output, &outLeft);
486	if (res == (size_t) -1) {
487	NS_ERROR("conversion from utf-8 to utf-16 failed");
488	break;
489	}
490	}
491
492	(input) += (inputLeft - inLeft);
493	*inputLeft = inLeft;
494	*outputLeft = outLeft / 2;
495
496	if (res != (size_t) -1)
497	return NS_OK;
498
499	// reset converters
500	xp_iconv_reset(gNativeToUTF8);
501	xp_iconv_reset(gUTF8ToUnicode);
502	}
503	#endif
504
505	// fallback: zero-pad and hope for the best
506	// XXX This is lame and we have to do better.
507	isolatin1_to_utf16(input, inputLeft, output, outputLeft);
508
509	return NS_OK;
510	}
511
512	nsresult
513	nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
514	PRUint32 *inputLeft,
515	char **output,
516	PRUint32 *outputLeft)
517	{
518	size_t res = 0;
519	size_t inLeft = (size_t) inputLeft 2;
520	size_t outLeft = (size_t) *outputLeft;
521
522	if (gUnicodeToNative != INVALID_ICONV_T) {
523	res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
524
525	if (res != (size_t) -1) {
526	*inputLeft = inLeft / 2;
527	*outputLeft = outLeft;
528	return NS_OK;
529	}
530
531	NS_ERROR("iconv failed");
532
533	// reset converter
534	xp_iconv_reset(gUnicodeToNative);
535	}
536	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
537	else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
538	(gUTF8ToNative != INVALID_ICONV_T)) {
539	const char in = (const char ) *input;
540
541	char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
542
543	// convert one uchar at a time...
544	while (inLeft && outLeft) {
545	char *p = ubuf;
546	size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
547	res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
548	if (res == (size_t) -1) {
549	NS_ERROR("conversion from utf-16 to utf-8 failed");
550	break;
551	}
552	p = ubuf;
553	n = sizeof(ubuf) - n;
554	res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
555	if (res == (size_t) -1) {
556	if (errno == E2BIG) {
557	// not enough room for last uchar... back up and return.
558	in -= sizeof(PRUnichar);
559	res = 0;
560	}
561	else
562	NS_ERROR("conversion from utf-8 to native failed");
563	break;
564	}
565	inLeft -= sizeof(PRUnichar);
566	}
567
568	if (res != (size_t) -1) {
569	(input) += (inputLeft - inLeft/2);
570	*inputLeft = inLeft/2;
571	*outputLeft = outLeft;
572	return NS_OK;
573	}
574
575	// reset converters
576	xp_iconv_reset(gUnicodeToUTF8);
577	xp_iconv_reset(gUTF8ToNative);
578	}
579	#endif
580
581	// fallback: truncate and hope for the best
582	utf16_to_isolatin1(input, inputLeft, output, outputLeft);
583
584	return NS_OK;
585	}
586
587	#endif // USE_ICONV
588
589	//-----------------------------------------------------------------------------
590	// conversion using mb[r]towc/wc[r]tomb
591	//-----------------------------------------------------------------------------
592	#if defined(USE_STDCONV)
593	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
594	#include <wchar.h> // mbrtowc, wcrtomb
595	#endif
596
597	class nsNativeCharsetConverter
598	{
599	public:
600	nsNativeCharsetConverter();
601
602	nsresult NativeToUnicode(const char *input , PRUint32 inputLeft,
603	PRUnichar *output, PRUint32 outputLeft);
604	nsresult UnicodeToNative(const PRUnichar *input , PRUint32 inputLeft,
605	char *output, PRUint32 outputLeft);
606
607	static void GlobalInit();
608	static void GlobalShutdown() { }
609
610	private:
611	static PRBool gWCharIsUnicode;
612
613	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
614	mbstate_t ps;
615	#endif
616	};
617
618	PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
619
620	nsNativeCharsetConverter::nsNativeCharsetConverter()
621	{
622	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
623	memset(&ps, 0, sizeof(ps));
624	#endif
625	}
626
627	void
628	nsNativeCharsetConverter::GlobalInit()
629	{
630	// verify that wchar_t for the current locale is actually unicode.
631	// if it is not, then we should avoid calling mbtowc/wctomb and
632	// just fallback on zero-pad/truncation conversion.
633	//
634	// this test cannot be done at build time because the encoding of
635	// wchar_t may depend on the runtime locale. sad, but true!!
636	//
637	// so, if wchar_t is unicode then converting an ASCII character
638	// to wchar_t should not change its numeric value. we'll just
639	// check what happens with the ASCII 'a' character.
640	//
641	// this test is not perfect... obviously, it could yield false
642	// positives, but then at least ASCII text would be converted
643	// properly (or maybe just the 'a' character) -- oh well :(
644
645	char a = 'a';
646	unsigned int w = 0;
647
648	int res = mbtowc((wchar_t *) &w, &a, 1);
649
650	gWCharIsUnicode = (res != -1 && w == 'a');
651
652	#ifdef DEBUG
653	if (!gWCharIsUnicode)
654	NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
655	#endif
656	}
657
658	nsresult
659	nsNativeCharsetConverter::NativeToUnicode(const char **input,
660	PRUint32 *inputLeft,
661	PRUnichar **output,
662	PRUint32 *outputLeft)
663	{
664	if (gWCharIsUnicode) {
665	int incr;
666
667	// cannot use wchar_t here since it may have been redefined (e.g.,
668	// via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
669	unsigned int tmp = 0;
670	while (inputLeft && outputLeft) {
671	#ifdef HAVE_MBRTOWC
672	incr = (int) mbrtowc((wchar_t ) &tmp, input, *inputLeft, &ps);
673	#else
674	// XXX is this thread-safe?
675	incr = (int) mbtowc((wchar_t ) &tmp, input, *inputLeft);
676	#endif
677	if (incr < 0) {
678	NS_WARNING("mbtowc failed: possible charset mismatch");
679	// zero-pad and hope for the best
680	tmp = (unsigned char) **input;
681	incr = 1;
682	}
683	**output = (PRUnichar) tmp;
684	(*input) += incr;
685	(*inputLeft) -= incr;
686	(*output)++;
687	(*outputLeft)--;
688	}
689	}
690	else {
691	// wchar_t isn't unicode, so the best we can do is treat the
692	// input as if it is isolatin1 :(
693	isolatin1_to_utf16(input, inputLeft, output, outputLeft);
694	}
695
696	return NS_OK;
697	}
698
699	nsresult
700	nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
701	PRUint32 *inputLeft,
702	char **output,
703	PRUint32 *outputLeft)
704	{
705	if (gWCharIsUnicode) {
706	int incr;
707
708	/* MB_CUR_MAX better be positive. */
709	while (inputLeft && outputLeft >= (PRUint32)MB_CUR_MAX) {
710	#ifdef HAVE_WCRTOMB
711	incr = (int) wcrtomb(output, (wchar_t) *input, &ps);
712	#else
713	// XXX is this thread-safe?
714	incr = (int) wctomb(output, (wchar_t) *input);
715	#endif
716	if (incr < 0) {
717	NS_WARNING("mbtowc failed: possible charset mismatch");
718	output = (unsigned char) input; // truncate
719	incr = 1;
720	}
721	// most likely we're dead anyways if this assertion should fire
722	NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
723	(*output) += incr;
724	(*outputLeft) -= incr;
725	(*input)++;
726	(*inputLeft)--;
727	}
728	}
729	else {
730	// wchar_t isn't unicode, so the best we can do is treat the
731	// input as if it is isolatin1 :(
732	utf16_to_isolatin1(input, inputLeft, output, outputLeft);
733	}
734
735	return NS_OK;
736	}
737
738	#endif // USE_STDCONV
739
740	//-----------------------------------------------------------------------------
741	// API implementation
742	//-----------------------------------------------------------------------------
743
744	NS_COM nsresult
745	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
746	{
747	output.Truncate();
748
749	PRUint32 inputLen = input.Length();
750
751	nsACString::const_iterator iter;
752	input.BeginReading(iter);
753
754	//
755	// OPTIMIZATION: preallocate space for largest possible result; convert
756	// directly into the result buffer to avoid intermediate buffer copy.
757	//
758	// this will generally result in a larger allocation, but that seems
759	// better than an extra buffer copy.
760	//
761	output.SetLength(inputLen);
762	nsAString::iterator out_iter;
763	output.BeginWriting(out_iter);
764
765	PRUnichar *result = out_iter.get();
766	PRUint32 resultLeft = inputLen;
767
768	const char *buf = iter.get();
769	PRUint32 bufLeft = inputLen;
770
771	nsNativeCharsetConverter conv;
772	nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
773	if (NS_SUCCEEDED(rv)) {
774	NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
775	output.SetLength(inputLen - resultLeft);
776	}
777	return rv;
778	}
779
780	NS_COM nsresult
781	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
782	{
783	output.Truncate();
784
785	nsAString::const_iterator iter, end;
786	input.BeginReading(iter);
787	input.EndReading(end);
788
789	// cannot easily avoid intermediate buffer copy.
790	char temp[4096];
791
792	nsNativeCharsetConverter conv;
793
794	const PRUnichar *buf = iter.get();
795	PRUint32 bufLeft = Distance(iter, end);
796	while (bufLeft) {
797	char *p = temp;
798	PRUint32 tempLeft = sizeof(temp);
799
800	nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
801	if (NS_FAILED(rv)) return rv;
802
803	if (tempLeft < sizeof(temp))
804	output.Append(temp, sizeof(temp) - tempLeft);
805	}
806	return NS_OK;
807	}
808
809	void
810	NS_StartupNativeCharsetUtils()
811	{
812	//
813	// need to initialize the locale or else charset conversion will fail.
814	// better not delay this in case some other component alters the locale
815	// settings.
816	//
817	// XXX we assume that we are called early enough that we should
818	// always be the first to care about the locale's charset.
819	//
820	setlocale(LC_CTYPE, "");
821
822	nsNativeCharsetConverter::GlobalInit();
823	}
824
825	void
826	NS_ShutdownNativeCharsetUtils()
827	{
828	nsNativeCharsetConverter::GlobalShutdown();
829	}
830
831	//-----------------------------------------------------------------------------
832	// default : truncate/zeropad
833	//-----------------------------------------------------------------------------
834	#else
835
836	#include "nsReadableUtils.h"
837
838	NS_COM nsresult
839	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
840	{
841	CopyASCIItoUCS2(input, output);
842	return NS_OK;
843	}
844
845	NS_COM nsresult
846	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
847	{
848	CopyUCS2toASCII(input, output);
849	return NS_OK;
850	}
851
852	void
853	NS_StartupNativeCharsetUtils()
854	{
855	}
856
857	void
858	NS_ShutdownNativeCharsetUtils()
859	{
860	}
861
862	#endif
863

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/xpcom18a4/xpcom/io/nsNativeCharsetUtils.cpp

Download in other formats: