VirtualBox

source: vbox/trunk/include/iprt/uni.h@ 94480

Last change on this file since 94480 was 93115, checked in by vboxsync, 3 years ago

scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 10.8 KB
Line 
1/** @file
2 * IPRT - Unicode Code Points.
3 */
4
5/*
6 * Copyright (C) 2006-2022 Oracle Corporation
7 *
8 * This file is part of VirtualBox Open Source Edition (OSE), as
9 * available from http://www.virtualbox.org. This file is free software;
10 * you can redistribute it and/or modify it under the terms of the GNU
11 * General Public License (GPL) as published by the Free Software
12 * Foundation, in version 2 as it comes in the "COPYING" file of the
13 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15 *
16 * The contents of this file may alternatively be used under the terms
17 * of the Common Development and Distribution License Version 1.0
18 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19 * VirtualBox OSE distribution, in which case the provisions of the
20 * CDDL are applicable instead of those of the GPL.
21 *
22 * You may elect to license modified versions of this file under the
23 * terms and conditions of either the GPL or the CDDL or both.
24 */
25
26#ifndef IPRT_INCLUDED_uni_h
27#define IPRT_INCLUDED_uni_h
28#ifndef RT_WITHOUT_PRAGMA_ONCE
29# pragma once
30#endif
31
32/** @defgroup grp_rt_uni RTUniCp - Unicode Code Points
33 * @ingroup grp_rt
34 * @{
35 */
36
37/** @def RTUNI_USE_WCTYPE
38 * Define RTUNI_USE_WCTYPE to not use the IPRT unicode data but the
39 * data which the C runtime library provides. */
40#ifdef DOXYGEN_RUNNING
41# define RTUNI_USE_WCTYPE
42#endif
43
44#include <iprt/types.h>
45#ifdef RTUNI_USE_WCTYPE
46# include <wctype.h>
47#endif
48
49RT_C_DECLS_BEGIN
50
51
52#ifndef RTUNI_USE_WCTYPE
53
54/**
55 * A unicode flags range.
56 * @internal
57 */
58typedef struct RTUNIFLAGSRANGE
59{
60 /** The first code point of the range. */
61 RTUNICP BeginCP;
62 /** The last + 1 code point of the range. */
63 RTUNICP EndCP;
64 /** Pointer to the array of case folded code points. */
65 const uint8_t *pafFlags;
66} RTUNIFLAGSRANGE;
67/** Pointer to a flags range.
68 * @internal */
69typedef RTUNIFLAGSRANGE *PRTUNIFLAGSRANGE;
70/** Pointer to a const flags range.
71 * @internal */
72typedef const RTUNIFLAGSRANGE *PCRTUNIFLAGSRANGE;
73
74/**
75 * A unicode case folded range.
76 * @internal
77 */
78typedef struct RTUNICASERANGE
79{
80 /** The first code point of the range. */
81 RTUNICP BeginCP;
82 /** The last + 1 code point of the range. */
83 RTUNICP EndCP;
84 /** Pointer to the array of case folded code points. */
85 PCRTUNICP paFoldedCPs;
86} RTUNICASERANGE;
87/** Pointer to a case folded range.
88 * @internal */
89typedef RTUNICASERANGE *PRTUNICASERANGE;
90/** Pointer to a const case folded range.
91 * @internal */
92typedef const RTUNICASERANGE *PCRTUNICASERANGE;
93
94/** @name Unicode Code Point Flags.
95 * @internal
96 * @{ */
97#define RTUNI_UPPER RT_BIT(0)
98#define RTUNI_LOWER RT_BIT(1)
99#define RTUNI_ALPHA RT_BIT(2)
100#define RTUNI_XDIGIT RT_BIT(3)
101#define RTUNI_DDIGIT RT_BIT(4)
102#define RTUNI_WSPACE RT_BIT(5)
103/*#define RTUNI_BSPACE RT_BIT(6) - later */
104/** When set, the codepoint requires further checking wrt NFC and NFD
105 * normalization. I.e. set when either of QC_NFD and QC_NFC are not Y. */
106#define RTUNI_QC_NFX RT_BIT(7)
107/** @} */
108
109
110/**
111 * Array of flags ranges.
112 * @internal
113 */
114extern RTDATADECL(const RTUNIFLAGSRANGE) g_aRTUniFlagsRanges[];
115
116/**
117 * Gets the flags for a unicode code point.
118 *
119 * @returns The flag mask. (RTUNI_*)
120 * @param CodePoint The unicode code point.
121 * @internal
122 */
123DECLINLINE(RTUNICP) rtUniCpFlags(RTUNICP CodePoint)
124{
125 PCRTUNIFLAGSRANGE pCur = &g_aRTUniFlagsRanges[0];
126 do
127 {
128 if (pCur->EndCP > CodePoint)
129 {
130 if (pCur->BeginCP <= CodePoint)
131 return pCur->pafFlags[CodePoint - pCur->BeginCP];
132 break;
133 }
134 pCur++;
135 } while (pCur->EndCP != RTUNICP_MAX);
136 return 0;
137}
138
139
140/**
141 * Checks if a unicode code point is upper case.
142 *
143 * @returns true if it is.
144 * @returns false if it isn't.
145 * @param CodePoint The code point.
146 */
147DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
148{
149 return (rtUniCpFlags(CodePoint) & RTUNI_UPPER) != 0;
150}
151
152
153/**
154 * Checks if a unicode code point is lower case.
155 *
156 * @returns true if it is.
157 * @returns false if it isn't.
158 * @param CodePoint The code point.
159 */
160DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
161{
162 return (rtUniCpFlags(CodePoint) & RTUNI_LOWER) != 0;
163}
164
165
166/**
167 * Checks if a unicode code point is case foldable.
168 *
169 * @returns true if it is.
170 * @returns false if it isn't.
171 * @param CodePoint The code point.
172 */
173DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
174{
175 /* Right enough. */
176 return (rtUniCpFlags(CodePoint) & (RTUNI_LOWER | RTUNI_UPPER)) != 0;
177}
178
179
180/**
181 * Checks if a unicode code point is alphabetic.
182 *
183 * @returns true if it is.
184 * @returns false if it isn't.
185 * @param CodePoint The code point.
186 */
187DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
188{
189 return (rtUniCpFlags(CodePoint) & RTUNI_ALPHA) != 0;
190}
191
192
193/**
194 * Checks if a unicode code point is a decimal digit.
195 *
196 * @returns true if it is.
197 * @returns false if it isn't.
198 * @param CodePoint The code point.
199 */
200DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
201{
202 return (rtUniCpFlags(CodePoint) & RTUNI_DDIGIT) != 0;
203}
204
205
206/**
207 * Checks if a unicode code point is a hexadecimal digit.
208 *
209 * @returns true if it is.
210 * @returns false if it isn't.
211 * @param CodePoint The code point.
212 */
213DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
214{
215 return (rtUniCpFlags(CodePoint) & RTUNI_XDIGIT) != 0;
216}
217
218
219/**
220 * Checks if a unicode code point is white space.
221 *
222 * @returns true if it is.
223 * @returns false if it isn't.
224 * @param CodePoint The code point.
225 */
226DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
227{
228 return (rtUniCpFlags(CodePoint) & RTUNI_WSPACE) != 0;
229}
230
231
232
233/**
234 * Array of uppercase ranges.
235 * @internal
236 */
237extern RTDATADECL(const RTUNICASERANGE) g_aRTUniUpperRanges[];
238
239/**
240 * Array of lowercase ranges.
241 * @internal
242 */
243extern RTDATADECL(const RTUNICASERANGE) g_aRTUniLowerRanges[];
244
245
246/**
247 * Folds a unicode code point using the specified range array.
248 *
249 * @returns FOlded code point.
250 * @param CodePoint The unicode code point to fold.
251 * @param pCur The case folding range to use.
252 */
253DECLINLINE(RTUNICP) rtUniCpFold(RTUNICP CodePoint, PCRTUNICASERANGE pCur)
254{
255 do
256 {
257 if (pCur->EndCP > CodePoint)
258 {
259 if (pCur->BeginCP <= CodePoint)
260 CodePoint = pCur->paFoldedCPs[CodePoint - pCur->BeginCP];
261 break;
262 }
263 pCur++;
264 } while (pCur->EndCP != RTUNICP_MAX);
265 return CodePoint;
266}
267
268
269/**
270 * Folds a unicode code point to upper case.
271 *
272 * @returns Folded code point.
273 * @param CodePoint The unicode code point to fold.
274 */
275DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
276{
277 return rtUniCpFold(CodePoint, &g_aRTUniUpperRanges[0]);
278}
279
280
281/**
282 * Folds a unicode code point to lower case.
283 *
284 * @returns Folded code point.
285 * @param CodePoint The unicode code point to fold.
286 */
287DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
288{
289 return rtUniCpFold(CodePoint, &g_aRTUniLowerRanges[0]);
290}
291
292
293#else /* RTUNI_USE_WCTYPE */
294
295
296/**
297 * Checks if a unicode code point is upper case.
298 *
299 * @returns true if it is.
300 * @returns false if it isn't.
301 * @param CodePoint The code point.
302 */
303DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
304{
305 return !!iswupper(CodePoint);
306}
307
308
309/**
310 * Checks if a unicode code point is lower case.
311 *
312 * @returns true if it is.
313 * @returns false if it isn't.
314 * @param CodePoint The code point.
315 */
316DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
317{
318 return !!iswlower(CodePoint);
319}
320
321
322/**
323 * Checks if a unicode code point is case foldable.
324 *
325 * @returns true if it is.
326 * @returns false if it isn't.
327 * @param CodePoint The code point.
328 */
329DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
330{
331 /* Right enough. */
332 return iswupper(CodePoint) || iswlower(CodePoint);
333}
334
335
336/**
337 * Checks if a unicode code point is alphabetic.
338 *
339 * @returns true if it is.
340 * @returns false if it isn't.
341 * @param CodePoint The code point.
342 */
343DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
344{
345 return !!iswalpha(CodePoint);
346}
347
348
349/**
350 * Checks if a unicode code point is a decimal digit.
351 *
352 * @returns true if it is.
353 * @returns false if it isn't.
354 * @param CodePoint The code point.
355 */
356DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
357{
358 return !!iswdigit(CodePoint);
359}
360
361
362/**
363 * Checks if a unicode code point is a hexadecimal digit.
364 *
365 * @returns true if it is.
366 * @returns false if it isn't.
367 * @param CodePoint The code point.
368 */
369DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
370{
371 return !!iswxdigit(CodePoint);
372}
373
374
375/**
376 * Checks if a unicode code point is white space.
377 *
378 * @returns true if it is.
379 * @returns false if it isn't.
380 * @param CodePoint The code point.
381 */
382DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
383{
384 return !!iswspace(CodePoint);
385}
386
387
388/**
389 * Folds a unicode code point to upper case.
390 *
391 * @returns Folded code point.
392 * @param CodePoint The unicode code point to fold.
393 */
394DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
395{
396 return towupper(CodePoint);
397}
398
399
400/**
401 * Folds a unicode code point to lower case.
402 *
403 * @returns Folded code point.
404 * @param CodePoint The unicode code point to fold.
405 */
406DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
407{
408 return towlower(CodePoint);
409}
410
411
412#endif /* RTUNI_USE_WCTYPE */
413
414
415/**
416 * Frees a unicode string.
417 *
418 * @param pusz The string to free.
419 */
420RTDECL(void) RTUniFree(PRTUNICP pusz);
421
422
423/**
424 * Checks if a code point valid.
425 *
426 * Any code point (defined or not) within the 17 unicode planes (0 thru 16),
427 * except surrogates will be considered valid code points by this function.
428 *
429 * @returns true if in range, false if not.
430 * @param CodePoint The unicode code point to validate.
431 */
432DECLINLINE(bool) RTUniCpIsValid(RTUNICP CodePoint)
433{
434 return CodePoint <= 0x00d7ff
435 || ( CodePoint <= 0x10ffff
436 && CodePoint >= 0x00e000);
437}
438
439
440/**
441 * Checks if the given code point is in the BMP range.
442 *
443 * Surrogates are not considered in the BMP range by this function.
444 *
445 * @returns true if in BMP, false if not.
446 * @param CodePoint The unicode code point to consider.
447 */
448DECLINLINE(bool) RTUniCpIsBMP(RTUNICP CodePoint)
449{
450 return CodePoint <= 0xd7ff
451 || ( CodePoint <= 0xffff
452 && CodePoint >= 0xe000);
453}
454
455
456/**
457 * Folds a unicode code point to lower case.
458 *
459 * @returns Folded code point.
460 * @param CodePoint The unicode code point to fold.
461 */
462DECLINLINE(size_t) RTUniCpCalcUtf8Len(RTUNICP CodePoint)
463{
464 if (CodePoint < 0x80)
465 return 1;
466 return 2
467 + (CodePoint >= 0x00000800)
468 + (CodePoint >= 0x00010000)
469 + (CodePoint >= 0x00200000)
470 + (CodePoint >= 0x04000000)
471 + (CodePoint >= 0x80000000) /* illegal */;
472}
473
474
475
476RT_C_DECLS_END
477/** @} */
478
479
480#endif /* !IPRT_INCLUDED_uni_h */
481
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette