VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/uniread.cpp@ 25721

Last change on this file since 25721 was 13836, checked in by vboxsync, 16 years ago

s/ELEMENTS/RT_ELEMENTS/g - retiring ELEMENTS (finally).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 26.3 KB
Line 
1/* $Id: uniread.cpp 13836 2008-11-05 02:42:54Z vboxsync $ */
2/** @file
3 * IPRT - Unicode Specification Reader.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31/*******************************************************************************
32* Header Files *
33*******************************************************************************/
34#include <iprt/types.h>
35#include <iprt/stdarg.h>
36
37#include <stdio.h>
38#include <string.h>
39#include <stdlib.h>
40
41
42/**
43 * Strip a line.
44 * @returns pointer to first non-blank char.
45 * @param pszLine The line string to strip.
46 */
47static char *StripLine(char *pszLine)
48{
49 while (*pszLine == ' ' || *pszLine == '\t')
50 pszLine++;
51
52 char *psz = strchr(pszLine, '#');
53 if (psz)
54 *psz = '\0';
55 else
56 psz = strchr(pszLine, '\0');
57 while (psz > pszLine)
58 {
59 switch (psz[-1])
60 {
61 case ' ':
62 case '\t':
63 case '\n':
64 case '\r':
65 *--psz = '\0';
66 continue;
67 }
68 break;
69 }
70
71 return pszLine;
72}
73
74
75/**
76 * Checks if the line is blank or a comment line and should be skipped.
77 * @returns true/false.
78 * @param pszLine The line to consider.
79 */
80static bool IsCommentOrBlankLine(const char *pszLine)
81{
82 while (*pszLine == ' ' || *pszLine == '\t' || *pszLine == '\n' || *pszLine == '\r')
83 pszLine++;
84 return *pszLine == '#' || *pszLine == '\0';
85}
86
87
88/**
89 * Get the first field in the string.
90 *
91 * @returns Pointer to the next field.
92 * @param ppsz Where to store the pointer to the next field.
93 * @param pszLine The line string. (could also be *ppsz from a FirstNext call)
94 */
95static char *FirstField(char **ppsz, char *pszLine)
96{
97 char *psz = strchr(pszLine, ';');
98 if (!psz)
99 *ppsz = psz = strchr(pszLine, '\0');
100 else
101 {
102 *psz = '\0';
103 *ppsz = psz + 1;
104 }
105
106 /* strip */
107 while (*pszLine == ' ' || *pszLine == '\t' || *pszLine == '\r' || *pszLine == '\n')
108 pszLine++;
109 while (psz > pszLine)
110 {
111 switch (psz[-1])
112 {
113 case ' ':
114 case '\t':
115 case '\n':
116 case '\r':
117 *--psz = '\0';
118 continue;
119 }
120 break;
121 }
122 return pszLine;
123}
124
125
126/**
127 * Get the next field in a field enumeration.
128 *
129 * @returns Pointer to the next field.
130 * @param ppsz Where to get and store the string postition.
131 */
132static char *NextField(char **ppsz)
133{
134 return FirstField(ppsz, *ppsz);
135}
136
137
138/**
139 * Converts a code point field to a number.
140 * @returns Code point.
141 * @param psz The field string.
142 */
143static RTUNICP ToNum(const char *psz)
144{
145 char *pszEnd = NULL;
146 unsigned long ul = strtoul(psz, &pszEnd, 16);
147 if (pszEnd && *pszEnd)
148 fprintf(stderr, "warning: failed converting '%s' to a number!\n", psz);
149 return (RTUNICP)ul;
150}
151
152
153/**
154 * Same as ToNum except that if the field is empty the Default is returned.
155 */
156static RTUNICP ToNumDefault(const char *psz, RTUNICP Default)
157{
158 if (*psz)
159 return ToNum(psz);
160 return Default;
161}
162
163
164/**
165 * Converts a code point range to numbers.
166 * @returns The start code point.\
167 * @returns ~(RTUNICP)0 on failure.
168 * @param psz The field string.
169 * @param pLast Where to store the last code point in the range.
170 */
171static RTUNICP ToRange(const char *psz, PRTUNICP pLast)
172{
173 char *pszEnd = NULL;
174 unsigned long ulStart = strtoul(psz, &pszEnd, 16);
175 unsigned long ulLast = ulStart;
176 if (pszEnd && *pszEnd)
177 {
178 if (*pszEnd == '.')
179 {
180 while (*pszEnd == '.')
181 pszEnd++;
182 ulLast = strtoul(pszEnd, &pszEnd, 16);
183 if (pszEnd && *pszEnd)
184 {
185 fprintf(stderr, "warning: failed converting '%s' to a number!\n", psz);
186 return ~(RTUNICP)0;
187 }
188 }
189 else
190 {
191 fprintf(stderr, "warning: failed converting '%s' to a number!\n", psz);
192 return ~(RTUNICP)0;
193 }
194 }
195 *pLast = (RTUNICP)ulLast;
196 return (RTUNICP)ulStart;
197
198}
199
200
201/**
202 * Duplicate a string, optimize certain strings to save memory.
203 *
204 * @returns Pointer to string copy.
205 * @param pszStr The string to duplicate.
206 */
207static char *DupStr(const char *pszStr)
208{
209 if (!*pszStr)
210 return (char*)"";
211 char *psz = strdup(pszStr);
212 if (psz)
213 return psz;
214
215 fprintf(stderr, "out of memory!\n");
216 exit(1);
217}
218
219
220/**
221 * Array of all possible and impossible unicode code points as of 4.1
222 */
223struct CPINFO
224{
225 RTUNICP CodePoint;
226 RTUNICP SimpleUpperCaseMapping;
227 RTUNICP SimpleLowerCaseMapping;
228 RTUNICP SimpleTitleCaseMapping;
229 const char *pszName;
230 /** Set if this is an unused entry */
231 unsigned fNullEntry : 1;
232
233 unsigned fAlphabetic : 1;
234 unsigned fASCIIHexDigit : 1;
235 unsigned fBidiControl : 1;
236 unsigned fDash : 1;
237 unsigned fDefaultIgnorableCodePoint : 1;
238 unsigned fDeprecated : 1;
239 unsigned fDiacritic : 1;
240 unsigned fExtender : 1;
241 unsigned fGraphemeBase : 1;
242 unsigned fGraphemeExtend : 1;
243 unsigned fGraphemeLink : 1;
244 unsigned fHexDigit : 1;
245 unsigned fHyphen : 1;
246 unsigned fIDContinue : 1;
247 unsigned fIdeographic : 1;
248 unsigned fIDSBinaryOperator : 1;
249 unsigned fIDStart : 1;
250 unsigned fIDSTrinaryOperator : 1;
251 unsigned fJoinControl : 1;
252 unsigned fLogicalOrderException : 1;
253 unsigned fLowercase : 1;
254 unsigned fMath : 1;
255 unsigned fNoncharacterCodePoint : 1;
256 unsigned fOtherAlphabetic : 1;
257 unsigned fOtherDefaultIgnorableCodePoint : 1;
258 unsigned fOtherGraphemeExtend : 1;
259 unsigned fOtherIDContinue : 1;
260 unsigned fOtherIDStart : 1;
261 unsigned fOtherLowercase : 1;
262 unsigned fOtherMath : 1;
263 unsigned fOtherUppercase : 1;
264 unsigned fPatternSyntax : 1;
265 unsigned fPatternWhiteSpace : 1;
266 unsigned fQuotationMark : 1;
267 unsigned fRadical : 1;
268 unsigned fSoftDotted : 1;
269 unsigned fSTerm : 1;
270 unsigned fTerminalPunctuation : 1;
271 unsigned fUnifiedIdeograph : 1;
272 unsigned fUppercase : 1;
273 unsigned fVariationSelector : 1;
274 unsigned fWhiteSpace : 1;
275 unsigned fXIDContinue : 1;
276 unsigned fXIDStart : 1;
277
278 /* unprocess stuff, so far. */
279 const char *pszGeneralCategory;
280 const char *pszCanonicalCombiningClass;
281 const char *pszBidiClass;
282 const char *pszDecompositionType;
283 const char *pszDecompositionMapping;
284 const char *pszNumericType;
285 const char *pszNumericValue;
286 const char *pszBidiMirrored;
287 const char *pszUnicode1Name;
288 const char *pszISOComment;
289} g_aCPInfo[0xf0000];
290
291
292/**
293 * Creates a 'null' entry at i.
294 * @param i The entry in question.
295 */
296static void NullEntry(unsigned i)
297{
298 g_aCPInfo[i].CodePoint = i;
299 g_aCPInfo[i].fNullEntry = 1;
300 g_aCPInfo[i].pszName = "";
301 g_aCPInfo[i].SimpleUpperCaseMapping = i;
302 g_aCPInfo[i].SimpleLowerCaseMapping = i;
303 g_aCPInfo[i].SimpleTitleCaseMapping = i;
304 g_aCPInfo[i].pszGeneralCategory = "";
305 g_aCPInfo[i].pszCanonicalCombiningClass = "";
306 g_aCPInfo[i].pszBidiClass = "";
307 g_aCPInfo[i].pszDecompositionType = "";
308 g_aCPInfo[i].pszDecompositionMapping = "";
309 g_aCPInfo[i].pszNumericType = "";
310 g_aCPInfo[i].pszNumericValue = "";
311 g_aCPInfo[i].pszBidiMirrored = "";
312 g_aCPInfo[i].pszUnicode1Name = "";
313 g_aCPInfo[i].pszISOComment = "";
314}
315
316
317/**
318 * Read the UnicodeData.txt file.
319 * @returns 0 on success.
320 * @returns !0 on failure.
321 * @param pszFilename The name of the file.
322 */
323static int ReadUnicodeData(const char *pszFilename)
324{
325 /*
326 * Open input.
327 */
328 FILE *pFile = fopen(pszFilename, "r");
329 if (!pFile)
330 {
331 printf("uniread: failed to open '%s' for reading\n", pszFilename);
332 return 1;
333 }
334
335 /*
336 * Parse the input and spit out the output.
337 */
338 char szLine[4096];
339 RTUNICP i = 0;
340 while (fgets(szLine, sizeof(szLine), pFile) != NULL)
341 {
342 if (IsCommentOrBlankLine(szLine))
343 continue;
344
345 char *pszCurField;
346 char *pszCodePoint = FirstField(&pszCurField, StripLine(szLine)); /* 0 */
347 char *pszName = NextField(&pszCurField); /* 1 */
348 char *pszGeneralCategory = NextField(&pszCurField); /* 2 */
349 char *pszCanonicalCombiningClass = NextField(&pszCurField); /* 3 */
350 char *pszBidiClass = NextField(&pszCurField); /* 4 */
351 char *pszDecompositionType = NextField(&pszCurField); /* 5 */
352 char *pszDecompositionMapping = NextField(&pszCurField); /* 6 */
353 char *pszNumericType = NextField(&pszCurField); /* 7 */
354 char *pszNumericValue = NextField(&pszCurField); /* 8 */
355 char *pszBidiMirrored = NextField(&pszCurField); /* 9 */
356 char *pszUnicode1Name = NextField(&pszCurField); /* 10 */
357 char *pszISOComment = NextField(&pszCurField); /* 11 */
358 char *pszSimpleUpperCaseMapping = NextField(&pszCurField); /* 12 */
359 char *pszSimpleLowerCaseMapping = NextField(&pszCurField); /* 13 */
360 char *pszSimpleTitleCaseMapping = NextField(&pszCurField); /* 14 */
361
362 RTUNICP CodePoint = ToNum(pszCodePoint);
363 if (CodePoint >= RT_ELEMENTS(g_aCPInfo))
364 continue;
365
366 /* catchup? */
367 while (i < CodePoint)
368 NullEntry(i++);
369 if (i != CodePoint)
370 {
371 fprintf(stderr, "unitest: error: i=%d CodePoint=%u\n", i, CodePoint);
372 fclose(pFile);
373 return 1;
374 }
375
376 /* this one */
377 g_aCPInfo[i].CodePoint = i;
378 g_aCPInfo[i].fNullEntry = 0;
379 g_aCPInfo[i].pszName = DupStr(pszName);
380 g_aCPInfo[i].SimpleUpperCaseMapping = ToNumDefault(pszSimpleUpperCaseMapping, CodePoint);
381 g_aCPInfo[i].SimpleLowerCaseMapping = ToNumDefault(pszSimpleLowerCaseMapping, CodePoint);
382 g_aCPInfo[i].SimpleTitleCaseMapping = ToNumDefault(pszSimpleTitleCaseMapping, CodePoint);
383 g_aCPInfo[i].pszGeneralCategory = DupStr(pszGeneralCategory);
384 g_aCPInfo[i].pszCanonicalCombiningClass = DupStr(pszCanonicalCombiningClass);
385 g_aCPInfo[i].pszBidiClass = DupStr(pszBidiClass);
386 g_aCPInfo[i].pszDecompositionType = DupStr(pszDecompositionType);
387 g_aCPInfo[i].pszDecompositionMapping = DupStr(pszDecompositionMapping);
388 g_aCPInfo[i].pszNumericType = DupStr(pszNumericType);
389 g_aCPInfo[i].pszNumericValue = DupStr(pszNumericValue);
390 g_aCPInfo[i].pszBidiMirrored = DupStr(pszBidiMirrored);
391 g_aCPInfo[i].pszUnicode1Name = DupStr(pszUnicode1Name);
392 g_aCPInfo[i].pszISOComment = DupStr(pszISOComment);
393 i++;
394 }
395 /* catchup? */
396 while (i < RT_ELEMENTS(g_aCPInfo))
397 NullEntry(i++);
398 fclose(pFile);
399
400 return 0;
401}
402
403
404/**
405 * Applies a property to a code point.
406 *
407 * @param StartCP The code point.
408 * @param pszProperty The property name.
409 */
410static void ApplyProperty(RTUNICP StartCP, const char *pszProperty)
411{
412 if (StartCP >= RT_ELEMENTS(g_aCPInfo))
413 return;
414 struct CPINFO *pCPInfo = &g_aCPInfo[StartCP];
415 /* string switch */
416 if (!strcmp(pszProperty, "ASCII_Hex_Digit")) pCPInfo->fASCIIHexDigit = 1;
417 else if (!strcmp(pszProperty, "Bidi_Control")) pCPInfo->fBidiControl = 1;
418 else if (!strcmp(pszProperty, "Dash")) pCPInfo->fDash = 1;
419 else if (!strcmp(pszProperty, "Deprecated")) pCPInfo->fDeprecated = 1;
420 else if (!strcmp(pszProperty, "Diacritic")) pCPInfo->fDiacritic = 1;
421 else if (!strcmp(pszProperty, "Extender")) pCPInfo->fExtender = 1;
422 else if (!strcmp(pszProperty, "Grapheme_Link")) pCPInfo->fGraphemeLink = 1;
423 else if (!strcmp(pszProperty, "Hex_Digit")) pCPInfo->fHexDigit = 1;
424 else if (!strcmp(pszProperty, "Hyphen")) pCPInfo->fHyphen = 1;
425 else if (!strcmp(pszProperty, "Ideographic")) pCPInfo->fIdeographic = 1;
426 else if (!strcmp(pszProperty, "IDS_Binary_Operator")) pCPInfo->fIDSBinaryOperator = 1;
427 else if (!strcmp(pszProperty, "IDS_Trinary_Operator")) pCPInfo->fIDSTrinaryOperator = 1;
428 else if (!strcmp(pszProperty, "Join_Control")) pCPInfo->fJoinControl = 1;
429 else if (!strcmp(pszProperty, "Logical_Order_Exception")) pCPInfo->fLogicalOrderException = 1;
430 else if (!strcmp(pszProperty, "Noncharacter_Code_Point")) pCPInfo->fNoncharacterCodePoint = 1;
431 else if (!strcmp(pszProperty, "Other_Alphabetic")) pCPInfo->fOtherAlphabetic = 1;
432 else if (!strcmp(pszProperty, "Other_Default_Ignorable_Code_Point")) pCPInfo->fOtherDefaultIgnorableCodePoint = 1;
433 else if (!strcmp(pszProperty, "Other_Grapheme_Extend")) pCPInfo->fOtherGraphemeExtend = 1;
434 else if (!strcmp(pszProperty, "Other_ID_Continue")) pCPInfo->fOtherIDContinue = 1;
435 else if (!strcmp(pszProperty, "Other_ID_Start")) pCPInfo->fOtherIDStart = 1;
436 else if (!strcmp(pszProperty, "Other_Lowercase")) pCPInfo->fOtherLowercase = 1;
437 else if (!strcmp(pszProperty, "Other_Math")) pCPInfo->fOtherMath = 1;
438 else if (!strcmp(pszProperty, "Other_Uppercase")) pCPInfo->fOtherUppercase = 1;
439 else if (!strcmp(pszProperty, "Alphabetic")) pCPInfo->fAlphabetic = 1;
440 else if (!strcmp(pszProperty, "Default_Ignorable_Code_Point")) pCPInfo->fDefaultIgnorableCodePoint = 1;
441 else if (!strcmp(pszProperty, "Grapheme_Base")) pCPInfo->fGraphemeBase = 1;
442 else if (!strcmp(pszProperty, "Grapheme_Extend")) pCPInfo->fGraphemeExtend = 1;
443 else if (!strcmp(pszProperty, "ID_Continue")) pCPInfo->fIDContinue = 1;
444 else if (!strcmp(pszProperty, "ID_Start")) pCPInfo->fIDStart = 1;
445 else if (!strcmp(pszProperty, "XID_Continue")) pCPInfo->fXIDContinue = 1;
446 else if (!strcmp(pszProperty, "XID_Start")) pCPInfo->fXIDStart = 1;
447 else if (!strcmp(pszProperty, "Lowercase")) pCPInfo->fLowercase = 1;
448 else if (!strcmp(pszProperty, "Math")) pCPInfo->fMath = 1;
449 else if (!strcmp(pszProperty, "Uppercase")) pCPInfo->fUppercase = 1;
450 else if (!strcmp(pszProperty, "Pattern_Syntax")) pCPInfo->fPatternSyntax = 1;
451 else if (!strcmp(pszProperty, "Pattern_White_Space")) pCPInfo->fPatternWhiteSpace = 1;
452 else if (!strcmp(pszProperty, "Quotation_Mark")) pCPInfo->fQuotationMark = 1;
453 else if (!strcmp(pszProperty, "Radical")) pCPInfo->fRadical = 1;
454 else if (!strcmp(pszProperty, "Soft_Dotted")) pCPInfo->fSoftDotted = 1;
455 else if (!strcmp(pszProperty, "STerm")) pCPInfo->fSTerm = 1;
456 else if (!strcmp(pszProperty, "Terminal_Punctuation")) pCPInfo->fTerminalPunctuation = 1;
457 else if (!strcmp(pszProperty, "Unified_Ideograph")) pCPInfo->fUnifiedIdeograph = 1;
458 else if (!strcmp(pszProperty, "Variation_Selector")) pCPInfo->fVariationSelector = 1;
459 else if (!strcmp(pszProperty, "White_Space")) pCPInfo->fWhiteSpace = 1;
460 else
461 fprintf(stderr, "uniread: Unknown property '%s'\n", pszProperty);
462}
463
464
465/**
466 * Reads a property file.
467 *
468 * There are several property files, this code can read all
469 * of those but will only make use of the properties it recognizes.
470 *
471 * @returns 0 on success.
472 * @returns !0 on failure.
473 * @param pszFilename The name of the file.
474 */
475static int ReadProperties(const char *pszFilename)
476{
477 /*
478 * Open input.
479 */
480 FILE *pFile = fopen(pszFilename, "r");
481 if (!pFile)
482 {
483 printf("uniread: failed to open '%s' for reading\n", pszFilename);
484 return 1;
485 }
486
487 /*
488 * Parse the input and spit out the output.
489 */
490 char szLine[4096];
491 while (fgets(szLine, sizeof(szLine), pFile) != NULL)
492 {
493 if (IsCommentOrBlankLine(szLine))
494 continue;
495 char *pszCurField;
496 char *pszRange = FirstField(&pszCurField, StripLine(szLine));
497 char *pszProperty = NextField(&pszCurField);
498 if (!*pszProperty)
499 continue;
500
501 RTUNICP LastCP;
502 RTUNICP StartCP = ToRange(pszRange, &LastCP);
503 if (StartCP == ~(RTUNICP)0)
504 continue;
505
506 while (StartCP <= LastCP)
507 ApplyProperty(StartCP++, pszProperty);
508 }
509
510 fclose(pFile);
511
512 return 0;
513}
514
515
516/**
517 * Append a flag to the string.
518 */
519static char *AppendFlag(char *psz, const char *pszFlag)
520{
521 char *pszEnd = strchr(psz, '\0');
522 if (pszEnd != psz)
523 {
524 *pszEnd++ = ' ';
525 *pszEnd++ = '|';
526 *pszEnd++ = ' ';
527 }
528 strcpy(pszEnd, pszFlag);
529 return psz;
530}
531
532/**
533 * Calcs the flags for a code point.
534 * @returns true if there is a flag.
535 * @returns false if the isn't.
536 */
537static bool CalcFlags(struct CPINFO *pInfo, char *pszFlags)
538{
539 pszFlags[0] = '\0';
540 /** @todo read the specs on this other vs standard stuff, and check out the finer points */
541 if (pInfo->fAlphabetic || pInfo->fOtherAlphabetic)
542 AppendFlag(pszFlags, "RTUNI_ALPHA");
543 if (pInfo->fHexDigit || pInfo->fASCIIHexDigit)
544 AppendFlag(pszFlags, "RTUNI_XDIGIT");
545 if (!strcmp(pInfo->pszGeneralCategory, "Nd"))
546 AppendFlag(pszFlags, "RTUNI_DDIGIT");
547 if (pInfo->fWhiteSpace)
548 AppendFlag(pszFlags, "RTUNI_WSPACE");
549 if (pInfo->fUppercase || pInfo->fOtherUppercase)
550 AppendFlag(pszFlags, "RTUNI_UPPER");
551 if (pInfo->fLowercase || pInfo->fOtherLowercase)
552 AppendFlag(pszFlags, "RTUNI_LOWER");
553 //if (pInfo->fNumeric)
554 // AppendFlag(pszFlags, "RTUNI_NUMERIC");
555 if (!*pszFlags)
556 {
557 pszFlags[0] = '0';
558 pszFlags[1] = '\0';
559 return false;
560 }
561 return true;
562}
563
564/** the data store for stream two. */
565static char g_szStream2[10240];
566static unsigned g_offStream2 = 0;
567
568/**
569 * Initializes the 2nd steam.
570 */
571static void Stream2Init(void)
572{
573 g_szStream2[0] = '\0';
574 g_offStream2 = 0;
575}
576
577/**
578 * Flushes the 2nd stream to stdout.
579 */
580static int Stream2Flush(void)
581{
582 fwrite(g_szStream2, 1, g_offStream2, stdout);
583 return 0;
584}
585
586/**
587 * printf to the 2nd stream.
588 */
589static int Stream2Printf(const char *pszFormat, ...)
590{
591 va_list va;
592 va_start(va, pszFormat);
593 int cch = vsprintf(&g_szStream2[g_offStream2], pszFormat, va);
594 va_end(va);
595 g_offStream2 += cch;
596 if (g_offStream2 >= sizeof(g_szStream2))
597 {
598 fprintf(stderr, "error: stream2 overflow!\n");
599 exit(1);
600 }
601 return cch;
602}
603
604
605/**
606 * Print the unidata.cpp file header and include list.
607 */
608int PrintHeader(const char *argv0)
609{
610 /*
611 * Print file header.
612 */
613 printf("/** @file\n"
614 " *\n"
615 " * IPRT - Unicode Tables\n"
616 " *\n"
617 " * Automatically Generated by %s (" __DATE__ " " __TIME__ ")\n"
618 " */\n\n"
619 "/*\n"
620 " * Copyright (C) 2006-2008 Sun Microsystems, Inc.\n"
621 " *\n"
622 " * This file is part of VirtualBox Open Source Edition (OSE), as\n"
623 " * available from http://www.virtualbox.org. This file is free software;\n"
624 " * you can redistribute it and/or modify it under the terms of the GNU\n"
625 " * General Public License as published by the Free Software Foundation,\n"
626 " * in version 2 as it comes in the \"COPYING\" file of the VirtualBox OSE\n"
627 " * distribution. VirtualBox OSE is distributed in the hope that it will\n"
628 " * be useful, but WITHOUT ANY WARRANTY of any kind.\n"
629 " *\n"
630 "\n"
631 "#include <iprt/uni.h>\n"
632 "\n",
633 argv0);
634 return 0;
635}
636
637
638/**
639 * Print the flag tables.
640 */
641int PrintFlags(void)
642{
643 /*
644 * Print flags table.
645 */
646 Stream2Init();
647 Stream2Printf("const RTUNIFLAGSRANGE g_aRTUniFlagRanges[] =\n"
648 "{\n");
649 RTUNICP i = 0;
650 int iStart = -1;
651 while (i < RT_ELEMENTS(g_aCPInfo))
652 {
653 /* figure how far off the next chunk is */
654 char szFlags[256];
655 unsigned iNonNull = i;
656 while ( (g_aCPInfo[iNonNull].fNullEntry || !CalcFlags(&g_aCPInfo[iNonNull], szFlags))
657 && iNonNull < RT_ELEMENTS(g_aCPInfo)
658 && iNonNull >= 256)
659 iNonNull++;
660 if (iNonNull - i > 4096 || iNonNull == RT_ELEMENTS(g_aCPInfo))
661 {
662 if (iStart >= 0)
663 {
664 printf("};\n\n");
665 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniFlags0x%06x[0] },\n", iStart, i, iStart);
666 iStart = -1;
667 }
668 i = iNonNull;
669 }
670 else
671 {
672 if (iStart < 0)
673 {
674 printf("static const uint8_t g_afRTUniFlags0x%06x[] = \n"
675 "{\n", i);
676 iStart = i;
677 }
678 CalcFlags(&g_aCPInfo[i], szFlags);
679 printf(" %50s, /* U+%06x: %s*/\n", szFlags, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
680 i++;
681 }
682 }
683 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
684 "};\n\n\n");
685 printf("\n");
686 return Stream2Flush();
687}
688
689
690/**
691 * Prints the upper case tables.
692 */
693static int PrintUpper(void)
694{
695 Stream2Init();
696 Stream2Printf("const RTUNICASERANGE g_aRTUniUpperRanges[] =\n"
697 "{\n");
698 RTUNICP i = 0;
699 int iStart = -1;
700 while (i < RT_ELEMENTS(g_aCPInfo))
701 {
702 /* figure how far off the next chunk is */
703 unsigned iSameCase = i;
704 while ( g_aCPInfo[iSameCase].SimpleUpperCaseMapping == g_aCPInfo[iSameCase].CodePoint
705 && iSameCase < RT_ELEMENTS(g_aCPInfo)
706 && iSameCase >= 256)
707 iSameCase++;
708 if (iSameCase - i > 4096/sizeof(RTUNICP) || iSameCase == RT_ELEMENTS(g_aCPInfo))
709 {
710 if (iStart >= 0)
711 {
712 printf("};\n\n");
713 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniUpper0x%06x[0] },\n", iStart, i, iStart);
714 iStart = -1;
715 }
716 i = iSameCase;
717 }
718 else
719 {
720 if (iStart < 0)
721 {
722 printf("static const RTUNICP g_afRTUniUpper0x%06x[] = \n"
723 "{\n", i);
724 iStart = i;
725 }
726 printf(" 0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleUpperCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
727 i++;
728 }
729 }
730 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
731 "};\n\n\n");
732 printf("\n");
733 return Stream2Flush();
734}
735
736
737/**
738 * Prints the lowercase tables.
739 */
740static int PrintLower(void)
741{
742 Stream2Init();
743 Stream2Printf("const RTUNICASERANGE g_aRTUniLowerRanges[] =\n"
744 "{\n");
745 RTUNICP i = 0;
746 int iStart = -1;
747 while (i < RT_ELEMENTS(g_aCPInfo))
748 {
749 /* figure how far off the next chunk is */
750 unsigned iSameCase = i;
751 while ( g_aCPInfo[iSameCase].SimpleLowerCaseMapping == g_aCPInfo[iSameCase].CodePoint
752 && iSameCase < RT_ELEMENTS(g_aCPInfo)
753 && iSameCase >= 256)
754 iSameCase++;
755 if (iSameCase - i > 4096/sizeof(RTUNICP) || iSameCase == RT_ELEMENTS(g_aCPInfo))
756 {
757 if (iStart >= 0)
758 {
759 printf("};\n\n");
760 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniLower0x%06x[0] },\n", iStart, i, iStart);
761 iStart = -1;
762 }
763 i = iSameCase;
764 }
765 else
766 {
767 if (iStart < 0)
768 {
769 printf("static const RTUNICP g_afRTUniLower0x%06x[] = \n"
770 "{\n", i);
771 iStart = i;
772 }
773 printf(" 0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleLowerCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
774 i++;
775 }
776 }
777 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
778 "};\n\n\n");
779 printf("\n");
780 return Stream2Flush();
781}
782
783
784int main(int argc, char **argv)
785{
786 /*
787 * Parse args.
788 */
789 if (argc <= 1)
790 {
791 printf("usage: %s [UnicodeData.txt [DerivedCoreProperties.txt [PropList.txt]]]\n", argv[0]);
792 return 1;
793 }
794
795 const char *pszUnicodeData = "UnicodeData.txt";
796 const char *pszDerivedCoreProperties = "DerivedCoreProperties.txt";
797 const char *pszPropList = "PropList.txt";
798 int iFile = 0;
799 for (int argi = 1; argi < argc; argi++)
800 {
801 if (argv[argi][0] != '-')
802 {
803 switch (iFile++)
804 {
805 case 0: pszUnicodeData = argv[argi]; break;
806 case 1: pszDerivedCoreProperties = argv[argi]; break;
807 case 2: pszPropList = argv[argi]; break;
808 default:
809 printf("uniread: syntax error at '%s': too many filenames\n", argv[argi]);
810 return 1;
811 }
812 }
813 else
814 {
815 printf("uniread: syntax error at '%s': Unknown argument\n", argv[argi]);
816 return 1;
817 }
818 }
819
820 /*
821 * Read the data.
822 */
823 int rc = ReadUnicodeData(pszUnicodeData);
824 if (rc)
825 return rc;
826 rc = ReadProperties(pszPropList);
827 if (rc)
828 return rc;
829 rc = ReadProperties(pszDerivedCoreProperties);
830 if (rc)
831 return rc;
832
833 /*
834 * Print stuff.
835 */
836 rc = PrintHeader(argv[0]);
837 if (rc)
838 return rc;
839 rc = PrintFlags();
840 if (rc)
841 return rc;
842 rc = PrintUpper();
843 if (rc)
844 return rc;
845 rc = PrintLower();
846 if (rc)
847 return rc;
848
849 /* done */
850 fflush(stdout);
851
852 return rc;
853}
854
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette