VirtualBox

source: vbox/trunk/include/iprt/bldprog-strtab-template.cpp.h@ 96407

Last change on this file since 96407 was 96407, checked in by vboxsync, 2 years ago

scm copyright and license note update

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 33.9 KB
Line 
1/* $Id: bldprog-strtab-template.cpp.h 96407 2022-08-22 17:43:14Z vboxsync $ */
2/** @file
3 * IPRT - Build Program - String Table Generator.
4 */
5
6/*
7 * Copyright (C) 2006-2022 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*
39 * (Avoid include sanity checks as this is ring-3 only, C++ code.)
40 */
41#if defined(__cplusplus) && defined(IN_RING3)
42
43
44/*********************************************************************************************************************************
45* Defined Constants And Macros *
46*********************************************************************************************************************************/
47/** @def BLDPROG_STRTAB_MAX_STRLEN
48 * The max length of strings in the table. */
49#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING)
50# define BLDPROG_STRTAB_MAX_STRLEN 256
51#endif
52
53/** @def BLDPROG_STRTAB_WITH_COMPRESSION
54 * Enables very simple string compression.
55 */
56#if defined(DOXYGEN_RUNNING)
57# define BLDPROG_STRTAB_WITH_COMPRESSION
58#endif
59
60/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS
61 * Modifies the string compression to look for camel case words.
62 */
63#if defined(DOXYGEN_RUNNING)
64# define BLDPROG_STRTAB_WITH_CAMEL_WORDS
65#endif
66
67/** @def BLDPROG_STRTAB_PURE_ASCII
68 * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this
69 * is defined. Otherwise, the compression code will require IPRT to link.
70 */
71#if defined(DOXYGEN_RUNNING)
72# define BLDPROG_STRTAB_PURE_ASCII
73#endif
74
75
76
77/*********************************************************************************************************************************
78* Header Files *
79*********************************************************************************************************************************/
80#include <iprt/stdarg.h>
81#include <iprt/ctype.h>
82#include <stdlib.h>
83#include <stdio.h>
84#include <string.h>
85
86#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
87# include <algorithm>
88# include <map>
89# include <iprt/sanitized/string>
90# include <vector>
91
92typedef std::map<std::string, size_t> BLDPROGWORDFREQMAP;
93typedef BLDPROGWORDFREQMAP::value_type BLDPROGWORDFREQPAIR;
94
95#endif
96
97#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */
98
99
100/*********************************************************************************************************************************
101* Structures and Typedefs *
102*********************************************************************************************************************************/
103/**
104 * Build table string.
105 */
106typedef struct BLDPROGSTRING
107{
108 /** The string.
109 * @note This may be modified or replaced (allocated from heap) when
110 * compressing the string table. */
111 char *pszString;
112 /** The string hash value. */
113 uint32_t uHash;
114 /** The strint table offset. */
115 uint32_t offStrTab;
116 /** The string length. */
117 size_t cchString;
118 /** Pointer to the next string reference (same string table entry). */
119 struct BLDPROGSTRING *pNextRef;
120 /** Pointer to the next string with the same hash value (collision). */
121 struct BLDPROGSTRING *pNextCollision;
122
123} BLDPROGSTRING;
124/** Pointer to a string table string. */
125typedef BLDPROGSTRING *PBLDPROGSTRING;
126
127
128/** String table data. */
129typedef struct BLDPROGSTRTAB
130{
131 /** The size of g_papStrHash. */
132 size_t cStrHash;
133 /** String hash table. */
134 PBLDPROGSTRING *papStrHash;
135 /** Duplicate strings found by AddString. */
136 size_t cDuplicateStrings;
137 /** Total length of the unique strings (no terminators). */
138 size_t cchUniqueStrings;
139 /** Number of unique strings after AddString. */
140 size_t cUniqueStrings;
141 /** Number of collisions. */
142 size_t cCollisions;
143
144 /** Number of entries in apSortedStrings. */
145 size_t cSortedStrings;
146 /** The sorted string table. */
147 PBLDPROGSTRING *papSortedStrings;
148
149#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
150 /** The 127 words we've picked to be indexed by reference. */
151 BLDPROGSTRING aCompDict[127];
152 /** The frequency of the 127 dictionary entries. */
153 size_t auCompDictFreq[127];
154 /** Incoming strings pending compression. */
155 PBLDPROGSTRING *papPendingStrings;
156 /** Current number of entries in papStrPending. */
157 size_t cPendingStrings;
158 /** The allocated size of papPendingStrings. */
159 size_t cMaxPendingStrings;
160 /** Work frequency map.
161 * @todo rewrite in plain C. */
162 BLDPROGWORDFREQMAP Frequencies;
163#endif
164
165 /** The string table. */
166 char *pachStrTab;
167 /** The actual string table size. */
168 size_t cchStrTab;
169} BLDPROGSTRTAB;
170typedef BLDPROGSTRTAB *PBLDPROGSTRTAB;
171
172#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
173# pragma GCC diagnostic push
174# pragma GCC diagnostic ignored "-Wunused-function"
175#endif
176
177
178/**
179 * Initializes the strint table compiler.
180 *
181 * @returns success indicator (out of memory if false).
182 * @param pThis The strint table compiler instance.
183 * @param cMaxStrings The max number of strings we'll be adding.
184 */
185static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings)
186{
187 pThis->cStrHash = 0;
188 pThis->papStrHash = NULL;
189 pThis->cDuplicateStrings = 0;
190 pThis->cchUniqueStrings = 0;
191 pThis->cUniqueStrings = 0;
192 pThis->cCollisions = 0;
193 pThis->cSortedStrings = 0;
194 pThis->papSortedStrings = NULL;
195 pThis->pachStrTab = NULL;
196 pThis->cchStrTab = 0;
197#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
198 memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict));
199 pThis->papPendingStrings = NULL;
200 pThis->cPendingStrings = 0;
201 pThis->cMaxPendingStrings = cMaxStrings;
202#endif
203
204 /*
205 * Allocate a hash table double the size of all strings (to avoid too
206 * many collisions). Add all strings to it, finding duplicates in the
207 * process.
208 */
209#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
210 cMaxStrings += RT_ELEMENTS(pThis->aCompDict);
211#endif
212 cMaxStrings *= 2;
213 pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings);
214 if (pThis->papStrHash)
215 {
216 pThis->cStrHash = cMaxStrings;
217#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
218 pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings);
219 if (pThis->papPendingStrings)
220#endif
221 return true;
222
223#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
224 free(pThis->papStrHash);
225 pThis->papStrHash = NULL;
226#endif
227 }
228 return false;
229}
230
231
232#if 0 /* unused */
233static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis)
234{
235 free(pThis->papStrHash);
236 free(pThis->papSortedStrings);
237 free(pThis->pachStrTab);
238# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
239 free(pThis->papPendingStrings);
240# endif
241 memset(pThis, 0, sizeof(*pThis));
242}
243#endif
244
245#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
246
247DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc)
248{
249 /*
250 * Skip leading word separators.
251 */
252# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
253 while ( ch == ' '
254 || ch == '-'
255 || ch == '+'
256 || ch == '_')
257# else
258 while (ch == ' ')
259# endif
260 ch = *++pszSrc;
261 if (ch)
262 {
263 /*
264 * Find end of word.
265 */
266 size_t cchWord = 1;
267# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
268 char chPrev = ch;
269 while ( (ch = pszSrc[cchWord]) != ' '
270 && ch != '\0'
271 && ch != '-'
272 && ch != '+'
273 && ch != '_'
274 && ( ch == chPrev
275 || !RT_C_IS_UPPER(ch)
276 || RT_C_IS_UPPER(chPrev)) )
277 {
278 chPrev = ch;
279 cchWord++;
280 }
281# else
282 while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0')
283 cchWord++;
284# endif
285 *ppszSrc = pszSrc;
286 return cchWord;
287 }
288
289 *ppszSrc = pszSrc;
290 return 0;
291}
292
293
294/**
295 * Analyzes a string.
296 *
297 * @param pThis The strint table compiler instance.
298 * @param pStr The string to analyze.
299 */
300static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
301{
302 const char *psz = pStr->pszString;
303
304 /*
305 * For now we just consider words.
306 */
307 char ch;
308 while ((ch = *psz) != '\0')
309 {
310 size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz);
311 if (cchWord > 1)
312 {
313 std::string strWord(psz, cchWord);
314 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
315 if (it != pThis->Frequencies.end())
316 it->second += cchWord - 1;
317 else
318 pThis->Frequencies[strWord] = 0;
319
320 /** @todo could gain hits by including the space after the word, but that
321 * has the same accounting problems as the two words scenario below. */
322
323# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */
324 /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */
325 if (ch == ' ')
326 {
327 ch = psz[++cchWord];
328 if (ch != ' ' && ch != '\0')
329 {
330 size_t const cchSaved = cchWord;
331
332 do
333 cchWord++;
334 while ((ch = psz[cchWord]) != ' ' && ch != '\0');
335
336 strWord = std::string(psz, cchWord);
337 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
338 if (it != pThis->Frequencies.end())
339 it->second += cchWord - 1;
340 else
341 pThis->Frequencies[strWord] = 0;
342
343 cchWord = cchSaved;
344 }
345 }
346# endif
347 }
348 else if (!cchWord)
349 break;
350
351 /* Advance. */
352 psz += cchWord;
353 }
354 pStr->cchString = psz - pStr->pszString;
355 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
356 {
357 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
358 abort();
359 }
360}
361
362#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
363
364/**
365 * Adds a string to the hash table.
366 * @param pThis The strint table compiler instance.
367 * @param pStr The string.
368 */
369static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
370{
371 pStr->pNextRef = NULL;
372 pStr->pNextCollision = NULL;
373 pStr->offStrTab = 0;
374 pStr->uHash = sdbm(pStr->pszString, &pStr->cchString);
375 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
376 {
377 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
378 exit(RTEXITCODE_FAILURE);
379 }
380
381 size_t idxHash = pStr->uHash % pThis->cStrHash;
382 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
383 if (!pCur)
384 pThis->papStrHash[idxHash] = pStr;
385 else
386 {
387 /* Look for matching string. */
388 do
389 {
390 if ( pCur->uHash == pStr->uHash
391 && pCur->cchString == pStr->cchString
392 && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0)
393 {
394 pStr->pNextRef = pCur->pNextRef;
395 pCur->pNextRef = pStr;
396 pThis->cDuplicateStrings++;
397 return;
398 }
399 pCur = pCur->pNextCollision;
400 } while (pCur != NULL);
401
402 /* No matching string, insert. */
403 pThis->cCollisions++;
404 pStr->pNextCollision = pThis->papStrHash[idxHash];
405 pThis->papStrHash[idxHash] = pStr;
406 }
407
408 pThis->cUniqueStrings++;
409 pThis->cchUniqueStrings += pStr->cchString;
410}
411
412
413/**
414 * Adds a string to the string table.
415 *
416 * @param pThis The strint table compiler instance.
417 * @param pStr The string.
418 */
419static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
420{
421#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
422 bldProgStrTab_compressorAnalyzeString(pThis, pStr);
423 if (pThis->cPendingStrings < pThis->cMaxPendingStrings)
424 pThis->papPendingStrings[pThis->cPendingStrings++] = pStr;
425 else
426 abort();
427#else
428 bldProgStrTab_AddStringToHashTab(pThis, pStr);
429#endif
430}
431
432
433/**
434 * Adds a string to the string table.
435 *
436 * @param pThis The strint table compiler instance.
437 * @param pStr The string entry (uninitialized).
438 * @param psz The string, will be duplicated if compression is enabled.
439 */
440DECLINLINE(void) BldProgStrTab_AddStringDup(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr, const char *psz)
441{
442#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
443 pStr->pszString = strdup(psz);
444 if (!pStr->pszString)
445 abort();
446#else
447 pStr->pszString = (char *)psz;
448#endif
449 BldProgStrTab_AddString(pThis, pStr);
450}
451
452#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
453
454/**
455 * Copies @a cchSrc chars from @a pchSrc to @a pszDst, escaping special
456 * sequences.
457 *
458 * @returns New @a pszDst position, NULL if invalid source encoding.
459 * @param pszDst The destination buffer.
460 * @param pszSrc The source buffer.
461 * @param cchSrc How much to copy.
462 */
463static char *bldProgStrTab_compressorCopyAndEscape(char *pszDst, const char *pszSrc, size_t cchSrc)
464{
465 while (cchSrc-- > 0)
466 {
467 char ch = *pszSrc;
468 if (!((unsigned char)ch & 0x80))
469 {
470 *pszDst++ = ch;
471 pszSrc++;
472 }
473 else
474 {
475# ifdef BLDPROG_STRTAB_PURE_ASCII
476 fprintf(stderr, "error: unexpected char value %#x\n", ch);
477 return NULL;
478# else
479 RTUNICP uc;
480 int rc = RTStrGetCpEx(&pszSrc, &uc);
481 if (RT_SUCCESS(rc))
482 {
483 *pszDst++ = (unsigned char)0xff; /* escape single code point. */
484 pszDst = RTStrPutCp(pszDst, uc);
485 }
486 else
487 {
488 fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc);
489 return NULL;
490 }
491# endif
492 }
493 }
494 return pszDst;
495}
496
497
498/**
499 * Replaces the dictionary words and escapes non-ascii chars in a string.
500 *
501 * @param pThis The strint table compiler instance.
502 * @param pString The string to fixup.
503 */
504static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
505{
506 char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2];
507 char *pszDst = szNew;
508 const char *pszSrc = pStr->pszString;
509 const char *pszSrcEnd = pszSrc + pStr->cchString;
510
511 char ch;
512 while ((ch = *pszSrc) != '\0')
513 {
514 const char * const pszSrcUncompressed = pszSrc;
515 size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc);
516 size_t cchSrcUncompressed = pszSrc - pszSrcUncompressed;
517 if (cchSrcUncompressed > 0)
518 {
519 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrcUncompressed, cchSrcUncompressed);
520 if (!pszDst)
521 return false;
522 }
523 if (!cchWord)
524 break;
525
526 /* Check for g_aWord matches. */
527 size_t cchMax = pszSrcEnd - pszSrc;
528 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
529 {
530 size_t cchLen = pThis->aCompDict[i].cchString;
531 if ( cchLen >= cchWord
532 && cchLen <= cchMax
533 && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0)
534 {
535 *pszDst++ = (unsigned char)(0x80 | i);
536 pszSrc += cchLen;
537 cchWord = 0;
538 break;
539 }
540 }
541
542 if (cchWord)
543 {
544 /* Copy the current word. */
545 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrc, cchWord);
546 if (!pszDst)
547 return false;
548 pszSrc += cchWord;
549 }
550 }
551
552 /* Just terminate it now. */
553 *pszDst = '\0';
554
555 /*
556 * Update the string.
557 */
558 size_t cchNew = pszDst - &szNew[0];
559 if (cchNew > pStr->cchString)
560 {
561 pStr->pszString = (char *)malloc(cchNew + 1);
562 if (!pStr->pszString)
563 {
564 fprintf(stderr, "Out of memory!\n");
565 return false;
566 }
567 }
568 memcpy(pStr->pszString, szNew, cchNew + 1);
569 pStr->cchString = cchNew;
570
571 return true;
572}
573
574
575/**
576 * For sorting the frequency fidning in descending order.
577 *
578 * Comparison operators are put outside to make older gcc versions (like 4.1.1
579 * on lnx64-rel) happy.
580 */
581class WordFreqSortEntry
582{
583public:
584 BLDPROGWORDFREQPAIR const *m_pPair;
585
586public:
587 WordFreqSortEntry(BLDPROGWORDFREQPAIR const *pPair) : m_pPair(pPair) {}
588};
589
590bool operator == (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
591{
592 return rLeft.m_pPair->second == rRight.m_pPair->second;
593}
594
595bool operator < (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
596{
597 return rLeft.m_pPair->second > rRight.m_pPair->second;
598}
599
600
601/**
602 * Compresses the vendor and product strings.
603 *
604 * This is very very simple (a lot less work than the string table for
605 * instance).
606 */
607static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose)
608{
609 /*
610 * Sort the frequency analyzis result and pick the top 127 ones.
611 */
612 std::vector<WordFreqSortEntry> SortMap;
613 for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it)
614 {
615 BLDPROGWORDFREQPAIR const &rPair = *it;
616 SortMap.push_back(WordFreqSortEntry(&rPair));
617 }
618
619 sort(SortMap.begin(), SortMap.end());
620
621 size_t cb = 0;
622 size_t i = 0;
623 for (std::vector<WordFreqSortEntry>::iterator it = SortMap.begin();
624 it != SortMap.end() && i < RT_ELEMENTS(pThis->aCompDict);
625 ++it, i++)
626 {
627 pThis->auCompDictFreq[i] = it->m_pPair->second;
628 pThis->aCompDict[i].cchString = it->m_pPair->first.length();
629 pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1);
630 if (pThis->aCompDict[i].pszString)
631 memcpy(pThis->aCompDict[i].pszString, it->m_pPair->first.c_str(), pThis->aCompDict[i].cchString + 1);
632 else
633 return false;
634 cb += it->m_pPair->second;
635 }
636
637 if (fVerbose)
638 printf("debug: Estimated string compression saving: %u bytes\n", (unsigned)cb);
639
640 /*
641 * Rework the strings.
642 */
643 size_t cchOld = 0;
644 size_t cchOldMax = 0;
645 size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN;
646 size_t cchNew = 0;
647 size_t cchNewMax = 0;
648 size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN;
649 i = pThis->cPendingStrings;
650 while (i-- > 0)
651 {
652 PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i];
653 cchOld += pCurStr->cchString;
654 if (pCurStr->cchString > cchOldMax)
655 cchOldMax = pCurStr->cchString;
656 if (pCurStr->cchString < cchOldMin)
657 cchOldMin = pCurStr->cchString;
658
659 if (!bldProgStrTab_compressorFixupString(pThis, pCurStr))
660 return false;
661
662 cchNew += pCurStr->cchString;
663 if (pCurStr->cchString > cchNewMax)
664 cchNewMax = pCurStr->cchString;
665 if (pCurStr->cchString < cchNewMin)
666 cchNewMin = pCurStr->cchString;
667
668 bldProgStrTab_AddStringToHashTab(pThis, pCurStr);
669 }
670
671 /*
672 * Do debug stats.
673 */
674 if (fVerbose)
675 {
676 for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
677 cchNew += pThis->aCompDict[i].cchString + 1;
678
679 printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew);
680 if (cchNew < cchOld)
681 printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld));
682 else
683 printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew));
684 printf("debug: Original string lengths: average %u; min %u; max %u\n",
685 (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax);
686 printf("debug: Compressed string lengths: average %u; min %u; max %u\n",
687 (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax);
688 }
689
690 return true;
691}
692
693#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
694
695/**
696 * Inserts a string into g_apUniqueStrings.
697 * @param pThis The strint table compiler instance.
698 * @param pStr The string.
699 */
700static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
701{
702 size_t iIdx;
703 size_t iEnd = pThis->cSortedStrings;
704 if (iEnd)
705 {
706 size_t iStart = 0;
707 for (;;)
708 {
709 iIdx = iStart + (iEnd - iStart) / 2;
710 if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString)
711 {
712 if (iIdx <= iStart)
713 break;
714 iEnd = iIdx;
715 }
716 else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString)
717 {
718 if (++iIdx >= iEnd)
719 break;
720 iStart = iIdx;
721 }
722 else
723 break;
724 }
725
726 if (iIdx != pThis->cSortedStrings)
727 memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx],
728 (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx]));
729 }
730 else
731 iIdx = 0;
732
733 pThis->papSortedStrings[iIdx] = pStr;
734 pThis->cSortedStrings++;
735}
736
737
738/**
739 * Compiles the string table after the string has been added.
740 *
741 * This will save space by dropping string terminators, eliminating duplicates
742 * and try find strings that are sub-strings of others.
743 *
744 * Will initialize the StrRef of all StrTabString instances.
745 *
746 * @returns success indicator (error flagged).
747 * @param pThis The strint table compiler instance.
748 */
749static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose)
750{
751#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
752 /*
753 * Do the compression and add all the compressed strings to the table.
754 */
755 if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose))
756 return false;
757
758 /*
759 * Add the dictionary strings.
760 */
761 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
762 bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]);
763#endif
764 if (fVerbose)
765 printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n",
766 (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings,
767 (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions);
768
769 /*
770 * Create papSortedStrings from the hash table. The table is sorted by
771 * string length, with the longer strings first, so that we increase our
772 * chances of locating duplicate substrings.
773 */
774 pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings);
775 if (!pThis->papSortedStrings)
776 return false;
777 pThis->cSortedStrings = 0;
778 size_t idxHash = pThis->cStrHash;
779 while (idxHash-- > 0)
780 {
781 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
782 if (pCur)
783 {
784 do
785 {
786 bldProgStrTab_InsertUniqueString(pThis, pCur);
787 pCur = pCur->pNextCollision;
788 } while (pCur);
789 }
790 }
791
792 /*
793 * Create the actual string table.
794 */
795 pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1);
796 if (!pThis->pachStrTab)
797 return false;
798 pThis->cchStrTab = 0;
799 for (size_t i = 0; i < pThis->cSortedStrings; i++)
800 {
801 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
802 const char * const pszCur = pCur->pszString;
803 size_t const cchCur = pCur->cchString;
804 size_t offStrTab = pThis->cchStrTab;
805
806 /*
807 * See if the string is a substring already found in the string table.
808 * Excluding the zero terminator increases the chances for this.
809 */
810 size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0;
811 const char *pchLeft = pThis->pachStrTab;
812 char const chFirst = *pszCur;
813 while (cchLeft > 0)
814 {
815 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
816 if (!pchCandidate)
817 break;
818 if (memcmp(pchCandidate, pszCur, cchCur) == 0)
819 {
820 offStrTab = pchCandidate - pThis->pachStrTab;
821 break;
822 }
823
824 cchLeft -= pchCandidate + 1 - pchLeft;
825 pchLeft = pchCandidate + 1;
826 }
827
828 if (offStrTab == pThis->cchStrTab)
829 {
830 /*
831 * See if the start of the string overlaps the end of the string table.
832 */
833 if (pThis->cchStrTab && cchCur > 1)
834 {
835 cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1);
836 pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft];
837 while (cchLeft > 0)
838 {
839 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
840 if (!pchCandidate)
841 break;
842 cchLeft -= pchCandidate - pchLeft;
843 pchLeft = pchCandidate;
844 if (memcmp(pchLeft, pszCur, cchLeft) == 0)
845 {
846 size_t cchToCopy = cchCur - cchLeft;
847 memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy);
848 pThis->cchStrTab += cchToCopy;
849 offStrTab = pchCandidate - pThis->pachStrTab;
850 break;
851 }
852 cchLeft--;
853 pchLeft++;
854 }
855 }
856
857 /*
858 * If we didn't have any luck above, just append the string.
859 */
860 if (offStrTab == pThis->cchStrTab)
861 {
862 memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur);
863 pThis->cchStrTab += cchCur;
864 }
865 }
866
867 /*
868 * Set the string table offset for all the references to this string.
869 */
870 do
871 {
872 pCur->offStrTab = (uint32_t)offStrTab;
873 pCur = pCur->pNextRef;
874 } while (pCur != NULL);
875 }
876
877 if (fVerbose)
878 printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab);
879 return true;
880}
881
882
883#ifdef RT_STRICT
884/**
885 * Sanity checks a string table string.
886 * @param pThis The strint table compiler instance.
887 * @param pStr The string to check.
888 */
889static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
890{
891 if (pStr->cchString != strlen(pStr->pszString))
892 abort();
893 if (pStr->offStrTab >= pThis->cchStrTab)
894 abort();
895 if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab)
896 abort();
897 if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0)
898 abort();
899}
900#endif
901
902
903/**
904 * Output the string table string in C string litteral fashion.
905 *
906 * @param pThis The strint table instance.
907 * @param pString The string to print.
908 * @param pOut The output stream.
909 */
910static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut)
911{
912 const unsigned char *psz = (const unsigned char *)pString->pszString;
913 unsigned char uch;
914 while ((uch = *psz++) != '\0')
915 {
916 if (!(uch & 0x80))
917 {
918 if (uch != '\'' && uch != '\\')
919 fputc((char)uch, pOut);
920 else
921 {
922 fputc('\\', pOut);
923 fputc((char)uch, pOut);
924 }
925 }
926#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
927 else if (uch != 0xff)
928 fputs(pThis->aCompDict[uch & 0x7f].pszString, pOut);
929 else
930 {
931# ifdef BLDPROG_STRTAB_PURE_ASCII
932 abort();
933# else
934 RTUNICP uc = RTStrGetCp((const char *)psz);
935 psz += RTStrCpSize(uc);
936 fprintf(pOut, "\\u%04x", uc);
937# endif
938 }
939#else
940 else
941 fprintf(pOut, "\\x%02x", (unsigned)uch);
942 NOREF(pThis);
943#endif
944 }
945}
946
947
948/**
949 * Writes the string table code to the output stream.
950 *
951 * @param pThis The strint table compiler instance.
952 * @param pOut The output stream.
953 * @param pszScope The scoping ("static " or empty string),
954 * including trailing space.
955 * @param pszPrefix The variable prefix. Typically "g_" for C programs,
956 * whereas C++ classes normally use "class::s_g".
957 * @param pszBaseName The base name for the variables. The user
958 * accessible variable will end up as just the
959 * prefix and basename concatenated.
960 */
961static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut,
962 const char *pszScope, const char *pszPrefix, const char *pszBaseName)
963{
964#ifdef RT_STRICT
965 /*
966 * Do some quick sanity checks while we're here.
967 */
968# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
969 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
970 BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]);
971# endif
972#endif
973
974 /*
975 * Create a table for speeding up the character categorization.
976 */
977 uint8_t abCharCat[256];
978 memset(abCharCat, 0, sizeof(abCharCat));
979 abCharCat[(unsigned char)'\\'] = 1;
980 abCharCat[(unsigned char)'\''] = 1;
981 for (unsigned i = 0; i < 0x20; i++)
982 abCharCat[i] = 2;
983 for (unsigned i = 0x7f; i < 0x100; i++)
984 abCharCat[i] = 2;
985
986 /*
987 * We follow the sorted string table, one string per line.
988 */
989 fprintf(pOut,
990 "#include <iprt/bldprog-strtab.h>\n"
991 "\n"
992 "static const char g_achStrTab%s[] =\n"
993 "{\n",
994 pszBaseName);
995
996 uint32_t off = 0;
997 for (uint32_t i = 0; i < pThis->cSortedStrings; i++)
998 {
999 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
1000 uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString;
1001 if (offEnd > off)
1002 {
1003 /* Comment with a uncompressed and more readable version of the string. */
1004 if (off == pCur->offStrTab)
1005 fprintf(pOut, "/* 0x%05x = \"", off);
1006 else
1007 fprintf(pOut, "/* 0X%05x = \"", off);
1008 BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut);
1009 fputs("\" */\n", pOut);
1010
1011 /* Must use char by char here or we may trigger the max string
1012 length limit in the compiler, */
1013 fputs(" ", pOut);
1014 for (; off < offEnd; off++)
1015 {
1016 unsigned char uch = pThis->pachStrTab[off];
1017 fputc('\'', pOut);
1018 if (abCharCat[uch] == 0)
1019 fputc(uch, pOut);
1020 else if (abCharCat[uch] != 1)
1021 fprintf(pOut, "\\x%02x", (unsigned)uch);
1022 else
1023 {
1024 fputc('\\', pOut);
1025 fputc(uch, pOut);
1026 }
1027 fputc('\'', pOut);
1028 fputc(',', pOut);
1029 }
1030 fputc('\n', pOut);
1031 }
1032 }
1033
1034 fprintf(pOut,
1035 "};\n"
1036 "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n",
1037 pszBaseName, (unsigned)pThis->cchStrTab);
1038
1039#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1040 /*
1041 * Write the compression dictionary.
1042 */
1043 fprintf(pOut,
1044 "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n"
1045 "{\n",
1046 pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict));
1047 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
1048 fprintf(pOut, " { %#08x, %#04x }, // %6lu - %s\n",
1049 pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString,
1050 (unsigned long)pThis->auCompDictFreq[i], pThis->aCompDict[i].pszString);
1051 fprintf(pOut, "};\n\n");
1052#endif
1053
1054
1055 /*
1056 * Write the string table data structure.
1057 */
1058 fprintf(pOut,
1059 "%sconst RTBLDPROGSTRTAB %s%s = \n"
1060 "{\n"
1061 " /*.pchStrTab = */ &g_achStrTab%s[0],\n"
1062 " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n"
1063 ,
1064 pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName);
1065#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1066 fprintf(pOut,
1067 " /*.cCompDict = */ %u,\n"
1068 " /*.paCompDict = */ &g_aCompDict%s[0]\n"
1069 "};\n"
1070 , (unsigned)RT_ELEMENTS(pThis->aCompDict), pszBaseName);
1071#else
1072 fprintf(pOut,
1073 " /*.cCompDict = */ 0,\n"
1074 " /*.paCompDict = */ NULL\n"
1075 "};\n");
1076#endif
1077}
1078
1079#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
1080# pragma GCC diagnostic pop
1081#endif
1082
1083#endif /* __cplusplus && IN_RING3 */
1084
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette