1 | /* enough.c -- determine the maximum size of inflate's Huffman code tables over
|
---|
2 | * all possible valid and complete prefix codes, subject to a length limit.
|
---|
3 | * Copyright (C) 2007, 2008, 2012, 2018 Mark Adler
|
---|
4 | * Version 1.5 5 August 2018 Mark Adler
|
---|
5 | */
|
---|
6 |
|
---|
7 | /* Version history:
|
---|
8 | 1.0 3 Jan 2007 First version (derived from codecount.c version 1.4)
|
---|
9 | 1.1 4 Jan 2007 Use faster incremental table usage computation
|
---|
10 | Prune examine() search on previously visited states
|
---|
11 | 1.2 5 Jan 2007 Comments clean up
|
---|
12 | As inflate does, decrease root for short codes
|
---|
13 | Refuse cases where inflate would increase root
|
---|
14 | 1.3 17 Feb 2008 Add argument for initial root table size
|
---|
15 | Fix bug for initial root table size == max - 1
|
---|
16 | Use a macro to compute the history index
|
---|
17 | 1.4 18 Aug 2012 Avoid shifts more than bits in type (caused endless loop!)
|
---|
18 | Clean up comparisons of different types
|
---|
19 | Clean up code indentation
|
---|
20 | 1.5 5 Aug 2018 Clean up code style, formatting, and comments
|
---|
21 | Show all the codes for the maximum, and only the maximum
|
---|
22 | */
|
---|
23 |
|
---|
24 | /*
|
---|
25 | Examine all possible prefix codes for a given number of symbols and a
|
---|
26 | maximum code length in bits to determine the maximum table size for zlib's
|
---|
27 | inflate. Only complete prefix codes are counted.
|
---|
28 |
|
---|
29 | Two codes are considered distinct if the vectors of the number of codes per
|
---|
30 | length are not identical. So permutations of the symbol assignments result
|
---|
31 | in the same code for the counting, as do permutations of the assignments of
|
---|
32 | the bit values to the codes (i.e. only canonical codes are counted).
|
---|
33 |
|
---|
34 | We build a code from shorter to longer lengths, determining how many symbols
|
---|
35 | are coded at each length. At each step, we have how many symbols remain to
|
---|
36 | be coded, what the last code length used was, and how many bit patterns of
|
---|
37 | that length remain unused. Then we add one to the code length and double the
|
---|
38 | number of unused patterns to graduate to the next code length. We then
|
---|
39 | assign all portions of the remaining symbols to that code length that
|
---|
40 | preserve the properties of a correct and eventually complete code. Those
|
---|
41 | properties are: we cannot use more bit patterns than are available; and when
|
---|
42 | all the symbols are used, there are exactly zero possible bit patterns left
|
---|
43 | unused.
|
---|
44 |
|
---|
45 | The inflate Huffman decoding algorithm uses two-level lookup tables for
|
---|
46 | speed. There is a single first-level table to decode codes up to root bits
|
---|
47 | in length (root == 9 for literal/length codes and root == 6 for distance
|
---|
48 | codes, in the current inflate implementation). The base table has 1 << root
|
---|
49 | entries and is indexed by the next root bits of input. Codes shorter than
|
---|
50 | root bits have replicated table entries, so that the correct entry is
|
---|
51 | pointed to regardless of the bits that follow the short code. If the code is
|
---|
52 | longer than root bits, then the table entry points to a second-level table.
|
---|
53 | The size of that table is determined by the longest code with that root-bit
|
---|
54 | prefix. If that longest code has length len, then the table has size 1 <<
|
---|
55 | (len - root), to index the remaining bits in that set of codes. Each
|
---|
56 | subsequent root-bit prefix then has its own sub-table. The total number of
|
---|
57 | table entries required by the code is calculated incrementally as the number
|
---|
58 | of codes at each bit length is populated. When all of the codes are shorter
|
---|
59 | than root bits, then root is reduced to the longest code length, resulting
|
---|
60 | in a single, smaller, one-level table.
|
---|
61 |
|
---|
62 | The inflate algorithm also provides for small values of root (relative to
|
---|
63 | the log2 of the number of symbols), where the shortest code has more bits
|
---|
64 | than root. In that case, root is increased to the length of the shortest
|
---|
65 | code. This program, by design, does not handle that case, so it is verified
|
---|
66 | that the number of symbols is less than 1 << (root + 1).
|
---|
67 |
|
---|
68 | In order to speed up the examination (by about ten orders of magnitude for
|
---|
69 | the default arguments), the intermediate states in the build-up of a code
|
---|
70 | are remembered and previously visited branches are pruned. The memory
|
---|
71 | required for this will increase rapidly with the total number of symbols and
|
---|
72 | the maximum code length in bits. However this is a very small price to pay
|
---|
73 | for the vast speedup.
|
---|
74 |
|
---|
75 | First, all of the possible prefix codes are counted, and reachable
|
---|
76 | intermediate states are noted by a non-zero count in a saved-results array.
|
---|
77 | Second, the intermediate states that lead to (root + 1) bit or longer codes
|
---|
78 | are used to look at all sub-codes from those junctures for their inflate
|
---|
79 | memory usage. (The amount of memory used is not affected by the number of
|
---|
80 | codes of root bits or less in length.) Third, the visited states in the
|
---|
81 | construction of those sub-codes and the associated calculation of the table
|
---|
82 | size is recalled in order to avoid recalculating from the same juncture.
|
---|
83 | Beginning the code examination at (root + 1) bit codes, which is enabled by
|
---|
84 | identifying the reachable nodes, accounts for about six of the orders of
|
---|
85 | magnitude of improvement for the default arguments. About another four
|
---|
86 | orders of magnitude come from not revisiting previous states. Out of
|
---|
87 | approximately 2x10^16 possible prefix codes, only about 2x10^6 sub-codes
|
---|
88 | need to be examined to cover all of the possible table memory usage cases
|
---|
89 | for the default arguments of 286 symbols limited to 15-bit codes.
|
---|
90 |
|
---|
91 | Note that the uintmax_t type is used for counting. It is quite easy to
|
---|
92 | exceed the capacity of an eight-byte integer with a large number of symbols
|
---|
93 | and a large maximum code length, so multiple-precision arithmetic would need
|
---|
94 | to replace the integer arithmetic in that case. This program will abort if
|
---|
95 | an overflow occurs. The big_t type identifies where the counting takes
|
---|
96 | place.
|
---|
97 |
|
---|
98 | The uintmax_t type is also used for calculating the number of possible codes
|
---|
99 | remaining at the maximum length. This limits the maximum code length to the
|
---|
100 | number of bits in a long long minus the number of bits needed to represent
|
---|
101 | the symbols in a flat code. The code_t type identifies where the bit-pattern
|
---|
102 | counting takes place.
|
---|
103 | */
|
---|
104 |
|
---|
105 | #include <stdio.h>
|
---|
106 | #include <stdlib.h>
|
---|
107 | #include <string.h>
|
---|
108 | #include <stdarg.h>
|
---|
109 | #include <stdint.h>
|
---|
110 | #include <assert.h>
|
---|
111 |
|
---|
112 | #define local static
|
---|
113 |
|
---|
114 | // Special data types.
|
---|
115 | typedef uintmax_t big_t; // type for code counting
|
---|
116 | #define PRIbig "ju" // printf format for big_t
|
---|
117 | typedef uintmax_t code_t; // type for bit pattern counting
|
---|
118 | struct tab { // type for been-here check
|
---|
119 | size_t len; // allocated length of bit vector in octets
|
---|
120 | char *vec; // allocated bit vector
|
---|
121 | };
|
---|
122 |
|
---|
123 | /* The array for saving results, num[], is indexed with this triplet:
|
---|
124 |
|
---|
125 | syms: number of symbols remaining to code
|
---|
126 | left: number of available bit patterns at length len
|
---|
127 | len: number of bits in the codes currently being assigned
|
---|
128 |
|
---|
129 | Those indices are constrained thusly when saving results:
|
---|
130 |
|
---|
131 | syms: 3..totsym (totsym == total symbols to code)
|
---|
132 | left: 2..syms - 1, but only the evens (so syms == 8 -> 2, 4, 6)
|
---|
133 | len: 1..max - 1 (max == maximum code length in bits)
|
---|
134 |
|
---|
135 | syms == 2 is not saved since that immediately leads to a single code. left
|
---|
136 | must be even, since it represents the number of available bit patterns at
|
---|
137 | the current length, which is double the number at the previous length. left
|
---|
138 | ends at syms-1 since left == syms immediately results in a single code.
|
---|
139 | (left > sym is not allowed since that would result in an incomplete code.)
|
---|
140 | len is less than max, since the code completes immediately when len == max.
|
---|
141 |
|
---|
142 | The offset into the array is calculated for the three indices with the first
|
---|
143 | one (syms) being outermost, and the last one (len) being innermost. We build
|
---|
144 | the array with length max-1 lists for the len index, with syms-3 of those
|
---|
145 | for each symbol. There are totsym-2 of those, with each one varying in
|
---|
146 | length as a function of sym. See the calculation of index in map() for the
|
---|
147 | index, and the calculation of size in main() for the size of the array.
|
---|
148 |
|
---|
149 | For the deflate example of 286 symbols limited to 15-bit codes, the array
|
---|
150 | has 284,284 entries, taking up 2.17 MB for an 8-byte big_t. More than half
|
---|
151 | of the space allocated for saved results is actually used -- not all
|
---|
152 | possible triplets are reached in the generation of valid prefix codes.
|
---|
153 | */
|
---|
154 |
|
---|
155 | /* The array for tracking visited states, done[], is itself indexed identically
|
---|
156 | to the num[] array as described above for the (syms, left, len) triplet.
|
---|
157 | Each element in the array is further indexed by the (mem, rem) doublet,
|
---|
158 | where mem is the amount of inflate table space used so far, and rem is the
|
---|
159 | remaining unused entries in the current inflate sub-table. Each indexed
|
---|
160 | element is simply one bit indicating whether the state has been visited or
|
---|
161 | not. Since the ranges for mem and rem are not known a priori, each bit
|
---|
162 | vector is of a variable size, and grows as needed to accommodate the visited
|
---|
163 | states. mem and rem are used to calculate a single index in a triangular
|
---|
164 | array. Since the range of mem is expected in the default case to be about
|
---|
165 | ten times larger than the range of rem, the array is skewed to reduce the
|
---|
166 | memory usage, with eight times the range for mem than for rem. See the
|
---|
167 | calculations for offset and bit in been_here() for the details.
|
---|
168 |
|
---|
169 | For the deflate example of 286 symbols limited to 15-bit codes, the bit
|
---|
170 | vectors grow to total 5.5 MB, in addition to the 4.3 MB done array itself.
|
---|
171 | */
|
---|
172 |
|
---|
173 | // Type for a variable-length, allocated string.
|
---|
174 | typedef struct {
|
---|
175 | char *str; // pointer to allocated string
|
---|
176 | size_t size; // size of allocation
|
---|
177 | size_t len; // length of string, not including terminating zero
|
---|
178 | } string_t;
|
---|
179 |
|
---|
180 | // Clear a string_t.
|
---|
181 | local void string_clear(string_t *s) {
|
---|
182 | s->str[0] = 0;
|
---|
183 | s->len = 0;
|
---|
184 | }
|
---|
185 |
|
---|
186 | // Initialize a string_t.
|
---|
187 | local void string_init(string_t *s) {
|
---|
188 | s->size = 16;
|
---|
189 | s->str = malloc(s->size);
|
---|
190 | assert(s->str != NULL && "out of memory");
|
---|
191 | string_clear(s);
|
---|
192 | }
|
---|
193 |
|
---|
194 | // Release the allocation of a string_t.
|
---|
195 | local void string_free(string_t *s) {
|
---|
196 | free(s->str);
|
---|
197 | s->str = NULL;
|
---|
198 | s->size = 0;
|
---|
199 | s->len = 0;
|
---|
200 | }
|
---|
201 |
|
---|
202 | // Save the results of printf with fmt and the subsequent argument list to s.
|
---|
203 | // Each call appends to s. The allocated space for s is increased as needed.
|
---|
204 | local void string_printf(string_t *s, char *fmt, ...) {
|
---|
205 | va_list ap;
|
---|
206 | va_start(ap, fmt);
|
---|
207 | size_t len = s->len;
|
---|
208 | int ret = vsnprintf(s->str + len, s->size - len, fmt, ap);
|
---|
209 | assert(ret >= 0 && "out of memory");
|
---|
210 | s->len += ret;
|
---|
211 | if (s->size < s->len + 1) {
|
---|
212 | do {
|
---|
213 | s->size <<= 1;
|
---|
214 | assert(s->size != 0 && "overflow");
|
---|
215 | } while (s->size < s->len + 1);
|
---|
216 | s->str = realloc(s->str, s->size);
|
---|
217 | assert(s->str != NULL && "out of memory");
|
---|
218 | vsnprintf(s->str + len, s->size - len, fmt, ap);
|
---|
219 | }
|
---|
220 | va_end(ap);
|
---|
221 | }
|
---|
222 |
|
---|
223 | // Globals to avoid propagating constants or constant pointers recursively.
|
---|
224 | struct {
|
---|
225 | int max; // maximum allowed bit length for the codes
|
---|
226 | int root; // size of base code table in bits
|
---|
227 | int large; // largest code table so far
|
---|
228 | size_t size; // number of elements in num and done
|
---|
229 | big_t tot; // total number of codes with maximum tables size
|
---|
230 | string_t out; // display of subcodes for maximum tables size
|
---|
231 | int *code; // number of symbols assigned to each bit length
|
---|
232 | big_t *num; // saved results array for code counting
|
---|
233 | struct tab *done; // states already evaluated array
|
---|
234 | } g;
|
---|
235 |
|
---|
236 | // Index function for num[] and done[].
|
---|
237 | local inline size_t map(int syms, int left, int len) {
|
---|
238 | return ((size_t)((syms - 1) >> 1) * ((syms - 2) >> 1) +
|
---|
239 | (left >> 1) - 1) * (g.max - 1) +
|
---|
240 | len - 1;
|
---|
241 | }
|
---|
242 |
|
---|
243 | // Free allocated space in globals.
|
---|
244 | local void cleanup(void) {
|
---|
245 | if (g.done != NULL) {
|
---|
246 | for (size_t n = 0; n < g.size; n++)
|
---|
247 | if (g.done[n].len)
|
---|
248 | free(g.done[n].vec);
|
---|
249 | g.size = 0;
|
---|
250 | free(g.done); g.done = NULL;
|
---|
251 | }
|
---|
252 | free(g.num); g.num = NULL;
|
---|
253 | free(g.code); g.code = NULL;
|
---|
254 | string_free(&g.out);
|
---|
255 | }
|
---|
256 |
|
---|
257 | // Return the number of possible prefix codes using bit patterns of lengths len
|
---|
258 | // through max inclusive, coding syms symbols, with left bit patterns of length
|
---|
259 | // len unused -- return -1 if there is an overflow in the counting. Keep a
|
---|
260 | // record of previous results in num to prevent repeating the same calculation.
|
---|
261 | local big_t count(int syms, int left, int len) {
|
---|
262 | // see if only one possible code
|
---|
263 | if (syms == left)
|
---|
264 | return 1;
|
---|
265 |
|
---|
266 | // note and verify the expected state
|
---|
267 | assert(syms > left && left > 0 && len < g.max);
|
---|
268 |
|
---|
269 | // see if we've done this one already
|
---|
270 | size_t index = map(syms, left, len);
|
---|
271 | big_t got = g.num[index];
|
---|
272 | if (got)
|
---|
273 | return got; // we have -- return the saved result
|
---|
274 |
|
---|
275 | // we need to use at least this many bit patterns so that the code won't be
|
---|
276 | // incomplete at the next length (more bit patterns than symbols)
|
---|
277 | int least = (left << 1) - syms;
|
---|
278 | if (least < 0)
|
---|
279 | least = 0;
|
---|
280 |
|
---|
281 | // we can use at most this many bit patterns, lest there not be enough
|
---|
282 | // available for the remaining symbols at the maximum length (if there were
|
---|
283 | // no limit to the code length, this would become: most = left - 1)
|
---|
284 | int most = (((code_t)left << (g.max - len)) - syms) /
|
---|
285 | (((code_t)1 << (g.max - len)) - 1);
|
---|
286 |
|
---|
287 | // count all possible codes from this juncture and add them up
|
---|
288 | big_t sum = 0;
|
---|
289 | for (int use = least; use <= most; use++) {
|
---|
290 | got = count(syms - use, (left - use) << 1, len + 1);
|
---|
291 | sum += got;
|
---|
292 | if (got == (big_t)-1 || sum < got) // overflow
|
---|
293 | return (big_t)-1;
|
---|
294 | }
|
---|
295 |
|
---|
296 | // verify that all recursive calls are productive
|
---|
297 | assert(sum != 0);
|
---|
298 |
|
---|
299 | // save the result and return it
|
---|
300 | g.num[index] = sum;
|
---|
301 | return sum;
|
---|
302 | }
|
---|
303 |
|
---|
304 | // Return true if we've been here before, set to true if not. Set a bit in a
|
---|
305 | // bit vector to indicate visiting this state. Each (syms,len,left) state has a
|
---|
306 | // variable size bit vector indexed by (mem,rem). The bit vector is lengthened
|
---|
307 | // as needed to allow setting the (mem,rem) bit.
|
---|
308 | local int been_here(int syms, int left, int len, int mem, int rem) {
|
---|
309 | // point to vector for (syms,left,len), bit in vector for (mem,rem)
|
---|
310 | size_t index = map(syms, left, len);
|
---|
311 | mem -= 1 << g.root; // mem always includes the root table
|
---|
312 | mem >>= 1; // mem and rem are always even
|
---|
313 | rem >>= 1;
|
---|
314 | size_t offset = (mem >> 3) + rem;
|
---|
315 | offset = ((offset * (offset + 1)) >> 1) + rem;
|
---|
316 | int bit = 1 << (mem & 7);
|
---|
317 |
|
---|
318 | // see if we've been here
|
---|
319 | size_t length = g.done[index].len;
|
---|
320 | if (offset < length && (g.done[index].vec[offset] & bit) != 0)
|
---|
321 | return 1; // done this!
|
---|
322 |
|
---|
323 | // we haven't been here before -- set the bit to show we have now
|
---|
324 |
|
---|
325 | // see if we need to lengthen the vector in order to set the bit
|
---|
326 | if (length <= offset) {
|
---|
327 | // if we have one already, enlarge it, zero out the appended space
|
---|
328 | char *vector;
|
---|
329 | if (length) {
|
---|
330 | do {
|
---|
331 | length <<= 1;
|
---|
332 | } while (length <= offset);
|
---|
333 | vector = realloc(g.done[index].vec, length);
|
---|
334 | assert(vector != NULL && "out of memory");
|
---|
335 | memset(vector + g.done[index].len, 0, length - g.done[index].len);
|
---|
336 | }
|
---|
337 |
|
---|
338 | // otherwise we need to make a new vector and zero it out
|
---|
339 | else {
|
---|
340 | length = 16;
|
---|
341 | while (length <= offset)
|
---|
342 | length <<= 1;
|
---|
343 | vector = calloc(length, 1);
|
---|
344 | assert(vector != NULL && "out of memory");
|
---|
345 | }
|
---|
346 |
|
---|
347 | // install the new vector
|
---|
348 | g.done[index].len = length;
|
---|
349 | g.done[index].vec = vector;
|
---|
350 | }
|
---|
351 |
|
---|
352 | // set the bit
|
---|
353 | g.done[index].vec[offset] |= bit;
|
---|
354 | return 0;
|
---|
355 | }
|
---|
356 |
|
---|
357 | // Examine all possible codes from the given node (syms, len, left). Compute
|
---|
358 | // the amount of memory required to build inflate's decoding tables, where the
|
---|
359 | // number of code structures used so far is mem, and the number remaining in
|
---|
360 | // the current sub-table is rem.
|
---|
361 | local void examine(int syms, int left, int len, int mem, int rem) {
|
---|
362 | // see if we have a complete code
|
---|
363 | if (syms == left) {
|
---|
364 | // set the last code entry
|
---|
365 | g.code[len] = left;
|
---|
366 |
|
---|
367 | // complete computation of memory used by this code
|
---|
368 | while (rem < left) {
|
---|
369 | left -= rem;
|
---|
370 | rem = 1 << (len - g.root);
|
---|
371 | mem += rem;
|
---|
372 | }
|
---|
373 | assert(rem == left);
|
---|
374 |
|
---|
375 | // if this is at the maximum, show the sub-code
|
---|
376 | if (mem >= g.large) {
|
---|
377 | // if this is a new maximum, update the maximum and clear out the
|
---|
378 | // printed sub-codes from the previous maximum
|
---|
379 | if (mem > g.large) {
|
---|
380 | g.large = mem;
|
---|
381 | string_clear(&g.out);
|
---|
382 | }
|
---|
383 |
|
---|
384 | // compute the starting state for this sub-code
|
---|
385 | syms = 0;
|
---|
386 | left = 1 << g.max;
|
---|
387 | for (int bits = g.max; bits > g.root; bits--) {
|
---|
388 | syms += g.code[bits];
|
---|
389 | left -= g.code[bits];
|
---|
390 | assert((left & 1) == 0);
|
---|
391 | left >>= 1;
|
---|
392 | }
|
---|
393 |
|
---|
394 | // print the starting state and the resulting sub-code to g.out
|
---|
395 | string_printf(&g.out, "<%u, %u, %u>:",
|
---|
396 | syms, g.root + 1, ((1 << g.root) - left) << 1);
|
---|
397 | for (int bits = g.root + 1; bits <= g.max; bits++)
|
---|
398 | if (g.code[bits])
|
---|
399 | string_printf(&g.out, " %d[%d]", g.code[bits], bits);
|
---|
400 | string_printf(&g.out, "\n");
|
---|
401 | }
|
---|
402 |
|
---|
403 | // remove entries as we drop back down in the recursion
|
---|
404 | g.code[len] = 0;
|
---|
405 | return;
|
---|
406 | }
|
---|
407 |
|
---|
408 | // prune the tree if we can
|
---|
409 | if (been_here(syms, left, len, mem, rem))
|
---|
410 | return;
|
---|
411 |
|
---|
412 | // we need to use at least this many bit patterns so that the code won't be
|
---|
413 | // incomplete at the next length (more bit patterns than symbols)
|
---|
414 | int least = (left << 1) - syms;
|
---|
415 | if (least < 0)
|
---|
416 | least = 0;
|
---|
417 |
|
---|
418 | // we can use at most this many bit patterns, lest there not be enough
|
---|
419 | // available for the remaining symbols at the maximum length (if there were
|
---|
420 | // no limit to the code length, this would become: most = left - 1)
|
---|
421 | int most = (((code_t)left << (g.max - len)) - syms) /
|
---|
422 | (((code_t)1 << (g.max - len)) - 1);
|
---|
423 |
|
---|
424 | // occupy least table spaces, creating new sub-tables as needed
|
---|
425 | int use = least;
|
---|
426 | while (rem < use) {
|
---|
427 | use -= rem;
|
---|
428 | rem = 1 << (len - g.root);
|
---|
429 | mem += rem;
|
---|
430 | }
|
---|
431 | rem -= use;
|
---|
432 |
|
---|
433 | // examine codes from here, updating table space as we go
|
---|
434 | for (use = least; use <= most; use++) {
|
---|
435 | g.code[len] = use;
|
---|
436 | examine(syms - use, (left - use) << 1, len + 1,
|
---|
437 | mem + (rem ? 1 << (len - g.root) : 0), rem << 1);
|
---|
438 | if (rem == 0) {
|
---|
439 | rem = 1 << (len - g.root);
|
---|
440 | mem += rem;
|
---|
441 | }
|
---|
442 | rem--;
|
---|
443 | }
|
---|
444 |
|
---|
445 | // remove entries as we drop back down in the recursion
|
---|
446 | g.code[len] = 0;
|
---|
447 | }
|
---|
448 |
|
---|
449 | // Look at all sub-codes starting with root + 1 bits. Look at only the valid
|
---|
450 | // intermediate code states (syms, left, len). For each completed code,
|
---|
451 | // calculate the amount of memory required by inflate to build the decoding
|
---|
452 | // tables. Find the maximum amount of memory required and show the codes that
|
---|
453 | // require that maximum.
|
---|
454 | local void enough(int syms) {
|
---|
455 | // clear code
|
---|
456 | for (int n = 0; n <= g.max; n++)
|
---|
457 | g.code[n] = 0;
|
---|
458 |
|
---|
459 | // look at all (root + 1) bit and longer codes
|
---|
460 | string_clear(&g.out); // empty saved results
|
---|
461 | g.large = 1 << g.root; // base table
|
---|
462 | if (g.root < g.max) // otherwise, there's only a base table
|
---|
463 | for (int n = 3; n <= syms; n++)
|
---|
464 | for (int left = 2; left < n; left += 2) {
|
---|
465 | // look at all reachable (root + 1) bit nodes, and the
|
---|
466 | // resulting codes (complete at root + 2 or more)
|
---|
467 | size_t index = map(n, left, g.root + 1);
|
---|
468 | if (g.root + 1 < g.max && g.num[index]) // reachable node
|
---|
469 | examine(n, left, g.root + 1, 1 << g.root, 0);
|
---|
470 |
|
---|
471 | // also look at root bit codes with completions at root + 1
|
---|
472 | // bits (not saved in num, since complete), just in case
|
---|
473 | if (g.num[index - 1] && n <= left << 1)
|
---|
474 | examine((n - left) << 1, (n - left) << 1, g.root + 1,
|
---|
475 | 1 << g.root, 0);
|
---|
476 | }
|
---|
477 |
|
---|
478 | // done
|
---|
479 | printf("maximum of %d table entries for root = %d\n", g.large, g.root);
|
---|
480 | fputs(g.out.str, stdout);
|
---|
481 | }
|
---|
482 |
|
---|
483 | // Examine and show the total number of possible prefix codes for a given
|
---|
484 | // maximum number of symbols, initial root table size, and maximum code length
|
---|
485 | // in bits -- those are the command arguments in that order. The default values
|
---|
486 | // are 286, 9, and 15 respectively, for the deflate literal/length code. The
|
---|
487 | // possible codes are counted for each number of coded symbols from two to the
|
---|
488 | // maximum. The counts for each of those and the total number of codes are
|
---|
489 | // shown. The maximum number of inflate table entires is then calculated across
|
---|
490 | // all possible codes. Each new maximum number of table entries and the
|
---|
491 | // associated sub-code (starting at root + 1 == 10 bits) is shown.
|
---|
492 | //
|
---|
493 | // To count and examine prefix codes that are not length-limited, provide a
|
---|
494 | // maximum length equal to the number of symbols minus one.
|
---|
495 | //
|
---|
496 | // For the deflate literal/length code, use "enough". For the deflate distance
|
---|
497 | // code, use "enough 30 6".
|
---|
498 | int main(int argc, char **argv) {
|
---|
499 | // set up globals for cleanup()
|
---|
500 | g.code = NULL;
|
---|
501 | g.num = NULL;
|
---|
502 | g.done = NULL;
|
---|
503 | string_init(&g.out);
|
---|
504 |
|
---|
505 | // get arguments -- default to the deflate literal/length code
|
---|
506 | int syms = 286;
|
---|
507 | g.root = 9;
|
---|
508 | g.max = 15;
|
---|
509 | if (argc > 1) {
|
---|
510 | syms = atoi(argv[1]);
|
---|
511 | if (argc > 2) {
|
---|
512 | g.root = atoi(argv[2]);
|
---|
513 | if (argc > 3)
|
---|
514 | g.max = atoi(argv[3]);
|
---|
515 | }
|
---|
516 | }
|
---|
517 | if (argc > 4 || syms < 2 || g.root < 1 || g.max < 1) {
|
---|
518 | fputs("invalid arguments, need: [sym >= 2 [root >= 1 [max >= 1]]]\n",
|
---|
519 | stderr);
|
---|
520 | return 1;
|
---|
521 | }
|
---|
522 |
|
---|
523 | // if not restricting the code length, the longest is syms - 1
|
---|
524 | if (g.max > syms - 1)
|
---|
525 | g.max = syms - 1;
|
---|
526 |
|
---|
527 | // determine the number of bits in a code_t
|
---|
528 | int bits = 0;
|
---|
529 | for (code_t word = 1; word; word <<= 1)
|
---|
530 | bits++;
|
---|
531 |
|
---|
532 | // make sure that the calculation of most will not overflow
|
---|
533 | if (g.max > bits || (code_t)(syms - 2) >= ((code_t)-1 >> (g.max - 1))) {
|
---|
534 | fputs("abort: code length too long for internal types\n", stderr);
|
---|
535 | return 1;
|
---|
536 | }
|
---|
537 |
|
---|
538 | // reject impossible code requests
|
---|
539 | if ((code_t)(syms - 1) > ((code_t)1 << g.max) - 1) {
|
---|
540 | fprintf(stderr, "%d symbols cannot be coded in %d bits\n",
|
---|
541 | syms, g.max);
|
---|
542 | return 1;
|
---|
543 | }
|
---|
544 |
|
---|
545 | // allocate code vector
|
---|
546 | g.code = calloc(g.max + 1, sizeof(int));
|
---|
547 | assert(g.code != NULL && "out of memory");
|
---|
548 |
|
---|
549 | // determine size of saved results array, checking for overflows,
|
---|
550 | // allocate and clear the array (set all to zero with calloc())
|
---|
551 | if (syms == 2) // iff max == 1
|
---|
552 | g.num = NULL; // won't be saving any results
|
---|
553 | else {
|
---|
554 | g.size = syms >> 1;
|
---|
555 | int n = (syms - 1) >> 1;
|
---|
556 | assert(g.size <= (size_t)-1 / n && "overflow");
|
---|
557 | g.size *= n;
|
---|
558 | n = g.max - 1;
|
---|
559 | assert(g.size <= (size_t)-1 / n && "overflow");
|
---|
560 | g.size *= n;
|
---|
561 | g.num = calloc(g.size, sizeof(big_t));
|
---|
562 | assert(g.num != NULL && "out of memory");
|
---|
563 | }
|
---|
564 |
|
---|
565 | // count possible codes for all numbers of symbols, add up counts
|
---|
566 | big_t sum = 0;
|
---|
567 | for (int n = 2; n <= syms; n++) {
|
---|
568 | big_t got = count(n, 2, 1);
|
---|
569 | sum += got;
|
---|
570 | assert(got != (big_t)-1 && sum >= got && "overflow");
|
---|
571 | }
|
---|
572 | printf("%"PRIbig" total codes for 2 to %d symbols", sum, syms);
|
---|
573 | if (g.max < syms - 1)
|
---|
574 | printf(" (%d-bit length limit)\n", g.max);
|
---|
575 | else
|
---|
576 | puts(" (no length limit)");
|
---|
577 |
|
---|
578 | // allocate and clear done array for been_here()
|
---|
579 | if (syms == 2)
|
---|
580 | g.done = NULL;
|
---|
581 | else {
|
---|
582 | g.done = calloc(g.size, sizeof(struct tab));
|
---|
583 | assert(g.done != NULL && "out of memory");
|
---|
584 | }
|
---|
585 |
|
---|
586 | // find and show maximum inflate table usage
|
---|
587 | if (g.root > g.max) // reduce root to max length
|
---|
588 | g.root = g.max;
|
---|
589 | if ((code_t)syms < ((code_t)1 << (g.root + 1)))
|
---|
590 | enough(syms);
|
---|
591 | else
|
---|
592 | fputs("cannot handle minimum code lengths > root", stderr);
|
---|
593 |
|
---|
594 | // done
|
---|
595 | cleanup();
|
---|
596 | return 0;
|
---|
597 | }
|
---|