1 | ///////////////////////////////////////////////////////////////////////////////
|
---|
2 | //
|
---|
3 | /// \file crc64.c
|
---|
4 | /// \brief CRC64 calculation
|
---|
5 | ///
|
---|
6 | /// There are two methods in this file. crc64_generic uses the
|
---|
7 | /// the slice-by-four algorithm. This is the same idea that is
|
---|
8 | /// used in crc32_fast.c, but for CRC64 we use only four tables
|
---|
9 | /// instead of eight to avoid increasing CPU cache usage.
|
---|
10 | ///
|
---|
11 | /// crc64_clmul uses 32/64-bit x86 SSSE3, SSE4.1, and CLMUL instructions.
|
---|
12 | /// It was derived from
|
---|
13 | /// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
---|
14 | /// and the public domain code from https://github.com/rawrunprotected/crc
|
---|
15 | /// (URLs were checked on 2022-11-07).
|
---|
16 | ///
|
---|
17 | /// FIXME: Builds for 32-bit x86 use crc64_x86.S by default instead
|
---|
18 | /// of this file and thus CLMUL version isn't available on 32-bit x86
|
---|
19 | /// unless configured with --disable-assembler. Even then the lookup table
|
---|
20 | /// isn't omitted in crc64_table.c since it doesn't know that assembly
|
---|
21 | /// code has been disabled.
|
---|
22 | //
|
---|
23 | // Authors: Lasse Collin
|
---|
24 | // Ilya Kurdyukov
|
---|
25 | //
|
---|
26 | // This file has been put into the public domain.
|
---|
27 | // You can do whatever you want with this file.
|
---|
28 | //
|
---|
29 | ///////////////////////////////////////////////////////////////////////////////
|
---|
30 |
|
---|
31 | #include "check.h"
|
---|
32 |
|
---|
33 | #undef CRC_GENERIC
|
---|
34 | #undef CRC_CLMUL
|
---|
35 | #undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
---|
36 |
|
---|
37 | // If CLMUL cannot be used then only the generic slice-by-four is built.
|
---|
38 | #if !defined(HAVE_USABLE_CLMUL)
|
---|
39 | # define CRC_GENERIC 1
|
---|
40 |
|
---|
41 | // If CLMUL is allowed unconditionally in the compiler options then the
|
---|
42 | // generic version can be omitted. Note that this doesn't work with MSVC
|
---|
43 | // as I don't know how to detect the features here.
|
---|
44 | //
|
---|
45 | // NOTE: Keep this this in sync with crc64_table.c.
|
---|
46 | #elif (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
|
---|
47 | || (defined(__e2k__) && __iset__ >= 6)
|
---|
48 | # define CRC_CLMUL 1
|
---|
49 |
|
---|
50 | // Otherwise build both and detect at runtime which version to use.
|
---|
51 | #else
|
---|
52 | # define CRC_GENERIC 1
|
---|
53 | # define CRC_CLMUL 1
|
---|
54 |
|
---|
55 | /*
|
---|
56 | // The generic code is much faster with 1-8-byte inputs and has
|
---|
57 | // similar performance up to 16 bytes at least in microbenchmarks
|
---|
58 | // (it depends on input buffer alignment too). If both versions are
|
---|
59 | // built, this #define will use the generic version for inputs up to
|
---|
60 | // 16 bytes and CLMUL for bigger inputs. It saves a little in code
|
---|
61 | // size since the special cases for 0-16-byte inputs will be omitted
|
---|
62 | // from the CLMUL code.
|
---|
63 | # define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
|
---|
64 | */
|
---|
65 |
|
---|
66 | # if defined(_MSC_VER)
|
---|
67 | # include <intrin.h>
|
---|
68 | # elif defined(HAVE_CPUID_H)
|
---|
69 | # include <cpuid.h>
|
---|
70 | # endif
|
---|
71 | #endif
|
---|
72 |
|
---|
73 |
|
---|
74 | /////////////////////////////////
|
---|
75 | // Generic slice-by-four CRC64 //
|
---|
76 | /////////////////////////////////
|
---|
77 |
|
---|
78 | #ifdef CRC_GENERIC
|
---|
79 |
|
---|
80 | #include "crc_macros.h"
|
---|
81 |
|
---|
82 |
|
---|
83 | #ifdef WORDS_BIGENDIAN
|
---|
84 | # define A1(x) ((x) >> 56)
|
---|
85 | #else
|
---|
86 | # define A1 A
|
---|
87 | #endif
|
---|
88 |
|
---|
89 |
|
---|
90 | // See the comments in crc32_fast.c. They aren't duplicated here.
|
---|
91 | static uint64_t
|
---|
92 | crc64_generic(const uint8_t *buf, size_t size, uint64_t crc)
|
---|
93 | {
|
---|
94 | crc = ~crc;
|
---|
95 |
|
---|
96 | #ifdef WORDS_BIGENDIAN
|
---|
97 | crc = bswap64(crc);
|
---|
98 | #endif
|
---|
99 |
|
---|
100 | if (size > 4) {
|
---|
101 | while ((uintptr_t)(buf) & 3) {
|
---|
102 | crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
|
---|
103 | --size;
|
---|
104 | }
|
---|
105 |
|
---|
106 | const uint8_t *const limit = buf + (size & ~(size_t)(3));
|
---|
107 | size &= (size_t)(3);
|
---|
108 |
|
---|
109 | while (buf < limit) {
|
---|
110 | #ifdef WORDS_BIGENDIAN
|
---|
111 | const uint32_t tmp = (uint32_t)(crc >> 32)
|
---|
112 | ^ aligned_read32ne(buf);
|
---|
113 | #else
|
---|
114 | const uint32_t tmp = (uint32_t)crc
|
---|
115 | ^ aligned_read32ne(buf);
|
---|
116 | #endif
|
---|
117 | buf += 4;
|
---|
118 |
|
---|
119 | crc = lzma_crc64_table[3][A(tmp)]
|
---|
120 | ^ lzma_crc64_table[2][B(tmp)]
|
---|
121 | ^ S32(crc)
|
---|
122 | ^ lzma_crc64_table[1][C(tmp)]
|
---|
123 | ^ lzma_crc64_table[0][D(tmp)];
|
---|
124 | }
|
---|
125 | }
|
---|
126 |
|
---|
127 | while (size-- != 0)
|
---|
128 | crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
|
---|
129 |
|
---|
130 | #ifdef WORDS_BIGENDIAN
|
---|
131 | crc = bswap64(crc);
|
---|
132 | #endif
|
---|
133 |
|
---|
134 | return ~crc;
|
---|
135 | }
|
---|
136 | #endif
|
---|
137 |
|
---|
138 |
|
---|
139 | /////////////////////
|
---|
140 | // x86 CLMUL CRC64 //
|
---|
141 | /////////////////////
|
---|
142 |
|
---|
143 | #ifdef CRC_CLMUL
|
---|
144 |
|
---|
145 | #include <immintrin.h>
|
---|
146 |
|
---|
147 |
|
---|
148 | /*
|
---|
149 | // These functions were used to generate the constants
|
---|
150 | // at the top of crc64_clmul().
|
---|
151 | static uint64_t
|
---|
152 | calc_lo(uint64_t poly)
|
---|
153 | {
|
---|
154 | uint64_t a = poly;
|
---|
155 | uint64_t b = 0;
|
---|
156 |
|
---|
157 | for (unsigned i = 0; i < 64; ++i) {
|
---|
158 | b = (b >> 1) | (a << 63);
|
---|
159 | a = (a >> 1) ^ (a & 1 ? poly : 0);
|
---|
160 | }
|
---|
161 |
|
---|
162 | return b;
|
---|
163 | }
|
---|
164 |
|
---|
165 | static uint64_t
|
---|
166 | calc_hi(uint64_t poly, uint64_t a)
|
---|
167 | {
|
---|
168 | for (unsigned i = 0; i < 64; ++i)
|
---|
169 | a = (a >> 1) ^ (a & 1 ? poly : 0);
|
---|
170 |
|
---|
171 | return a;
|
---|
172 | }
|
---|
173 | */
|
---|
174 |
|
---|
175 |
|
---|
176 | #define MASK_L(in, mask, r) \
|
---|
177 | r = _mm_shuffle_epi8(in, mask)
|
---|
178 |
|
---|
179 | #define MASK_H(in, mask, r) \
|
---|
180 | r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign))
|
---|
181 |
|
---|
182 | #define MASK_LH(in, mask, low, high) \
|
---|
183 | MASK_L(in, mask, low); \
|
---|
184 | MASK_H(in, mask, high)
|
---|
185 |
|
---|
186 |
|
---|
187 | // MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC
|
---|
188 | // code when optimizations are enabled (release build). According to the bug
|
---|
189 | // report, the ebx register is corrupted and the calculated result is wrong.
|
---|
190 | // Trying to workaround the problem with "__asm mov ebx, ebx" didn't help.
|
---|
191 | // The following pragma works and performance is still good. x86-64 builds
|
---|
192 | // aren't affected by this problem.
|
---|
193 | //
|
---|
194 | // NOTE: Another pragma after the function restores the optimizations.
|
---|
195 | // If the #if condition here is updated, the other one must be updated too.
|
---|
196 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
|
---|
197 | && defined(_M_IX86)
|
---|
198 | # pragma optimize("g", off)
|
---|
199 | #endif
|
---|
200 |
|
---|
201 | // EDG-based compilers (Intel's classic compiler and compiler for E2K) can
|
---|
202 | // define __GNUC__ but the attribute must not be used with them.
|
---|
203 | // The new Clang-based ICX needs the attribute.
|
---|
204 | //
|
---|
205 | // NOTE: Build systems check for this too, keep them in sync with this.
|
---|
206 | #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
|
---|
207 | __attribute__((__target__("ssse3,sse4.1,pclmul")))
|
---|
208 | #endif
|
---|
209 | static uint64_t
|
---|
210 | crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc)
|
---|
211 | {
|
---|
212 | // The prototypes of the intrinsics use signed types while most of
|
---|
213 | // the values are treated as unsigned here. These warnings in this
|
---|
214 | // function have been checked and found to be harmless so silence them.
|
---|
215 | #if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__)
|
---|
216 | # pragma GCC diagnostic push
|
---|
217 | # pragma GCC diagnostic ignored "-Wsign-conversion"
|
---|
218 | # pragma GCC diagnostic ignored "-Wconversion"
|
---|
219 | #endif
|
---|
220 |
|
---|
221 | #ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
---|
222 | // The code assumes that there is at least one byte of input.
|
---|
223 | if (size == 0)
|
---|
224 | return crc;
|
---|
225 | #endif
|
---|
226 |
|
---|
227 | // const uint64_t poly = 0xc96c5795d7870f42; // CRC polynomial
|
---|
228 | const uint64_t p = 0x92d8af2baf0e1e85; // (poly << 1) | 1
|
---|
229 | const uint64_t mu = 0x9c3e466c172963d5; // (calc_lo(poly) << 1) | 1
|
---|
230 | const uint64_t k2 = 0xdabe95afc7875f40; // calc_hi(poly, 1)
|
---|
231 | const uint64_t k1 = 0xe05dd497ca393ae4; // calc_hi(poly, k2)
|
---|
232 | const __m128i vfold0 = _mm_set_epi64x(p, mu);
|
---|
233 | const __m128i vfold1 = _mm_set_epi64x(k2, k1);
|
---|
234 |
|
---|
235 | // Create a vector with 8-bit values 0 to 15. This is used to
|
---|
236 | // construct control masks for _mm_blendv_epi8 and _mm_shuffle_epi8.
|
---|
237 | const __m128i vramp = _mm_setr_epi32(
|
---|
238 | 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);
|
---|
239 |
|
---|
240 | // This is used to inverse the control mask of _mm_shuffle_epi8
|
---|
241 | // so that bytes that wouldn't be picked with the original mask
|
---|
242 | // will be picked and vice versa.
|
---|
243 | const __m128i vsign = _mm_set1_epi8(0x80);
|
---|
244 |
|
---|
245 | // Memory addresses A to D and the distances between them:
|
---|
246 | //
|
---|
247 | // A B C D
|
---|
248 | // [skip_start][size][skip_end]
|
---|
249 | // [ size2 ]
|
---|
250 | //
|
---|
251 | // A and D are 16-byte aligned. B and C are 1-byte aligned.
|
---|
252 | // skip_start and skip_end are 0-15 bytes. size is at least 1 byte.
|
---|
253 | //
|
---|
254 | // A = aligned_buf will initially point to this address.
|
---|
255 | // B = The address pointed by the caller-supplied buf.
|
---|
256 | // C = buf + size == aligned_buf + size2
|
---|
257 | // D = buf + size + skip_end == aligned_buf + size2 + skip_end
|
---|
258 | const size_t skip_start = (size_t)((uintptr_t)buf & 15);
|
---|
259 | const size_t skip_end = (size_t)(-(uintptr_t)(buf + size) & 15);
|
---|
260 | const __m128i *aligned_buf = (const __m128i *)(
|
---|
261 | (uintptr_t)buf & ~(uintptr_t)15);
|
---|
262 |
|
---|
263 | // If size2 <= 16 then the whole input fits into a single 16-byte
|
---|
264 | // vector. If size2 > 16 then at least two 16-byte vectors must
|
---|
265 | // be processed. If size2 > 16 && size <= 16 then there is only
|
---|
266 | // one 16-byte vector's worth of input but it is unaligned in memory.
|
---|
267 | //
|
---|
268 | // NOTE: There is no integer overflow here if the arguments are valid.
|
---|
269 | // If this overflowed, buf + size would too.
|
---|
270 | size_t size2 = skip_start + size;
|
---|
271 |
|
---|
272 | // Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8:
|
---|
273 | // The first skip_start or skip_end bytes in the vectors will have
|
---|
274 | // the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8
|
---|
275 | // will produce zeros for these positions. (Bitwise-xor of these
|
---|
276 | // masks with vsign will produce the opposite behavior.)
|
---|
277 | const __m128i mask_start
|
---|
278 | = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start));
|
---|
279 | const __m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end));
|
---|
280 |
|
---|
281 | // Get the first 1-16 bytes into data0. If loading less than 16 bytes,
|
---|
282 | // the bytes are loaded to the high bits of the vector and the least
|
---|
283 | // significant positions are filled with zeros.
|
---|
284 | const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf),
|
---|
285 | _mm_setzero_si128(), mask_start);
|
---|
286 | ++aligned_buf;
|
---|
287 |
|
---|
288 | #if defined(__i386__) || defined(_M_IX86)
|
---|
289 | const __m128i initial_crc = _mm_set_epi64x(0, ~crc);
|
---|
290 | #else
|
---|
291 | // GCC and Clang would produce good code with _mm_set_epi64x
|
---|
292 | // but MSVC needs _mm_cvtsi64_si128 on x86-64.
|
---|
293 | const __m128i initial_crc = _mm_cvtsi64_si128(~crc);
|
---|
294 | #endif
|
---|
295 |
|
---|
296 | __m128i v0, v1, v2, v3;
|
---|
297 |
|
---|
298 | #ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
---|
299 | if (size <= 16) {
|
---|
300 | // Right-shift initial_crc by 1-16 bytes based on "size"
|
---|
301 | // and store the result in v1 (high bytes) and v0 (low bytes).
|
---|
302 | //
|
---|
303 | // NOTE: The highest 8 bytes of initial_crc are zeros so
|
---|
304 | // v1 will be filled with zeros if size >= 8. The highest 8
|
---|
305 | // bytes of v1 will always become zeros.
|
---|
306 | //
|
---|
307 | // [ v1 ][ v0 ]
|
---|
308 | // [ initial_crc ] size == 1
|
---|
309 | // [ initial_crc ] size == 2
|
---|
310 | // [ initial_crc ] size == 15
|
---|
311 | // [ initial_crc ] size == 16 (all in v0)
|
---|
312 | const __m128i mask_low = _mm_add_epi8(
|
---|
313 | vramp, _mm_set1_epi8(size - 16));
|
---|
314 | MASK_LH(initial_crc, mask_low, v0, v1);
|
---|
315 |
|
---|
316 | if (size2 <= 16) {
|
---|
317 | // There are 1-16 bytes of input and it is all
|
---|
318 | // in data0. Copy the input bytes to v3. If there
|
---|
319 | // are fewer than 16 bytes, the low bytes in v3
|
---|
320 | // will be filled with zeros. That is, the input
|
---|
321 | // bytes are stored to the same position as
|
---|
322 | // (part of) initial_crc is in v0.
|
---|
323 | MASK_L(data0, mask_end, v3);
|
---|
324 | } else {
|
---|
325 | // There are 2-16 bytes of input but not all bytes
|
---|
326 | // are in data0.
|
---|
327 | const __m128i data1 = _mm_load_si128(aligned_buf);
|
---|
328 |
|
---|
329 | // Collect the 2-16 input bytes from data0 and data1
|
---|
330 | // to v2 and v3, and bitwise-xor them with the
|
---|
331 | // low bits of initial_crc in v0. Note that the
|
---|
332 | // the second xor is below this else-block as it
|
---|
333 | // is shared with the other branch.
|
---|
334 | MASK_H(data0, mask_end, v2);
|
---|
335 | MASK_L(data1, mask_end, v3);
|
---|
336 | v0 = _mm_xor_si128(v0, v2);
|
---|
337 | }
|
---|
338 |
|
---|
339 | v0 = _mm_xor_si128(v0, v3);
|
---|
340 | v1 = _mm_alignr_epi8(v1, v0, 8);
|
---|
341 | } else
|
---|
342 | #endif
|
---|
343 | {
|
---|
344 | const __m128i data1 = _mm_load_si128(aligned_buf);
|
---|
345 | MASK_LH(initial_crc, mask_start, v0, v1);
|
---|
346 | v0 = _mm_xor_si128(v0, data0);
|
---|
347 | v1 = _mm_xor_si128(v1, data1);
|
---|
348 |
|
---|
349 | #define FOLD \
|
---|
350 | v1 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x00)); \
|
---|
351 | v0 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x11));
|
---|
352 |
|
---|
353 | while (size2 > 32) {
|
---|
354 | ++aligned_buf;
|
---|
355 | size2 -= 16;
|
---|
356 | FOLD
|
---|
357 | v1 = _mm_load_si128(aligned_buf);
|
---|
358 | }
|
---|
359 |
|
---|
360 | if (size2 < 32) {
|
---|
361 | MASK_H(v0, mask_end, v2);
|
---|
362 | MASK_L(v0, mask_end, v0);
|
---|
363 | MASK_L(v1, mask_end, v3);
|
---|
364 | v1 = _mm_or_si128(v2, v3);
|
---|
365 | }
|
---|
366 |
|
---|
367 | FOLD
|
---|
368 | v1 = _mm_srli_si128(v0, 8);
|
---|
369 | #undef FOLD
|
---|
370 | }
|
---|
371 |
|
---|
372 | v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold1, 0x10), v1);
|
---|
373 | v0 = _mm_clmulepi64_si128(v1, vfold0, 0x00);
|
---|
374 | v2 = _mm_clmulepi64_si128(v0, vfold0, 0x10);
|
---|
375 | v0 = _mm_xor_si128(_mm_xor_si128(v2, _mm_slli_si128(v0, 8)), v1);
|
---|
376 |
|
---|
377 | #if defined(__i386__) || defined(_M_IX86)
|
---|
378 | return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
|
---|
379 | (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
|
---|
380 | #else
|
---|
381 | return ~(uint64_t)_mm_extract_epi64(v0, 1);
|
---|
382 | #endif
|
---|
383 |
|
---|
384 | #if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__)
|
---|
385 | # pragma GCC diagnostic pop
|
---|
386 | #endif
|
---|
387 | }
|
---|
388 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
|
---|
389 | && defined(_M_IX86)
|
---|
390 | # pragma optimize("", on)
|
---|
391 | #endif
|
---|
392 | #endif
|
---|
393 |
|
---|
394 |
|
---|
395 | ////////////////////////
|
---|
396 | // Detect CPU support //
|
---|
397 | ////////////////////////
|
---|
398 |
|
---|
399 | #if defined(CRC_GENERIC) && defined(CRC_CLMUL)
|
---|
400 | static inline bool
|
---|
401 | is_clmul_supported(void)
|
---|
402 | {
|
---|
403 | int success = 1;
|
---|
404 | uint32_t r[4]; // eax, ebx, ecx, edx
|
---|
405 |
|
---|
406 | #if defined(_MSC_VER)
|
---|
407 | // This needs <intrin.h> with MSVC. ICC has it as a built-in
|
---|
408 | // on all platforms.
|
---|
409 | __cpuid(r, 1);
|
---|
410 | #elif defined(HAVE_CPUID_H)
|
---|
411 | // Compared to just using __asm__ to run CPUID, this also checks
|
---|
412 | // that CPUID is supported and saves and restores ebx as that is
|
---|
413 | // needed with GCC < 5 with position-independent code (PIC).
|
---|
414 | success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
|
---|
415 | #else
|
---|
416 | // Just a fallback that shouldn't be needed.
|
---|
417 | __asm__("cpuid\n\t"
|
---|
418 | : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
|
---|
419 | : "a"(1), "c"(0));
|
---|
420 | #endif
|
---|
421 |
|
---|
422 | // Returns true if these are supported:
|
---|
423 | // CLMUL (bit 1 in ecx)
|
---|
424 | // SSSE3 (bit 9 in ecx)
|
---|
425 | // SSE4.1 (bit 19 in ecx)
|
---|
426 | const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
|
---|
427 | return success && (r[2] & ecx_mask) == ecx_mask;
|
---|
428 |
|
---|
429 | // Alternative methods that weren't used:
|
---|
430 | // - ICC's _may_i_use_cpu_feature: the other methods should work too.
|
---|
431 | // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
|
---|
432 | //
|
---|
433 | // CPUID decding is needed with MSVC anyway and older GCC. This keeps
|
---|
434 | // the feature checks in the build system simpler too. The nice thing
|
---|
435 | // about __builtin_cpu_supports would be that it generates very short
|
---|
436 | // code as is it only reads a variable set at startup but a few bytes
|
---|
437 | // doesn't matter here.
|
---|
438 | }
|
---|
439 |
|
---|
440 |
|
---|
441 | #ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
|
---|
442 | # define CRC64_FUNC_INIT
|
---|
443 | # define CRC64_SET_FUNC_ATTR __attribute__((__constructor__))
|
---|
444 | #else
|
---|
445 | # define CRC64_FUNC_INIT = &crc64_dispatch
|
---|
446 | # define CRC64_SET_FUNC_ATTR
|
---|
447 | static uint64_t crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc);
|
---|
448 | #endif
|
---|
449 |
|
---|
450 |
|
---|
451 | // Pointer to the the selected CRC64 method.
|
---|
452 | static uint64_t (*crc64_func)(const uint8_t *buf, size_t size, uint64_t crc)
|
---|
453 | CRC64_FUNC_INIT;
|
---|
454 |
|
---|
455 |
|
---|
456 | CRC64_SET_FUNC_ATTR
|
---|
457 | static void
|
---|
458 | crc64_set_func(void)
|
---|
459 | {
|
---|
460 | crc64_func = is_clmul_supported() ? &crc64_clmul : &crc64_generic;
|
---|
461 | return;
|
---|
462 | }
|
---|
463 |
|
---|
464 |
|
---|
465 | #ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
|
---|
466 | static uint64_t
|
---|
467 | crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc)
|
---|
468 | {
|
---|
469 | // When __attribute__((__constructor__)) isn't supported, set the
|
---|
470 | // function pointer without any locking. If multiple threads run
|
---|
471 | // the detection code in parallel, they will all end up setting
|
---|
472 | // the pointer to the same value. This avoids the use of
|
---|
473 | // mythread_once() on every call to lzma_crc64() but this likely
|
---|
474 | // isn't strictly standards compliant. Let's change it if it breaks.
|
---|
475 | crc64_set_func();
|
---|
476 | return crc64_func(buf, size, crc);
|
---|
477 | }
|
---|
478 | #endif
|
---|
479 | #endif
|
---|
480 |
|
---|
481 |
|
---|
482 | extern LZMA_API(uint64_t)
|
---|
483 | lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
---|
484 | {
|
---|
485 | #if defined(CRC_GENERIC) && defined(CRC_CLMUL)
|
---|
486 | // If CLMUL is available, it is the best for non-tiny inputs,
|
---|
487 | // being over twice as fast as the generic slice-by-four version.
|
---|
488 | // However, for size <= 16 it's different. In the extreme case
|
---|
489 | // of size == 1 the generic version can be five times faster.
|
---|
490 | // At size >= 8 the CLMUL starts to become reasonable. It
|
---|
491 | // varies depending on the alignment of buf too.
|
---|
492 | //
|
---|
493 | // The above doesn't include the overhead of mythread_once().
|
---|
494 | // At least on x86-64 GNU/Linux, pthread_once() is very fast but
|
---|
495 | // it still makes lzma_crc64(buf, 1, crc) 50-100 % slower. When
|
---|
496 | // size reaches 12-16 bytes the overhead becomes negligible.
|
---|
497 | //
|
---|
498 | // So using the generic version for size <= 16 may give better
|
---|
499 | // performance with tiny inputs but if such inputs happen rarely
|
---|
500 | // it's not so obvious because then the lookup table of the
|
---|
501 | // generic version may not be in the processor cache.
|
---|
502 | #ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
---|
503 | if (size <= 16)
|
---|
504 | return crc64_generic(buf, size, crc);
|
---|
505 | #endif
|
---|
506 |
|
---|
507 | /*
|
---|
508 | #ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
|
---|
509 | // See crc64_dispatch(). This would be the alternative which uses
|
---|
510 | // locking and doesn't use crc64_dispatch(). Note that on Windows
|
---|
511 | // this method needs Vista threads.
|
---|
512 | mythread_once(crc64_set_func);
|
---|
513 | #endif
|
---|
514 | */
|
---|
515 |
|
---|
516 | return crc64_func(buf, size, crc);
|
---|
517 |
|
---|
518 | #elif defined(CRC_CLMUL)
|
---|
519 | // If CLMUL is used unconditionally without runtime CPU detection
|
---|
520 | // then omitting the generic version and its 8 KiB lookup table
|
---|
521 | // makes the library smaller.
|
---|
522 | //
|
---|
523 | // FIXME: Lookup table isn't currently omitted on 32-bit x86,
|
---|
524 | // see crc64_table.c.
|
---|
525 | return crc64_clmul(buf, size, crc);
|
---|
526 |
|
---|
527 | #else
|
---|
528 | return crc64_generic(buf, size, crc);
|
---|
529 | #endif
|
---|
530 | }
|
---|