1 | /*
|
---|
2 | * Speed-optimized CRC64 using slicing-by-four algorithm
|
---|
3 | *
|
---|
4 | * This uses only i386 instructions, but it is optimized for i686 and later
|
---|
5 | * (including e.g. Pentium II/III/IV, Athlon XP, and Core 2).
|
---|
6 | *
|
---|
7 | * Authors: Igor Pavlov (original CRC32 assembly code)
|
---|
8 | * Lasse Collin (CRC64 adaptation of the modified CRC32 code)
|
---|
9 | *
|
---|
10 | * This file has been put into the public domain.
|
---|
11 | * You can do whatever you want with this file.
|
---|
12 | *
|
---|
13 | * This code needs lzma_crc64_table, which can be created using the
|
---|
14 | * following C code:
|
---|
15 |
|
---|
16 | uint64_t lzma_crc64_table[4][256];
|
---|
17 |
|
---|
18 | void
|
---|
19 | init_table(void)
|
---|
20 | {
|
---|
21 | // ECMA-182
|
---|
22 | static const uint64_t poly64 = UINT64_C(0xC96C5795D7870F42);
|
---|
23 |
|
---|
24 | for (size_t s = 0; s < 4; ++s) {
|
---|
25 | for (size_t b = 0; b < 256; ++b) {
|
---|
26 | uint64_t r = s == 0 ? b : lzma_crc64_table[s - 1][b];
|
---|
27 |
|
---|
28 | for (size_t i = 0; i < 8; ++i) {
|
---|
29 | if (r & 1)
|
---|
30 | r = (r >> 1) ^ poly64;
|
---|
31 | else
|
---|
32 | r >>= 1;
|
---|
33 | }
|
---|
34 |
|
---|
35 | lzma_crc64_table[s][b] = r;
|
---|
36 | }
|
---|
37 | }
|
---|
38 | }
|
---|
39 |
|
---|
40 | * The prototype of the CRC64 function:
|
---|
41 | * extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
|
---|
42 | */
|
---|
43 |
|
---|
44 | /* When Intel CET is enabled, include <cet.h> in assembly code to mark
|
---|
45 | Intel CET support. */
|
---|
46 | #ifdef __CET__
|
---|
47 | # include <cet.h>
|
---|
48 | #else
|
---|
49 | # define _CET_ENDBR
|
---|
50 | #endif
|
---|
51 |
|
---|
52 | /*
|
---|
53 | * On some systems, the functions need to be prefixed. The prefix is
|
---|
54 | * usually an underscore.
|
---|
55 | */
|
---|
56 | #ifndef __USER_LABEL_PREFIX__
|
---|
57 | # define __USER_LABEL_PREFIX__
|
---|
58 | #endif
|
---|
59 | #define MAKE_SYM_CAT(prefix, sym) prefix ## sym
|
---|
60 | #define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym)
|
---|
61 | #define LZMA_CRC64 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64)
|
---|
62 | #define LZMA_CRC64_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64_table)
|
---|
63 |
|
---|
64 | /*
|
---|
65 | * Solaris assembler doesn't have .p2align, and Darwin uses .align
|
---|
66 | * differently than GNU/Linux and Solaris.
|
---|
67 | */
|
---|
68 | #if defined(__APPLE__) || defined(__MSDOS__)
|
---|
69 | # define ALIGN(pow2, abs) .align pow2
|
---|
70 | #else
|
---|
71 | # define ALIGN(pow2, abs) .align abs
|
---|
72 | #endif
|
---|
73 |
|
---|
74 | .text
|
---|
75 | .globl LZMA_CRC64
|
---|
76 |
|
---|
77 | #if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \
|
---|
78 | && !defined(__MSDOS__)
|
---|
79 | .type LZMA_CRC64, @function
|
---|
80 | #endif
|
---|
81 |
|
---|
82 | ALIGN(4, 16)
|
---|
83 | LZMA_CRC64:
|
---|
84 | _CET_ENDBR
|
---|
85 | /*
|
---|
86 | * Register usage:
|
---|
87 | * %eax crc LSB
|
---|
88 | * %edx crc MSB
|
---|
89 | * %esi buf
|
---|
90 | * %edi size or buf + size
|
---|
91 | * %ebx lzma_crc64_table
|
---|
92 | * %ebp Table index
|
---|
93 | * %ecx Temporary
|
---|
94 | */
|
---|
95 | pushl %ebx
|
---|
96 | pushl %esi
|
---|
97 | pushl %edi
|
---|
98 | pushl %ebp
|
---|
99 | movl 0x14(%esp), %esi /* buf */
|
---|
100 | movl 0x18(%esp), %edi /* size */
|
---|
101 | movl 0x1C(%esp), %eax /* crc LSB */
|
---|
102 | movl 0x20(%esp), %edx /* crc MSB */
|
---|
103 |
|
---|
104 | /*
|
---|
105 | * Store the address of lzma_crc64_table to %ebx. This is needed to
|
---|
106 | * get position-independent code (PIC).
|
---|
107 | *
|
---|
108 | * The PIC macro is defined by libtool, while __PIC__ is defined
|
---|
109 | * by GCC but only on some systems. Testing for both makes it simpler
|
---|
110 | * to test this code without libtool, and keeps the code working also
|
---|
111 | * when built with libtool but using something else than GCC.
|
---|
112 | *
|
---|
113 | * I understood that libtool may define PIC on Windows even though
|
---|
114 | * the code in Windows DLLs is not PIC in sense that it is in ELF
|
---|
115 | * binaries, so we need a separate check to always use the non-PIC
|
---|
116 | * code on Windows.
|
---|
117 | */
|
---|
118 | #if (!defined(PIC) && !defined(__PIC__)) \
|
---|
119 | || (defined(_WIN32) || defined(__CYGWIN__))
|
---|
120 | /* Not PIC */
|
---|
121 | movl $ LZMA_CRC64_TABLE, %ebx
|
---|
122 | #elif defined(__APPLE__)
|
---|
123 | /* Mach-O */
|
---|
124 | call .L_get_pc
|
---|
125 | .L_pic:
|
---|
126 | leal .L_lzma_crc64_table$non_lazy_ptr-.L_pic(%ebx), %ebx
|
---|
127 | movl (%ebx), %ebx
|
---|
128 | #else
|
---|
129 | /* ELF */
|
---|
130 | call .L_get_pc
|
---|
131 | addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
---|
132 | movl LZMA_CRC64_TABLE@GOT(%ebx), %ebx
|
---|
133 | #endif
|
---|
134 |
|
---|
135 | /* Complement the initial value. */
|
---|
136 | notl %eax
|
---|
137 | notl %edx
|
---|
138 |
|
---|
139 | .L_align:
|
---|
140 | /*
|
---|
141 | * Check if there is enough input to use slicing-by-four.
|
---|
142 | * We need eight bytes, because the loop pre-reads four bytes.
|
---|
143 | */
|
---|
144 | cmpl $8, %edi
|
---|
145 | jb .L_rest
|
---|
146 |
|
---|
147 | /* Check if we have reached alignment of four bytes. */
|
---|
148 | testl $3, %esi
|
---|
149 | jz .L_slice
|
---|
150 |
|
---|
151 | /* Calculate CRC of the next input byte. */
|
---|
152 | movzbl (%esi), %ebp
|
---|
153 | incl %esi
|
---|
154 | movzbl %al, %ecx
|
---|
155 | xorl %ecx, %ebp
|
---|
156 | shrdl $8, %edx, %eax
|
---|
157 | xorl (%ebx, %ebp, 8), %eax
|
---|
158 | shrl $8, %edx
|
---|
159 | xorl 4(%ebx, %ebp, 8), %edx
|
---|
160 | decl %edi
|
---|
161 | jmp .L_align
|
---|
162 |
|
---|
163 | .L_slice:
|
---|
164 | /*
|
---|
165 | * If we get here, there's at least eight bytes of aligned input
|
---|
166 | * available. Make %edi multiple of four bytes. Store the possible
|
---|
167 | * remainder over the "size" variable in the argument stack.
|
---|
168 | */
|
---|
169 | movl %edi, 0x18(%esp)
|
---|
170 | andl $-4, %edi
|
---|
171 | subl %edi, 0x18(%esp)
|
---|
172 |
|
---|
173 | /*
|
---|
174 | * Let %edi be buf + size - 4 while running the main loop. This way
|
---|
175 | * we can compare for equality to determine when exit the loop.
|
---|
176 | */
|
---|
177 | addl %esi, %edi
|
---|
178 | subl $4, %edi
|
---|
179 |
|
---|
180 | /* Read in the first four aligned bytes. */
|
---|
181 | movl (%esi), %ecx
|
---|
182 |
|
---|
183 | .L_loop:
|
---|
184 | xorl %eax, %ecx
|
---|
185 | movzbl %cl, %ebp
|
---|
186 | movl 0x1800(%ebx, %ebp, 8), %eax
|
---|
187 | xorl %edx, %eax
|
---|
188 | movl 0x1804(%ebx, %ebp, 8), %edx
|
---|
189 | movzbl %ch, %ebp
|
---|
190 | xorl 0x1000(%ebx, %ebp, 8), %eax
|
---|
191 | xorl 0x1004(%ebx, %ebp, 8), %edx
|
---|
192 | shrl $16, %ecx
|
---|
193 | movzbl %cl, %ebp
|
---|
194 | xorl 0x0800(%ebx, %ebp, 8), %eax
|
---|
195 | xorl 0x0804(%ebx, %ebp, 8), %edx
|
---|
196 | movzbl %ch, %ebp
|
---|
197 | addl $4, %esi
|
---|
198 | xorl (%ebx, %ebp, 8), %eax
|
---|
199 | xorl 4(%ebx, %ebp, 8), %edx
|
---|
200 |
|
---|
201 | /* Check for end of aligned input. */
|
---|
202 | cmpl %edi, %esi
|
---|
203 |
|
---|
204 | /*
|
---|
205 | * Copy the next input byte to %ecx. It is slightly faster to
|
---|
206 | * read it here than at the top of the loop.
|
---|
207 | */
|
---|
208 | movl (%esi), %ecx
|
---|
209 | jb .L_loop
|
---|
210 |
|
---|
211 | /*
|
---|
212 | * Process the remaining four bytes, which we have already
|
---|
213 | * copied to %ecx.
|
---|
214 | */
|
---|
215 | xorl %eax, %ecx
|
---|
216 | movzbl %cl, %ebp
|
---|
217 | movl 0x1800(%ebx, %ebp, 8), %eax
|
---|
218 | xorl %edx, %eax
|
---|
219 | movl 0x1804(%ebx, %ebp, 8), %edx
|
---|
220 | movzbl %ch, %ebp
|
---|
221 | xorl 0x1000(%ebx, %ebp, 8), %eax
|
---|
222 | xorl 0x1004(%ebx, %ebp, 8), %edx
|
---|
223 | shrl $16, %ecx
|
---|
224 | movzbl %cl, %ebp
|
---|
225 | xorl 0x0800(%ebx, %ebp, 8), %eax
|
---|
226 | xorl 0x0804(%ebx, %ebp, 8), %edx
|
---|
227 | movzbl %ch, %ebp
|
---|
228 | addl $4, %esi
|
---|
229 | xorl (%ebx, %ebp, 8), %eax
|
---|
230 | xorl 4(%ebx, %ebp, 8), %edx
|
---|
231 |
|
---|
232 | /* Copy the number of remaining bytes to %edi. */
|
---|
233 | movl 0x18(%esp), %edi
|
---|
234 |
|
---|
235 | .L_rest:
|
---|
236 | /* Check for end of input. */
|
---|
237 | testl %edi, %edi
|
---|
238 | jz .L_return
|
---|
239 |
|
---|
240 | /* Calculate CRC of the next input byte. */
|
---|
241 | movzbl (%esi), %ebp
|
---|
242 | incl %esi
|
---|
243 | movzbl %al, %ecx
|
---|
244 | xorl %ecx, %ebp
|
---|
245 | shrdl $8, %edx, %eax
|
---|
246 | xorl (%ebx, %ebp, 8), %eax
|
---|
247 | shrl $8, %edx
|
---|
248 | xorl 4(%ebx, %ebp, 8), %edx
|
---|
249 | decl %edi
|
---|
250 | jmp .L_rest
|
---|
251 |
|
---|
252 | .L_return:
|
---|
253 | /* Complement the final value. */
|
---|
254 | notl %eax
|
---|
255 | notl %edx
|
---|
256 |
|
---|
257 | popl %ebp
|
---|
258 | popl %edi
|
---|
259 | popl %esi
|
---|
260 | popl %ebx
|
---|
261 | ret
|
---|
262 |
|
---|
263 | #if defined(PIC) || defined(__PIC__)
|
---|
264 | ALIGN(4, 16)
|
---|
265 | .L_get_pc:
|
---|
266 | movl (%esp), %ebx
|
---|
267 | ret
|
---|
268 | #endif
|
---|
269 |
|
---|
270 | #if defined(__APPLE__) && (defined(PIC) || defined(__PIC__))
|
---|
271 | /* Mach-O PIC */
|
---|
272 | .section __IMPORT,__pointers,non_lazy_symbol_pointers
|
---|
273 | .L_lzma_crc64_table$non_lazy_ptr:
|
---|
274 | .indirect_symbol LZMA_CRC64_TABLE
|
---|
275 | .long 0
|
---|
276 |
|
---|
277 | #elif defined(_WIN32) || defined(__CYGWIN__)
|
---|
278 | # ifdef DLL_EXPORT
|
---|
279 | /* This is equivalent of __declspec(dllexport). */
|
---|
280 | .section .drectve
|
---|
281 | .ascii " -export:lzma_crc64"
|
---|
282 | # endif
|
---|
283 |
|
---|
284 | #elif !defined(__MSDOS__)
|
---|
285 | /* ELF */
|
---|
286 | .size LZMA_CRC64, .-LZMA_CRC64
|
---|
287 | #endif
|
---|
288 |
|
---|
289 | /*
|
---|
290 | * This is needed to support non-executable stack. It's ugly to
|
---|
291 | * use __FreeBSD__ and __linux__ here, but I don't know a way to detect when
|
---|
292 | * we are using GNU assembler.
|
---|
293 | */
|
---|
294 | #if defined(__ELF__) && (defined(__FreeBSD__) || defined(__linux__))
|
---|
295 | .section .note.GNU-stack,"",@progbits
|
---|
296 | #endif
|
---|