1 | /*
|
---|
2 | * Speed-optimized CRC32 using slicing-by-eight algorithm
|
---|
3 | *
|
---|
4 | * This uses only i386 instructions, but it is optimized for i686 and later
|
---|
5 | * (including e.g. Pentium II/III/IV, Athlon XP, and Core 2). For i586
|
---|
6 | * (e.g. Pentium), slicing-by-four would be better, and even the C version
|
---|
7 | * of slicing-by-eight built with gcc -march=i586 tends to be a little bit
|
---|
8 | * better than this. Very few probably run this code on i586 or older x86
|
---|
9 | * so this shouldn't be a problem in practice.
|
---|
10 | *
|
---|
11 | * Authors: Igor Pavlov (original version)
|
---|
12 | * Lasse Collin (AT&T syntax, PIC support, better portability)
|
---|
13 | *
|
---|
14 | * This file has been put into the public domain.
|
---|
15 | * You can do whatever you want with this file.
|
---|
16 | *
|
---|
17 | * This code needs lzma_crc32_table, which can be created using the
|
---|
18 | * following C code:
|
---|
19 |
|
---|
20 | uint32_t lzma_crc32_table[8][256];
|
---|
21 |
|
---|
22 | void
|
---|
23 | init_table(void)
|
---|
24 | {
|
---|
25 | // IEEE-802.3
|
---|
26 | static const uint32_t poly32 = UINT32_C(0xEDB88320);
|
---|
27 |
|
---|
28 | // Castagnoli
|
---|
29 | // static const uint32_t poly32 = UINT32_C(0x82F63B78);
|
---|
30 |
|
---|
31 | // Koopman
|
---|
32 | // static const uint32_t poly32 = UINT32_C(0xEB31D82E);
|
---|
33 |
|
---|
34 | for (size_t s = 0; s < 8; ++s) {
|
---|
35 | for (size_t b = 0; b < 256; ++b) {
|
---|
36 | uint32_t r = s == 0 ? b : lzma_crc32_table[s - 1][b];
|
---|
37 |
|
---|
38 | for (size_t i = 0; i < 8; ++i) {
|
---|
39 | if (r & 1)
|
---|
40 | r = (r >> 1) ^ poly32;
|
---|
41 | else
|
---|
42 | r >>= 1;
|
---|
43 | }
|
---|
44 |
|
---|
45 | lzma_crc32_table[s][b] = r;
|
---|
46 | }
|
---|
47 | }
|
---|
48 | }
|
---|
49 |
|
---|
50 | * The prototype of the CRC32 function:
|
---|
51 | * extern uint32_t lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc);
|
---|
52 | */
|
---|
53 |
|
---|
54 | /* When Intel CET is enabled, include <cet.h> in assembly code to mark
|
---|
55 | Intel CET support. */
|
---|
56 | #ifdef __CET__
|
---|
57 | # include <cet.h>
|
---|
58 | #else
|
---|
59 | # define _CET_ENDBR
|
---|
60 | #endif
|
---|
61 |
|
---|
62 | /*
|
---|
63 | * On some systems, the functions need to be prefixed. The prefix is
|
---|
64 | * usually an underscore.
|
---|
65 | */
|
---|
66 | #ifndef __USER_LABEL_PREFIX__
|
---|
67 | # define __USER_LABEL_PREFIX__
|
---|
68 | #endif
|
---|
69 | #define MAKE_SYM_CAT(prefix, sym) prefix ## sym
|
---|
70 | #define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym)
|
---|
71 | #define LZMA_CRC32 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc32)
|
---|
72 | #define LZMA_CRC32_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc32_table)
|
---|
73 |
|
---|
74 | /*
|
---|
75 | * Solaris assembler doesn't have .p2align, and Darwin uses .align
|
---|
76 | * differently than GNU/Linux and Solaris.
|
---|
77 | */
|
---|
78 | #if defined(__APPLE__) || defined(__MSDOS__)
|
---|
79 | # define ALIGN(pow2, abs) .align pow2
|
---|
80 | #else
|
---|
81 | # define ALIGN(pow2, abs) .align abs
|
---|
82 | #endif
|
---|
83 |
|
---|
84 | .text
|
---|
85 | .globl LZMA_CRC32
|
---|
86 |
|
---|
87 | #if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \
|
---|
88 | && !defined(__MSDOS__)
|
---|
89 | .type LZMA_CRC32, @function
|
---|
90 | #endif
|
---|
91 |
|
---|
92 | ALIGN(4, 16)
|
---|
93 | LZMA_CRC32:
|
---|
94 | _CET_ENDBR
|
---|
95 | /*
|
---|
96 | * Register usage:
|
---|
97 | * %eax crc
|
---|
98 | * %esi buf
|
---|
99 | * %edi size or buf + size
|
---|
100 | * %ebx lzma_crc32_table
|
---|
101 | * %ebp Table index
|
---|
102 | * %ecx Temporary
|
---|
103 | * %edx Temporary
|
---|
104 | */
|
---|
105 | pushl %ebx
|
---|
106 | pushl %esi
|
---|
107 | pushl %edi
|
---|
108 | pushl %ebp
|
---|
109 | movl 0x14(%esp), %esi /* buf */
|
---|
110 | movl 0x18(%esp), %edi /* size */
|
---|
111 | movl 0x1C(%esp), %eax /* crc */
|
---|
112 |
|
---|
113 | /*
|
---|
114 | * Store the address of lzma_crc32_table to %ebx. This is needed to
|
---|
115 | * get position-independent code (PIC).
|
---|
116 | *
|
---|
117 | * The PIC macro is defined by libtool, while __PIC__ is defined
|
---|
118 | * by GCC but only on some systems. Testing for both makes it simpler
|
---|
119 | * to test this code without libtool, and keeps the code working also
|
---|
120 | * when built with libtool but using something else than GCC.
|
---|
121 | *
|
---|
122 | * I understood that libtool may define PIC on Windows even though
|
---|
123 | * the code in Windows DLLs is not PIC in sense that it is in ELF
|
---|
124 | * binaries, so we need a separate check to always use the non-PIC
|
---|
125 | * code on Windows.
|
---|
126 | */
|
---|
127 | #if (!defined(PIC) && !defined(__PIC__)) \
|
---|
128 | || (defined(_WIN32) || defined(__CYGWIN__))
|
---|
129 | /* Not PIC */
|
---|
130 | movl $ LZMA_CRC32_TABLE, %ebx
|
---|
131 | #elif defined(__APPLE__)
|
---|
132 | /* Mach-O */
|
---|
133 | call .L_get_pc
|
---|
134 | .L_pic:
|
---|
135 | leal .L_lzma_crc32_table$non_lazy_ptr-.L_pic(%ebx), %ebx
|
---|
136 | movl (%ebx), %ebx
|
---|
137 | #else
|
---|
138 | /* ELF */
|
---|
139 | call .L_get_pc
|
---|
140 | addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
---|
141 | movl LZMA_CRC32_TABLE@GOT(%ebx), %ebx
|
---|
142 | #endif
|
---|
143 |
|
---|
144 | /* Complement the initial value. */
|
---|
145 | notl %eax
|
---|
146 |
|
---|
147 | ALIGN(4, 16)
|
---|
148 | .L_align:
|
---|
149 | /*
|
---|
150 | * Check if there is enough input to use slicing-by-eight.
|
---|
151 | * We need 16 bytes, because the loop pre-reads eight bytes.
|
---|
152 | */
|
---|
153 | cmpl $16, %edi
|
---|
154 | jb .L_rest
|
---|
155 |
|
---|
156 | /* Check if we have reached alignment of eight bytes. */
|
---|
157 | testl $7, %esi
|
---|
158 | jz .L_slice
|
---|
159 |
|
---|
160 | /* Calculate CRC of the next input byte. */
|
---|
161 | movzbl (%esi), %ebp
|
---|
162 | incl %esi
|
---|
163 | movzbl %al, %ecx
|
---|
164 | xorl %ecx, %ebp
|
---|
165 | shrl $8, %eax
|
---|
166 | xorl (%ebx, %ebp, 4), %eax
|
---|
167 | decl %edi
|
---|
168 | jmp .L_align
|
---|
169 |
|
---|
170 | ALIGN(2, 4)
|
---|
171 | .L_slice:
|
---|
172 | /*
|
---|
173 | * If we get here, there's at least 16 bytes of aligned input
|
---|
174 | * available. Make %edi multiple of eight bytes. Store the possible
|
---|
175 | * remainder over the "size" variable in the argument stack.
|
---|
176 | */
|
---|
177 | movl %edi, 0x18(%esp)
|
---|
178 | andl $-8, %edi
|
---|
179 | subl %edi, 0x18(%esp)
|
---|
180 |
|
---|
181 | /*
|
---|
182 | * Let %edi be buf + size - 8 while running the main loop. This way
|
---|
183 | * we can compare for equality to determine when exit the loop.
|
---|
184 | */
|
---|
185 | addl %esi, %edi
|
---|
186 | subl $8, %edi
|
---|
187 |
|
---|
188 | /* Read in the first eight aligned bytes. */
|
---|
189 | xorl (%esi), %eax
|
---|
190 | movl 4(%esi), %ecx
|
---|
191 | movzbl %cl, %ebp
|
---|
192 |
|
---|
193 | .L_loop:
|
---|
194 | movl 0x0C00(%ebx, %ebp, 4), %edx
|
---|
195 | movzbl %ch, %ebp
|
---|
196 | xorl 0x0800(%ebx, %ebp, 4), %edx
|
---|
197 | shrl $16, %ecx
|
---|
198 | xorl 8(%esi), %edx
|
---|
199 | movzbl %cl, %ebp
|
---|
200 | xorl 0x0400(%ebx, %ebp, 4), %edx
|
---|
201 | movzbl %ch, %ebp
|
---|
202 | xorl (%ebx, %ebp, 4), %edx
|
---|
203 | movzbl %al, %ebp
|
---|
204 |
|
---|
205 | /*
|
---|
206 | * Read the next four bytes, for which the CRC is calculated
|
---|
207 | * on the next iteration of the loop.
|
---|
208 | */
|
---|
209 | movl 12(%esi), %ecx
|
---|
210 |
|
---|
211 | xorl 0x1C00(%ebx, %ebp, 4), %edx
|
---|
212 | movzbl %ah, %ebp
|
---|
213 | shrl $16, %eax
|
---|
214 | xorl 0x1800(%ebx, %ebp, 4), %edx
|
---|
215 | movzbl %ah, %ebp
|
---|
216 | movzbl %al, %eax
|
---|
217 | movl 0x1400(%ebx, %eax, 4), %eax
|
---|
218 | addl $8, %esi
|
---|
219 | xorl %edx, %eax
|
---|
220 | xorl 0x1000(%ebx, %ebp, 4), %eax
|
---|
221 |
|
---|
222 | /* Check for end of aligned input. */
|
---|
223 | cmpl %edi, %esi
|
---|
224 | movzbl %cl, %ebp
|
---|
225 | jne .L_loop
|
---|
226 |
|
---|
227 | /*
|
---|
228 | * Process the remaining eight bytes, which we have already
|
---|
229 | * copied to %ecx and %edx.
|
---|
230 | */
|
---|
231 | movl 0x0C00(%ebx, %ebp, 4), %edx
|
---|
232 | movzbl %ch, %ebp
|
---|
233 | xorl 0x0800(%ebx, %ebp, 4), %edx
|
---|
234 | shrl $16, %ecx
|
---|
235 | movzbl %cl, %ebp
|
---|
236 | xorl 0x0400(%ebx, %ebp, 4), %edx
|
---|
237 | movzbl %ch, %ebp
|
---|
238 | xorl (%ebx, %ebp, 4), %edx
|
---|
239 | movzbl %al, %ebp
|
---|
240 |
|
---|
241 | xorl 0x1C00(%ebx, %ebp, 4), %edx
|
---|
242 | movzbl %ah, %ebp
|
---|
243 | shrl $16, %eax
|
---|
244 | xorl 0x1800(%ebx, %ebp, 4), %edx
|
---|
245 | movzbl %ah, %ebp
|
---|
246 | movzbl %al, %eax
|
---|
247 | movl 0x1400(%ebx, %eax, 4), %eax
|
---|
248 | addl $8, %esi
|
---|
249 | xorl %edx, %eax
|
---|
250 | xorl 0x1000(%ebx, %ebp, 4), %eax
|
---|
251 |
|
---|
252 | /* Copy the number of remaining bytes to %edi. */
|
---|
253 | movl 0x18(%esp), %edi
|
---|
254 |
|
---|
255 | .L_rest:
|
---|
256 | /* Check for end of input. */
|
---|
257 | testl %edi, %edi
|
---|
258 | jz .L_return
|
---|
259 |
|
---|
260 | /* Calculate CRC of the next input byte. */
|
---|
261 | movzbl (%esi), %ebp
|
---|
262 | incl %esi
|
---|
263 | movzbl %al, %ecx
|
---|
264 | xorl %ecx, %ebp
|
---|
265 | shrl $8, %eax
|
---|
266 | xorl (%ebx, %ebp, 4), %eax
|
---|
267 | decl %edi
|
---|
268 | jmp .L_rest
|
---|
269 |
|
---|
270 | .L_return:
|
---|
271 | /* Complement the final value. */
|
---|
272 | notl %eax
|
---|
273 |
|
---|
274 | popl %ebp
|
---|
275 | popl %edi
|
---|
276 | popl %esi
|
---|
277 | popl %ebx
|
---|
278 | ret
|
---|
279 |
|
---|
280 | #if defined(PIC) || defined(__PIC__)
|
---|
281 | ALIGN(4, 16)
|
---|
282 | .L_get_pc:
|
---|
283 | movl (%esp), %ebx
|
---|
284 | ret
|
---|
285 | #endif
|
---|
286 |
|
---|
287 | #if defined(__APPLE__) && (defined(PIC) || defined(__PIC__))
|
---|
288 | /* Mach-O PIC */
|
---|
289 | .section __IMPORT,__pointers,non_lazy_symbol_pointers
|
---|
290 | .L_lzma_crc32_table$non_lazy_ptr:
|
---|
291 | .indirect_symbol LZMA_CRC32_TABLE
|
---|
292 | .long 0
|
---|
293 |
|
---|
294 | #elif defined(_WIN32) || defined(__CYGWIN__)
|
---|
295 | # ifdef DLL_EXPORT
|
---|
296 | /* This is equivalent of __declspec(dllexport). */
|
---|
297 | .section .drectve
|
---|
298 | .ascii " -export:lzma_crc32"
|
---|
299 | # endif
|
---|
300 |
|
---|
301 | #elif !defined(__MSDOS__)
|
---|
302 | /* ELF */
|
---|
303 | .size LZMA_CRC32, .-LZMA_CRC32
|
---|
304 | #endif
|
---|
305 |
|
---|
306 | /*
|
---|
307 | * This is needed to support non-executable stack. It's ugly to
|
---|
308 | * use __FreeBSD__ and __linux__ here, but I don't know a way to detect when
|
---|
309 | * we are using GNU assembler.
|
---|
310 | */
|
---|
311 | #if defined(__ELF__) && (defined(__FreeBSD__) || defined(__linux__))
|
---|
312 | .section .note.GNU-stack,"",@progbits
|
---|
313 | #endif
|
---|