VirtualBox

source: vbox/trunk/src/libs/liblzma-5.4.1/check/crc64_x86.S

Last change on this file was 98730, checked in by vboxsync, 21 months ago

libs/liblzma-5.4.1: Export to OSE, bugref:10254

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1/*
2 * Speed-optimized CRC64 using slicing-by-four algorithm
3 *
4 * This uses only i386 instructions, but it is optimized for i686 and later
5 * (including e.g. Pentium II/III/IV, Athlon XP, and Core 2).
6 *
7 * Authors: Igor Pavlov (original CRC32 assembly code)
8 * Lasse Collin (CRC64 adaptation of the modified CRC32 code)
9 *
10 * This file has been put into the public domain.
11 * You can do whatever you want with this file.
12 *
13 * This code needs lzma_crc64_table, which can be created using the
14 * following C code:
15
16uint64_t lzma_crc64_table[4][256];
17
18void
19init_table(void)
20{
21 // ECMA-182
22 static const uint64_t poly64 = UINT64_C(0xC96C5795D7870F42);
23
24 for (size_t s = 0; s < 4; ++s) {
25 for (size_t b = 0; b < 256; ++b) {
26 uint64_t r = s == 0 ? b : lzma_crc64_table[s - 1][b];
27
28 for (size_t i = 0; i < 8; ++i) {
29 if (r & 1)
30 r = (r >> 1) ^ poly64;
31 else
32 r >>= 1;
33 }
34
35 lzma_crc64_table[s][b] = r;
36 }
37 }
38}
39
40 * The prototype of the CRC64 function:
41 * extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
42 */
43
44/* When Intel CET is enabled, include <cet.h> in assembly code to mark
45 Intel CET support. */
46#ifdef __CET__
47# include <cet.h>
48#else
49# define _CET_ENDBR
50#endif
51
52/*
53 * On some systems, the functions need to be prefixed. The prefix is
54 * usually an underscore.
55 */
56#ifndef __USER_LABEL_PREFIX__
57# define __USER_LABEL_PREFIX__
58#endif
59#define MAKE_SYM_CAT(prefix, sym) prefix ## sym
60#define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym)
61#define LZMA_CRC64 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64)
62#define LZMA_CRC64_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64_table)
63
64/*
65 * Solaris assembler doesn't have .p2align, and Darwin uses .align
66 * differently than GNU/Linux and Solaris.
67 */
68#if defined(__APPLE__) || defined(__MSDOS__)
69# define ALIGN(pow2, abs) .align pow2
70#else
71# define ALIGN(pow2, abs) .align abs
72#endif
73
74 .text
75 .globl LZMA_CRC64
76
77#if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \
78 && !defined(__MSDOS__)
79 .type LZMA_CRC64, @function
80#endif
81
82 ALIGN(4, 16)
83LZMA_CRC64:
84 _CET_ENDBR
85 /*
86 * Register usage:
87 * %eax crc LSB
88 * %edx crc MSB
89 * %esi buf
90 * %edi size or buf + size
91 * %ebx lzma_crc64_table
92 * %ebp Table index
93 * %ecx Temporary
94 */
95 pushl %ebx
96 pushl %esi
97 pushl %edi
98 pushl %ebp
99 movl 0x14(%esp), %esi /* buf */
100 movl 0x18(%esp), %edi /* size */
101 movl 0x1C(%esp), %eax /* crc LSB */
102 movl 0x20(%esp), %edx /* crc MSB */
103
104 /*
105 * Store the address of lzma_crc64_table to %ebx. This is needed to
106 * get position-independent code (PIC).
107 *
108 * The PIC macro is defined by libtool, while __PIC__ is defined
109 * by GCC but only on some systems. Testing for both makes it simpler
110 * to test this code without libtool, and keeps the code working also
111 * when built with libtool but using something else than GCC.
112 *
113 * I understood that libtool may define PIC on Windows even though
114 * the code in Windows DLLs is not PIC in sense that it is in ELF
115 * binaries, so we need a separate check to always use the non-PIC
116 * code on Windows.
117 */
118#if (!defined(PIC) && !defined(__PIC__)) \
119 || (defined(_WIN32) || defined(__CYGWIN__))
120 /* Not PIC */
121 movl $ LZMA_CRC64_TABLE, %ebx
122#elif defined(__APPLE__)
123 /* Mach-O */
124 call .L_get_pc
125.L_pic:
126 leal .L_lzma_crc64_table$non_lazy_ptr-.L_pic(%ebx), %ebx
127 movl (%ebx), %ebx
128#else
129 /* ELF */
130 call .L_get_pc
131 addl $_GLOBAL_OFFSET_TABLE_, %ebx
132 movl LZMA_CRC64_TABLE@GOT(%ebx), %ebx
133#endif
134
135 /* Complement the initial value. */
136 notl %eax
137 notl %edx
138
139.L_align:
140 /*
141 * Check if there is enough input to use slicing-by-four.
142 * We need eight bytes, because the loop pre-reads four bytes.
143 */
144 cmpl $8, %edi
145 jb .L_rest
146
147 /* Check if we have reached alignment of four bytes. */
148 testl $3, %esi
149 jz .L_slice
150
151 /* Calculate CRC of the next input byte. */
152 movzbl (%esi), %ebp
153 incl %esi
154 movzbl %al, %ecx
155 xorl %ecx, %ebp
156 shrdl $8, %edx, %eax
157 xorl (%ebx, %ebp, 8), %eax
158 shrl $8, %edx
159 xorl 4(%ebx, %ebp, 8), %edx
160 decl %edi
161 jmp .L_align
162
163.L_slice:
164 /*
165 * If we get here, there's at least eight bytes of aligned input
166 * available. Make %edi multiple of four bytes. Store the possible
167 * remainder over the "size" variable in the argument stack.
168 */
169 movl %edi, 0x18(%esp)
170 andl $-4, %edi
171 subl %edi, 0x18(%esp)
172
173 /*
174 * Let %edi be buf + size - 4 while running the main loop. This way
175 * we can compare for equality to determine when exit the loop.
176 */
177 addl %esi, %edi
178 subl $4, %edi
179
180 /* Read in the first four aligned bytes. */
181 movl (%esi), %ecx
182
183.L_loop:
184 xorl %eax, %ecx
185 movzbl %cl, %ebp
186 movl 0x1800(%ebx, %ebp, 8), %eax
187 xorl %edx, %eax
188 movl 0x1804(%ebx, %ebp, 8), %edx
189 movzbl %ch, %ebp
190 xorl 0x1000(%ebx, %ebp, 8), %eax
191 xorl 0x1004(%ebx, %ebp, 8), %edx
192 shrl $16, %ecx
193 movzbl %cl, %ebp
194 xorl 0x0800(%ebx, %ebp, 8), %eax
195 xorl 0x0804(%ebx, %ebp, 8), %edx
196 movzbl %ch, %ebp
197 addl $4, %esi
198 xorl (%ebx, %ebp, 8), %eax
199 xorl 4(%ebx, %ebp, 8), %edx
200
201 /* Check for end of aligned input. */
202 cmpl %edi, %esi
203
204 /*
205 * Copy the next input byte to %ecx. It is slightly faster to
206 * read it here than at the top of the loop.
207 */
208 movl (%esi), %ecx
209 jb .L_loop
210
211 /*
212 * Process the remaining four bytes, which we have already
213 * copied to %ecx.
214 */
215 xorl %eax, %ecx
216 movzbl %cl, %ebp
217 movl 0x1800(%ebx, %ebp, 8), %eax
218 xorl %edx, %eax
219 movl 0x1804(%ebx, %ebp, 8), %edx
220 movzbl %ch, %ebp
221 xorl 0x1000(%ebx, %ebp, 8), %eax
222 xorl 0x1004(%ebx, %ebp, 8), %edx
223 shrl $16, %ecx
224 movzbl %cl, %ebp
225 xorl 0x0800(%ebx, %ebp, 8), %eax
226 xorl 0x0804(%ebx, %ebp, 8), %edx
227 movzbl %ch, %ebp
228 addl $4, %esi
229 xorl (%ebx, %ebp, 8), %eax
230 xorl 4(%ebx, %ebp, 8), %edx
231
232 /* Copy the number of remaining bytes to %edi. */
233 movl 0x18(%esp), %edi
234
235.L_rest:
236 /* Check for end of input. */
237 testl %edi, %edi
238 jz .L_return
239
240 /* Calculate CRC of the next input byte. */
241 movzbl (%esi), %ebp
242 incl %esi
243 movzbl %al, %ecx
244 xorl %ecx, %ebp
245 shrdl $8, %edx, %eax
246 xorl (%ebx, %ebp, 8), %eax
247 shrl $8, %edx
248 xorl 4(%ebx, %ebp, 8), %edx
249 decl %edi
250 jmp .L_rest
251
252.L_return:
253 /* Complement the final value. */
254 notl %eax
255 notl %edx
256
257 popl %ebp
258 popl %edi
259 popl %esi
260 popl %ebx
261 ret
262
263#if defined(PIC) || defined(__PIC__)
264 ALIGN(4, 16)
265.L_get_pc:
266 movl (%esp), %ebx
267 ret
268#endif
269
270#if defined(__APPLE__) && (defined(PIC) || defined(__PIC__))
271 /* Mach-O PIC */
272 .section __IMPORT,__pointers,non_lazy_symbol_pointers
273.L_lzma_crc64_table$non_lazy_ptr:
274 .indirect_symbol LZMA_CRC64_TABLE
275 .long 0
276
277#elif defined(_WIN32) || defined(__CYGWIN__)
278# ifdef DLL_EXPORT
279 /* This is equivalent of __declspec(dllexport). */
280 .section .drectve
281 .ascii " -export:lzma_crc64"
282# endif
283
284#elif !defined(__MSDOS__)
285 /* ELF */
286 .size LZMA_CRC64, .-LZMA_CRC64
287#endif
288
289/*
290 * This is needed to support non-executable stack. It's ugly to
291 * use __FreeBSD__ and __linux__ here, but I don't know a way to detect when
292 * we are using GNU assembler.
293 */
294#if defined(__ELF__) && (defined(__FreeBSD__) || defined(__linux__))
295 .section .note.GNU-stack,"",@progbits
296#endif
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette