VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/genasm-macosx/chacha-x86_64.S@ 94081

Last change on this file since 94081 was 83531, checked in by vboxsync, 4 years ago

setting svn:sync-process=export for openssl-1.1.1f, all files except tests

File size: 20.8 KB
Line 
1.text
2
3
4
5.p2align 6
6L$zero:
7.long 0,0,0,0
8L$one:
9.long 1,0,0,0
10L$inc:
11.long 0,1,2,3
12L$four:
13.long 4,4,4,4
14L$incy:
15.long 0,2,4,6,1,3,5,7
16L$eight:
17.long 8,8,8,8,8,8,8,8
18L$rot16:
19.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
20L$rot24:
21.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
22L$twoy:
23.long 2,0,0,0, 2,0,0,0
24.p2align 6
25L$zeroz:
26.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
27L$fourz:
28.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
29L$incz:
30.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
31L$sixteen:
32.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
33L$sigma:
34.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
35.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
36.globl _ChaCha20_ctr32
37
38.p2align 6
39_ChaCha20_ctr32:
40
41 cmpq $0,%rdx
42 je L$no_data
43 movq _OPENSSL_ia32cap_P+4(%rip),%r10
44 testl $512,%r10d
45 jnz L$ChaCha20_ssse3
46
47 pushq %rbx
48
49 pushq %rbp
50
51 pushq %r12
52
53 pushq %r13
54
55 pushq %r14
56
57 pushq %r15
58
59 subq $64+24,%rsp
60
61L$ctr32_body:
62
63
64 movdqu (%rcx),%xmm1
65 movdqu 16(%rcx),%xmm2
66 movdqu (%r8),%xmm3
67 movdqa L$one(%rip),%xmm4
68
69
70 movdqa %xmm1,16(%rsp)
71 movdqa %xmm2,32(%rsp)
72 movdqa %xmm3,48(%rsp)
73 movq %rdx,%rbp
74 jmp L$oop_outer
75
76.p2align 5
77L$oop_outer:
78 movl $0x61707865,%eax
79 movl $0x3320646e,%ebx
80 movl $0x79622d32,%ecx
81 movl $0x6b206574,%edx
82 movl 16(%rsp),%r8d
83 movl 20(%rsp),%r9d
84 movl 24(%rsp),%r10d
85 movl 28(%rsp),%r11d
86 movd %xmm3,%r12d
87 movl 52(%rsp),%r13d
88 movl 56(%rsp),%r14d
89 movl 60(%rsp),%r15d
90
91 movq %rbp,64+0(%rsp)
92 movl $10,%ebp
93 movq %rsi,64+8(%rsp)
94.byte 102,72,15,126,214
95 movq %rdi,64+16(%rsp)
96 movq %rsi,%rdi
97 shrq $32,%rdi
98 jmp L$oop
99
100.p2align 5
101L$oop:
102 addl %r8d,%eax
103 xorl %eax,%r12d
104 roll $16,%r12d
105 addl %r9d,%ebx
106 xorl %ebx,%r13d
107 roll $16,%r13d
108 addl %r12d,%esi
109 xorl %esi,%r8d
110 roll $12,%r8d
111 addl %r13d,%edi
112 xorl %edi,%r9d
113 roll $12,%r9d
114 addl %r8d,%eax
115 xorl %eax,%r12d
116 roll $8,%r12d
117 addl %r9d,%ebx
118 xorl %ebx,%r13d
119 roll $8,%r13d
120 addl %r12d,%esi
121 xorl %esi,%r8d
122 roll $7,%r8d
123 addl %r13d,%edi
124 xorl %edi,%r9d
125 roll $7,%r9d
126 movl %esi,32(%rsp)
127 movl %edi,36(%rsp)
128 movl 40(%rsp),%esi
129 movl 44(%rsp),%edi
130 addl %r10d,%ecx
131 xorl %ecx,%r14d
132 roll $16,%r14d
133 addl %r11d,%edx
134 xorl %edx,%r15d
135 roll $16,%r15d
136 addl %r14d,%esi
137 xorl %esi,%r10d
138 roll $12,%r10d
139 addl %r15d,%edi
140 xorl %edi,%r11d
141 roll $12,%r11d
142 addl %r10d,%ecx
143 xorl %ecx,%r14d
144 roll $8,%r14d
145 addl %r11d,%edx
146 xorl %edx,%r15d
147 roll $8,%r15d
148 addl %r14d,%esi
149 xorl %esi,%r10d
150 roll $7,%r10d
151 addl %r15d,%edi
152 xorl %edi,%r11d
153 roll $7,%r11d
154 addl %r9d,%eax
155 xorl %eax,%r15d
156 roll $16,%r15d
157 addl %r10d,%ebx
158 xorl %ebx,%r12d
159 roll $16,%r12d
160 addl %r15d,%esi
161 xorl %esi,%r9d
162 roll $12,%r9d
163 addl %r12d,%edi
164 xorl %edi,%r10d
165 roll $12,%r10d
166 addl %r9d,%eax
167 xorl %eax,%r15d
168 roll $8,%r15d
169 addl %r10d,%ebx
170 xorl %ebx,%r12d
171 roll $8,%r12d
172 addl %r15d,%esi
173 xorl %esi,%r9d
174 roll $7,%r9d
175 addl %r12d,%edi
176 xorl %edi,%r10d
177 roll $7,%r10d
178 movl %esi,40(%rsp)
179 movl %edi,44(%rsp)
180 movl 32(%rsp),%esi
181 movl 36(%rsp),%edi
182 addl %r11d,%ecx
183 xorl %ecx,%r13d
184 roll $16,%r13d
185 addl %r8d,%edx
186 xorl %edx,%r14d
187 roll $16,%r14d
188 addl %r13d,%esi
189 xorl %esi,%r11d
190 roll $12,%r11d
191 addl %r14d,%edi
192 xorl %edi,%r8d
193 roll $12,%r8d
194 addl %r11d,%ecx
195 xorl %ecx,%r13d
196 roll $8,%r13d
197 addl %r8d,%edx
198 xorl %edx,%r14d
199 roll $8,%r14d
200 addl %r13d,%esi
201 xorl %esi,%r11d
202 roll $7,%r11d
203 addl %r14d,%edi
204 xorl %edi,%r8d
205 roll $7,%r8d
206 decl %ebp
207 jnz L$oop
208 movl %edi,36(%rsp)
209 movl %esi,32(%rsp)
210 movq 64(%rsp),%rbp
211 movdqa %xmm2,%xmm1
212 movq 64+8(%rsp),%rsi
213 paddd %xmm4,%xmm3
214 movq 64+16(%rsp),%rdi
215
216 addl $0x61707865,%eax
217 addl $0x3320646e,%ebx
218 addl $0x79622d32,%ecx
219 addl $0x6b206574,%edx
220 addl 16(%rsp),%r8d
221 addl 20(%rsp),%r9d
222 addl 24(%rsp),%r10d
223 addl 28(%rsp),%r11d
224 addl 48(%rsp),%r12d
225 addl 52(%rsp),%r13d
226 addl 56(%rsp),%r14d
227 addl 60(%rsp),%r15d
228 paddd 32(%rsp),%xmm1
229
230 cmpq $64,%rbp
231 jb L$tail
232
233 xorl 0(%rsi),%eax
234 xorl 4(%rsi),%ebx
235 xorl 8(%rsi),%ecx
236 xorl 12(%rsi),%edx
237 xorl 16(%rsi),%r8d
238 xorl 20(%rsi),%r9d
239 xorl 24(%rsi),%r10d
240 xorl 28(%rsi),%r11d
241 movdqu 32(%rsi),%xmm0
242 xorl 48(%rsi),%r12d
243 xorl 52(%rsi),%r13d
244 xorl 56(%rsi),%r14d
245 xorl 60(%rsi),%r15d
246 leaq 64(%rsi),%rsi
247 pxor %xmm1,%xmm0
248
249 movdqa %xmm2,32(%rsp)
250 movd %xmm3,48(%rsp)
251
252 movl %eax,0(%rdi)
253 movl %ebx,4(%rdi)
254 movl %ecx,8(%rdi)
255 movl %edx,12(%rdi)
256 movl %r8d,16(%rdi)
257 movl %r9d,20(%rdi)
258 movl %r10d,24(%rdi)
259 movl %r11d,28(%rdi)
260 movdqu %xmm0,32(%rdi)
261 movl %r12d,48(%rdi)
262 movl %r13d,52(%rdi)
263 movl %r14d,56(%rdi)
264 movl %r15d,60(%rdi)
265 leaq 64(%rdi),%rdi
266
267 subq $64,%rbp
268 jnz L$oop_outer
269
270 jmp L$done
271
272.p2align 4
273L$tail:
274 movl %eax,0(%rsp)
275 movl %ebx,4(%rsp)
276 xorq %rbx,%rbx
277 movl %ecx,8(%rsp)
278 movl %edx,12(%rsp)
279 movl %r8d,16(%rsp)
280 movl %r9d,20(%rsp)
281 movl %r10d,24(%rsp)
282 movl %r11d,28(%rsp)
283 movdqa %xmm1,32(%rsp)
284 movl %r12d,48(%rsp)
285 movl %r13d,52(%rsp)
286 movl %r14d,56(%rsp)
287 movl %r15d,60(%rsp)
288
289L$oop_tail:
290 movzbl (%rsi,%rbx,1),%eax
291 movzbl (%rsp,%rbx,1),%edx
292 leaq 1(%rbx),%rbx
293 xorl %edx,%eax
294 movb %al,-1(%rdi,%rbx,1)
295 decq %rbp
296 jnz L$oop_tail
297
298L$done:
299 leaq 64+24+48(%rsp),%rsi
300
301 movq -48(%rsi),%r15
302
303 movq -40(%rsi),%r14
304
305 movq -32(%rsi),%r13
306
307 movq -24(%rsi),%r12
308
309 movq -16(%rsi),%rbp
310
311 movq -8(%rsi),%rbx
312
313 leaq (%rsi),%rsp
314
315L$no_data:
316 .byte 0xf3,0xc3
317
318
319
320.p2align 5
321ChaCha20_ssse3:
322
323L$ChaCha20_ssse3:
324 movq %rsp,%r9
325
326 cmpq $128,%rdx
327 je L$ChaCha20_128
328 ja L$ChaCha20_4x
329
330L$do_sse3_after_all:
331 subq $64+8,%rsp
332 movdqa L$sigma(%rip),%xmm0
333 movdqu (%rcx),%xmm1
334 movdqu 16(%rcx),%xmm2
335 movdqu (%r8),%xmm3
336 movdqa L$rot16(%rip),%xmm6
337 movdqa L$rot24(%rip),%xmm7
338
339 movdqa %xmm0,0(%rsp)
340 movdqa %xmm1,16(%rsp)
341 movdqa %xmm2,32(%rsp)
342 movdqa %xmm3,48(%rsp)
343 movq $10,%r8
344 jmp L$oop_ssse3
345
346.p2align 5
347L$oop_outer_ssse3:
348 movdqa L$one(%rip),%xmm3
349 movdqa 0(%rsp),%xmm0
350 movdqa 16(%rsp),%xmm1
351 movdqa 32(%rsp),%xmm2
352 paddd 48(%rsp),%xmm3
353 movq $10,%r8
354 movdqa %xmm3,48(%rsp)
355 jmp L$oop_ssse3
356
357.p2align 5
358L$oop_ssse3:
359 paddd %xmm1,%xmm0
360 pxor %xmm0,%xmm3
361.byte 102,15,56,0,222
362 paddd %xmm3,%xmm2
363 pxor %xmm2,%xmm1
364 movdqa %xmm1,%xmm4
365 psrld $20,%xmm1
366 pslld $12,%xmm4
367 por %xmm4,%xmm1
368 paddd %xmm1,%xmm0
369 pxor %xmm0,%xmm3
370.byte 102,15,56,0,223
371 paddd %xmm3,%xmm2
372 pxor %xmm2,%xmm1
373 movdqa %xmm1,%xmm4
374 psrld $25,%xmm1
375 pslld $7,%xmm4
376 por %xmm4,%xmm1
377 pshufd $78,%xmm2,%xmm2
378 pshufd $57,%xmm1,%xmm1
379 pshufd $147,%xmm3,%xmm3
380 nop
381 paddd %xmm1,%xmm0
382 pxor %xmm0,%xmm3
383.byte 102,15,56,0,222
384 paddd %xmm3,%xmm2
385 pxor %xmm2,%xmm1
386 movdqa %xmm1,%xmm4
387 psrld $20,%xmm1
388 pslld $12,%xmm4
389 por %xmm4,%xmm1
390 paddd %xmm1,%xmm0
391 pxor %xmm0,%xmm3
392.byte 102,15,56,0,223
393 paddd %xmm3,%xmm2
394 pxor %xmm2,%xmm1
395 movdqa %xmm1,%xmm4
396 psrld $25,%xmm1
397 pslld $7,%xmm4
398 por %xmm4,%xmm1
399 pshufd $78,%xmm2,%xmm2
400 pshufd $147,%xmm1,%xmm1
401 pshufd $57,%xmm3,%xmm3
402 decq %r8
403 jnz L$oop_ssse3
404 paddd 0(%rsp),%xmm0
405 paddd 16(%rsp),%xmm1
406 paddd 32(%rsp),%xmm2
407 paddd 48(%rsp),%xmm3
408
409 cmpq $64,%rdx
410 jb L$tail_ssse3
411
412 movdqu 0(%rsi),%xmm4
413 movdqu 16(%rsi),%xmm5
414 pxor %xmm4,%xmm0
415 movdqu 32(%rsi),%xmm4
416 pxor %xmm5,%xmm1
417 movdqu 48(%rsi),%xmm5
418 leaq 64(%rsi),%rsi
419 pxor %xmm4,%xmm2
420 pxor %xmm5,%xmm3
421
422 movdqu %xmm0,0(%rdi)
423 movdqu %xmm1,16(%rdi)
424 movdqu %xmm2,32(%rdi)
425 movdqu %xmm3,48(%rdi)
426 leaq 64(%rdi),%rdi
427
428 subq $64,%rdx
429 jnz L$oop_outer_ssse3
430
431 jmp L$done_ssse3
432
433.p2align 4
434L$tail_ssse3:
435 movdqa %xmm0,0(%rsp)
436 movdqa %xmm1,16(%rsp)
437 movdqa %xmm2,32(%rsp)
438 movdqa %xmm3,48(%rsp)
439 xorq %r8,%r8
440
441L$oop_tail_ssse3:
442 movzbl (%rsi,%r8,1),%eax
443 movzbl (%rsp,%r8,1),%ecx
444 leaq 1(%r8),%r8
445 xorl %ecx,%eax
446 movb %al,-1(%rdi,%r8,1)
447 decq %rdx
448 jnz L$oop_tail_ssse3
449
450L$done_ssse3:
451 leaq (%r9),%rsp
452
453L$ssse3_epilogue:
454 .byte 0xf3,0xc3
455
456
457
458.p2align 5
459ChaCha20_128:
460
461L$ChaCha20_128:
462 movq %rsp,%r9
463
464 subq $64+8,%rsp
465 movdqa L$sigma(%rip),%xmm8
466 movdqu (%rcx),%xmm9
467 movdqu 16(%rcx),%xmm2
468 movdqu (%r8),%xmm3
469 movdqa L$one(%rip),%xmm1
470 movdqa L$rot16(%rip),%xmm6
471 movdqa L$rot24(%rip),%xmm7
472
473 movdqa %xmm8,%xmm10
474 movdqa %xmm8,0(%rsp)
475 movdqa %xmm9,%xmm11
476 movdqa %xmm9,16(%rsp)
477 movdqa %xmm2,%xmm0
478 movdqa %xmm2,32(%rsp)
479 paddd %xmm3,%xmm1
480 movdqa %xmm3,48(%rsp)
481 movq $10,%r8
482 jmp L$oop_128
483
484.p2align 5
485L$oop_128:
486 paddd %xmm9,%xmm8
487 pxor %xmm8,%xmm3
488 paddd %xmm11,%xmm10
489 pxor %xmm10,%xmm1
490.byte 102,15,56,0,222
491.byte 102,15,56,0,206
492 paddd %xmm3,%xmm2
493 paddd %xmm1,%xmm0
494 pxor %xmm2,%xmm9
495 pxor %xmm0,%xmm11
496 movdqa %xmm9,%xmm4
497 psrld $20,%xmm9
498 movdqa %xmm11,%xmm5
499 pslld $12,%xmm4
500 psrld $20,%xmm11
501 por %xmm4,%xmm9
502 pslld $12,%xmm5
503 por %xmm5,%xmm11
504 paddd %xmm9,%xmm8
505 pxor %xmm8,%xmm3
506 paddd %xmm11,%xmm10
507 pxor %xmm10,%xmm1
508.byte 102,15,56,0,223
509.byte 102,15,56,0,207
510 paddd %xmm3,%xmm2
511 paddd %xmm1,%xmm0
512 pxor %xmm2,%xmm9
513 pxor %xmm0,%xmm11
514 movdqa %xmm9,%xmm4
515 psrld $25,%xmm9
516 movdqa %xmm11,%xmm5
517 pslld $7,%xmm4
518 psrld $25,%xmm11
519 por %xmm4,%xmm9
520 pslld $7,%xmm5
521 por %xmm5,%xmm11
522 pshufd $78,%xmm2,%xmm2
523 pshufd $57,%xmm9,%xmm9
524 pshufd $147,%xmm3,%xmm3
525 pshufd $78,%xmm0,%xmm0
526 pshufd $57,%xmm11,%xmm11
527 pshufd $147,%xmm1,%xmm1
528 paddd %xmm9,%xmm8
529 pxor %xmm8,%xmm3
530 paddd %xmm11,%xmm10
531 pxor %xmm10,%xmm1
532.byte 102,15,56,0,222
533.byte 102,15,56,0,206
534 paddd %xmm3,%xmm2
535 paddd %xmm1,%xmm0
536 pxor %xmm2,%xmm9
537 pxor %xmm0,%xmm11
538 movdqa %xmm9,%xmm4
539 psrld $20,%xmm9
540 movdqa %xmm11,%xmm5
541 pslld $12,%xmm4
542 psrld $20,%xmm11
543 por %xmm4,%xmm9
544 pslld $12,%xmm5
545 por %xmm5,%xmm11
546 paddd %xmm9,%xmm8
547 pxor %xmm8,%xmm3
548 paddd %xmm11,%xmm10
549 pxor %xmm10,%xmm1
550.byte 102,15,56,0,223
551.byte 102,15,56,0,207
552 paddd %xmm3,%xmm2
553 paddd %xmm1,%xmm0
554 pxor %xmm2,%xmm9
555 pxor %xmm0,%xmm11
556 movdqa %xmm9,%xmm4
557 psrld $25,%xmm9
558 movdqa %xmm11,%xmm5
559 pslld $7,%xmm4
560 psrld $25,%xmm11
561 por %xmm4,%xmm9
562 pslld $7,%xmm5
563 por %xmm5,%xmm11
564 pshufd $78,%xmm2,%xmm2
565 pshufd $147,%xmm9,%xmm9
566 pshufd $57,%xmm3,%xmm3
567 pshufd $78,%xmm0,%xmm0
568 pshufd $147,%xmm11,%xmm11
569 pshufd $57,%xmm1,%xmm1
570 decq %r8
571 jnz L$oop_128
572 paddd 0(%rsp),%xmm8
573 paddd 16(%rsp),%xmm9
574 paddd 32(%rsp),%xmm2
575 paddd 48(%rsp),%xmm3
576 paddd L$one(%rip),%xmm1
577 paddd 0(%rsp),%xmm10
578 paddd 16(%rsp),%xmm11
579 paddd 32(%rsp),%xmm0
580 paddd 48(%rsp),%xmm1
581
582 movdqu 0(%rsi),%xmm4
583 movdqu 16(%rsi),%xmm5
584 pxor %xmm4,%xmm8
585 movdqu 32(%rsi),%xmm4
586 pxor %xmm5,%xmm9
587 movdqu 48(%rsi),%xmm5
588 pxor %xmm4,%xmm2
589 movdqu 64(%rsi),%xmm4
590 pxor %xmm5,%xmm3
591 movdqu 80(%rsi),%xmm5
592 pxor %xmm4,%xmm10
593 movdqu 96(%rsi),%xmm4
594 pxor %xmm5,%xmm11
595 movdqu 112(%rsi),%xmm5
596 pxor %xmm4,%xmm0
597 pxor %xmm5,%xmm1
598
599 movdqu %xmm8,0(%rdi)
600 movdqu %xmm9,16(%rdi)
601 movdqu %xmm2,32(%rdi)
602 movdqu %xmm3,48(%rdi)
603 movdqu %xmm10,64(%rdi)
604 movdqu %xmm11,80(%rdi)
605 movdqu %xmm0,96(%rdi)
606 movdqu %xmm1,112(%rdi)
607 leaq (%r9),%rsp
608
609L$128_epilogue:
610 .byte 0xf3,0xc3
611
612
613
614.p2align 5
615ChaCha20_4x:
616
617L$ChaCha20_4x:
618 movq %rsp,%r9
619
620 movq %r10,%r11
621 cmpq $192,%rdx
622 ja L$proceed4x
623
624 andq $71303168,%r11
625 cmpq $4194304,%r11
626 je L$do_sse3_after_all
627
628L$proceed4x:
629 subq $0x140+8,%rsp
630 movdqa L$sigma(%rip),%xmm11
631 movdqu (%rcx),%xmm15
632 movdqu 16(%rcx),%xmm7
633 movdqu (%r8),%xmm3
634 leaq 256(%rsp),%rcx
635 leaq L$rot16(%rip),%r10
636 leaq L$rot24(%rip),%r11
637
638 pshufd $0x00,%xmm11,%xmm8
639 pshufd $0x55,%xmm11,%xmm9
640 movdqa %xmm8,64(%rsp)
641 pshufd $0xaa,%xmm11,%xmm10
642 movdqa %xmm9,80(%rsp)
643 pshufd $0xff,%xmm11,%xmm11
644 movdqa %xmm10,96(%rsp)
645 movdqa %xmm11,112(%rsp)
646
647 pshufd $0x00,%xmm15,%xmm12
648 pshufd $0x55,%xmm15,%xmm13
649 movdqa %xmm12,128-256(%rcx)
650 pshufd $0xaa,%xmm15,%xmm14
651 movdqa %xmm13,144-256(%rcx)
652 pshufd $0xff,%xmm15,%xmm15
653 movdqa %xmm14,160-256(%rcx)
654 movdqa %xmm15,176-256(%rcx)
655
656 pshufd $0x00,%xmm7,%xmm4
657 pshufd $0x55,%xmm7,%xmm5
658 movdqa %xmm4,192-256(%rcx)
659 pshufd $0xaa,%xmm7,%xmm6
660 movdqa %xmm5,208-256(%rcx)
661 pshufd $0xff,%xmm7,%xmm7
662 movdqa %xmm6,224-256(%rcx)
663 movdqa %xmm7,240-256(%rcx)
664
665 pshufd $0x00,%xmm3,%xmm0
666 pshufd $0x55,%xmm3,%xmm1
667 paddd L$inc(%rip),%xmm0
668 pshufd $0xaa,%xmm3,%xmm2
669 movdqa %xmm1,272-256(%rcx)
670 pshufd $0xff,%xmm3,%xmm3
671 movdqa %xmm2,288-256(%rcx)
672 movdqa %xmm3,304-256(%rcx)
673
674 jmp L$oop_enter4x
675
676.p2align 5
677L$oop_outer4x:
678 movdqa 64(%rsp),%xmm8
679 movdqa 80(%rsp),%xmm9
680 movdqa 96(%rsp),%xmm10
681 movdqa 112(%rsp),%xmm11
682 movdqa 128-256(%rcx),%xmm12
683 movdqa 144-256(%rcx),%xmm13
684 movdqa 160-256(%rcx),%xmm14
685 movdqa 176-256(%rcx),%xmm15
686 movdqa 192-256(%rcx),%xmm4
687 movdqa 208-256(%rcx),%xmm5
688 movdqa 224-256(%rcx),%xmm6
689 movdqa 240-256(%rcx),%xmm7
690 movdqa 256-256(%rcx),%xmm0
691 movdqa 272-256(%rcx),%xmm1
692 movdqa 288-256(%rcx),%xmm2
693 movdqa 304-256(%rcx),%xmm3
694 paddd L$four(%rip),%xmm0
695
696L$oop_enter4x:
697 movdqa %xmm6,32(%rsp)
698 movdqa %xmm7,48(%rsp)
699 movdqa (%r10),%xmm7
700 movl $10,%eax
701 movdqa %xmm0,256-256(%rcx)
702 jmp L$oop4x
703
704.p2align 5
705L$oop4x:
706 paddd %xmm12,%xmm8
707 paddd %xmm13,%xmm9
708 pxor %xmm8,%xmm0
709 pxor %xmm9,%xmm1
710.byte 102,15,56,0,199
711.byte 102,15,56,0,207
712 paddd %xmm0,%xmm4
713 paddd %xmm1,%xmm5
714 pxor %xmm4,%xmm12
715 pxor %xmm5,%xmm13
716 movdqa %xmm12,%xmm6
717 pslld $12,%xmm12
718 psrld $20,%xmm6
719 movdqa %xmm13,%xmm7
720 pslld $12,%xmm13
721 por %xmm6,%xmm12
722 psrld $20,%xmm7
723 movdqa (%r11),%xmm6
724 por %xmm7,%xmm13
725 paddd %xmm12,%xmm8
726 paddd %xmm13,%xmm9
727 pxor %xmm8,%xmm0
728 pxor %xmm9,%xmm1
729.byte 102,15,56,0,198
730.byte 102,15,56,0,206
731 paddd %xmm0,%xmm4
732 paddd %xmm1,%xmm5
733 pxor %xmm4,%xmm12
734 pxor %xmm5,%xmm13
735 movdqa %xmm12,%xmm7
736 pslld $7,%xmm12
737 psrld $25,%xmm7
738 movdqa %xmm13,%xmm6
739 pslld $7,%xmm13
740 por %xmm7,%xmm12
741 psrld $25,%xmm6
742 movdqa (%r10),%xmm7
743 por %xmm6,%xmm13
744 movdqa %xmm4,0(%rsp)
745 movdqa %xmm5,16(%rsp)
746 movdqa 32(%rsp),%xmm4
747 movdqa 48(%rsp),%xmm5
748 paddd %xmm14,%xmm10
749 paddd %xmm15,%xmm11
750 pxor %xmm10,%xmm2
751 pxor %xmm11,%xmm3
752.byte 102,15,56,0,215
753.byte 102,15,56,0,223
754 paddd %xmm2,%xmm4
755 paddd %xmm3,%xmm5
756 pxor %xmm4,%xmm14
757 pxor %xmm5,%xmm15
758 movdqa %xmm14,%xmm6
759 pslld $12,%xmm14
760 psrld $20,%xmm6
761 movdqa %xmm15,%xmm7
762 pslld $12,%xmm15
763 por %xmm6,%xmm14
764 psrld $20,%xmm7
765 movdqa (%r11),%xmm6
766 por %xmm7,%xmm15
767 paddd %xmm14,%xmm10
768 paddd %xmm15,%xmm11
769 pxor %xmm10,%xmm2
770 pxor %xmm11,%xmm3
771.byte 102,15,56,0,214
772.byte 102,15,56,0,222
773 paddd %xmm2,%xmm4
774 paddd %xmm3,%xmm5
775 pxor %xmm4,%xmm14
776 pxor %xmm5,%xmm15
777 movdqa %xmm14,%xmm7
778 pslld $7,%xmm14
779 psrld $25,%xmm7
780 movdqa %xmm15,%xmm6
781 pslld $7,%xmm15
782 por %xmm7,%xmm14
783 psrld $25,%xmm6
784 movdqa (%r10),%xmm7
785 por %xmm6,%xmm15
786 paddd %xmm13,%xmm8
787 paddd %xmm14,%xmm9
788 pxor %xmm8,%xmm3
789 pxor %xmm9,%xmm0
790.byte 102,15,56,0,223
791.byte 102,15,56,0,199
792 paddd %xmm3,%xmm4
793 paddd %xmm0,%xmm5
794 pxor %xmm4,%xmm13
795 pxor %xmm5,%xmm14
796 movdqa %xmm13,%xmm6
797 pslld $12,%xmm13
798 psrld $20,%xmm6
799 movdqa %xmm14,%xmm7
800 pslld $12,%xmm14
801 por %xmm6,%xmm13
802 psrld $20,%xmm7
803 movdqa (%r11),%xmm6
804 por %xmm7,%xmm14
805 paddd %xmm13,%xmm8
806 paddd %xmm14,%xmm9
807 pxor %xmm8,%xmm3
808 pxor %xmm9,%xmm0
809.byte 102,15,56,0,222
810.byte 102,15,56,0,198
811 paddd %xmm3,%xmm4
812 paddd %xmm0,%xmm5
813 pxor %xmm4,%xmm13
814 pxor %xmm5,%xmm14
815 movdqa %xmm13,%xmm7
816 pslld $7,%xmm13
817 psrld $25,%xmm7
818 movdqa %xmm14,%xmm6
819 pslld $7,%xmm14
820 por %xmm7,%xmm13
821 psrld $25,%xmm6
822 movdqa (%r10),%xmm7
823 por %xmm6,%xmm14
824 movdqa %xmm4,32(%rsp)
825 movdqa %xmm5,48(%rsp)
826 movdqa 0(%rsp),%xmm4
827 movdqa 16(%rsp),%xmm5
828 paddd %xmm15,%xmm10
829 paddd %xmm12,%xmm11
830 pxor %xmm10,%xmm1
831 pxor %xmm11,%xmm2
832.byte 102,15,56,0,207
833.byte 102,15,56,0,215
834 paddd %xmm1,%xmm4
835 paddd %xmm2,%xmm5
836 pxor %xmm4,%xmm15
837 pxor %xmm5,%xmm12
838 movdqa %xmm15,%xmm6
839 pslld $12,%xmm15
840 psrld $20,%xmm6
841 movdqa %xmm12,%xmm7
842 pslld $12,%xmm12
843 por %xmm6,%xmm15
844 psrld $20,%xmm7
845 movdqa (%r11),%xmm6
846 por %xmm7,%xmm12
847 paddd %xmm15,%xmm10
848 paddd %xmm12,%xmm11
849 pxor %xmm10,%xmm1
850 pxor %xmm11,%xmm2
851.byte 102,15,56,0,206
852.byte 102,15,56,0,214
853 paddd %xmm1,%xmm4
854 paddd %xmm2,%xmm5
855 pxor %xmm4,%xmm15
856 pxor %xmm5,%xmm12
857 movdqa %xmm15,%xmm7
858 pslld $7,%xmm15
859 psrld $25,%xmm7
860 movdqa %xmm12,%xmm6
861 pslld $7,%xmm12
862 por %xmm7,%xmm15
863 psrld $25,%xmm6
864 movdqa (%r10),%xmm7
865 por %xmm6,%xmm12
866 decl %eax
867 jnz L$oop4x
868
869 paddd 64(%rsp),%xmm8
870 paddd 80(%rsp),%xmm9
871 paddd 96(%rsp),%xmm10
872 paddd 112(%rsp),%xmm11
873
874 movdqa %xmm8,%xmm6
875 punpckldq %xmm9,%xmm8
876 movdqa %xmm10,%xmm7
877 punpckldq %xmm11,%xmm10
878 punpckhdq %xmm9,%xmm6
879 punpckhdq %xmm11,%xmm7
880 movdqa %xmm8,%xmm9
881 punpcklqdq %xmm10,%xmm8
882 movdqa %xmm6,%xmm11
883 punpcklqdq %xmm7,%xmm6
884 punpckhqdq %xmm10,%xmm9
885 punpckhqdq %xmm7,%xmm11
886 paddd 128-256(%rcx),%xmm12
887 paddd 144-256(%rcx),%xmm13
888 paddd 160-256(%rcx),%xmm14
889 paddd 176-256(%rcx),%xmm15
890
891 movdqa %xmm8,0(%rsp)
892 movdqa %xmm9,16(%rsp)
893 movdqa 32(%rsp),%xmm8
894 movdqa 48(%rsp),%xmm9
895
896 movdqa %xmm12,%xmm10
897 punpckldq %xmm13,%xmm12
898 movdqa %xmm14,%xmm7
899 punpckldq %xmm15,%xmm14
900 punpckhdq %xmm13,%xmm10
901 punpckhdq %xmm15,%xmm7
902 movdqa %xmm12,%xmm13
903 punpcklqdq %xmm14,%xmm12
904 movdqa %xmm10,%xmm15
905 punpcklqdq %xmm7,%xmm10
906 punpckhqdq %xmm14,%xmm13
907 punpckhqdq %xmm7,%xmm15
908 paddd 192-256(%rcx),%xmm4
909 paddd 208-256(%rcx),%xmm5
910 paddd 224-256(%rcx),%xmm8
911 paddd 240-256(%rcx),%xmm9
912
913 movdqa %xmm6,32(%rsp)
914 movdqa %xmm11,48(%rsp)
915
916 movdqa %xmm4,%xmm14
917 punpckldq %xmm5,%xmm4
918 movdqa %xmm8,%xmm7
919 punpckldq %xmm9,%xmm8
920 punpckhdq %xmm5,%xmm14
921 punpckhdq %xmm9,%xmm7
922 movdqa %xmm4,%xmm5
923 punpcklqdq %xmm8,%xmm4
924 movdqa %xmm14,%xmm9
925 punpcklqdq %xmm7,%xmm14
926 punpckhqdq %xmm8,%xmm5
927 punpckhqdq %xmm7,%xmm9
928 paddd 256-256(%rcx),%xmm0
929 paddd 272-256(%rcx),%xmm1
930 paddd 288-256(%rcx),%xmm2
931 paddd 304-256(%rcx),%xmm3
932
933 movdqa %xmm0,%xmm8
934 punpckldq %xmm1,%xmm0
935 movdqa %xmm2,%xmm7
936 punpckldq %xmm3,%xmm2
937 punpckhdq %xmm1,%xmm8
938 punpckhdq %xmm3,%xmm7
939 movdqa %xmm0,%xmm1
940 punpcklqdq %xmm2,%xmm0
941 movdqa %xmm8,%xmm3
942 punpcklqdq %xmm7,%xmm8
943 punpckhqdq %xmm2,%xmm1
944 punpckhqdq %xmm7,%xmm3
945 cmpq $256,%rdx
946 jb L$tail4x
947
948 movdqu 0(%rsi),%xmm6
949 movdqu 16(%rsi),%xmm11
950 movdqu 32(%rsi),%xmm2
951 movdqu 48(%rsi),%xmm7
952 pxor 0(%rsp),%xmm6
953 pxor %xmm12,%xmm11
954 pxor %xmm4,%xmm2
955 pxor %xmm0,%xmm7
956
957 movdqu %xmm6,0(%rdi)
958 movdqu 64(%rsi),%xmm6
959 movdqu %xmm11,16(%rdi)
960 movdqu 80(%rsi),%xmm11
961 movdqu %xmm2,32(%rdi)
962 movdqu 96(%rsi),%xmm2
963 movdqu %xmm7,48(%rdi)
964 movdqu 112(%rsi),%xmm7
965 leaq 128(%rsi),%rsi
966 pxor 16(%rsp),%xmm6
967 pxor %xmm13,%xmm11
968 pxor %xmm5,%xmm2
969 pxor %xmm1,%xmm7
970
971 movdqu %xmm6,64(%rdi)
972 movdqu 0(%rsi),%xmm6
973 movdqu %xmm11,80(%rdi)
974 movdqu 16(%rsi),%xmm11
975 movdqu %xmm2,96(%rdi)
976 movdqu 32(%rsi),%xmm2
977 movdqu %xmm7,112(%rdi)
978 leaq 128(%rdi),%rdi
979 movdqu 48(%rsi),%xmm7
980 pxor 32(%rsp),%xmm6
981 pxor %xmm10,%xmm11
982 pxor %xmm14,%xmm2
983 pxor %xmm8,%xmm7
984
985 movdqu %xmm6,0(%rdi)
986 movdqu 64(%rsi),%xmm6
987 movdqu %xmm11,16(%rdi)
988 movdqu 80(%rsi),%xmm11
989 movdqu %xmm2,32(%rdi)
990 movdqu 96(%rsi),%xmm2
991 movdqu %xmm7,48(%rdi)
992 movdqu 112(%rsi),%xmm7
993 leaq 128(%rsi),%rsi
994 pxor 48(%rsp),%xmm6
995 pxor %xmm15,%xmm11
996 pxor %xmm9,%xmm2
997 pxor %xmm3,%xmm7
998 movdqu %xmm6,64(%rdi)
999 movdqu %xmm11,80(%rdi)
1000 movdqu %xmm2,96(%rdi)
1001 movdqu %xmm7,112(%rdi)
1002 leaq 128(%rdi),%rdi
1003
1004 subq $256,%rdx
1005 jnz L$oop_outer4x
1006
1007 jmp L$done4x
1008
1009L$tail4x:
1010 cmpq $192,%rdx
1011 jae L$192_or_more4x
1012 cmpq $128,%rdx
1013 jae L$128_or_more4x
1014 cmpq $64,%rdx
1015 jae L$64_or_more4x
1016
1017
1018 xorq %r10,%r10
1019
1020 movdqa %xmm12,16(%rsp)
1021 movdqa %xmm4,32(%rsp)
1022 movdqa %xmm0,48(%rsp)
1023 jmp L$oop_tail4x
1024
1025.p2align 5
1026L$64_or_more4x:
1027 movdqu 0(%rsi),%xmm6
1028 movdqu 16(%rsi),%xmm11
1029 movdqu 32(%rsi),%xmm2
1030 movdqu 48(%rsi),%xmm7
1031 pxor 0(%rsp),%xmm6
1032 pxor %xmm12,%xmm11
1033 pxor %xmm4,%xmm2
1034 pxor %xmm0,%xmm7
1035 movdqu %xmm6,0(%rdi)
1036 movdqu %xmm11,16(%rdi)
1037 movdqu %xmm2,32(%rdi)
1038 movdqu %xmm7,48(%rdi)
1039 je L$done4x
1040
1041 movdqa 16(%rsp),%xmm6
1042 leaq 64(%rsi),%rsi
1043 xorq %r10,%r10
1044 movdqa %xmm6,0(%rsp)
1045 movdqa %xmm13,16(%rsp)
1046 leaq 64(%rdi),%rdi
1047 movdqa %xmm5,32(%rsp)
1048 subq $64,%rdx
1049 movdqa %xmm1,48(%rsp)
1050 jmp L$oop_tail4x
1051
1052.p2align 5
1053L$128_or_more4x:
1054 movdqu 0(%rsi),%xmm6
1055 movdqu 16(%rsi),%xmm11
1056 movdqu 32(%rsi),%xmm2
1057 movdqu 48(%rsi),%xmm7
1058 pxor 0(%rsp),%xmm6
1059 pxor %xmm12,%xmm11
1060 pxor %xmm4,%xmm2
1061 pxor %xmm0,%xmm7
1062
1063 movdqu %xmm6,0(%rdi)
1064 movdqu 64(%rsi),%xmm6
1065 movdqu %xmm11,16(%rdi)
1066 movdqu 80(%rsi),%xmm11
1067 movdqu %xmm2,32(%rdi)
1068 movdqu 96(%rsi),%xmm2
1069 movdqu %xmm7,48(%rdi)
1070 movdqu 112(%rsi),%xmm7
1071 pxor 16(%rsp),%xmm6
1072 pxor %xmm13,%xmm11
1073 pxor %xmm5,%xmm2
1074 pxor %xmm1,%xmm7
1075 movdqu %xmm6,64(%rdi)
1076 movdqu %xmm11,80(%rdi)
1077 movdqu %xmm2,96(%rdi)
1078 movdqu %xmm7,112(%rdi)
1079 je L$done4x
1080
1081 movdqa 32(%rsp),%xmm6
1082 leaq 128(%rsi),%rsi
1083 xorq %r10,%r10
1084 movdqa %xmm6,0(%rsp)
1085 movdqa %xmm10,16(%rsp)
1086 leaq 128(%rdi),%rdi
1087 movdqa %xmm14,32(%rsp)
1088 subq $128,%rdx
1089 movdqa %xmm8,48(%rsp)
1090 jmp L$oop_tail4x
1091
1092.p2align 5
1093L$192_or_more4x:
1094 movdqu 0(%rsi),%xmm6
1095 movdqu 16(%rsi),%xmm11
1096 movdqu 32(%rsi),%xmm2
1097 movdqu 48(%rsi),%xmm7
1098 pxor 0(%rsp),%xmm6
1099 pxor %xmm12,%xmm11
1100 pxor %xmm4,%xmm2
1101 pxor %xmm0,%xmm7
1102
1103 movdqu %xmm6,0(%rdi)
1104 movdqu 64(%rsi),%xmm6
1105 movdqu %xmm11,16(%rdi)
1106 movdqu 80(%rsi),%xmm11
1107 movdqu %xmm2,32(%rdi)
1108 movdqu 96(%rsi),%xmm2
1109 movdqu %xmm7,48(%rdi)
1110 movdqu 112(%rsi),%xmm7
1111 leaq 128(%rsi),%rsi
1112 pxor 16(%rsp),%xmm6
1113 pxor %xmm13,%xmm11
1114 pxor %xmm5,%xmm2
1115 pxor %xmm1,%xmm7
1116
1117 movdqu %xmm6,64(%rdi)
1118 movdqu 0(%rsi),%xmm6
1119 movdqu %xmm11,80(%rdi)
1120 movdqu 16(%rsi),%xmm11
1121 movdqu %xmm2,96(%rdi)
1122 movdqu 32(%rsi),%xmm2
1123 movdqu %xmm7,112(%rdi)
1124 leaq 128(%rdi),%rdi
1125 movdqu 48(%rsi),%xmm7
1126 pxor 32(%rsp),%xmm6
1127 pxor %xmm10,%xmm11
1128 pxor %xmm14,%xmm2
1129 pxor %xmm8,%xmm7
1130 movdqu %xmm6,0(%rdi)
1131 movdqu %xmm11,16(%rdi)
1132 movdqu %xmm2,32(%rdi)
1133 movdqu %xmm7,48(%rdi)
1134 je L$done4x
1135
1136 movdqa 48(%rsp),%xmm6
1137 leaq 64(%rsi),%rsi
1138 xorq %r10,%r10
1139 movdqa %xmm6,0(%rsp)
1140 movdqa %xmm15,16(%rsp)
1141 leaq 64(%rdi),%rdi
1142 movdqa %xmm9,32(%rsp)
1143 subq $192,%rdx
1144 movdqa %xmm3,48(%rsp)
1145
1146L$oop_tail4x:
1147 movzbl (%rsi,%r10,1),%eax
1148 movzbl (%rsp,%r10,1),%ecx
1149 leaq 1(%r10),%r10
1150 xorl %ecx,%eax
1151 movb %al,-1(%rdi,%r10,1)
1152 decq %rdx
1153 jnz L$oop_tail4x
1154
1155L$done4x:
1156 leaq (%r9),%rsp
1157
1158L$4x_epilogue:
1159 .byte 0xf3,0xc3
1160
1161
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette