VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 95308

Last change on this file since 95308 was 95308, checked in by vboxsync, 2 years ago

VMM/IEM: Implemented ANDN, BEXTR, SHLX, SARX, SHRX, RORX, TZCNT, and LZCNT. Fixed long-mod bug in 32-bit version of BSR and BSF (would clear the upper 32 bits of the destination register when ZF=1). bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 101.8 KB
Line 
1; $Id: IEMAllAImpl.asm 95308 2022-06-19 20:40:26Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2022 Oracle Corporation
8;
9; This file is part of VirtualBox Open Source Edition (OSE), as
10; available from http://www.virtualbox.org. This file is free software;
11; you can redistribute it and/or modify it under the terms of the GNU
12; General Public License (GPL) as published by the Free Software
13; Foundation, in version 2 as it comes in the "COPYING" file of the
14; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16;
17
18
19;*********************************************************************************************************************************
20;* Header Files *
21;*********************************************************************************************************************************
22%include "VBox/asmdefs.mac"
23%include "VBox/err.mac"
24%include "iprt/x86.mac"
25
26
27;*********************************************************************************************************************************
28;* Defined Constants And Macros *
29;*********************************************************************************************************************************
30
31;;
32; RET XX / RET wrapper for fastcall.
33;
34%macro RET_FASTCALL 1
35%ifdef RT_ARCH_X86
36 %ifdef RT_OS_WINDOWS
37 ret %1
38 %else
39 ret
40 %endif
41%else
42 ret
43%endif
44%endmacro
45
46;;
47; NAME for fastcall functions.
48;
49;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50; escaping (or whatever the dollar is good for here). Thus the ugly
51; prefix argument.
52;
53%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54%ifdef RT_ARCH_X86
55 %ifdef RT_OS_WINDOWS
56 %undef NAME_FASTCALL
57 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58 %endif
59%endif
60
61;;
62; BEGINPROC for fastcall functions.
63;
64; @param 1 The function name (C).
65; @param 2 The argument size on x86.
66;
67%macro BEGINPROC_FASTCALL 2
68 %ifdef ASM_FORMAT_PE
69 export %1=NAME_FASTCALL(%1,%2,$@)
70 %endif
71 %ifdef __NASM__
72 %ifdef ASM_FORMAT_OMF
73 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74 %endif
75 %endif
76 %ifndef ASM_FORMAT_BIN
77 global NAME_FASTCALL(%1,%2,$@)
78 %endif
79NAME_FASTCALL(%1,%2,@):
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %endif
143
144 %ifdef ASM_CALL64_MSC
145 %define A0 rcx
146 %define A0_32 ecx
147 %define A0_16 cx
148 %define A0_8 cl
149
150 %define A1 rdx
151 %define A1_32 edx
152 %define A1_16 dx
153 %define A1_8 dl
154
155 %define A2 r8
156 %define A2_32 r8d
157 %define A2_16 r8w
158 %define A2_8 r8b
159
160 %define A3 r9
161 %define A3_32 r9d
162 %define A3_16 r9w
163 %endif
164
165 %define T0 rax
166 %define T0_32 eax
167 %define T0_16 ax
168 %define T0_8 al
169
170 %define T1 r11
171 %define T1_32 r11d
172 %define T1_16 r11w
173 %define T1_8 r11b
174
175 %define T2 r10 ; only AMD64
176 %define T2_32 r10d
177 %define T2_16 r10w
178 %define T2_8 r10b
179
180%else
181 ; x86
182 %macro PROLOGUE_1_ARGS 0
183 push edi
184 %endmacro
185 %macro EPILOGUE_1_ARGS 0
186 pop edi
187 ret 0
188 %endmacro
189 %macro EPILOGUE_1_ARGS_EX 1
190 pop edi
191 ret %1
192 %endmacro
193
194 %macro PROLOGUE_2_ARGS 0
195 push edi
196 %endmacro
197 %macro EPILOGUE_2_ARGS 0
198 pop edi
199 ret 0
200 %endmacro
201 %macro EPILOGUE_2_ARGS_EX 1
202 pop edi
203 ret %1
204 %endmacro
205
206 %macro PROLOGUE_3_ARGS 0
207 push ebx
208 mov ebx, [esp + 4 + 4]
209 push edi
210 %endmacro
211 %macro EPILOGUE_3_ARGS_EX 1
212 %if (%1) < 4
213 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214 %endif
215 pop edi
216 pop ebx
217 ret %1
218 %endmacro
219 %macro EPILOGUE_3_ARGS 0
220 EPILOGUE_3_ARGS_EX 4
221 %endmacro
222
223 %macro PROLOGUE_4_ARGS 0
224 push ebx
225 push edi
226 push esi
227 mov ebx, [esp + 12 + 4 + 0]
228 mov esi, [esp + 12 + 4 + 4]
229 %endmacro
230 %macro EPILOGUE_4_ARGS_EX 1
231 %if (%1) < 8
232 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233 %endif
234 pop esi
235 pop edi
236 pop ebx
237 ret %1
238 %endmacro
239 %macro EPILOGUE_4_ARGS 0
240 EPILOGUE_4_ARGS_EX 8
241 %endmacro
242
243 %define A0 ecx
244 %define A0_32 ecx
245 %define A0_16 cx
246 %define A0_8 cl
247
248 %define A1 edx
249 %define A1_32 edx
250 %define A1_16 dx
251 %define A1_8 dl
252
253 %define A2 ebx
254 %define A2_32 ebx
255 %define A2_16 bx
256 %define A2_8 bl
257
258 %define A3 esi
259 %define A3_32 esi
260 %define A3_16 si
261
262 %define T0 eax
263 %define T0_32 eax
264 %define T0_16 ax
265 %define T0_8 al
266
267 %define T1 edi
268 %define T1_32 edi
269 %define T1_16 di
270%endif
271
272
273;;
274; Load the relevant flags from [%1] if there are undefined flags (%3).
275;
276; @remarks Clobbers T0, stack. Changes EFLAGS.
277; @param A2 The register pointing to the flags.
278; @param 1 The parameter (A0..A3) pointing to the eflags.
279; @param 2 The set of modified flags.
280; @param 3 The set of undefined flags.
281;
282%macro IEM_MAYBE_LOAD_FLAGS 3
283 ;%if (%3) != 0
284 pushf ; store current flags
285 mov T0_32, [%1] ; load the guest flags
286 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
287 and T0_32, (%2 | %3) ; select the modified and undefined flags.
288 or [xSP], T0 ; merge guest flags with host flags.
289 popf ; load the mixed flags.
290 ;%endif
291%endmacro
292
293;;
294; Update the flag.
295;
296; @remarks Clobbers T0, T1, stack.
297; @param 1 The register pointing to the EFLAGS.
298; @param 2 The mask of modified flags to save.
299; @param 3 The mask of undefined flags to (maybe) save.
300;
301%macro IEM_SAVE_FLAGS 3
302 %if (%2 | %3) != 0
303 pushf
304 pop T1
305 mov T0_32, [%1] ; flags
306 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
307 and T1_32, (%2 | %3) ; select the modified and undefined flags.
308 or T0_32, T1_32 ; combine the flags.
309 mov [%1], T0_32 ; save the flags.
310 %endif
311%endmacro
312
313;;
314; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315;
316; @remarks Clobbers T0, T1, stack.
317; @param 1 The register pointing to the EFLAGS.
318; @param 2 The mask of modified flags to save.
319; @param 3 Mask of additional flags to always clear
320; @param 4 Mask of additional flags to always set.
321;
322%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323 %if (%2 | %3 | %4) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; load flags.
327 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
328 and T1_32, (%2) ; select the modified flags.
329 or T0_32, T1_32 ; combine the flags.
330 %if (%4) != 0
331 or T0_32, %4 ; add the always set flags.
332 %endif
333 mov [%1], T0_32 ; save the result.
334 %endif
335%endmacro
336
337;;
338; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
339; signed input (%4[%5]) and parity index (%6).
340;
341; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
342; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
343; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
344;
345; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 The result register to set SF by.
350; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
351; @param 6 The (full) register containing the parity table index. Will be modified!
352
353%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
354 %ifdef RT_ARCH_AMD64
355 pushf
356 pop T2
357 %else
358 push T0
359 pushf
360 pop T0
361 %endif
362 mov T1_32, [%1] ; load flags.
363 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
364 %ifdef RT_ARCH_AMD64
365 and T2_32, (%2) ; select the modified flags.
366 or T1_32, T2_32 ; combine the flags.
367 %else
368 and T0_32, (%2) ; select the modified flags.
369 or T1_32, T0_32 ; combine the flags.
370 pop T0
371 %endif
372
373 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
374 bt %4, %5 - 1
375 jnc %%sf_clear
376 or T1_32, X86_EFL_SF
377 %%sf_clear:
378
379 ; Parity last.
380 and %6, 0xff
381 %ifdef RT_ARCH_AMD64
382 lea T2, [NAME(g_afParity) xWrtRIP]
383 or T1_8, [T2 + %6]
384 %else
385 or T1_8, [NAME(g_afParity) + %6]
386 %endif
387
388 mov [%1], T1_32 ; save the result.
389%endmacro
390
391;;
392; Calculates the new EFLAGS using fixed clear and set bit masks.
393;
394; @remarks Clobbers T0.
395; @param 1 The register pointing to the EFLAGS.
396; @param 2 Mask of additional flags to always clear
397; @param 3 Mask of additional flags to always set.
398;
399%macro IEM_ADJUST_FLAGS 3
400 %if (%2 | %3) != 0
401 mov T0_32, [%1] ; Load flags.
402 %if (%2) != 0
403 and T0_32, ~(%2) ; Remove the always cleared flags.
404 %endif
405 %if (%3) != 0
406 or T0_32, %3 ; Add the always set flags.
407 %endif
408 mov [%1], T0_32 ; Save the result.
409 %endif
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0, %4, EFLAGS.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419; @param 4 The (full) register containing the parity table index. Will be modified!
420;
421%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
422 mov T0_32, [%1] ; Load flags.
423 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
424 %if (%3) != 0
425 or T0_32, %3 ; Add the always set flags.
426 %endif
427 and %4, 0xff
428 %ifdef RT_ARCH_AMD64
429 lea T2, [NAME(g_afParity) xWrtRIP]
430 or T0_8, [T2 + %4]
431 %else
432 or T0_8, [NAME(g_afParity) + %4]
433 %endif
434 mov [%1], T0_32 ; Save the result.
435%endmacro
436
437
438;*********************************************************************************************************************************
439;* External Symbols *
440;*********************************************************************************************************************************
441extern NAME(g_afParity)
442
443
444;;
445; Macro for implementing a binary operator.
446;
447; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
448; variants, except on 32-bit system where the 64-bit accesses requires hand
449; coding.
450;
451; All the functions takes a pointer to the destination memory operand in A0,
452; the source register operand in A1 and a pointer to eflags in A2.
453;
454; @param 1 The instruction mnemonic.
455; @param 2 Non-zero if there should be a locked version.
456; @param 3 The modified flags.
457; @param 4 The undefined flags.
458;
459%macro IEMIMPL_BIN_OP 4
460BEGINCODE
461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
462 PROLOGUE_3_ARGS
463 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
464 %1 byte [A0], A1_8
465 IEM_SAVE_FLAGS A2, %3, %4
466 EPILOGUE_3_ARGS
467ENDPROC iemAImpl_ %+ %1 %+ _u8
468
469BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
470 PROLOGUE_3_ARGS
471 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472 %1 word [A0], A1_16
473 IEM_SAVE_FLAGS A2, %3, %4
474 EPILOGUE_3_ARGS
475ENDPROC iemAImpl_ %+ %1 %+ _u16
476
477BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
478 PROLOGUE_3_ARGS
479 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480 %1 dword [A0], A1_32
481 IEM_SAVE_FLAGS A2, %3, %4
482 EPILOGUE_3_ARGS
483ENDPROC iemAImpl_ %+ %1 %+ _u32
484
485 %ifdef RT_ARCH_AMD64
486BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
487 PROLOGUE_3_ARGS
488 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489 %1 qword [A0], A1
490 IEM_SAVE_FLAGS A2, %3, %4
491 EPILOGUE_3_ARGS_EX 8
492ENDPROC iemAImpl_ %+ %1 %+ _u64
493 %endif ; RT_ARCH_AMD64
494
495 %if %2 != 0 ; locked versions requested?
496
497BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
498 PROLOGUE_3_ARGS
499 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500 lock %1 byte [A0], A1_8
501 IEM_SAVE_FLAGS A2, %3, %4
502 EPILOGUE_3_ARGS
503ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
504
505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
506 PROLOGUE_3_ARGS
507 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508 lock %1 word [A0], A1_16
509 IEM_SAVE_FLAGS A2, %3, %4
510 EPILOGUE_3_ARGS
511ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
512
513BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
514 PROLOGUE_3_ARGS
515 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
516 lock %1 dword [A0], A1_32
517 IEM_SAVE_FLAGS A2, %3, %4
518 EPILOGUE_3_ARGS
519ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
520
521 %ifdef RT_ARCH_AMD64
522BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
523 PROLOGUE_3_ARGS
524 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
525 lock %1 qword [A0], A1
526 IEM_SAVE_FLAGS A2, %3, %4
527 EPILOGUE_3_ARGS_EX 8
528ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
529 %endif ; RT_ARCH_AMD64
530 %endif ; locked
531%endmacro
532
533; instr,lock, modified-flags, undefined flags
534IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
535IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
536IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
537IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
538IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
539IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
540IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
541IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
542IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
543
544
545;;
546; Macro for implementing a binary operator, VEX variant with separate input/output.
547;
548; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
549; where the 64-bit accesses requires hand coding.
550;
551; All the functions takes a pointer to the destination memory operand in A0,
552; the first source register operand in A1, the second source register operand
553; in A2 and a pointer to eflags in A3.
554;
555; @param 1 The instruction mnemonic.
556; @param 2 The modified flags.
557; @param 3 The undefined flags.
558;
559%macro IEMIMPL_VEX_BIN_OP 3
560BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
561 PROLOGUE_4_ARGS
562 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
563 %1 T0_32, A1_32, A2_32
564 mov [A0], T0_32
565 IEM_SAVE_FLAGS A3, %2, %3
566 EPILOGUE_4_ARGS
567ENDPROC iemAImpl_ %+ %1 %+ _u32
568
569 %ifdef RT_ARCH_AMD64
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0, A1, A2
574 mov [A0], T0
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u64
578 %endif ; RT_ARCH_AMD64
579%endmacro
580
581; instr, modified-flags, undefined-flags
582IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
583IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
584
585
586;;
587; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
588;
589; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
590; where the 64-bit accesses requires hand coding.
591;
592; All the functions takes a pointer to the destination memory operand in A0,
593; the first source register operand in A1, the second source register operand
594; in A2 and a pointer to eflags in A3.
595;
596; @param 1 The instruction mnemonic.
597;
598%macro IEMIMPL_VEX_BIN_OP_NOEFL 2
599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
600 PROLOGUE_3_ARGS
601 %1 T0_32, A1_32, A2_32
602 mov [A0], T0_32
603 EPILOGUE_3_ARGS
604ENDPROC iemAImpl_ %+ %1 %+ _u32
605
606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
607 PROLOGUE_3_ARGS
608 %ifdef ASM_CALL64_GCC
609 mov cl, A2_8
610 %2 A1_32, cl
611 mov [A0], A1_32
612 %else
613 xchg A2, A0
614 %2 A1_32, cl
615 mov [A2], A1_32
616 %endif
617 EPILOGUE_3_ARGS
618ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
619
620 %ifdef RT_ARCH_AMD64
621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
622 PROLOGUE_3_ARGS
623 %1 T0, A1, A2
624 mov [A0], T0
625 EPILOGUE_3_ARGS
626ENDPROC iemAImpl_ %+ %1 %+ _u64
627
628BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
629 PROLOGUE_3_ARGS
630 %ifdef ASM_CALL64_GCC
631 mov cl, A2_8
632 %2 A1, cl
633 mov [A0], A1_32
634 %else
635 xchg A2, A0
636 %2 A1, cl
637 mov [A2], A1_32
638 %endif
639 mov [A0], A1
640 EPILOGUE_3_ARGS
641ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
642 %endif ; RT_ARCH_AMD64
643%endmacro
644
645; instr, fallback instr
646IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar
647IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl
648IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr
649
650
651;
652; RORX uses a immediate byte for the shift count, so we only do
653; fallback implementation of that one.
654;
655BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
656 PROLOGUE_3_ARGS
657 %ifdef ASM_CALL64_GCC
658 mov cl, A2_8
659 ror A1_32, cl
660 mov [A0], A1_32
661 %else
662 xchg A2, A0
663 ror A1_32, cl
664 mov [A2], A1_32
665 %endif
666 EPILOGUE_3_ARGS
667ENDPROC iemAImpl_rorx_u32
668
669 %ifdef RT_ARCH_AMD64
670BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
671 PROLOGUE_3_ARGS
672 %ifdef ASM_CALL64_GCC
673 mov cl, A2_8
674 ror A1, cl
675 mov [A0], A1_32
676 %else
677 xchg A2, A0
678 ror A1, cl
679 mov [A2], A1_32
680 %endif
681 mov [A0], A1
682 EPILOGUE_3_ARGS
683ENDPROC iemAImpl_rorx_u64
684 %endif ; RT_ARCH_AMD64
685
686
687;;
688; Macro for implementing a bit operator.
689;
690; This will generate code for the 16, 32 and 64 bit accesses with locked
691; variants, except on 32-bit system where the 64-bit accesses requires hand
692; coding.
693;
694; All the functions takes a pointer to the destination memory operand in A0,
695; the source register operand in A1 and a pointer to eflags in A2.
696;
697; @param 1 The instruction mnemonic.
698; @param 2 Non-zero if there should be a locked version.
699; @param 3 The modified flags.
700; @param 4 The undefined flags.
701;
702%macro IEMIMPL_BIT_OP 4
703BEGINCODE
704BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
705 PROLOGUE_3_ARGS
706 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
707 %1 word [A0], A1_16
708 IEM_SAVE_FLAGS A2, %3, %4
709 EPILOGUE_3_ARGS
710ENDPROC iemAImpl_ %+ %1 %+ _u16
711
712BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
713 PROLOGUE_3_ARGS
714 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
715 %1 dword [A0], A1_32
716 IEM_SAVE_FLAGS A2, %3, %4
717 EPILOGUE_3_ARGS
718ENDPROC iemAImpl_ %+ %1 %+ _u32
719
720 %ifdef RT_ARCH_AMD64
721BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
722 PROLOGUE_3_ARGS
723 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
724 %1 qword [A0], A1
725 IEM_SAVE_FLAGS A2, %3, %4
726 EPILOGUE_3_ARGS_EX 8
727ENDPROC iemAImpl_ %+ %1 %+ _u64
728 %endif ; RT_ARCH_AMD64
729
730 %if %2 != 0 ; locked versions requested?
731
732BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
733 PROLOGUE_3_ARGS
734 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
735 lock %1 word [A0], A1_16
736 IEM_SAVE_FLAGS A2, %3, %4
737 EPILOGUE_3_ARGS
738ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
739
740BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
741 PROLOGUE_3_ARGS
742 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
743 lock %1 dword [A0], A1_32
744 IEM_SAVE_FLAGS A2, %3, %4
745 EPILOGUE_3_ARGS
746ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
747
748 %ifdef RT_ARCH_AMD64
749BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
750 PROLOGUE_3_ARGS
751 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
752 lock %1 qword [A0], A1
753 IEM_SAVE_FLAGS A2, %3, %4
754 EPILOGUE_3_ARGS_EX 8
755ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
756 %endif ; RT_ARCH_AMD64
757 %endif ; locked
758%endmacro
759IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
760IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
761IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
762IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
763
764;;
765; Macro for implementing a bit search operator.
766;
767; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
768; system where the 64-bit accesses requires hand coding.
769;
770; All the functions takes a pointer to the destination memory operand in A0,
771; the source register operand in A1 and a pointer to eflags in A2.
772;
773; In the ZF case the destination register is 'undefined', however it seems that
774; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
775; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
776; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
777; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
778;
779; @param 1 The instruction mnemonic.
780; @param 2 The modified flags.
781; @param 3 The undefined flags.
782; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
783;
784%macro IEMIMPL_BIT_OP2 4
785BEGINCODE
786BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
787 PROLOGUE_3_ARGS
788 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
789 %1 T0_16, A1_16
790%if %4 != 0
791 jz .unchanged_dst
792%endif
793 mov [A0], T0_16
794.unchanged_dst:
795 IEM_SAVE_FLAGS A2, %2, %3
796 EPILOGUE_3_ARGS
797ENDPROC iemAImpl_ %+ %1 %+ _u16
798
799BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
800 PROLOGUE_3_ARGS
801 %1 T1_16, A1_16
802%if %4 != 0
803 jz .unchanged_dst
804%endif
805 mov [A0], T1_16
806 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
807 EPILOGUE_3_ARGS
808.unchanged_dst:
809 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
810 EPILOGUE_3_ARGS
811ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
812
813BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
814 PROLOGUE_3_ARGS
815 %1 T0_16, A1_16
816%if %4 != 0
817 jz .unchanged_dst
818%endif
819 mov [A0], T0_16
820.unchanged_dst:
821 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
822 EPILOGUE_3_ARGS
823ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
824
825
826BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
827 PROLOGUE_3_ARGS
828 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
829 %1 T0_32, A1_32
830%if %4 != 0
831 jz .unchanged_dst
832%endif
833 mov [A0], T0_32
834.unchanged_dst:
835 IEM_SAVE_FLAGS A2, %2, %3
836 EPILOGUE_3_ARGS
837ENDPROC iemAImpl_ %+ %1 %+ _u32
838
839BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
840 PROLOGUE_3_ARGS
841 %1 T1_32, A1_32
842%if %4 != 0
843 jz .unchanged_dst
844%endif
845 mov [A0], T1_32
846 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
847 EPILOGUE_3_ARGS
848.unchanged_dst:
849 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
850 EPILOGUE_3_ARGS
851ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
852
853BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
854 PROLOGUE_3_ARGS
855 %1 T0_32, A1_32
856%if %4 != 0
857 jz .unchanged_dst
858%endif
859 mov [A0], T0_32
860.unchanged_dst:
861 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
862 EPILOGUE_3_ARGS
863ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
864
865
866 %ifdef RT_ARCH_AMD64
867
868BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
869 PROLOGUE_3_ARGS
870 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
871 %1 T0, A1
872%if %4 != 0
873 jz .unchanged_dst
874%endif
875 mov [A0], T0
876.unchanged_dst:
877 IEM_SAVE_FLAGS A2, %2, %3
878 EPILOGUE_3_ARGS_EX 8
879ENDPROC iemAImpl_ %+ %1 %+ _u64
880
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
884 %1 T1, A1
885%if %4 != 0
886 jz .unchanged_dst
887%endif
888 mov [A0], T1
889 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
890 EPILOGUE_3_ARGS
891.unchanged_dst:
892 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
893 EPILOGUE_3_ARGS
894ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
895
896BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
897 PROLOGUE_3_ARGS
898 %1 T0, A1
899%if %4 != 0
900 jz .unchanged_dst
901%endif
902 mov [A0], T0
903.unchanged_dst:
904 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
905 EPILOGUE_3_ARGS_EX 8
906ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
907
908 %endif ; RT_ARCH_AMD64
909%endmacro
910
911IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
912IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
913IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
914IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
915
916
917;
918; IMUL is also a similar but yet different case (no lock, no mem dst).
919; The rDX:rAX variant of imul is handled together with mul further down.
920;
921BEGINCODE
922; @param 1 EFLAGS that are modified.
923; @param 2 Undefined EFLAGS.
924; @param 3 Function suffix.
925; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
926; 2 for AMD (set AF, clear PF, ZF and SF).
927%macro IEMIMPL_IMUL_TWO 4
928BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
929 PROLOGUE_3_ARGS
930 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
931 imul A1_16, word [A0]
932 mov [A0], A1_16
933 %if %4 != 1
934 IEM_SAVE_FLAGS A2, %1, %2
935 %else
936 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
937 %endif
938 EPILOGUE_3_ARGS
939ENDPROC iemAImpl_imul_two_u16 %+ %3
940
941BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
942 PROLOGUE_3_ARGS
943 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
944 imul A1_32, dword [A0]
945 mov [A0], A1_32
946 %if %4 != 1
947 IEM_SAVE_FLAGS A2, %1, %2
948 %else
949 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
950 %endif
951 EPILOGUE_3_ARGS
952ENDPROC iemAImpl_imul_two_u32 %+ %3
953
954 %ifdef RT_ARCH_AMD64
955BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
956 PROLOGUE_3_ARGS
957 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
958 imul A1, qword [A0]
959 mov [A0], A1
960 %if %4 != 1
961 IEM_SAVE_FLAGS A2, %1, %2
962 %else
963 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
964 %endif
965 EPILOGUE_3_ARGS_EX 8
966ENDPROC iemAImpl_imul_two_u64 %+ %3
967 %endif ; RT_ARCH_AMD64
968%endmacro
969IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
970IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
971IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
972
973
974;
975; XCHG for memory operands. This implies locking. No flag changes.
976;
977; Each function takes two arguments, first the pointer to the memory,
978; then the pointer to the register. They all return void.
979;
980BEGINCODE
981BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
982 PROLOGUE_2_ARGS
983 mov T0_8, [A1]
984 xchg [A0], T0_8
985 mov [A1], T0_8
986 EPILOGUE_2_ARGS
987ENDPROC iemAImpl_xchg_u8_locked
988
989BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
990 PROLOGUE_2_ARGS
991 mov T0_16, [A1]
992 xchg [A0], T0_16
993 mov [A1], T0_16
994 EPILOGUE_2_ARGS
995ENDPROC iemAImpl_xchg_u16_locked
996
997BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
998 PROLOGUE_2_ARGS
999 mov T0_32, [A1]
1000 xchg [A0], T0_32
1001 mov [A1], T0_32
1002 EPILOGUE_2_ARGS
1003ENDPROC iemAImpl_xchg_u32_locked
1004
1005%ifdef RT_ARCH_AMD64
1006BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1007 PROLOGUE_2_ARGS
1008 mov T0, [A1]
1009 xchg [A0], T0
1010 mov [A1], T0
1011 EPILOGUE_2_ARGS
1012ENDPROC iemAImpl_xchg_u64_locked
1013%endif
1014
1015; Unlocked variants for fDisregardLock mode.
1016
1017BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1018 PROLOGUE_2_ARGS
1019 mov T0_8, [A1]
1020 mov T1_8, [A0]
1021 mov [A0], T0_8
1022 mov [A1], T1_8
1023 EPILOGUE_2_ARGS
1024ENDPROC iemAImpl_xchg_u8_unlocked
1025
1026BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1027 PROLOGUE_2_ARGS
1028 mov T0_16, [A1]
1029 mov T1_16, [A0]
1030 mov [A0], T0_16
1031 mov [A1], T1_16
1032 EPILOGUE_2_ARGS
1033ENDPROC iemAImpl_xchg_u16_unlocked
1034
1035BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1036 PROLOGUE_2_ARGS
1037 mov T0_32, [A1]
1038 mov T1_32, [A0]
1039 mov [A0], T0_32
1040 mov [A1], T1_32
1041 EPILOGUE_2_ARGS
1042ENDPROC iemAImpl_xchg_u32_unlocked
1043
1044%ifdef RT_ARCH_AMD64
1045BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1046 PROLOGUE_2_ARGS
1047 mov T0, [A1]
1048 mov T1, [A0]
1049 mov [A0], T0
1050 mov [A1], T1
1051 EPILOGUE_2_ARGS
1052ENDPROC iemAImpl_xchg_u64_unlocked
1053%endif
1054
1055
1056;
1057; XADD for memory operands.
1058;
1059; Each function takes three arguments, first the pointer to the
1060; memory/register, then the pointer to the register, and finally a pointer to
1061; eflags. They all return void.
1062;
1063BEGINCODE
1064BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1065 PROLOGUE_3_ARGS
1066 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1067 mov T0_8, [A1]
1068 xadd [A0], T0_8
1069 mov [A1], T0_8
1070 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1071 EPILOGUE_3_ARGS
1072ENDPROC iemAImpl_xadd_u8
1073
1074BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1075 PROLOGUE_3_ARGS
1076 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1077 mov T0_16, [A1]
1078 xadd [A0], T0_16
1079 mov [A1], T0_16
1080 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1081 EPILOGUE_3_ARGS
1082ENDPROC iemAImpl_xadd_u16
1083
1084BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1085 PROLOGUE_3_ARGS
1086 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1087 mov T0_32, [A1]
1088 xadd [A0], T0_32
1089 mov [A1], T0_32
1090 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1091 EPILOGUE_3_ARGS
1092ENDPROC iemAImpl_xadd_u32
1093
1094%ifdef RT_ARCH_AMD64
1095BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1096 PROLOGUE_3_ARGS
1097 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1098 mov T0, [A1]
1099 xadd [A0], T0
1100 mov [A1], T0
1101 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1102 EPILOGUE_3_ARGS
1103ENDPROC iemAImpl_xadd_u64
1104%endif ; RT_ARCH_AMD64
1105
1106BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1107 PROLOGUE_3_ARGS
1108 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1109 mov T0_8, [A1]
1110 lock xadd [A0], T0_8
1111 mov [A1], T0_8
1112 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1113 EPILOGUE_3_ARGS
1114ENDPROC iemAImpl_xadd_u8_locked
1115
1116BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1117 PROLOGUE_3_ARGS
1118 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1119 mov T0_16, [A1]
1120 lock xadd [A0], T0_16
1121 mov [A1], T0_16
1122 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1123 EPILOGUE_3_ARGS
1124ENDPROC iemAImpl_xadd_u16_locked
1125
1126BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1127 PROLOGUE_3_ARGS
1128 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1129 mov T0_32, [A1]
1130 lock xadd [A0], T0_32
1131 mov [A1], T0_32
1132 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1133 EPILOGUE_3_ARGS
1134ENDPROC iemAImpl_xadd_u32_locked
1135
1136%ifdef RT_ARCH_AMD64
1137BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1138 PROLOGUE_3_ARGS
1139 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1140 mov T0, [A1]
1141 lock xadd [A0], T0
1142 mov [A1], T0
1143 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_xadd_u64_locked
1146%endif ; RT_ARCH_AMD64
1147
1148
1149;
1150; CMPXCHG8B.
1151;
1152; These are tricky register wise, so the code is duplicated for each calling
1153; convention.
1154;
1155; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1156;
1157; C-proto:
1158; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1159; uint32_t *pEFlags));
1160;
1161; Note! Identical to iemAImpl_cmpxchg16b.
1162;
1163BEGINCODE
1164BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1165%ifdef RT_ARCH_AMD64
1166 %ifdef ASM_CALL64_MSC
1167 push rbx
1168
1169 mov r11, rdx ; pu64EaxEdx (is also T1)
1170 mov r10, rcx ; pu64Dst
1171
1172 mov ebx, [r8]
1173 mov ecx, [r8 + 4]
1174 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1175 mov eax, [r11]
1176 mov edx, [r11 + 4]
1177
1178 lock cmpxchg8b [r10]
1179
1180 mov [r11], eax
1181 mov [r11 + 4], edx
1182 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1183
1184 pop rbx
1185 ret
1186 %else
1187 push rbx
1188
1189 mov r10, rcx ; pEFlags
1190 mov r11, rdx ; pu64EbxEcx (is also T1)
1191
1192 mov ebx, [r11]
1193 mov ecx, [r11 + 4]
1194 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1195 mov eax, [rsi]
1196 mov edx, [rsi + 4]
1197
1198 lock cmpxchg8b [rdi]
1199
1200 mov [rsi], eax
1201 mov [rsi + 4], edx
1202 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1203
1204 pop rbx
1205 ret
1206
1207 %endif
1208%else
1209 push esi
1210 push edi
1211 push ebx
1212 push ebp
1213
1214 mov edi, ecx ; pu64Dst
1215 mov esi, edx ; pu64EaxEdx
1216 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1217 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1218
1219 mov ebx, [ecx]
1220 mov ecx, [ecx + 4]
1221 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1222 mov eax, [esi]
1223 mov edx, [esi + 4]
1224
1225 lock cmpxchg8b [edi]
1226
1227 mov [esi], eax
1228 mov [esi + 4], edx
1229 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1230
1231 pop ebp
1232 pop ebx
1233 pop edi
1234 pop esi
1235 ret 8
1236%endif
1237ENDPROC iemAImpl_cmpxchg8b
1238
1239BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1240 ; Lazy bird always lock prefixes cmpxchg8b.
1241 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1242ENDPROC iemAImpl_cmpxchg8b_locked
1243
1244%ifdef RT_ARCH_AMD64
1245
1246;
1247; CMPXCHG16B.
1248;
1249; These are tricky register wise, so the code is duplicated for each calling
1250; convention.
1251;
1252; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1253;
1254; C-proto:
1255; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1256; uint32_t *pEFlags));
1257;
1258; Note! Identical to iemAImpl_cmpxchg8b.
1259;
1260BEGINCODE
1261BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1262 %ifdef ASM_CALL64_MSC
1263 push rbx
1264
1265 mov r11, rdx ; pu64RaxRdx (is also T1)
1266 mov r10, rcx ; pu64Dst
1267
1268 mov rbx, [r8]
1269 mov rcx, [r8 + 8]
1270 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1271 mov rax, [r11]
1272 mov rdx, [r11 + 8]
1273
1274 lock cmpxchg16b [r10]
1275
1276 mov [r11], rax
1277 mov [r11 + 8], rdx
1278 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1279
1280 pop rbx
1281 ret
1282 %else
1283 push rbx
1284
1285 mov r10, rcx ; pEFlags
1286 mov r11, rdx ; pu64RbxRcx (is also T1)
1287
1288 mov rbx, [r11]
1289 mov rcx, [r11 + 8]
1290 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1291 mov rax, [rsi]
1292 mov rdx, [rsi + 8]
1293
1294 lock cmpxchg16b [rdi]
1295
1296 mov [rsi], rax
1297 mov [rsi + 8], rdx
1298 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1299
1300 pop rbx
1301 ret
1302
1303 %endif
1304ENDPROC iemAImpl_cmpxchg16b
1305
1306BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1307 ; Lazy bird always lock prefixes cmpxchg16b.
1308 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1309ENDPROC iemAImpl_cmpxchg16b_locked
1310
1311%endif ; RT_ARCH_AMD64
1312
1313
1314;
1315; CMPXCHG.
1316;
1317; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1318;
1319; C-proto:
1320; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1321;
1322BEGINCODE
1323%macro IEMIMPL_CMPXCHG 2
1324BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1325 PROLOGUE_4_ARGS
1326 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1327 mov al, [A1]
1328 %1 cmpxchg [A0], A2_8
1329 mov [A1], al
1330 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1331 EPILOGUE_4_ARGS
1332ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1333
1334BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1335 PROLOGUE_4_ARGS
1336 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1337 mov ax, [A1]
1338 %1 cmpxchg [A0], A2_16
1339 mov [A1], ax
1340 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1341 EPILOGUE_4_ARGS
1342ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1343
1344BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1345 PROLOGUE_4_ARGS
1346 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1347 mov eax, [A1]
1348 %1 cmpxchg [A0], A2_32
1349 mov [A1], eax
1350 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1351 EPILOGUE_4_ARGS
1352ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1353
1354BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1355%ifdef RT_ARCH_AMD64
1356 PROLOGUE_4_ARGS
1357 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1358 mov rax, [A1]
1359 %1 cmpxchg [A0], A2
1360 mov [A1], rax
1361 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1362 EPILOGUE_4_ARGS
1363%else
1364 ;
1365 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1366 ;
1367 push esi
1368 push edi
1369 push ebx
1370 push ebp
1371
1372 mov edi, ecx ; pu64Dst
1373 mov esi, edx ; pu64Rax
1374 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1375 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1376
1377 mov ebx, [ecx]
1378 mov ecx, [ecx + 4]
1379 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1380 mov eax, [esi]
1381 mov edx, [esi + 4]
1382
1383 lock cmpxchg8b [edi]
1384
1385 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1386 jz .cmpxchg8b_not_equal
1387 cmp eax, eax ; just set the other flags.
1388.store:
1389 mov [esi], eax
1390 mov [esi + 4], edx
1391 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1392
1393 pop ebp
1394 pop ebx
1395 pop edi
1396 pop esi
1397 ret 8
1398
1399.cmpxchg8b_not_equal:
1400 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1401 jne .store
1402 cmp [esi], eax
1403 jmp .store
1404
1405%endif
1406ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1407%endmacro ; IEMIMPL_CMPXCHG
1408
1409IEMIMPL_CMPXCHG , ,
1410IEMIMPL_CMPXCHG lock, _locked
1411
1412;;
1413; Macro for implementing a unary operator.
1414;
1415; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1416; variants, except on 32-bit system where the 64-bit accesses requires hand
1417; coding.
1418;
1419; All the functions takes a pointer to the destination memory operand in A0,
1420; the source register operand in A1 and a pointer to eflags in A2.
1421;
1422; @param 1 The instruction mnemonic.
1423; @param 2 The modified flags.
1424; @param 3 The undefined flags.
1425;
1426%macro IEMIMPL_UNARY_OP 3
1427BEGINCODE
1428BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1429 PROLOGUE_2_ARGS
1430 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1431 %1 byte [A0]
1432 IEM_SAVE_FLAGS A1, %2, %3
1433 EPILOGUE_2_ARGS
1434ENDPROC iemAImpl_ %+ %1 %+ _u8
1435
1436BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1437 PROLOGUE_2_ARGS
1438 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1439 lock %1 byte [A0]
1440 IEM_SAVE_FLAGS A1, %2, %3
1441 EPILOGUE_2_ARGS
1442ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1443
1444BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1445 PROLOGUE_2_ARGS
1446 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1447 %1 word [A0]
1448 IEM_SAVE_FLAGS A1, %2, %3
1449 EPILOGUE_2_ARGS
1450ENDPROC iemAImpl_ %+ %1 %+ _u16
1451
1452BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1453 PROLOGUE_2_ARGS
1454 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1455 lock %1 word [A0]
1456 IEM_SAVE_FLAGS A1, %2, %3
1457 EPILOGUE_2_ARGS
1458ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1459
1460BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1461 PROLOGUE_2_ARGS
1462 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1463 %1 dword [A0]
1464 IEM_SAVE_FLAGS A1, %2, %3
1465 EPILOGUE_2_ARGS
1466ENDPROC iemAImpl_ %+ %1 %+ _u32
1467
1468BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1469 PROLOGUE_2_ARGS
1470 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1471 lock %1 dword [A0]
1472 IEM_SAVE_FLAGS A1, %2, %3
1473 EPILOGUE_2_ARGS
1474ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1475
1476 %ifdef RT_ARCH_AMD64
1477BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1478 PROLOGUE_2_ARGS
1479 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1480 %1 qword [A0]
1481 IEM_SAVE_FLAGS A1, %2, %3
1482 EPILOGUE_2_ARGS
1483ENDPROC iemAImpl_ %+ %1 %+ _u64
1484
1485BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1486 PROLOGUE_2_ARGS
1487 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1488 lock %1 qword [A0]
1489 IEM_SAVE_FLAGS A1, %2, %3
1490 EPILOGUE_2_ARGS
1491ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1492 %endif ; RT_ARCH_AMD64
1493
1494%endmacro
1495
1496IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1497IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1498IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1499IEMIMPL_UNARY_OP not, 0, 0
1500
1501
1502;
1503; BSWAP. No flag changes.
1504;
1505; Each function takes one argument, pointer to the value to bswap
1506; (input/output). They all return void.
1507;
1508BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1509 PROLOGUE_1_ARGS
1510 mov T0_32, [A0] ; just in case any of the upper bits are used.
1511 db 66h
1512 bswap T0_32
1513 mov [A0], T0_32
1514 EPILOGUE_1_ARGS
1515ENDPROC iemAImpl_bswap_u16
1516
1517BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1518 PROLOGUE_1_ARGS
1519 mov T0_32, [A0]
1520 bswap T0_32
1521 mov [A0], T0_32
1522 EPILOGUE_1_ARGS
1523ENDPROC iemAImpl_bswap_u32
1524
1525BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1526%ifdef RT_ARCH_AMD64
1527 PROLOGUE_1_ARGS
1528 mov T0, [A0]
1529 bswap T0
1530 mov [A0], T0
1531 EPILOGUE_1_ARGS
1532%else
1533 PROLOGUE_1_ARGS
1534 mov T0, [A0]
1535 mov T1, [A0 + 4]
1536 bswap T0
1537 bswap T1
1538 mov [A0 + 4], T0
1539 mov [A0], T1
1540 EPILOGUE_1_ARGS
1541%endif
1542ENDPROC iemAImpl_bswap_u64
1543
1544
1545;;
1546; Macro for implementing a shift operation.
1547;
1548; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1549; 32-bit system where the 64-bit accesses requires hand coding.
1550;
1551; All the functions takes a pointer to the destination memory operand in A0,
1552; the shift count in A1 and a pointer to eflags in A2.
1553;
1554; @param 1 The instruction mnemonic.
1555; @param 2 The modified flags.
1556; @param 3 The undefined flags.
1557;
1558; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1559;
1560; @note the _intel and _amd variants are implemented in C.
1561;
1562%macro IEMIMPL_SHIFT_OP 3
1563BEGINCODE
1564BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1565 PROLOGUE_3_ARGS
1566 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1567 %ifdef ASM_CALL64_GCC
1568 mov cl, A1_8
1569 %1 byte [A0], cl
1570 %else
1571 xchg A1, A0
1572 %1 byte [A1], cl
1573 %endif
1574 IEM_SAVE_FLAGS A2, %2, %3
1575 EPILOGUE_3_ARGS
1576ENDPROC iemAImpl_ %+ %1 %+ _u8
1577
1578BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1579 PROLOGUE_3_ARGS
1580 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1581 %ifdef ASM_CALL64_GCC
1582 mov cl, A1_8
1583 %1 word [A0], cl
1584 %else
1585 xchg A1, A0
1586 %1 word [A1], cl
1587 %endif
1588 IEM_SAVE_FLAGS A2, %2, %3
1589 EPILOGUE_3_ARGS
1590ENDPROC iemAImpl_ %+ %1 %+ _u16
1591
1592BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1593 PROLOGUE_3_ARGS
1594 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1595 %ifdef ASM_CALL64_GCC
1596 mov cl, A1_8
1597 %1 dword [A0], cl
1598 %else
1599 xchg A1, A0
1600 %1 dword [A1], cl
1601 %endif
1602 IEM_SAVE_FLAGS A2, %2, %3
1603 EPILOGUE_3_ARGS
1604ENDPROC iemAImpl_ %+ %1 %+ _u32
1605
1606 %ifdef RT_ARCH_AMD64
1607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1608 PROLOGUE_3_ARGS
1609 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1610 %ifdef ASM_CALL64_GCC
1611 mov cl, A1_8
1612 %1 qword [A0], cl
1613 %else
1614 xchg A1, A0
1615 %1 qword [A1], cl
1616 %endif
1617 IEM_SAVE_FLAGS A2, %2, %3
1618 EPILOGUE_3_ARGS
1619ENDPROC iemAImpl_ %+ %1 %+ _u64
1620 %endif ; RT_ARCH_AMD64
1621
1622%endmacro
1623
1624IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1625IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1626IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1627IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1628IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1629IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1630IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1631
1632
1633;;
1634; Macro for implementing a double precision shift operation.
1635;
1636; This will generate code for the 16, 32 and 64 bit accesses, except on
1637; 32-bit system where the 64-bit accesses requires hand coding.
1638;
1639; The functions takes the destination operand (r/m) in A0, the source (reg) in
1640; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1641;
1642; @param 1 The instruction mnemonic.
1643; @param 2 The modified flags.
1644; @param 3 The undefined flags.
1645;
1646; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1647;
1648; @note the _intel and _amd variants are implemented in C.
1649;
1650%macro IEMIMPL_SHIFT_DBL_OP 3
1651BEGINCODE
1652BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1653 PROLOGUE_4_ARGS
1654 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1655 %ifdef ASM_CALL64_GCC
1656 xchg A3, A2
1657 %1 [A0], A1_16, cl
1658 xchg A3, A2
1659 %else
1660 xchg A0, A2
1661 %1 [A2], A1_16, cl
1662 %endif
1663 IEM_SAVE_FLAGS A3, %2, %3
1664 EPILOGUE_4_ARGS
1665ENDPROC iemAImpl_ %+ %1 %+ _u16
1666
1667BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1668 PROLOGUE_4_ARGS
1669 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1670 %ifdef ASM_CALL64_GCC
1671 xchg A3, A2
1672 %1 [A0], A1_32, cl
1673 xchg A3, A2
1674 %else
1675 xchg A0, A2
1676 %1 [A2], A1_32, cl
1677 %endif
1678 IEM_SAVE_FLAGS A3, %2, %3
1679 EPILOGUE_4_ARGS
1680ENDPROC iemAImpl_ %+ %1 %+ _u32
1681
1682 %ifdef RT_ARCH_AMD64
1683BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1684 PROLOGUE_4_ARGS
1685 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1686 %ifdef ASM_CALL64_GCC
1687 xchg A3, A2
1688 %1 [A0], A1, cl
1689 xchg A3, A2
1690 %else
1691 xchg A0, A2
1692 %1 [A2], A1, cl
1693 %endif
1694 IEM_SAVE_FLAGS A3, %2, %3
1695 EPILOGUE_4_ARGS_EX 12
1696ENDPROC iemAImpl_ %+ %1 %+ _u64
1697 %endif ; RT_ARCH_AMD64
1698
1699%endmacro
1700
1701IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1702IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1703
1704
1705;;
1706; Macro for implementing a multiplication operations.
1707;
1708; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1709; 32-bit system where the 64-bit accesses requires hand coding.
1710;
1711; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1712; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1713; pointer to eflags in A3.
1714;
1715; The functions all return 0 so the caller can be used for div/idiv as well as
1716; for the mul/imul implementation.
1717;
1718; @param 1 The instruction mnemonic.
1719; @param 2 The modified flags.
1720; @param 3 The undefined flags.
1721; @param 4 Name suffix.
1722; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1723;
1724; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1725;
1726%macro IEMIMPL_MUL_OP 5
1727BEGINCODE
1728BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1729 PROLOGUE_3_ARGS
1730 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1731 mov al, [A0]
1732 %1 A1_8
1733 mov [A0], ax
1734 %if %5 != 1
1735 IEM_SAVE_FLAGS A2, %2, %3
1736 %else
1737 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1738 %endif
1739 xor eax, eax
1740 EPILOGUE_3_ARGS
1741ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1742
1743BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1744 PROLOGUE_4_ARGS
1745 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1746 mov ax, [A0]
1747 %ifdef ASM_CALL64_GCC
1748 %1 A2_16
1749 mov [A0], ax
1750 mov [A1], dx
1751 %else
1752 mov T1, A1
1753 %1 A2_16
1754 mov [A0], ax
1755 mov [T1], dx
1756 %endif
1757 %if %5 != 1
1758 IEM_SAVE_FLAGS A3, %2, %3
1759 %else
1760 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1761 %endif
1762 xor eax, eax
1763 EPILOGUE_4_ARGS
1764ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1765
1766BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1767 PROLOGUE_4_ARGS
1768 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1769 mov eax, [A0]
1770 %ifdef ASM_CALL64_GCC
1771 %1 A2_32
1772 mov [A0], eax
1773 mov [A1], edx
1774 %else
1775 mov T1, A1
1776 %1 A2_32
1777 mov [A0], eax
1778 mov [T1], edx
1779 %endif
1780 %if %5 != 1
1781 IEM_SAVE_FLAGS A3, %2, %3
1782 %else
1783 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1784 %endif
1785 xor eax, eax
1786 EPILOGUE_4_ARGS
1787ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1788
1789 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1790BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1791 PROLOGUE_4_ARGS
1792 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1793 mov rax, [A0]
1794 %ifdef ASM_CALL64_GCC
1795 %1 A2
1796 mov [A0], rax
1797 mov [A1], rdx
1798 %else
1799 mov T1, A1
1800 %1 A2
1801 mov [A0], rax
1802 mov [T1], rdx
1803 %endif
1804 %if %5 != 1
1805 IEM_SAVE_FLAGS A3, %2, %3
1806 %else
1807 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
1808 %endif
1809 xor eax, eax
1810 EPILOGUE_4_ARGS_EX 12
1811ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
1812 %endif ; !RT_ARCH_AMD64
1813
1814%endmacro
1815
1816IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
1817IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
1818IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
1819IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
1820IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
1821IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
1822
1823
1824BEGINCODE
1825;;
1826; Worker function for negating a 32-bit number in T1:T0
1827; @uses None (T0,T1)
1828BEGINPROC iemAImpl_negate_T0_T1_u32
1829 push 0
1830 push 0
1831 xchg T0_32, [xSP]
1832 xchg T1_32, [xSP + xCB]
1833 sub T0_32, [xSP]
1834 sbb T1_32, [xSP + xCB]
1835 add xSP, xCB*2
1836 ret
1837ENDPROC iemAImpl_negate_T0_T1_u32
1838
1839%ifdef RT_ARCH_AMD64
1840;;
1841; Worker function for negating a 64-bit number in T1:T0
1842; @uses None (T0,T1)
1843BEGINPROC iemAImpl_negate_T0_T1_u64
1844 push 0
1845 push 0
1846 xchg T0, [xSP]
1847 xchg T1, [xSP + xCB]
1848 sub T0, [xSP]
1849 sbb T1, [xSP + xCB]
1850 add xSP, xCB*2
1851 ret
1852ENDPROC iemAImpl_negate_T0_T1_u64
1853%endif
1854
1855
1856;;
1857; Macro for implementing a division operations.
1858;
1859; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1860; 32-bit system where the 64-bit accesses requires hand coding.
1861;
1862; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1863; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1864; pointer to eflags in A3.
1865;
1866; The functions all return 0 on success and -1 if a divide error should be
1867; raised by the caller.
1868;
1869; @param 1 The instruction mnemonic.
1870; @param 2 The modified flags.
1871; @param 3 The undefined flags.
1872; @param 4 1 if signed, 0 if unsigned.
1873; @param 5 Function suffix.
1874; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
1875; 2 for AMD (set AF, clear PF, ZF and SF).
1876;
1877; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1878;
1879%macro IEMIMPL_DIV_OP 6
1880BEGINCODE
1881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
1882 PROLOGUE_3_ARGS
1883
1884 ; div by chainsaw check.
1885 test A1_8, A1_8
1886 jz .div_zero
1887
1888 ; Overflow check - unsigned division is simple to verify, haven't
1889 ; found a simple way to check signed division yet unfortunately.
1890 %if %4 == 0
1891 cmp [A0 + 1], A1_8
1892 jae .div_overflow
1893 %else
1894 mov T0_16, [A0] ; T0 = dividend
1895 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1896 test A1_8, A1_8
1897 js .divisor_negative
1898 test T0_16, T0_16
1899 jns .both_positive
1900 neg T0_16
1901.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1902 push T0 ; Start off like unsigned below.
1903 shr T0_16, 7
1904 cmp T0_8, A1_8
1905 pop T0
1906 jb .div_no_overflow
1907 ja .div_overflow
1908 and T0_8, 0x7f ; Special case for covering (divisor - 1).
1909 cmp T0_8, A1_8
1910 jae .div_overflow
1911 jmp .div_no_overflow
1912
1913.divisor_negative:
1914 neg A1_8
1915 test T0_16, T0_16
1916 jns .one_of_each
1917 neg T0_16
1918.both_positive: ; Same as unsigned shifted by sign indicator bit.
1919 shr T0_16, 7
1920 cmp T0_8, A1_8
1921 jae .div_overflow
1922.div_no_overflow:
1923 mov A1, T1 ; restore divisor
1924 %endif
1925
1926 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1927 mov ax, [A0]
1928 %1 A1_8
1929 mov [A0], ax
1930 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
1931 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
1932 %else
1933 IEM_SAVE_FLAGS A2, %2, %3
1934 %endif
1935 xor eax, eax
1936
1937.return:
1938 EPILOGUE_3_ARGS
1939
1940.div_zero:
1941.div_overflow:
1942 mov eax, -1
1943 jmp .return
1944ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
1945
1946BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
1947 PROLOGUE_4_ARGS
1948
1949 ; div by chainsaw check.
1950 test A2_16, A2_16
1951 jz .div_zero
1952
1953 ; Overflow check - unsigned division is simple to verify, haven't
1954 ; found a simple way to check signed division yet unfortunately.
1955 %if %4 == 0
1956 cmp [A1], A2_16
1957 jae .div_overflow
1958 %else
1959 mov T0_16, [A1]
1960 shl T0_32, 16
1961 mov T0_16, [A0] ; T0 = dividend
1962 mov T1, A2 ; T1 = divisor
1963 test T1_16, T1_16
1964 js .divisor_negative
1965 test T0_32, T0_32
1966 jns .both_positive
1967 neg T0_32
1968.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1969 push T0 ; Start off like unsigned below.
1970 shr T0_32, 15
1971 cmp T0_16, T1_16
1972 pop T0
1973 jb .div_no_overflow
1974 ja .div_overflow
1975 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1976 cmp T0_16, T1_16
1977 jae .div_overflow
1978 jmp .div_no_overflow
1979
1980.divisor_negative:
1981 neg T1_16
1982 test T0_32, T0_32
1983 jns .one_of_each
1984 neg T0_32
1985.both_positive: ; Same as unsigned shifted by sign indicator bit.
1986 shr T0_32, 15
1987 cmp T0_16, T1_16
1988 jae .div_overflow
1989.div_no_overflow:
1990 %endif
1991
1992 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1993 %ifdef ASM_CALL64_GCC
1994 mov T1, A2
1995 mov ax, [A0]
1996 mov dx, [A1]
1997 %1 T1_16
1998 mov [A0], ax
1999 mov [A1], dx
2000 %else
2001 mov T1, A1
2002 mov ax, [A0]
2003 mov dx, [T1]
2004 %1 A2_16
2005 mov [A0], ax
2006 mov [T1], dx
2007 %endif
2008 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2009 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2010 %else
2011 IEM_SAVE_FLAGS A3, %2, %3
2012 %endif
2013 xor eax, eax
2014
2015.return:
2016 EPILOGUE_4_ARGS
2017
2018.div_zero:
2019.div_overflow:
2020 mov eax, -1
2021 jmp .return
2022ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2023
2024BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2025 PROLOGUE_4_ARGS
2026
2027 ; div by chainsaw check.
2028 test A2_32, A2_32
2029 jz .div_zero
2030
2031 ; Overflow check - unsigned division is simple to verify, haven't
2032 ; found a simple way to check signed division yet unfortunately.
2033 %if %4 == 0
2034 cmp [A1], A2_32
2035 jae .div_overflow
2036 %else
2037 push A2 ; save A2 so we modify it (we out of regs on x86).
2038 mov T0_32, [A0] ; T0 = dividend low
2039 mov T1_32, [A1] ; T1 = dividend high
2040 test A2_32, A2_32
2041 js .divisor_negative
2042 test T1_32, T1_32
2043 jns .both_positive
2044 call NAME(iemAImpl_negate_T0_T1_u32)
2045.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2046 push T0 ; Start off like unsigned below.
2047 shl T1_32, 1
2048 shr T0_32, 31
2049 or T1_32, T0_32
2050 cmp T1_32, A2_32
2051 pop T0
2052 jb .div_no_overflow
2053 ja .div_overflow
2054 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2055 cmp T0_32, A2_32
2056 jae .div_overflow
2057 jmp .div_no_overflow
2058
2059.divisor_negative:
2060 neg A2_32
2061 test T1_32, T1_32
2062 jns .one_of_each
2063 call NAME(iemAImpl_negate_T0_T1_u32)
2064.both_positive: ; Same as unsigned shifted by sign indicator bit.
2065 shl T1_32, 1
2066 shr T0_32, 31
2067 or T1_32, T0_32
2068 cmp T1_32, A2_32
2069 jae .div_overflow
2070.div_no_overflow:
2071 pop A2
2072 %endif
2073
2074 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2075 mov eax, [A0]
2076 %ifdef ASM_CALL64_GCC
2077 mov T1, A2
2078 mov eax, [A0]
2079 mov edx, [A1]
2080 %1 T1_32
2081 mov [A0], eax
2082 mov [A1], edx
2083 %else
2084 mov T1, A1
2085 mov eax, [A0]
2086 mov edx, [T1]
2087 %1 A2_32
2088 mov [A0], eax
2089 mov [T1], edx
2090 %endif
2091 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2092 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2093 %else
2094 IEM_SAVE_FLAGS A3, %2, %3
2095 %endif
2096 xor eax, eax
2097
2098.return:
2099 EPILOGUE_4_ARGS
2100
2101.div_overflow:
2102 %if %4 != 0
2103 pop A2
2104 %endif
2105.div_zero:
2106 mov eax, -1
2107 jmp .return
2108ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2109
2110 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2111BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2112 PROLOGUE_4_ARGS
2113
2114 test A2, A2
2115 jz .div_zero
2116 %if %4 == 0
2117 cmp [A1], A2
2118 jae .div_overflow
2119 %else
2120 push A2 ; save A2 so we modify it (we out of regs on x86).
2121 mov T0, [A0] ; T0 = dividend low
2122 mov T1, [A1] ; T1 = dividend high
2123 test A2, A2
2124 js .divisor_negative
2125 test T1, T1
2126 jns .both_positive
2127 call NAME(iemAImpl_negate_T0_T1_u64)
2128.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2129 push T0 ; Start off like unsigned below.
2130 shl T1, 1
2131 shr T0, 63
2132 or T1, T0
2133 cmp T1, A2
2134 pop T0
2135 jb .div_no_overflow
2136 ja .div_overflow
2137 mov T1, 0x7fffffffffffffff
2138 and T0, T1 ; Special case for covering (divisor - 1).
2139 cmp T0, A2
2140 jae .div_overflow
2141 jmp .div_no_overflow
2142
2143.divisor_negative:
2144 neg A2
2145 test T1, T1
2146 jns .one_of_each
2147 call NAME(iemAImpl_negate_T0_T1_u64)
2148.both_positive: ; Same as unsigned shifted by sign indicator bit.
2149 shl T1, 1
2150 shr T0, 63
2151 or T1, T0
2152 cmp T1, A2
2153 jae .div_overflow
2154.div_no_overflow:
2155 pop A2
2156 %endif
2157
2158 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2159 mov rax, [A0]
2160 %ifdef ASM_CALL64_GCC
2161 mov T1, A2
2162 mov rax, [A0]
2163 mov rdx, [A1]
2164 %1 T1
2165 mov [A0], rax
2166 mov [A1], rdx
2167 %else
2168 mov T1, A1
2169 mov rax, [A0]
2170 mov rdx, [T1]
2171 %1 A2
2172 mov [A0], rax
2173 mov [T1], rdx
2174 %endif
2175 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2176 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2177 %else
2178 IEM_SAVE_FLAGS A3, %2, %3
2179 %endif
2180 xor eax, eax
2181
2182.return:
2183 EPILOGUE_4_ARGS_EX 12
2184
2185.div_overflow:
2186 %if %4 != 0
2187 pop A2
2188 %endif
2189.div_zero:
2190 mov eax, -1
2191 jmp .return
2192ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2193 %endif ; !RT_ARCH_AMD64
2194
2195%endmacro
2196
2197IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2198IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2199IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2200IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2201IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2202IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2203
2204
2205;;
2206; Macro for implementing memory fence operation.
2207;
2208; No return value, no operands or anything.
2209;
2210; @param 1 The instruction.
2211;
2212%macro IEMIMPL_MEM_FENCE 1
2213BEGINCODE
2214BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2215 %1
2216 ret
2217ENDPROC iemAImpl_ %+ %1
2218%endmacro
2219
2220IEMIMPL_MEM_FENCE lfence
2221IEMIMPL_MEM_FENCE sfence
2222IEMIMPL_MEM_FENCE mfence
2223
2224;;
2225; Alternative for non-SSE2 host.
2226;
2227BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2228 push xAX
2229 xchg xAX, [xSP]
2230 add xSP, xCB
2231 ret
2232ENDPROC iemAImpl_alt_mem_fence
2233
2234
2235;;
2236; Initialize the FPU for the actual instruction being emulated, this means
2237; loading parts of the guest's control word and status word.
2238;
2239; @uses 24 bytes of stack. T0, T1
2240; @param 1 Expression giving the address of the FXSTATE of the guest.
2241;
2242%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2243 fnstenv [xSP]
2244
2245 ; FCW - for exception, precision and rounding control.
2246 movzx T0, word [%1 + X86FXSTATE.FCW]
2247 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2248 mov [xSP + X86FSTENV32P.FCW], T0_16
2249
2250 ; FSW - for undefined C0, C1, C2, and C3.
2251 movzx T1, word [%1 + X86FXSTATE.FSW]
2252 and T1, X86_FSW_C_MASK
2253 movzx T0, word [xSP + X86FSTENV32P.FSW]
2254 and T0, X86_FSW_TOP_MASK
2255 or T0, T1
2256 mov [xSP + X86FSTENV32P.FSW], T0_16
2257
2258 fldenv [xSP]
2259%endmacro
2260
2261
2262;;
2263; Initialize the FPU for the actual instruction being emulated, this means
2264; loading parts of the guest's control word, status word, and update the
2265; tag word for the top register if it's empty.
2266;
2267; ASSUMES actual TOP=7
2268;
2269; @uses 24 bytes of stack. T0, T1
2270; @param 1 Expression giving the address of the FXSTATE of the guest.
2271;
2272%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2273 fnstenv [xSP]
2274
2275 ; FCW - for exception, precision and rounding control.
2276 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2277 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2278 mov [xSP + X86FSTENV32P.FCW], T0_16
2279
2280 ; FSW - for undefined C0, C1, C2, and C3.
2281 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2282 and T1_32, X86_FSW_C_MASK
2283 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2284 and T0_32, X86_FSW_TOP_MASK
2285 or T0_32, T1_32
2286 mov [xSP + X86FSTENV32P.FSW], T0_16
2287
2288 ; FTW - Only for ST0 (in/out).
2289 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2290 shr T1_32, X86_FSW_TOP_SHIFT
2291 and T1_32, X86_FSW_TOP_SMASK
2292 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2293 jc %%st0_not_empty
2294 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2295%%st0_not_empty:
2296
2297 fldenv [xSP]
2298%endmacro
2299
2300
2301;;
2302; Need to move this as well somewhere better?
2303;
2304struc IEMFPURESULT
2305 .r80Result resw 5
2306 .FSW resw 1
2307endstruc
2308
2309
2310;;
2311; Need to move this as well somewhere better?
2312;
2313struc IEMFPURESULTTWO
2314 .r80Result1 resw 5
2315 .FSW resw 1
2316 .r80Result2 resw 5
2317endstruc
2318
2319
2320;
2321;---------------------- 16-bit signed integer operations ----------------------
2322;
2323
2324
2325;;
2326; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2327;
2328; @param A0 FPU context (fxsave).
2329; @param A1 Pointer to a IEMFPURESULT for the output.
2330; @param A2 Pointer to the 16-bit floating point value to convert.
2331;
2332BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2333 PROLOGUE_3_ARGS
2334 sub xSP, 20h
2335
2336 fninit
2337 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2338 fild word [A2]
2339
2340 fnstsw word [A1 + IEMFPURESULT.FSW]
2341 fnclex
2342 fstp tword [A1 + IEMFPURESULT.r80Result]
2343
2344 fninit
2345 add xSP, 20h
2346 EPILOGUE_3_ARGS
2347ENDPROC iemAImpl_fild_r80_from_i16
2348
2349
2350;;
2351; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2352;
2353; @param A0 FPU context (fxsave).
2354; @param A1 Where to return the output FSW.
2355; @param A2 Where to store the 16-bit signed integer value.
2356; @param A3 Pointer to the 80-bit value.
2357;
2358BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2359 PROLOGUE_4_ARGS
2360 sub xSP, 20h
2361
2362 fninit
2363 fld tword [A3]
2364 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2365 fistp word [A2]
2366
2367 fnstsw word [A1]
2368
2369 fninit
2370 add xSP, 20h
2371 EPILOGUE_4_ARGS
2372ENDPROC iemAImpl_fist_r80_to_i16
2373
2374
2375;;
2376; Store a 80-bit floating point value (register) as a 16-bit signed integer
2377; (memory) with truncation.
2378;
2379; @param A0 FPU context (fxsave).
2380; @param A1 Where to return the output FSW.
2381; @param A2 Where to store the 16-bit signed integer value.
2382; @param A3 Pointer to the 80-bit value.
2383;
2384BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2385 PROLOGUE_4_ARGS
2386 sub xSP, 20h
2387
2388 fninit
2389 fld tword [A3]
2390 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2391 fisttp word [A2]
2392
2393 fnstsw word [A1]
2394
2395 fninit
2396 add xSP, 20h
2397 EPILOGUE_4_ARGS
2398ENDPROC iemAImpl_fistt_r80_to_i16
2399
2400
2401;;
2402; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2403;
2404; @param 1 The instruction
2405;
2406; @param A0 FPU context (fxsave).
2407; @param A1 Pointer to a IEMFPURESULT for the output.
2408; @param A2 Pointer to the 80-bit value.
2409; @param A3 Pointer to the 16-bit value.
2410;
2411%macro IEMIMPL_FPU_R80_BY_I16 1
2412BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2413 PROLOGUE_4_ARGS
2414 sub xSP, 20h
2415
2416 fninit
2417 fld tword [A2]
2418 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2419 %1 word [A3]
2420
2421 fnstsw word [A1 + IEMFPURESULT.FSW]
2422 fnclex
2423 fstp tword [A1 + IEMFPURESULT.r80Result]
2424
2425 fninit
2426 add xSP, 20h
2427 EPILOGUE_4_ARGS
2428ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2429%endmacro
2430
2431IEMIMPL_FPU_R80_BY_I16 fiadd
2432IEMIMPL_FPU_R80_BY_I16 fimul
2433IEMIMPL_FPU_R80_BY_I16 fisub
2434IEMIMPL_FPU_R80_BY_I16 fisubr
2435IEMIMPL_FPU_R80_BY_I16 fidiv
2436IEMIMPL_FPU_R80_BY_I16 fidivr
2437
2438
2439;;
2440; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2441; only returning FSW.
2442;
2443; @param 1 The instruction
2444;
2445; @param A0 FPU context (fxsave).
2446; @param A1 Where to store the output FSW.
2447; @param A2 Pointer to the 80-bit value.
2448; @param A3 Pointer to the 64-bit value.
2449;
2450%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2451BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2452 PROLOGUE_4_ARGS
2453 sub xSP, 20h
2454
2455 fninit
2456 fld tword [A2]
2457 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2458 %1 word [A3]
2459
2460 fnstsw word [A1]
2461
2462 fninit
2463 add xSP, 20h
2464 EPILOGUE_4_ARGS
2465ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2466%endmacro
2467
2468IEMIMPL_FPU_R80_BY_I16_FSW ficom
2469
2470
2471
2472;
2473;---------------------- 32-bit signed integer operations ----------------------
2474;
2475
2476
2477;;
2478; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2479;
2480; @param A0 FPU context (fxsave).
2481; @param A1 Pointer to a IEMFPURESULT for the output.
2482; @param A2 Pointer to the 32-bit floating point value to convert.
2483;
2484BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2485 PROLOGUE_3_ARGS
2486 sub xSP, 20h
2487
2488 fninit
2489 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2490 fild dword [A2]
2491
2492 fnstsw word [A1 + IEMFPURESULT.FSW]
2493 fnclex
2494 fstp tword [A1 + IEMFPURESULT.r80Result]
2495
2496 fninit
2497 add xSP, 20h
2498 EPILOGUE_3_ARGS
2499ENDPROC iemAImpl_fild_r80_from_i32
2500
2501
2502;;
2503; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2504;
2505; @param A0 FPU context (fxsave).
2506; @param A1 Where to return the output FSW.
2507; @param A2 Where to store the 32-bit signed integer value.
2508; @param A3 Pointer to the 80-bit value.
2509;
2510BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2511 PROLOGUE_4_ARGS
2512 sub xSP, 20h
2513
2514 fninit
2515 fld tword [A3]
2516 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2517 fistp dword [A2]
2518
2519 fnstsw word [A1]
2520
2521 fninit
2522 add xSP, 20h
2523 EPILOGUE_4_ARGS
2524ENDPROC iemAImpl_fist_r80_to_i32
2525
2526
2527;;
2528; Store a 80-bit floating point value (register) as a 32-bit signed integer
2529; (memory) with truncation.
2530;
2531; @param A0 FPU context (fxsave).
2532; @param A1 Where to return the output FSW.
2533; @param A2 Where to store the 32-bit signed integer value.
2534; @param A3 Pointer to the 80-bit value.
2535;
2536BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2537 PROLOGUE_4_ARGS
2538 sub xSP, 20h
2539
2540 fninit
2541 fld tword [A3]
2542 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2543 fisttp dword [A2]
2544
2545 fnstsw word [A1]
2546
2547 fninit
2548 add xSP, 20h
2549 EPILOGUE_4_ARGS
2550ENDPROC iemAImpl_fistt_r80_to_i32
2551
2552
2553;;
2554; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2555;
2556; @param 1 The instruction
2557;
2558; @param A0 FPU context (fxsave).
2559; @param A1 Pointer to a IEMFPURESULT for the output.
2560; @param A2 Pointer to the 80-bit value.
2561; @param A3 Pointer to the 32-bit value.
2562;
2563%macro IEMIMPL_FPU_R80_BY_I32 1
2564BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2565 PROLOGUE_4_ARGS
2566 sub xSP, 20h
2567
2568 fninit
2569 fld tword [A2]
2570 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2571 %1 dword [A3]
2572
2573 fnstsw word [A1 + IEMFPURESULT.FSW]
2574 fnclex
2575 fstp tword [A1 + IEMFPURESULT.r80Result]
2576
2577 fninit
2578 add xSP, 20h
2579 EPILOGUE_4_ARGS
2580ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2581%endmacro
2582
2583IEMIMPL_FPU_R80_BY_I32 fiadd
2584IEMIMPL_FPU_R80_BY_I32 fimul
2585IEMIMPL_FPU_R80_BY_I32 fisub
2586IEMIMPL_FPU_R80_BY_I32 fisubr
2587IEMIMPL_FPU_R80_BY_I32 fidiv
2588IEMIMPL_FPU_R80_BY_I32 fidivr
2589
2590
2591;;
2592; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2593; only returning FSW.
2594;
2595; @param 1 The instruction
2596;
2597; @param A0 FPU context (fxsave).
2598; @param A1 Where to store the output FSW.
2599; @param A2 Pointer to the 80-bit value.
2600; @param A3 Pointer to the 64-bit value.
2601;
2602%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2604 PROLOGUE_4_ARGS
2605 sub xSP, 20h
2606
2607 fninit
2608 fld tword [A2]
2609 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2610 %1 dword [A3]
2611
2612 fnstsw word [A1]
2613
2614 fninit
2615 add xSP, 20h
2616 EPILOGUE_4_ARGS
2617ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2618%endmacro
2619
2620IEMIMPL_FPU_R80_BY_I32_FSW ficom
2621
2622
2623
2624;
2625;---------------------- 64-bit signed integer operations ----------------------
2626;
2627
2628
2629;;
2630; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2631;
2632; @param A0 FPU context (fxsave).
2633; @param A1 Pointer to a IEMFPURESULT for the output.
2634; @param A2 Pointer to the 64-bit floating point value to convert.
2635;
2636BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2637 PROLOGUE_3_ARGS
2638 sub xSP, 20h
2639
2640 fninit
2641 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2642 fild qword [A2]
2643
2644 fnstsw word [A1 + IEMFPURESULT.FSW]
2645 fnclex
2646 fstp tword [A1 + IEMFPURESULT.r80Result]
2647
2648 fninit
2649 add xSP, 20h
2650 EPILOGUE_3_ARGS
2651ENDPROC iemAImpl_fild_r80_from_i64
2652
2653
2654;;
2655; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2656;
2657; @param A0 FPU context (fxsave).
2658; @param A1 Where to return the output FSW.
2659; @param A2 Where to store the 64-bit signed integer value.
2660; @param A3 Pointer to the 80-bit value.
2661;
2662BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2663 PROLOGUE_4_ARGS
2664 sub xSP, 20h
2665
2666 fninit
2667 fld tword [A3]
2668 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2669 fistp qword [A2]
2670
2671 fnstsw word [A1]
2672
2673 fninit
2674 add xSP, 20h
2675 EPILOGUE_4_ARGS
2676ENDPROC iemAImpl_fist_r80_to_i64
2677
2678
2679;;
2680; Store a 80-bit floating point value (register) as a 64-bit signed integer
2681; (memory) with truncation.
2682;
2683; @param A0 FPU context (fxsave).
2684; @param A1 Where to return the output FSW.
2685; @param A2 Where to store the 64-bit signed integer value.
2686; @param A3 Pointer to the 80-bit value.
2687;
2688BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2689 PROLOGUE_4_ARGS
2690 sub xSP, 20h
2691
2692 fninit
2693 fld tword [A3]
2694 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2695 fisttp qword [A2]
2696
2697 fnstsw word [A1]
2698
2699 fninit
2700 add xSP, 20h
2701 EPILOGUE_4_ARGS
2702ENDPROC iemAImpl_fistt_r80_to_i64
2703
2704
2705
2706;
2707;---------------------- 32-bit floating point operations ----------------------
2708;
2709
2710;;
2711; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2712;
2713; @param A0 FPU context (fxsave).
2714; @param A1 Pointer to a IEMFPURESULT for the output.
2715; @param A2 Pointer to the 32-bit floating point value to convert.
2716;
2717BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2718 PROLOGUE_3_ARGS
2719 sub xSP, 20h
2720
2721 fninit
2722 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2723 fld dword [A2]
2724
2725 fnstsw word [A1 + IEMFPURESULT.FSW]
2726 fnclex
2727 fstp tword [A1 + IEMFPURESULT.r80Result]
2728
2729 fninit
2730 add xSP, 20h
2731 EPILOGUE_3_ARGS
2732ENDPROC iemAImpl_fld_r80_from_r32
2733
2734
2735;;
2736; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2737;
2738; @param A0 FPU context (fxsave).
2739; @param A1 Where to return the output FSW.
2740; @param A2 Where to store the 32-bit value.
2741; @param A3 Pointer to the 80-bit value.
2742;
2743BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2744 PROLOGUE_4_ARGS
2745 sub xSP, 20h
2746
2747 fninit
2748 fld tword [A3]
2749 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2750 fst dword [A2]
2751
2752 fnstsw word [A1]
2753
2754 fninit
2755 add xSP, 20h
2756 EPILOGUE_4_ARGS
2757ENDPROC iemAImpl_fst_r80_to_r32
2758
2759
2760;;
2761; FPU instruction working on one 80-bit and one 32-bit floating point value.
2762;
2763; @param 1 The instruction
2764;
2765; @param A0 FPU context (fxsave).
2766; @param A1 Pointer to a IEMFPURESULT for the output.
2767; @param A2 Pointer to the 80-bit value.
2768; @param A3 Pointer to the 32-bit value.
2769;
2770%macro IEMIMPL_FPU_R80_BY_R32 1
2771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2772 PROLOGUE_4_ARGS
2773 sub xSP, 20h
2774
2775 fninit
2776 fld tword [A2]
2777 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2778 %1 dword [A3]
2779
2780 fnstsw word [A1 + IEMFPURESULT.FSW]
2781 fnclex
2782 fstp tword [A1 + IEMFPURESULT.r80Result]
2783
2784 fninit
2785 add xSP, 20h
2786 EPILOGUE_4_ARGS
2787ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2788%endmacro
2789
2790IEMIMPL_FPU_R80_BY_R32 fadd
2791IEMIMPL_FPU_R80_BY_R32 fmul
2792IEMIMPL_FPU_R80_BY_R32 fsub
2793IEMIMPL_FPU_R80_BY_R32 fsubr
2794IEMIMPL_FPU_R80_BY_R32 fdiv
2795IEMIMPL_FPU_R80_BY_R32 fdivr
2796
2797
2798;;
2799; FPU instruction working on one 80-bit and one 32-bit floating point value,
2800; only returning FSW.
2801;
2802; @param 1 The instruction
2803;
2804; @param A0 FPU context (fxsave).
2805; @param A1 Where to store the output FSW.
2806; @param A2 Pointer to the 80-bit value.
2807; @param A3 Pointer to the 64-bit value.
2808;
2809%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2810BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2811 PROLOGUE_4_ARGS
2812 sub xSP, 20h
2813
2814 fninit
2815 fld tword [A2]
2816 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2817 %1 dword [A3]
2818
2819 fnstsw word [A1]
2820
2821 fninit
2822 add xSP, 20h
2823 EPILOGUE_4_ARGS
2824ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2825%endmacro
2826
2827IEMIMPL_FPU_R80_BY_R32_FSW fcom
2828
2829
2830
2831;
2832;---------------------- 64-bit floating point operations ----------------------
2833;
2834
2835;;
2836; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2837;
2838; @param A0 FPU context (fxsave).
2839; @param A1 Pointer to a IEMFPURESULT for the output.
2840; @param A2 Pointer to the 64-bit floating point value to convert.
2841;
2842BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
2843 PROLOGUE_3_ARGS
2844 sub xSP, 20h
2845
2846 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2847 fld qword [A2]
2848
2849 fnstsw word [A1 + IEMFPURESULT.FSW]
2850 fnclex
2851 fstp tword [A1 + IEMFPURESULT.r80Result]
2852
2853 fninit
2854 add xSP, 20h
2855 EPILOGUE_3_ARGS
2856ENDPROC iemAImpl_fld_r80_from_r64
2857
2858
2859;;
2860; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2861;
2862; @param A0 FPU context (fxsave).
2863; @param A1 Where to return the output FSW.
2864; @param A2 Where to store the 64-bit value.
2865; @param A3 Pointer to the 80-bit value.
2866;
2867BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2868 PROLOGUE_4_ARGS
2869 sub xSP, 20h
2870
2871 fninit
2872 fld tword [A3]
2873 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2874 fst qword [A2]
2875
2876 fnstsw word [A1]
2877
2878 fninit
2879 add xSP, 20h
2880 EPILOGUE_4_ARGS
2881ENDPROC iemAImpl_fst_r80_to_r64
2882
2883
2884;;
2885; FPU instruction working on one 80-bit and one 64-bit floating point value.
2886;
2887; @param 1 The instruction
2888;
2889; @param A0 FPU context (fxsave).
2890; @param A1 Pointer to a IEMFPURESULT for the output.
2891; @param A2 Pointer to the 80-bit value.
2892; @param A3 Pointer to the 64-bit value.
2893;
2894%macro IEMIMPL_FPU_R80_BY_R64 1
2895BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2896 PROLOGUE_4_ARGS
2897 sub xSP, 20h
2898
2899 fninit
2900 fld tword [A2]
2901 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2902 %1 qword [A3]
2903
2904 fnstsw word [A1 + IEMFPURESULT.FSW]
2905 fnclex
2906 fstp tword [A1 + IEMFPURESULT.r80Result]
2907
2908 fninit
2909 add xSP, 20h
2910 EPILOGUE_4_ARGS
2911ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2912%endmacro
2913
2914IEMIMPL_FPU_R80_BY_R64 fadd
2915IEMIMPL_FPU_R80_BY_R64 fmul
2916IEMIMPL_FPU_R80_BY_R64 fsub
2917IEMIMPL_FPU_R80_BY_R64 fsubr
2918IEMIMPL_FPU_R80_BY_R64 fdiv
2919IEMIMPL_FPU_R80_BY_R64 fdivr
2920
2921;;
2922; FPU instruction working on one 80-bit and one 64-bit floating point value,
2923; only returning FSW.
2924;
2925; @param 1 The instruction
2926;
2927; @param A0 FPU context (fxsave).
2928; @param A1 Where to store the output FSW.
2929; @param A2 Pointer to the 80-bit value.
2930; @param A3 Pointer to the 64-bit value.
2931;
2932%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2933BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2934 PROLOGUE_4_ARGS
2935 sub xSP, 20h
2936
2937 fninit
2938 fld tword [A2]
2939 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2940 %1 qword [A3]
2941
2942 fnstsw word [A1]
2943
2944 fninit
2945 add xSP, 20h
2946 EPILOGUE_4_ARGS
2947ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2948%endmacro
2949
2950IEMIMPL_FPU_R80_BY_R64_FSW fcom
2951
2952
2953
2954;
2955;---------------------- 80-bit floating point operations ----------------------
2956;
2957
2958;;
2959; Loads a 80-bit floating point register value from memory.
2960;
2961; @param A0 FPU context (fxsave).
2962; @param A1 Pointer to a IEMFPURESULT for the output.
2963; @param A2 Pointer to the 80-bit floating point value to load.
2964;
2965BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2966 PROLOGUE_3_ARGS
2967 sub xSP, 20h
2968
2969 fninit
2970 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971 fld tword [A2]
2972
2973 fnstsw word [A1 + IEMFPURESULT.FSW]
2974 fnclex
2975 fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977 fninit
2978 add xSP, 20h
2979 EPILOGUE_3_ARGS
2980ENDPROC iemAImpl_fld_r80_from_r80
2981
2982
2983;;
2984; Store a 80-bit floating point register to memory
2985;
2986; @param A0 FPU context (fxsave).
2987; @param A1 Where to return the output FSW.
2988; @param A2 Where to store the 80-bit value.
2989; @param A3 Pointer to the 80-bit register value.
2990;
2991BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2992 PROLOGUE_4_ARGS
2993 sub xSP, 20h
2994
2995 fninit
2996 fld tword [A3]
2997 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2998 fstp tword [A2]
2999
3000 fnstsw word [A1]
3001
3002 fninit
3003 add xSP, 20h
3004 EPILOGUE_4_ARGS
3005ENDPROC iemAImpl_fst_r80_to_r80
3006
3007
3008;;
3009; Loads an 80-bit floating point register value in BCD format from memory.
3010;
3011; @param A0 FPU context (fxsave).
3012; @param A1 Pointer to a IEMFPURESULT for the output.
3013; @param A2 Pointer to the 80-bit BCD value to load.
3014;
3015BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3016 PROLOGUE_3_ARGS
3017 sub xSP, 20h
3018
3019 fninit
3020 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3021 fbld tword [A2]
3022
3023 fnstsw word [A1 + IEMFPURESULT.FSW]
3024 fnclex
3025 fstp tword [A1 + IEMFPURESULT.r80Result]
3026
3027 fninit
3028 add xSP, 20h
3029 EPILOGUE_3_ARGS
3030ENDPROC iemAImpl_fld_r80_from_d80
3031
3032
3033;;
3034; Store a 80-bit floating point register to memory as BCD
3035;
3036; @param A0 FPU context (fxsave).
3037; @param A1 Where to return the output FSW.
3038; @param A2 Where to store the 80-bit BCD value.
3039; @param A3 Pointer to the 80-bit register value.
3040;
3041BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3042 PROLOGUE_4_ARGS
3043 sub xSP, 20h
3044
3045 fninit
3046 fld tword [A3]
3047 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3048 fbstp tword [A2]
3049
3050 fnstsw word [A1]
3051
3052 fninit
3053 add xSP, 20h
3054 EPILOGUE_4_ARGS
3055ENDPROC iemAImpl_fst_r80_to_d80
3056
3057
3058;;
3059; FPU instruction working on two 80-bit floating point values.
3060;
3061; @param 1 The instruction
3062;
3063; @param A0 FPU context (fxsave).
3064; @param A1 Pointer to a IEMFPURESULT for the output.
3065; @param A2 Pointer to the first 80-bit value (ST0)
3066; @param A3 Pointer to the second 80-bit value (STn).
3067;
3068%macro IEMIMPL_FPU_R80_BY_R80 2
3069BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3070 PROLOGUE_4_ARGS
3071 sub xSP, 20h
3072
3073 fninit
3074 fld tword [A3]
3075 fld tword [A2]
3076 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3077 %1 %2
3078
3079 fnstsw word [A1 + IEMFPURESULT.FSW]
3080 fnclex
3081 fstp tword [A1 + IEMFPURESULT.r80Result]
3082
3083 fninit
3084 add xSP, 20h
3085 EPILOGUE_4_ARGS
3086ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3087%endmacro
3088
3089IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3090IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3091IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3092IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3093IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3094IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3095IEMIMPL_FPU_R80_BY_R80 fprem, {}
3096IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3097IEMIMPL_FPU_R80_BY_R80 fscale, {}
3098
3099
3100;;
3101; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3102; storing the result in ST1 and popping the stack.
3103;
3104; @param 1 The instruction
3105;
3106; @param A0 FPU context (fxsave).
3107; @param A1 Pointer to a IEMFPURESULT for the output.
3108; @param A2 Pointer to the first 80-bit value (ST1).
3109; @param A3 Pointer to the second 80-bit value (ST0).
3110;
3111%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3112BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3113 PROLOGUE_4_ARGS
3114 sub xSP, 20h
3115
3116 fninit
3117 fld tword [A2]
3118 fld tword [A3]
3119 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3120 %1
3121
3122 fnstsw word [A1 + IEMFPURESULT.FSW]
3123 fnclex
3124 fstp tword [A1 + IEMFPURESULT.r80Result]
3125
3126 fninit
3127 add xSP, 20h
3128 EPILOGUE_4_ARGS
3129ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3130%endmacro
3131
3132IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3133IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3134IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3135
3136
3137;;
3138; FPU instruction working on two 80-bit floating point values, only
3139; returning FSW.
3140;
3141; @param 1 The instruction
3142;
3143; @param A0 FPU context (fxsave).
3144; @param A1 Pointer to a uint16_t for the resulting FSW.
3145; @param A2 Pointer to the first 80-bit value.
3146; @param A3 Pointer to the second 80-bit value.
3147;
3148%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3149BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3150 PROLOGUE_4_ARGS
3151 sub xSP, 20h
3152
3153 fninit
3154 fld tword [A3]
3155 fld tword [A2]
3156 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3157 %1 st0, st1
3158
3159 fnstsw word [A1]
3160
3161 fninit
3162 add xSP, 20h
3163 EPILOGUE_4_ARGS
3164ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3165%endmacro
3166
3167IEMIMPL_FPU_R80_BY_R80_FSW fcom
3168IEMIMPL_FPU_R80_BY_R80_FSW fucom
3169
3170
3171;;
3172; FPU instruction working on two 80-bit floating point values,
3173; returning FSW and EFLAGS (eax).
3174;
3175; @param 1 The instruction
3176;
3177; @returns EFLAGS in EAX.
3178; @param A0 FPU context (fxsave).
3179; @param A1 Pointer to a uint16_t for the resulting FSW.
3180; @param A2 Pointer to the first 80-bit value.
3181; @param A3 Pointer to the second 80-bit value.
3182;
3183%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3184BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3185 PROLOGUE_4_ARGS
3186 sub xSP, 20h
3187
3188 fninit
3189 fld tword [A3]
3190 fld tword [A2]
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 %1 st1
3193
3194 fnstsw word [A1]
3195 pushf
3196 pop xAX
3197
3198 fninit
3199 add xSP, 20h
3200 EPILOGUE_4_ARGS
3201ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3202%endmacro
3203
3204IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3205IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3206
3207
3208;;
3209; FPU instruction working on one 80-bit floating point value.
3210;
3211; @param 1 The instruction
3212;
3213; @param A0 FPU context (fxsave).
3214; @param A1 Pointer to a IEMFPURESULT for the output.
3215; @param A2 Pointer to the 80-bit value.
3216;
3217%macro IEMIMPL_FPU_R80 1
3218BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3219 PROLOGUE_3_ARGS
3220 sub xSP, 20h
3221
3222 fninit
3223 fld tword [A2]
3224 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3225 %1
3226
3227 fnstsw word [A1 + IEMFPURESULT.FSW]
3228 fnclex
3229 fstp tword [A1 + IEMFPURESULT.r80Result]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_3_ARGS
3234ENDPROC iemAImpl_ %+ %1 %+ _r80
3235%endmacro
3236
3237IEMIMPL_FPU_R80 fchs
3238IEMIMPL_FPU_R80 fabs
3239IEMIMPL_FPU_R80 f2xm1
3240IEMIMPL_FPU_R80 fsqrt
3241IEMIMPL_FPU_R80 frndint
3242IEMIMPL_FPU_R80 fsin
3243IEMIMPL_FPU_R80 fcos
3244
3245
3246;;
3247; FPU instruction working on one 80-bit floating point value, only
3248; returning FSW.
3249;
3250; @param 1 The instruction
3251; @param 2 Non-zero to also restore FTW.
3252;
3253; @param A0 FPU context (fxsave).
3254; @param A1 Pointer to a uint16_t for the resulting FSW.
3255; @param A2 Pointer to the 80-bit value.
3256;
3257%macro IEMIMPL_FPU_R80_FSW 2
3258BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3259 PROLOGUE_3_ARGS
3260 sub xSP, 20h
3261
3262 fninit
3263 fld tword [A2]
3264%if %2 != 0
3265 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3266%else
3267 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3268%endif
3269 %1
3270
3271 fnstsw word [A1]
3272
3273 fninit
3274 add xSP, 20h
3275 EPILOGUE_3_ARGS
3276ENDPROC iemAImpl_ %+ %1 %+ _r80
3277%endmacro
3278
3279IEMIMPL_FPU_R80_FSW ftst, 0
3280IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3281
3282
3283
3284;;
3285; FPU instruction loading a 80-bit floating point constant.
3286;
3287; @param 1 The instruction
3288;
3289; @param A0 FPU context (fxsave).
3290; @param A1 Pointer to a IEMFPURESULT for the output.
3291;
3292%macro IEMIMPL_FPU_R80_CONST 1
3293BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3294 PROLOGUE_2_ARGS
3295 sub xSP, 20h
3296
3297 fninit
3298 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3299 %1
3300
3301 fnstsw word [A1 + IEMFPURESULT.FSW]
3302 fnclex
3303 fstp tword [A1 + IEMFPURESULT.r80Result]
3304
3305 fninit
3306 add xSP, 20h
3307 EPILOGUE_2_ARGS
3308ENDPROC iemAImpl_ %+ %1 %+
3309%endmacro
3310
3311IEMIMPL_FPU_R80_CONST fld1
3312IEMIMPL_FPU_R80_CONST fldl2t
3313IEMIMPL_FPU_R80_CONST fldl2e
3314IEMIMPL_FPU_R80_CONST fldpi
3315IEMIMPL_FPU_R80_CONST fldlg2
3316IEMIMPL_FPU_R80_CONST fldln2
3317IEMIMPL_FPU_R80_CONST fldz
3318
3319
3320;;
3321; FPU instruction working on one 80-bit floating point value, outputing two.
3322;
3323; @param 1 The instruction
3324;
3325; @param A0 FPU context (fxsave).
3326; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3327; @param A2 Pointer to the 80-bit value.
3328;
3329%macro IEMIMPL_FPU_R80_R80 1
3330BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3331 PROLOGUE_3_ARGS
3332 sub xSP, 20h
3333
3334 fninit
3335 fld tword [A2]
3336 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3337 %1
3338
3339 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3340 fnclex
3341 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3342 fnclex
3343 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3344
3345 fninit
3346 add xSP, 20h
3347 EPILOGUE_3_ARGS
3348ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3349%endmacro
3350
3351IEMIMPL_FPU_R80_R80 fptan
3352IEMIMPL_FPU_R80_R80 fxtract
3353IEMIMPL_FPU_R80_R80 fsincos
3354
3355
3356
3357
3358;---------------------- SSE and MMX Operations ----------------------
3359
3360;; @todo what do we need to do for MMX?
3361%macro IEMIMPL_MMX_PROLOGUE 0
3362%endmacro
3363%macro IEMIMPL_MMX_EPILOGUE 0
3364%endmacro
3365
3366;; @todo what do we need to do for SSE?
3367%macro IEMIMPL_SSE_PROLOGUE 0
3368%endmacro
3369%macro IEMIMPL_SSE_EPILOGUE 0
3370%endmacro
3371
3372
3373;;
3374; Media instruction working on two full sized registers.
3375;
3376; @param 1 The instruction
3377;
3378; @param A0 FPU context (fxsave).
3379; @param A1 Pointer to the first media register size operand (input/output).
3380; @param A2 Pointer to the second media register size operand (input).
3381;
3382%macro IEMIMPL_MEDIA_F2 1
3383BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3384 PROLOGUE_3_ARGS
3385 IEMIMPL_MMX_PROLOGUE
3386
3387 movq mm0, [A1]
3388 movq mm1, [A2]
3389 %1 mm0, mm1
3390 movq [A1], mm0
3391
3392 IEMIMPL_MMX_EPILOGUE
3393 EPILOGUE_3_ARGS
3394ENDPROC iemAImpl_ %+ %1 %+ _u64
3395
3396BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3397 PROLOGUE_3_ARGS
3398 IEMIMPL_SSE_PROLOGUE
3399
3400 movdqu xmm0, [A1]
3401 movdqu xmm1, [A2]
3402 %1 xmm0, xmm1
3403 movdqu [A1], xmm0
3404
3405 IEMIMPL_SSE_EPILOGUE
3406 EPILOGUE_3_ARGS
3407ENDPROC iemAImpl_ %+ %1 %+ _u128
3408%endmacro
3409
3410IEMIMPL_MEDIA_F2 pxor
3411IEMIMPL_MEDIA_F2 pcmpeqb
3412IEMIMPL_MEDIA_F2 pcmpeqw
3413IEMIMPL_MEDIA_F2 pcmpeqd
3414
3415
3416;;
3417; Media instruction working on one full sized and one half sized register (lower half).
3418;
3419; @param 1 The instruction
3420; @param 2 1 if MMX is included, 0 if not.
3421;
3422; @param A0 FPU context (fxsave).
3423; @param A1 Pointer to the first full sized media register operand (input/output).
3424; @param A2 Pointer to the second half sized media register operand (input).
3425;
3426%macro IEMIMPL_MEDIA_F1L1 2
3427 %if %2 != 0
3428BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3429 PROLOGUE_3_ARGS
3430 IEMIMPL_MMX_PROLOGUE
3431
3432 movq mm0, [A1]
3433 movd mm1, [A2]
3434 %1 mm0, mm1
3435 movq [A1], mm0
3436
3437 IEMIMPL_MMX_EPILOGUE
3438 EPILOGUE_3_ARGS
3439ENDPROC iemAImpl_ %+ %1 %+ _u64
3440 %endif
3441
3442BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3443 PROLOGUE_3_ARGS
3444 IEMIMPL_SSE_PROLOGUE
3445
3446 movdqu xmm0, [A1]
3447 movq xmm1, [A2]
3448 %1 xmm0, xmm1
3449 movdqu [A1], xmm0
3450
3451 IEMIMPL_SSE_EPILOGUE
3452 EPILOGUE_3_ARGS
3453ENDPROC iemAImpl_ %+ %1 %+ _u128
3454%endmacro
3455
3456IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3457IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3458IEMIMPL_MEDIA_F1L1 punpckldq, 1
3459IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3460
3461
3462;;
3463; Media instruction working on one full sized and one half sized register (high half).
3464;
3465; @param 1 The instruction
3466; @param 2 1 if MMX is included, 0 if not.
3467;
3468; @param A0 FPU context (fxsave).
3469; @param A1 Pointer to the first full sized media register operand (input/output).
3470; @param A2 Pointer to the second full sized media register operand, where we
3471; will only use the upper half (input).
3472;
3473%macro IEMIMPL_MEDIA_F1H1 2
3474 %if %2 != 0
3475BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3476 PROLOGUE_3_ARGS
3477 IEMIMPL_MMX_PROLOGUE
3478
3479 movq mm0, [A1]
3480 movq mm1, [A2]
3481 %1 mm0, mm1
3482 movq [A1], mm0
3483
3484 IEMIMPL_MMX_EPILOGUE
3485 EPILOGUE_3_ARGS
3486ENDPROC iemAImpl_ %+ %1 %+ _u64
3487 %endif
3488
3489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3490 PROLOGUE_3_ARGS
3491 IEMIMPL_SSE_PROLOGUE
3492
3493 movdqu xmm0, [A1]
3494 movdqu xmm1, [A2]
3495 %1 xmm0, xmm1
3496 movdqu [A1], xmm0
3497
3498 IEMIMPL_SSE_EPILOGUE
3499 EPILOGUE_3_ARGS
3500ENDPROC iemAImpl_ %+ %1 %+ _u128
3501%endmacro
3502
3503IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3504IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3505IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3506IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3507
3508
3509;
3510; Shufflers with evil 8-bit immediates.
3511;
3512
3513BEGINPROC_FASTCALL iemAImpl_pshufw, 16
3514 PROLOGUE_4_ARGS
3515 IEMIMPL_MMX_PROLOGUE
3516
3517 movq mm0, [A1]
3518 movq mm1, [A2]
3519 lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
3520 lea T1, [.imm0 xWrtRIP]
3521 lea T1, [T1 + T0]
3522 call T1
3523 movq [A1], mm0
3524
3525 IEMIMPL_MMX_EPILOGUE
3526 EPILOGUE_4_ARGS
3527%assign bImm 0
3528%rep 256
3529.imm %+ bImm:
3530 pshufw mm0, mm1, bImm
3531 ret
3532 %assign bImm bImm + 1
3533%endrep
3534.immEnd: ; 256*5 == 0x500
3535dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3536dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3537ENDPROC iemAImpl_pshufw
3538
3539
3540%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3541BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
3542 PROLOGUE_4_ARGS
3543 IEMIMPL_SSE_PROLOGUE
3544
3545 movdqu xmm0, [A1]
3546 movdqu xmm1, [A2]
3547 lea T1, [.imm0 xWrtRIP]
3548 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3549 lea T1, [T1 + T0*2]
3550 call T1
3551 movdqu [A1], xmm0
3552
3553 IEMIMPL_SSE_EPILOGUE
3554 EPILOGUE_4_ARGS
3555 %assign bImm 0
3556 %rep 256
3557.imm %+ bImm:
3558 %1 xmm0, xmm1, bImm
3559 ret
3560 %assign bImm bImm + 1
3561 %endrep
3562.immEnd: ; 256*6 == 0x600
3563dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3564dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3565ENDPROC iemAImpl_ %+ %1
3566%endmacro
3567
3568IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3569IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3570IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3571
3572
3573;
3574; Move byte mask.
3575;
3576
3577BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
3578 PROLOGUE_3_ARGS
3579 IEMIMPL_MMX_PROLOGUE
3580
3581 mov T0, [A1]
3582 movq mm1, [A2]
3583 pmovmskb T0, mm1
3584 mov [A1], T0
3585%ifdef RT_ARCH_X86
3586 mov dword [A1 + 4], 0
3587%endif
3588 IEMIMPL_MMX_EPILOGUE
3589 EPILOGUE_3_ARGS
3590ENDPROC iemAImpl_pmovmskb_u64
3591
3592BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3593 PROLOGUE_3_ARGS
3594 IEMIMPL_SSE_PROLOGUE
3595
3596 mov T0, [A1]
3597 movdqu xmm1, [A2]
3598 pmovmskb T0, xmm1
3599 mov [A1], T0
3600%ifdef RT_ARCH_X86
3601 mov dword [A1 + 4], 0
3602%endif
3603 IEMIMPL_SSE_EPILOGUE
3604 EPILOGUE_3_ARGS
3605ENDPROC iemAImpl_pmovmskb_u128
3606
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette