VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 105486

Last change on this file since 105486 was 105486, checked in by vboxsync, 4 months ago

VMM/IEM: Fix for r164115, bugref:10652

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 221.4 KB
Line 
1; $Id: IEMAllAImpl.asm 105486 2024-07-24 14:26:58Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; This is handy for generating absolutly correct EFLAGS.
43;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
44
45
46;;
47; RET XX / RET wrapper for fastcall.
48;
49%macro RET_FASTCALL 1
50%ifdef RT_ARCH_X86
51 %ifdef RT_OS_WINDOWS
52 ret %1
53 %else
54 ret
55 %endif
56%else
57 ret
58%endif
59%endmacro
60
61;;
62; NAME for fastcall functions.
63;
64;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
65; escaping (or whatever the dollar is good for here). Thus the ugly
66; prefix argument.
67;
68%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
69%ifdef RT_ARCH_X86
70 %ifdef RT_OS_WINDOWS
71 %undef NAME_FASTCALL
72 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
73 %endif
74%endif
75
76;;
77; BEGINPROC for fastcall functions.
78;
79; @param 1 The function name (C).
80; @param 2 The argument size on x86.
81;
82%macro BEGINPROC_FASTCALL 2
83GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
84 IBT_ENDBRxx
85%endmacro
86
87
88;
89; We employ some macro assembly here to hid the calling convention differences.
90;
91%ifdef RT_ARCH_AMD64
92 %macro PROLOGUE_1_ARGS 0
93 %endmacro
94 %macro EPILOGUE_1_ARGS 0
95 ret
96 %endmacro
97 %macro EPILOGUE_1_ARGS_EX 0
98 ret
99 %endmacro
100
101 %macro PROLOGUE_2_ARGS 0
102 %endmacro
103 %macro EPILOGUE_2_ARGS 0
104 ret
105 %endmacro
106 %macro EPILOGUE_2_ARGS_EX 1
107 ret
108 %endmacro
109
110 %macro PROLOGUE_3_ARGS 0
111 %endmacro
112 %macro EPILOGUE_3_ARGS 0
113 ret
114 %endmacro
115 %macro EPILOGUE_3_ARGS_EX 1
116 ret
117 %endmacro
118
119 %macro PROLOGUE_4_ARGS 0
120 %endmacro
121 %macro EPILOGUE_4_ARGS 0
122 ret
123 %endmacro
124 %macro EPILOGUE_4_ARGS_EX 1
125 ret
126 %endmacro
127
128 %ifdef ASM_CALL64_GCC
129 %define A0 rdi
130 %define A0_32 edi
131 %define A0_16 di
132 %define A0_8 dil
133
134 %define A1 rsi
135 %define A1_32 esi
136 %define A1_16 si
137 %define A1_8 sil
138
139 %define A2 rdx
140 %define A2_32 edx
141 %define A2_16 dx
142 %define A2_8 dl
143
144 %define A3 rcx
145 %define A3_32 ecx
146 %define A3_16 cx
147 %define A3_8 cl
148 %endif
149
150 %ifdef ASM_CALL64_MSC
151 %define A0 rcx
152 %define A0_32 ecx
153 %define A0_16 cx
154 %define A0_8 cl
155
156 %define A1 rdx
157 %define A1_32 edx
158 %define A1_16 dx
159 %define A1_8 dl
160
161 %define A2 r8
162 %define A2_32 r8d
163 %define A2_16 r8w
164 %define A2_8 r8b
165
166 %define A3 r9
167 %define A3_32 r9d
168 %define A3_16 r9w
169 %define A3_8 r9b
170 %endif
171
172 %define T0 rax
173 %define T0_32 eax
174 %define T0_16 ax
175 %define T0_8 al
176
177 %define T1 r11
178 %define T1_32 r11d
179 %define T1_16 r11w
180 %define T1_8 r11b
181
182 %define T2 r10 ; only AMD64
183 %define T2_32 r10d
184 %define T2_16 r10w
185 %define T2_8 r10b
186
187 ;
188 ; Return value, same as T0 but to make it more obvious
189 ; that this is a return value.
190 ;
191 %define R0 rax
192 %define R0_32 eax
193 %define R0_16 ax
194 %define R0_8 al
195
196%else
197 ; x86
198 %macro PROLOGUE_1_ARGS 0
199 push edi
200 %endmacro
201 %macro EPILOGUE_1_ARGS 0
202 pop edi
203 ret 0
204 %endmacro
205 %macro EPILOGUE_1_ARGS_EX 1
206 pop edi
207 ret %1
208 %endmacro
209
210 %macro PROLOGUE_2_ARGS 0
211 push edi
212 %endmacro
213 %macro EPILOGUE_2_ARGS 0
214 pop edi
215 ret 0
216 %endmacro
217 %macro EPILOGUE_2_ARGS_EX 1
218 pop edi
219 ret %1
220 %endmacro
221
222 %macro PROLOGUE_3_ARGS 0
223 push ebx
224 mov ebx, [esp + 4 + 4]
225 push edi
226 %endmacro
227 %macro EPILOGUE_3_ARGS_EX 1
228 %if (%1) < 4
229 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
230 %endif
231 pop edi
232 pop ebx
233 ret %1
234 %endmacro
235 %macro EPILOGUE_3_ARGS 0
236 EPILOGUE_3_ARGS_EX 4
237 %endmacro
238
239 %macro PROLOGUE_4_ARGS 0
240 push ebx
241 push edi
242 push esi
243 mov ebx, [esp + 12 + 4 + 0]
244 mov esi, [esp + 12 + 4 + 4]
245 %endmacro
246 %macro EPILOGUE_4_ARGS_EX 1
247 %if (%1) < 8
248 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
249 %endif
250 pop esi
251 pop edi
252 pop ebx
253 ret %1
254 %endmacro
255 %macro EPILOGUE_4_ARGS 0
256 EPILOGUE_4_ARGS_EX 8
257 %endmacro
258
259 %define A0 ecx
260 %define A0_32 ecx
261 %define A0_16 cx
262 %define A0_8 cl
263
264 %define A1 edx
265 %define A1_32 edx
266 %define A1_16 dx
267 %define A1_8 dl
268
269 %define A2 ebx
270 %define A2_32 ebx
271 %define A2_16 bx
272 %define A2_8 bl
273
274 %define A3 esi
275 %define A3_32 esi
276 %define A3_16 si
277
278 %define T0 eax
279 %define T0_32 eax
280 %define T0_16 ax
281 %define T0_8 al
282
283 %define T1 edi
284 %define T1_32 edi
285 %define T1_16 di
286%endif
287
288
289;;
290; Load the relevant flags from [%1] if there are undefined flags (%3).
291;
292; @remarks Clobbers T0, stack. Changes EFLAGS.
293; @param 1 The parameter (A0..A3) holding the eflags value.
294; @param 2 The set of modified flags.
295; @param 3 The set of undefined flags.
296; @param 4 The flags that must be loaded.
297;
298%macro IEM_MAYBE_LOAD_FLAGS 4
299 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
300 pushf ; store current flags
301 mov T0_32, %1 ; load the guest flags
302 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
303 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
304 or [xSP], T0 ; merge guest flags with host flags.
305 popf ; load the mixed flags.
306
307 %elif (%3 + %4) != 0
308 %if 1 ; This approach seems faster on intel 10980XE
309 %if (%3 | %4) == X86_EFL_CF
310 ; Use bt to load bit into CF
311 bt %1, X86_EFL_CF_BIT
312 %else
313 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
314 mov eax, %1
315 %if (%3 | %4) == X86_EFL_OF
316 ; Use ADD to set OF.
317 shl eax, 31 - X86_EFL_OF_BIT
318 add eax, 80000000h
319 %elif ((%3 | %4) & X86_EFL_OF) != 0
320 ; Use ADD to set OF.
321 xchg al, ah
322 shl al, 15 - X86_EFL_OF_BIT
323 add al, 80h
324 ; Use SAHF to set the other status flags.
325 sahf
326 %else ; OF not needed; so al -> ah and load ah into eflags.
327 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
328 shl eax, 8
329 %else
330 xchg al, ah
331 %endif
332 sahf
333 %endif
334 %endif
335
336 %else
337 pushf ; store current flags
338 mov T0_32, %1 ; load the guest flags
339 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
340 and T0_32, (%2 | %3) ; select the modified and undefined flags.
341 or [xSP], T0 ; merge guest flags with host flags.
342 popf ; load the mixed flags.
343 %endif
344 %endif
345%endmacro
346
347;;
348; Load the relevant flags from [%1].
349;
350; @remarks Clobbers T0, stack. Changes EFLAGS.
351; @param 1 The parameter (A0..A3) holding the eflags value.
352; @param 2 The set of flags to load.
353; @param 3 The set of undefined flags.
354;
355%macro IEM_LOAD_FLAGS 3
356 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
357 pushf ; store current flags
358 mov T0_32, %1 ; load the guest flags
359 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
360 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
361 or [xSP], T0 ; merge guest flags with host flags.
362 popf ; load the mixed flags.
363
364 %elif 1 ; This approach seems faster on intel 10980XE
365 %if (%3 | %2) == X86_EFL_CF
366 ; Use bt to load bit into CF
367 bt %1, X86_EFL_CF_BIT
368 %else
369 mov eax, %1 ; ASSUMES T0_32 is eax!!
370 %if (%3 | %2) == X86_EFL_OF
371 ; Use ADD to set OF.
372 shl eax, 31 - X86_EFL_OF_BIT
373 add eax, 80000000h
374 %elif ((%3 | %2) & X86_EFL_OF) != 0
375 ; Use ADD to set OF.
376 xchg al, ah
377 shl al, 15 - X86_EFL_OF_BIT
378 add al, 80h
379 ; Use SAHF to set the other status flags.
380 sahf
381 %else ; OF not needed; so al -> ah and load ah into eflags.
382 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
383 shl eax, 8
384 %else
385 xchg al, ah
386 %endif
387 sahf
388 %endif
389 %endif ; (%3 | %2) != X86_EFL_CF
390
391 %else
392 pushf ; store current flags
393 mov T0_32, %1 ; load the guest flags
394 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
395 and T0_32, (%2 | %3) ; select the modified and undefined flags.
396 or [xSP], T0 ; merge guest flags with host flags.
397 popf ; load the mixed flags.
398 %endif
399%endmacro
400
401;;
402; Merge incoming guest EFLAGS (%1) with host EFLAGS into EAX (T0).
403;
404; @remarks Clobbers T0, T1, %1, stack.
405; @param 1 The parameter (A0..A3) holding the OLD eflags value. Clobbered.
406; @param 2 The mask of modified flags to save.
407; @param 3 The mask of undefined flags to (maybe) save.
408; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
409;
410%macro IEM_SAVE_FLAGS_RETVAL 4 0
411 %if (%2 | %3 | %4) != 0
412 mov T1_32, %1 ; flags
413 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
414 pushf
415 pop T0
416 and %1, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
417 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
418 %else
419 %if (%2 | %3 | %4) == X86_EFL_CF
420 setc T0_8
421 %elif (%2 | %3) == X86_EFL_OF
422 seto T0_8
423 shl T0_32, X86_EFL_OF_BIT
424 %elif (%2 | %3) == X86_EFL_ZF
425 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
426 shl T0_32, X86_EFL_ZF_BIT
427 %elif (%2 | %3) <= 0xff
428 lahf
429 movzx eax, ah ; ASSUMES T0_32 is eax!
430 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
431 lahf ; while there seems only to be a tiny advantage in most other test.
432 movzx eax, ah ; ASSUMES T0_32 is eax!
433 jno .of_is_clear
434 or eax, X86_EFL_OF
435.of_is_clear:
436 %else
437 pushf ; this is a bit slow
438 pop T0
439 %endif
440 and %1, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
441 and T0_32, (%2 | %3) ; select the modified and undefined flags.
442 %endif
443 or T0_32, %1 ; combine the flags. ASSUMES T0 = eax!
444 ;mov %1, T0_32 ; save the flags.
445 %endif
446%endmacro
447
448;;
449; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
450;
451; @remarks Clobbers T0, T1, stack.
452; @param 1 The parameter (A0..A3) holding the eflags value.
453; @param 2 The mask of modified flags to save.
454; @param 3 Mask of additional flags to always clear
455; @param 4 Mask of additional flags to always set.
456;
457;; @todo make it stuff the result into EAX?
458%macro IEM_SAVE_AND_ADJUST_FLAGS 4
459 %if (%2 | %3 | %4) != 0
460 pushf
461 pop T1
462 mov T0_32, %1 ; load flags.
463 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
464 and T1_32, (%2) ; select the modified flags.
465 or T0_32, T1_32 ; combine the flags.
466 %if (%4) != 0
467 or T0_32, %4 ; add the always set flags.
468 %endif
469 mov %1, T0_32 ; save the result.
470 %endif
471%endmacro
472
473;;
474; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
475; signed input (%4[%5]) and parity index (%6), storing the result into EAX (T0).
476;
477; @note %4 & %6 must not be RAX, EAX, or AX! So, don't use with full MUL/IMUL.
478
479; @remarks Clobbers T0, T1, stack, %6, EFLAGS, %1.
480; @param 1 The parameter (A0..A3) holding the eflags value.
481; @param 2 The mask of modified flags to save.
482; @param 3 Mask of additional flags to always clear
483; @param 4 The result register to set SF by.
484; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
485; @param 6 The (full) register containing the parity table index. Will be modified!
486%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL 6
487 pushf
488 pop T0
489 and %1, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
490 and T0_32, (%2) ; select the modified flags.
491 or T0_32, %1 ; combine the flags.
492
493 ; First calculate SF as it is the same register as %6 (only %6 is always full width).
494 bt %4, %5 - 1
495 jnc %%sf_clear
496 or T0_32, X86_EFL_SF
497 %%sf_clear:
498
499 ; Parity last.
500 and %6, 0xff
501 %ifdef RT_ARCH_AMD64
502 lea T1, [NAME(g_afParity) xWrtRIP]
503 or T0_8, [T1 + %6]
504 %else
505 or T0_8, [NAME(g_afParity) + %6]
506 %endif
507
508 ;mov %1, T0_32 ; save the result.
509 ; ASSUMES T0 = eax!
510%endmacro
511
512;;
513; Calculates the new EFLAGS using fixed clear and set bit masks.
514;
515; @remarks Clobbers T0.
516; @param 1 The parameter (A0..A3) holding the eflags value.
517; @param 2 Mask of additional flags to always clear
518; @param 3 Mask of additional flags to always set.
519;
520%macro IEM_ADJUST_FLAGS 3
521 %if (%2 | %3) != 0
522 mov T0_32, %1 ; Load flags.
523 %if (%2) != 0
524 and T0_32, ~(%2) ; Remove the always cleared flags.
525 %endif
526 %if (%3) != 0
527 or T0_32, %3 ; Add the always set flags.
528 %endif
529 mov %1, T0_32 ; Save the result.
530 %endif
531%endmacro
532
533;;
534; Calculates the new EFLAGS using fixed clear and set bit masks.
535;
536; @remarks Clobbers T0, %4, EFLAGS.
537; @param 1 The parameter (A0..A3) holding the eflags value.
538; @param 2 Mask of additional flags to always clear
539; @param 3 Mask of additional flags to always set.
540; @param 4 The (full) register containing the parity table index. Will be modified!
541;
542%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
543 mov T0_32, %1 ; Load flags.
544 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
545 %if (%3) != 0
546 or T0_32, %3 ; Add the always set flags.
547 %endif
548 and %4, 0xff
549 %ifdef RT_ARCH_AMD64
550 lea T2, [NAME(g_afParity) xWrtRIP]
551 or T0_8, [T2 + %4]
552 %else
553 or T0_8, [NAME(g_afParity) + %4]
554 %endif
555 mov %1, T0_32 ; Save the result.
556%endmacro
557
558
559;;;; OLD EFLAGS macros.
560;;;; OLD EFLAGS macros.
561;;;; OLD EFLAGS macros.
562;;;; OLD EFLAGS macros.
563;;;; OLD EFLAGS macros.
564
565;;
566; Load the relevant flags from [%1] if there are undefined flags (%3).
567;
568; @remarks Clobbers T0, stack. Changes EFLAGS.
569; @param 1 The parameter (A0..A3) pointing to the eflags.
570; @param 2 The set of modified flags.
571; @param 3 The set of undefined flags.
572; @param 4 The flags that must be loaded.
573;
574%macro IEM_MAYBE_LOAD_FLAGS_OLD 4
575 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
576 pushf ; store current flags
577 mov T0_32, [%1] ; load the guest flags
578 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
579 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
580 or [xSP], T0 ; merge guest flags with host flags.
581 popf ; load the mixed flags.
582
583 %elif (%3 + %4) != 0
584 %if 1 ; This approach seems faster on intel 10980XE
585 %if (%3 | %4) == X86_EFL_CF
586 ; Use bt to load bit into CF
587 bt dword [%1], X86_EFL_CF_BIT
588 %else
589 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
590 mov eax, [%1]
591 %if (%3 | %4) == X86_EFL_OF
592 ; Use ADD to set OF.
593 shl eax, 31 - X86_EFL_OF_BIT
594 add eax, 80000000h
595 %elif ((%3 | %4) & X86_EFL_OF) != 0
596 ; Use ADD to set OF.
597 xchg al, ah
598 shl al, 15 - X86_EFL_OF_BIT
599 add al, 80h
600 ; Use SAHF to set the other status flags.
601 sahf
602 %else ; OF not needed; so al -> ah and load ah into eflags.
603 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
604 shl eax, 8
605 %else
606 xchg al, ah
607 %endif
608 sahf
609 %endif
610 %endif
611
612 %else
613 pushf ; store current flags
614 mov T0_32, [%1] ; load the guest flags
615 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
616 and T0_32, (%2 | %3) ; select the modified and undefined flags.
617 or [xSP], T0 ; merge guest flags with host flags.
618 popf ; load the mixed flags.
619 %endif
620 %endif
621%endmacro
622
623;;
624; Load the relevant flags from [%1].
625;
626; @remarks Clobbers T0, stack. Changes EFLAGS.
627; @param 1 The parameter (A0..A3) pointing to the eflags.
628; @param 2 The set of flags to load.
629; @param 3 The set of undefined flags.
630;
631%macro IEM_LOAD_FLAGS_OLD 3
632 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
633 pushf ; store current flags
634 mov T0_32, [%1] ; load the guest flags
635 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
636 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
637 or [xSP], T0 ; merge guest flags with host flags.
638 popf ; load the mixed flags.
639
640 %elif 1 ; This approach seems faster on intel 10980XE
641 %if (%3 | %2) == X86_EFL_CF
642 ; Use bt to load bit into CF
643 bt dword [%1], X86_EFL_CF_BIT
644 %else
645 mov eax, [%1] ; ASSUMES T0_32 is eax!!
646 %if (%3 | %2) == X86_EFL_OF
647 ; Use ADD to set OF.
648 shl eax, 31 - X86_EFL_OF_BIT
649 add eax, 80000000h
650 %elif ((%3 | %2) & X86_EFL_OF) != 0
651 ; Use ADD to set OF.
652 xchg al, ah
653 shl al, 15 - X86_EFL_OF_BIT
654 add al, 80h
655 ; Use SAHF to set the other status flags.
656 sahf
657 %else ; OF not needed; so al -> ah and load ah into eflags.
658 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
659 shl eax, 8
660 %else
661 xchg al, ah
662 %endif
663 sahf
664 %endif
665 %endif ; (%3 | %2) != X86_EFL_CF
666
667 %else
668 pushf ; store current flags
669 mov T0_32, [%1] ; load the guest flags
670 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
671 and T0_32, (%2 | %3) ; select the modified and undefined flags.
672 or [xSP], T0 ; merge guest flags with host flags.
673 popf ; load the mixed flags.
674 %endif
675%endmacro
676
677;;
678; Update the flag.
679;
680; @remarks Clobbers T0, T1, stack.
681; @param 1 The register pointing to the EFLAGS.
682; @param 2 The mask of modified flags to save.
683; @param 3 The mask of undefined flags to (maybe) save.
684; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
685;
686%macro IEM_SAVE_FLAGS_OLD 4 0
687 %if (%2 | %3 | %4) != 0
688 mov T1_32, [%1] ; flags
689 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
690 pushf
691 pop T0
692 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
693 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
694 %else
695 %if (%2 | %3 | %4) == X86_EFL_CF
696 setc T0_8
697 %elif (%2 | %3) == X86_EFL_OF
698 seto T0_8
699 shl T0_32, X86_EFL_OF_BIT
700 %elif (%2 | %3) == X86_EFL_ZF
701 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
702 shl T0_32, X86_EFL_ZF_BIT
703 %elif (%2 | %3) <= 0xff
704 lahf
705 movzx eax, ah ; ASSUMES T0_32 is eax!
706 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
707 lahf ; while there seems only to be a tiny advantage in most other test.
708 movzx eax, ah ; ASSUMES T0_32 is eax!
709 jno .of_is_clear
710 or eax, X86_EFL_OF
711.of_is_clear:
712 %else
713 pushf ; this is a bit slow
714 pop T0
715 %endif
716 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
717 and T0_32, (%2 | %3) ; select the modified and undefined flags.
718 %endif
719 or T0_32, T1_32 ; combine the flags.
720 mov [%1], T0_32 ; save the flags.
721 %endif
722%endmacro
723
724;;
725; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
726;
727; @remarks Clobbers T0, T1, stack.
728; @param 1 The register pointing to the EFLAGS.
729; @param 2 The mask of modified flags to save.
730; @param 3 Mask of additional flags to always clear
731; @param 4 Mask of additional flags to always set.
732;
733%macro IEM_SAVE_AND_ADJUST_FLAGS_OLD 4
734 %if (%2 | %3 | %4) != 0
735 pushf
736 pop T1
737 mov T0_32, [%1] ; load flags.
738 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
739 and T1_32, (%2) ; select the modified flags.
740 or T0_32, T1_32 ; combine the flags.
741 %if (%4) != 0
742 or T0_32, %4 ; add the always set flags.
743 %endif
744 mov [%1], T0_32 ; save the result.
745 %endif
746%endmacro
747
748;;
749; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
750; signed input (%4[%5]) and parity index (%6).
751;
752; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
753; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
754; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
755;
756; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
757; @param 1 The register pointing to the EFLAGS.
758; @param 2 The mask of modified flags to save.
759; @param 3 Mask of additional flags to always clear
760; @param 4 The result register to set SF by.
761; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
762; @param 6 The (full) register containing the parity table index. Will be modified!
763
764%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD 6
765 %ifdef RT_ARCH_AMD64
766 pushf
767 pop T2
768 %else
769 push T0
770 pushf
771 pop T0
772 %endif
773 mov T1_32, [%1] ; load flags.
774 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
775 %ifdef RT_ARCH_AMD64
776 and T2_32, (%2) ; select the modified flags.
777 or T1_32, T2_32 ; combine the flags.
778 %else
779 and T0_32, (%2) ; select the modified flags.
780 or T1_32, T0_32 ; combine the flags.
781 pop T0
782 %endif
783
784 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
785 bt %4, %5 - 1
786 jnc %%sf_clear
787 or T1_32, X86_EFL_SF
788 %%sf_clear:
789
790 ; Parity last.
791 and %6, 0xff
792 %ifdef RT_ARCH_AMD64
793 lea T2, [NAME(g_afParity) xWrtRIP]
794 or T1_8, [T2 + %6]
795 %else
796 or T1_8, [NAME(g_afParity) + %6]
797 %endif
798
799 mov [%1], T1_32 ; save the result.
800%endmacro
801
802;;
803; Calculates the new EFLAGS using fixed clear and set bit masks.
804;
805; @remarks Clobbers T0.
806; @param 1 The register pointing to the EFLAGS.
807; @param 2 Mask of additional flags to always clear
808; @param 3 Mask of additional flags to always set.
809;
810%macro IEM_ADJUST_FLAGS_OLD 3
811 %if (%2 | %3) != 0
812 mov T0_32, [%1] ; Load flags.
813 %if (%2) != 0
814 and T0_32, ~(%2) ; Remove the always cleared flags.
815 %endif
816 %if (%3) != 0
817 or T0_32, %3 ; Add the always set flags.
818 %endif
819 mov [%1], T0_32 ; Save the result.
820 %endif
821%endmacro
822
823;;
824; Calculates the new EFLAGS using fixed clear and set bit masks.
825;
826; @remarks Clobbers T0, %4, EFLAGS.
827; @param 1 The register pointing to the EFLAGS.
828; @param 2 Mask of additional flags to always clear
829; @param 3 Mask of additional flags to always set.
830; @param 4 The (full) register containing the parity table index. Will be modified!
831;
832%macro IEM_ADJUST_FLAGS_WITH_PARITY_OLD 4
833 mov T0_32, [%1] ; Load flags.
834 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
835 %if (%3) != 0
836 or T0_32, %3 ; Add the always set flags.
837 %endif
838 and %4, 0xff
839 %ifdef RT_ARCH_AMD64
840 lea T2, [NAME(g_afParity) xWrtRIP]
841 or T0_8, [T2 + %4]
842 %else
843 or T0_8, [NAME(g_afParity) + %4]
844 %endif
845 mov [%1], T0_32 ; Save the result.
846%endmacro
847
848
849
850;;
851; Loads register with offset of imm8 instruction -- used by all of the instruction
852; implementations which lay out jump tables of 256x immediate byte variants.
853; Also checks that the instruction size matches the offsets in the table.
854;
855; @param 1 The register to receive the jump target address (T1).
856; @param 2 The register containing the imm8 index (A1 / A2 / A3).
857; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
858; @note Implicitly uses local symbols .imm0, .imm1, and .immEmd
859; (implementation artifacts of each instruction jump table).
860;
861; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]`.
862;
863%macro IEMIMPL_JUMP_TABLE_TARGET_INT 3
864 lea %1, [.imm0 xWrtRIP]
865 %if %3 == 5
866 lea T0, [%2 + %2*4] ; *5
867 lea %1, [%1 + T0] ; *5 + .imm0
868 %elif %3 == 6
869 lea T0, [%2 + %2*2] ; *3
870 lea %1, [%1 + T0*2] ; *6 + .imm0
871 %elif %3 == 7
872 lea T0, [%2 + %2*2] ; *3
873 lea T0, [T0 + %2*4] ; *7
874 lea %1, [%1 + T0] ; *7 + .imm0
875 %elif %3 == 8
876 lea %1, [%1 + %2*8] ; *8 + .imm0
877 %elif %3 == 9
878 lea T0, [%2 + %2*8] ; *9
879 lea %1, [%1 + T0] ; *9 + .imm0
880 %elif %3 == 10
881 lea T0, [%2 + %2*4] ; *5
882 lea %1, [%1 + T0*2] ; *10 + .imm0
883 %elif %3 == 11
884 lea T0, [%2 + %2*4] ; *5
885 lea T0, [%2 + T0*2] ; *11
886 lea %1, [%1 + T0] ; *11 + .imm0
887 %elif %3 == 12
888 lea T0, [%2 + %2*2] ; *3
889 lea %1, [%1 + T0*4] ; *12 + .imm0
890 %else
891 %error Unexpected instruction byte count in IEMIMPL_JUMP_TABLE_TARGET_INT
892 %endif
893 ; check size: 'warning: value does not fit in 8 bit field' if bad
894 times (.imm1 - .imm0 + %3) %% %3 db 999 * \
895 (.imm1 - .imm0 + %3)
896 ; check alignment: 'warning: value does not fit in 8 bit field' if bad
897 times ((.immEnd - .imm0) - 256 * %3) db 999 * \
898 ((.immEnd - .imm0) - 256 * %3)
899%endmacro
900
901%macro IEMIMPL_JUMP_TABLE_TARGET 3
902 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
903 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, (%3 + 4)
904 %else
905 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, %3
906 %endif
907%endmacro
908
909
910;;
911; Calls the given imm8 instruction -- used by all of the instruction
912; implementations which lay out jump tables of 256x immediate byte variants.
913;
914; @param 1 The register to receive the jump target address (T1).
915; @param 2 The register containing the imm8 index (A1 / A2 / A3).
916; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
917;
918; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]` +
919; `IBT_NOTRACK, call %1`.
920;
921%macro IEMIMPL_CALL_JUMP_TABLE_TARGET 3
922 IEMIMPL_JUMP_TABLE_TARGET %1, %2, %3
923 IBT_NOTRACK
924 call %1
925%endmacro
926
927
928;*********************************************************************************************************************************
929;* External Symbols *
930;*********************************************************************************************************************************
931extern NAME(g_afParity)
932
933
934;;
935; Macro for implementing a binary operator.
936;
937; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
938; variants, except on 32-bit system where the 64-bit accesses requires hand
939; coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; @param 1 The instruction mnemonic.
945; @param 2 Non-zero if there should be a locked version.
946; @param 3 The modified flags.
947; @param 4 The undefined flags.
948; @param 5 The flags that must be loaded (ADC, SBC).
949; @param 6 The flags that will be zeroed by the operation.
950;
951%macro IEMIMPL_BIN_OP 6
952BEGINCODE
953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
954 PROLOGUE_3_ARGS
955 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
956 %1 byte [A1], A2_8
957 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
958 EPILOGUE_3_ARGS
959ENDPROC iemAImpl_ %+ %1 %+ _u8
960
961BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
962 PROLOGUE_3_ARGS
963 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
964 %1 word [A1], A2_16
965 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
966 EPILOGUE_3_ARGS
967ENDPROC iemAImpl_ %+ %1 %+ _u16
968
969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
970 PROLOGUE_3_ARGS
971 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
972 %1 dword [A1], A2_32
973 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
974 EPILOGUE_3_ARGS
975ENDPROC iemAImpl_ %+ %1 %+ _u32
976
977 %ifdef RT_ARCH_AMD64
978BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
979 PROLOGUE_3_ARGS
980 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
981 %1 qword [A1], A2
982 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
983 EPILOGUE_3_ARGS_EX 8
984ENDPROC iemAImpl_ %+ %1 %+ _u64
985 %endif ; RT_ARCH_AMD64
986
987 %if %2 != 0 ; locked versions requested?
988
989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
990 PROLOGUE_3_ARGS
991 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
992 lock %1 byte [A1], A2_8
993 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1000 lock %1 word [A1], A2_16
1001 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1002 EPILOGUE_3_ARGS
1003ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1004
1005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1006 PROLOGUE_3_ARGS
1007 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1008 lock %1 dword [A1], A2_32
1009 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1010 EPILOGUE_3_ARGS
1011ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1012
1013 %ifdef RT_ARCH_AMD64
1014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1015 PROLOGUE_3_ARGS
1016 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1017 lock %1 qword [A1], A2
1018 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1019 EPILOGUE_3_ARGS_EX 8
1020ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1021 %endif ; RT_ARCH_AMD64
1022 %endif ; locked
1023%endmacro
1024
1025; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
1026IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1027IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1028IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1029IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1030IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1031IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1032IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1033IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1034IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1035
1036
1037;;
1038; Macro for implementing a binary operator, VEX variant with separate input/output.
1039;
1040; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1041; where the 64-bit accesses requires hand coding.
1042;
1043; All the functions takes a pointer to the destination memory operand in A0,
1044; the first source register operand in A1, the second source register operand
1045; in A2 and a pointer to eflags in A3.
1046;
1047; @param 1 The instruction mnemonic.
1048; @param 2 The modified flags.
1049; @param 3 The undefined flags.
1050; @param 4 The zeroed flags.
1051;
1052%macro IEMIMPL_VEX_BIN_OP 4
1053BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1054 PROLOGUE_4_ARGS
1055 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
1056 %1 T0_32, A1_32, A2_32
1057 mov [A0], T0_32
1058 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1059 EPILOGUE_4_ARGS
1060ENDPROC iemAImpl_ %+ %1 %+ _u32
1061
1062 %ifdef RT_ARCH_AMD64
1063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1064 PROLOGUE_4_ARGS
1065 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0
1066 %1 T0, A1, A2
1067 mov [A0], T0
1068 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1069 EPILOGUE_4_ARGS
1070ENDPROC iemAImpl_ %+ %1 %+ _u64
1071 %endif ; RT_ARCH_AMD64
1072%endmacro
1073
1074; instr, modified-flags, undefined-flags, zeroed-flags
1075IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1076IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1077IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
1078
1079;;
1080; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
1081;
1082; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1083; where the 64-bit accesses requires hand coding.
1084;
1085; All the functions takes a pointer to the destination memory operand in A1,
1086; the source register operand in A2 and incoming EFLAGS in A0. Updated EFLAGS
1087; are returned in EAX.
1088;
1089; @param 1 The instruction mnemonic.
1090; @param 2 The modified flags.
1091; @param 3 The undefined flags.
1092; @param 4 The zeroed flags.
1093;
1094%macro IEMIMPL_VEX_BIN_OP_2 4
1095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1096 PROLOGUE_4_ARGS
1097 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
1098 mov T0_32, [A1]
1099 %1 T0_32, A2_32
1100 mov [A1], T0_32
1101 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1102 EPILOGUE_4_ARGS
1103ENDPROC iemAImpl_ %+ %1 %+ _u32
1104
1105 %ifdef RT_ARCH_AMD64
1106BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1107 PROLOGUE_4_ARGS
1108 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1109 mov T0, [A1]
1110 %1 T0, A2
1111 mov [A1], T0
1112 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1113 EPILOGUE_4_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u64
1115 %endif ; RT_ARCH_AMD64
1116%endmacro
1117
1118; instr, modified-flags, undefined-flags zeroed-flags
1119IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1120IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1121IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1122
1123
1124;;
1125; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
1126;
1127; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1128; where the 64-bit accesses requires hand coding.
1129;
1130; All the functions takes a pointer to the destination memory operand in A0,
1131; the first source register operand in A1, the second source register operand
1132; in A2 and a pointer to eflags in A3.
1133;
1134; @param 1 The instruction mnemonic.
1135; @param 2 Fallback instruction if applicable.
1136; @param 3 Whether to emit fallback or not.
1137;
1138%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
1139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1140 PROLOGUE_3_ARGS
1141 %1 T0_32, A1_32, A2_32
1142 mov [A0], T0_32
1143 EPILOGUE_3_ARGS
1144ENDPROC iemAImpl_ %+ %1 %+ _u32
1145
1146 %if %3
1147BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
1148 PROLOGUE_3_ARGS
1149 %ifdef ASM_CALL64_GCC
1150 mov cl, A2_8
1151 %2 A1_32, cl
1152 mov [A0], A1_32
1153 %else
1154 xchg A2, A0
1155 %2 A1_32, cl
1156 mov [A2], A1_32
1157 %endif
1158 EPILOGUE_3_ARGS
1159ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
1160 %endif
1161
1162 %ifdef RT_ARCH_AMD64
1163BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1164 PROLOGUE_3_ARGS
1165 %1 T0, A1, A2
1166 mov [A0], T0
1167 EPILOGUE_3_ARGS
1168ENDPROC iemAImpl_ %+ %1 %+ _u64
1169
1170 %if %3
1171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
1172 PROLOGUE_3_ARGS
1173 %ifdef ASM_CALL64_GCC
1174 mov cl, A2_8
1175 %2 A1, cl
1176 mov [A0], A1_32
1177 %else
1178 xchg A2, A0
1179 %2 A1, cl
1180 mov [A2], A1_32
1181 %endif
1182 mov [A0], A1
1183 EPILOGUE_3_ARGS
1184ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
1185 %endif
1186 %endif ; RT_ARCH_AMD64
1187%endmacro
1188
1189; instr, fallback instr, emit fallback
1190IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
1191IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
1192IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
1193IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
1194IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
1195
1196
1197;
1198; RORX uses a immediate byte for the shift count, so we only do
1199; fallback implementation of that one.
1200;
1201BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
1202 PROLOGUE_3_ARGS
1203 %ifdef ASM_CALL64_GCC
1204 mov cl, A2_8
1205 ror A1_32, cl
1206 mov [A0], A1_32
1207 %else
1208 xchg A2, A0
1209 ror A1_32, cl
1210 mov [A2], A1_32
1211 %endif
1212 EPILOGUE_3_ARGS
1213ENDPROC iemAImpl_rorx_u32
1214
1215 %ifdef RT_ARCH_AMD64
1216BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
1217 PROLOGUE_3_ARGS
1218 %ifdef ASM_CALL64_GCC
1219 mov cl, A2_8
1220 ror A1, cl
1221 mov [A0], A1
1222 %else
1223 xchg A2, A0
1224 ror A1, cl
1225 mov [A2], A1
1226 %endif
1227 EPILOGUE_3_ARGS
1228ENDPROC iemAImpl_rorx_u64
1229 %endif ; RT_ARCH_AMD64
1230
1231
1232;
1233; MULX
1234;
1235BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
1236 PROLOGUE_4_ARGS
1237%ifdef ASM_CALL64_GCC
1238 ; A2_32 is EDX - prefect
1239 mulx T0_32, T1_32, A3_32
1240 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
1241 mov [A0], T0_32
1242%else
1243 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
1244 xchg A1, A2
1245 mulx T0_32, T1_32, A3_32
1246 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
1247 mov [A0], T0_32
1248%endif
1249 EPILOGUE_4_ARGS
1250ENDPROC iemAImpl_mulx_u32
1251
1252
1253BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
1254 PROLOGUE_4_ARGS
1255%ifdef ASM_CALL64_GCC
1256 ; A2_32 is EDX, T0_32 is EAX
1257 mov eax, A3_32
1258 mul A2_32
1259 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
1260 mov [A0], edx
1261%else
1262 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
1263 xchg A1, A2
1264 mov eax, A3_32
1265 mul A2_32
1266 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
1267 mov [A0], edx
1268%endif
1269 EPILOGUE_4_ARGS
1270ENDPROC iemAImpl_mulx_u32_fallback
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
1274 PROLOGUE_4_ARGS
1275%ifdef ASM_CALL64_GCC
1276 ; A2 is RDX - prefect
1277 mulx T0, T1, A3
1278 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
1279 mov [A0], T0
1280%else
1281 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
1282 xchg A1, A2
1283 mulx T0, T1, A3
1284 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
1285 mov [A0], T0
1286%endif
1287 EPILOGUE_4_ARGS
1288ENDPROC iemAImpl_mulx_u64
1289
1290
1291BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
1292 PROLOGUE_4_ARGS
1293%ifdef ASM_CALL64_GCC
1294 ; A2 is RDX, T0 is RAX
1295 mov rax, A3
1296 mul A2
1297 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
1298 mov [A0], rdx
1299%else
1300 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
1301 xchg A1, A2
1302 mov rax, A3
1303 mul A2
1304 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
1305 mov [A0], rdx
1306%endif
1307 EPILOGUE_4_ARGS
1308ENDPROC iemAImpl_mulx_u64_fallback
1309
1310%endif
1311
1312
1313;;
1314; Macro for implementing a bit operator.
1315;
1316; This will generate code for the 16, 32 and 64 bit accesses with locked
1317; variants, except on 32-bit system where the 64-bit accesses requires hand
1318; coding.
1319;
1320; All the functions takes a pointer to the destination memory operand in A1,
1321; the source register operand in A2 and incoming eflags in A0.
1322;
1323; @param 1 The instruction mnemonic.
1324; @param 2 Non-zero if there should be a locked version.
1325; @param 3 The modified flags.
1326; @param 4 The undefined flags.
1327;
1328%macro IEMIMPL_BIT_OP 4
1329BEGINCODE
1330BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1333 %1 word [A1], A2_16
1334 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_ %+ %1 %+ _u16
1337
1338BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1341 %1 dword [A1], A2_32
1342 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1343 EPILOGUE_3_ARGS
1344ENDPROC iemAImpl_ %+ %1 %+ _u32
1345
1346 %ifdef RT_ARCH_AMD64
1347BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1348 PROLOGUE_3_ARGS
1349 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1350 %1 qword [A1], A2
1351 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1352 EPILOGUE_3_ARGS_EX 8
1353ENDPROC iemAImpl_ %+ %1 %+ _u64
1354 %endif ; RT_ARCH_AMD64
1355
1356 %if %2 != 0 ; locked versions requested?
1357
1358BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1359 PROLOGUE_3_ARGS
1360 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1361 lock %1 word [A1], A2_16
1362 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1363 EPILOGUE_3_ARGS
1364ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1365
1366BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1367 PROLOGUE_3_ARGS
1368 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1369 lock %1 dword [A1], A2_32
1370 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1371 EPILOGUE_3_ARGS
1372ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1373
1374 %ifdef RT_ARCH_AMD64
1375BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1376 PROLOGUE_3_ARGS
1377 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1378 lock %1 qword [A1], A2
1379 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1380 EPILOGUE_3_ARGS_EX 8
1381ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1382 %endif ; RT_ARCH_AMD64
1383 %endif ; locked
1384%endmacro
1385
1386; Undefined flags are passed thru here by the intel and amd CPUs we have.
1387; modified efl, undefined eflags
1388IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1389IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1390IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1391IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1392
1393;;
1394; Macro for implementing a bit search operator.
1395;
1396; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1397; system where the 64-bit accesses requires hand coding.
1398;
1399; All the functions takes a pointer to the destination memory operand in A1,
1400; the source register operand in A2 and the incoming eflags in A0.
1401;
1402; In the ZF case the destination register is 'undefined', however it seems that
1403; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1404; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1405; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1406; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1407;
1408; Intel: Clear all and calculate PF in addition to ZF.
1409; AMD: Passthru all flags other than ZF.
1410;
1411; @param 1 The instruction mnemonic.
1412; @param 2 The modified flags.
1413; @param 3 The undefined flags.
1414; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1415;
1416%macro IEMIMPL_BIT_OP2 4
1417BEGINCODE
1418; 16-bit
1419
1420BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1421 PROLOGUE_3_ARGS
1422 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1423 %1 T0_16, A2_16
1424%if %4 != 0
1425 jz .unchanged_dst
1426%endif
1427 mov [A1], T0_16
1428.unchanged_dst:
1429 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1430 EPILOGUE_3_ARGS
1431ENDPROC iemAImpl_ %+ %1 %+ _u16
1432
1433;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1434;bad; PROLOGUE_3_ARGS
1435;bad; %1 T1_16, A1_16
1436;bad; jz .unchanged_dst
1437;bad; mov [A0], T1_16
1438;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1439;bad; EPILOGUE_3_ARGS
1440;bad;.unchanged_dst:
1441;bad;%if %4 != 0
1442;bad; mov [A0], T1_16
1443;bad;%endif
1444;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1445;bad; EPILOGUE_3_ARGS
1446;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1447;bad;
1448;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1449;bad; PROLOGUE_3_ARGS
1450;bad; %1 T0_16, A1_16
1451;bad;%if %4 != 0
1452;bad; jz .unchanged_dst
1453;bad;%endif
1454;bad; mov [A0], T0_16
1455;bad;.unchanged_dst:
1456;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1457;bad; EPILOGUE_3_ARGS
1458;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1459
1460; 32-bit
1461
1462BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1463 PROLOGUE_3_ARGS
1464 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1465 %1 T0_32, A2_32
1466%if %4 != 0
1467 jz .unchanged_dst
1468%endif
1469 mov [A1], T0_32
1470.unchanged_dst:
1471 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1472 EPILOGUE_3_ARGS
1473ENDPROC iemAImpl_ %+ %1 %+ _u32
1474
1475;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1476;bad; PROLOGUE_3_ARGS
1477;bad; %1 T1_32, A1_32
1478;bad;%if %4 != 0
1479;bad; jz .unchanged_dst
1480;bad;%endif
1481;bad; mov [A0], T1_32
1482;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1483;bad; EPILOGUE_3_ARGS
1484;bad;.unchanged_dst:
1485;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1486;bad; EPILOGUE_3_ARGS
1487;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1488;bad;
1489;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1490;bad; PROLOGUE_3_ARGS
1491;bad; %1 T0_32, A1_32
1492;bad;%if %4 != 0
1493;bad; jz .unchanged_dst
1494;bad;%endif
1495;bad; mov [A0], T0_32
1496;bad;.unchanged_dst:
1497;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1498;bad; EPILOGUE_3_ARGS
1499;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1500
1501
1502 %ifdef RT_ARCH_AMD64
1503; 64-bit
1504
1505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1506 PROLOGUE_3_ARGS
1507 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1508 %1 T0, A2
1509%if %4 != 0
1510 jz .unchanged_dst
1511%endif
1512 mov [A1], T0
1513.unchanged_dst:
1514 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1515 EPILOGUE_3_ARGS_EX 8
1516ENDPROC iemAImpl_ %+ %1 %+ _u64
1517
1518;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1519;bad; PROLOGUE_3_ARGS
1520;bad; %1 T1, A1
1521;bad;%if %4 != 0
1522;bad; jz .unchanged_dst
1523;bad;%endif
1524;bad; mov [A0], T1
1525;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1526;bad; EPILOGUE_3_ARGS
1527;bad;.unchanged_dst:
1528;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1529;bad; EPILOGUE_3_ARGS
1530;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1531;bad;
1532;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1533;bad; PROLOGUE_3_ARGS
1534;bad; %1 T0, A1
1535;bad;%if %4 != 0
1536;bad; jz .unchanged_dst
1537;bad;%endif
1538;bad; mov [A0], T0
1539;bad;.unchanged_dst:
1540;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1541;bad; EPILOGUE_3_ARGS_EX 8
1542;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1543
1544 %endif ; RT_ARCH_AMD64
1545%endmacro
1546
1547IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1548IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1549IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1550IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1551
1552
1553;;
1554; Macro for implementing POPCNT.
1555;
1556; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1557; system where the 64-bit accesses requires hand coding.
1558;
1559; All the functions takes a pointer to the destination memory operand in A1,
1560; the source register operand in A2 and eflags in A0.
1561;
1562; ASSUMES Intel and AMD set EFLAGS the same way.
1563;
1564; ASSUMES the instruction does not support memory destination.
1565;
1566; @param 1 The instruction mnemonic.
1567; @param 2 The modified flags.
1568; @param 3 The undefined flags.
1569; @param 4 The zeroed flags.
1570;
1571%macro IEMIMPL_BIT_OP3 4
1572BEGINCODE
1573BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1574 PROLOGUE_3_ARGS
1575 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1576 %1 T0_16, A2_16
1577 mov [A1], T0_16
1578 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1579 EPILOGUE_3_ARGS
1580ENDPROC iemAImpl_ %+ %1 %+ _u16
1581
1582BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1583 PROLOGUE_3_ARGS
1584 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1585 %1 T0_32, A2_32
1586 mov [A1], T0_32
1587 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1588 EPILOGUE_3_ARGS
1589ENDPROC iemAImpl_ %+ %1 %+ _u32
1590
1591 %ifdef RT_ARCH_AMD64
1592BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1593 PROLOGUE_3_ARGS
1594 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1595 %1 T0, A2
1596 mov [A1], T0
1597 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1598 EPILOGUE_3_ARGS_EX 8
1599ENDPROC iemAImpl_ %+ %1 %+ _u64
1600 %endif ; RT_ARCH_AMD64
1601%endmacro
1602IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1603
1604
1605;
1606; IMUL is also a similar but yet different case (no lock, no mem dst).
1607; The rDX:rAX variant of imul is handled together with mul further down.
1608;
1609BEGINCODE
1610; @param 1 EFLAGS that are modified.
1611; @param 2 Undefined EFLAGS.
1612; @param 3 Function suffix.
1613; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1614; 2 for AMD (set AF, clear PF, ZF and SF).
1615%macro IEMIMPL_IMUL_TWO 4
1616BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1617 PROLOGUE_3_ARGS
1618 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1619 imul A2_16, word [A1]
1620 mov [A1], A2_16
1621 %if %4 != 1
1622 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1623 %else
1624 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_16, 16, A2 ; intel
1625 %endif
1626 EPILOGUE_3_ARGS
1627ENDPROC iemAImpl_imul_two_u16 %+ %3
1628
1629BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1630 PROLOGUE_3_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1632 imul A2_32, dword [A1]
1633 mov [A1], A2_32
1634 %if %4 != 1
1635 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1636 %else
1637 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_32, 32, A2 ; intel
1638 %endif
1639 EPILOGUE_3_ARGS
1640ENDPROC iemAImpl_imul_two_u32 %+ %3
1641
1642 %ifdef RT_ARCH_AMD64
1643BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1644 PROLOGUE_3_ARGS
1645 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1646 imul A2, qword [A1]
1647 mov [A1], A2
1648 %if %4 != 1
1649 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1650 %else
1651 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2, 64, A2 ; intel
1652 %endif
1653 EPILOGUE_3_ARGS_EX 8
1654ENDPROC iemAImpl_imul_two_u64 %+ %3
1655 %endif ; RT_ARCH_AMD64
1656%endmacro
1657; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1658; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1659; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1660IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1661IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1662IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1663
1664
1665;
1666; XCHG for memory operands. This implies locking. No flag changes.
1667;
1668; Each function takes two arguments, first the pointer to the memory,
1669; then the pointer to the register. They all return void.
1670;
1671BEGINCODE
1672BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1673 PROLOGUE_2_ARGS
1674 mov T0_8, [A1]
1675 xchg [A0], T0_8
1676 mov [A1], T0_8
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_xchg_u8_locked
1679
1680BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 mov T0_16, [A1]
1683 xchg [A0], T0_16
1684 mov [A1], T0_16
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_xchg_u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1689 PROLOGUE_2_ARGS
1690 mov T0_32, [A1]
1691 xchg [A0], T0_32
1692 mov [A1], T0_32
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_xchg_u32_locked
1695
1696%ifdef RT_ARCH_AMD64
1697BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1698 PROLOGUE_2_ARGS
1699 mov T0, [A1]
1700 xchg [A0], T0
1701 mov [A1], T0
1702 EPILOGUE_2_ARGS
1703ENDPROC iemAImpl_xchg_u64_locked
1704%endif
1705
1706; Unlocked variants for fDisregardLock mode.
1707
1708BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1709 PROLOGUE_2_ARGS
1710 mov T0_8, [A1]
1711 mov T1_8, [A0]
1712 mov [A0], T0_8
1713 mov [A1], T1_8
1714 EPILOGUE_2_ARGS
1715ENDPROC iemAImpl_xchg_u8_unlocked
1716
1717BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1718 PROLOGUE_2_ARGS
1719 mov T0_16, [A1]
1720 mov T1_16, [A0]
1721 mov [A0], T0_16
1722 mov [A1], T1_16
1723 EPILOGUE_2_ARGS
1724ENDPROC iemAImpl_xchg_u16_unlocked
1725
1726BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1727 PROLOGUE_2_ARGS
1728 mov T0_32, [A1]
1729 mov T1_32, [A0]
1730 mov [A0], T0_32
1731 mov [A1], T1_32
1732 EPILOGUE_2_ARGS
1733ENDPROC iemAImpl_xchg_u32_unlocked
1734
1735%ifdef RT_ARCH_AMD64
1736BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1737 PROLOGUE_2_ARGS
1738 mov T0, [A1]
1739 mov T1, [A0]
1740 mov [A0], T0
1741 mov [A1], T1
1742 EPILOGUE_2_ARGS
1743ENDPROC iemAImpl_xchg_u64_unlocked
1744%endif
1745
1746
1747;
1748; XADD for memory operands.
1749;
1750; Each function takes three arguments, first the pointer to the
1751; memory/register, then the pointer to the register, and finally a pointer to
1752; eflags. They all return void.
1753;
1754BEGINCODE
1755BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1756 PROLOGUE_3_ARGS
1757 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1758 mov T0_8, [A1]
1759 xadd [A0], T0_8
1760 mov [A1], T0_8
1761 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1762 EPILOGUE_3_ARGS
1763ENDPROC iemAImpl_xadd_u8
1764
1765BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1766 PROLOGUE_3_ARGS
1767 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1768 mov T0_16, [A1]
1769 xadd [A0], T0_16
1770 mov [A1], T0_16
1771 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1772 EPILOGUE_3_ARGS
1773ENDPROC iemAImpl_xadd_u16
1774
1775BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1778 mov T0_32, [A1]
1779 xadd [A0], T0_32
1780 mov [A1], T0_32
1781 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_xadd_u32
1784
1785%ifdef RT_ARCH_AMD64
1786BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1787 PROLOGUE_3_ARGS
1788 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1789 mov T0, [A1]
1790 xadd [A0], T0
1791 mov [A1], T0
1792 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1793 EPILOGUE_3_ARGS
1794ENDPROC iemAImpl_xadd_u64
1795%endif ; RT_ARCH_AMD64
1796
1797BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1798 PROLOGUE_3_ARGS
1799 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1800 mov T0_8, [A1]
1801 lock xadd [A0], T0_8
1802 mov [A1], T0_8
1803 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1804 EPILOGUE_3_ARGS
1805ENDPROC iemAImpl_xadd_u8_locked
1806
1807BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1808 PROLOGUE_3_ARGS
1809 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1810 mov T0_16, [A1]
1811 lock xadd [A0], T0_16
1812 mov [A1], T0_16
1813 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1814 EPILOGUE_3_ARGS
1815ENDPROC iemAImpl_xadd_u16_locked
1816
1817BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1818 PROLOGUE_3_ARGS
1819 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1820 mov T0_32, [A1]
1821 lock xadd [A0], T0_32
1822 mov [A1], T0_32
1823 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1824 EPILOGUE_3_ARGS
1825ENDPROC iemAImpl_xadd_u32_locked
1826
1827%ifdef RT_ARCH_AMD64
1828BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1829 PROLOGUE_3_ARGS
1830 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1831 mov T0, [A1]
1832 lock xadd [A0], T0
1833 mov [A1], T0
1834 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1835 EPILOGUE_3_ARGS
1836ENDPROC iemAImpl_xadd_u64_locked
1837%endif ; RT_ARCH_AMD64
1838
1839
1840;
1841; CMPXCHG8B.
1842;
1843; These are tricky register wise, so the code is duplicated for each calling
1844; convention.
1845;
1846; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1847;
1848; C-proto:
1849; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1850; uint32_t *pEFlags));
1851;
1852; Note! Identical to iemAImpl_cmpxchg16b.
1853;
1854BEGINCODE
1855BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1856%ifdef RT_ARCH_AMD64
1857 %ifdef ASM_CALL64_MSC
1858 push rbx
1859
1860 mov r11, rdx ; pu64EaxEdx (is also T1)
1861 mov r10, rcx ; pu64Dst
1862
1863 mov ebx, [r8]
1864 mov ecx, [r8 + 4]
1865 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1866 mov eax, [r11]
1867 mov edx, [r11 + 4]
1868
1869 cmpxchg8b [r10]
1870
1871 mov [r11], eax
1872 mov [r11 + 4], edx
1873 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1874
1875 pop rbx
1876 ret
1877 %else
1878 push rbx
1879
1880 mov r10, rcx ; pEFlags
1881 mov r11, rdx ; pu64EbxEcx (is also T1)
1882
1883 mov ebx, [r11]
1884 mov ecx, [r11 + 4]
1885 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1886 mov eax, [rsi]
1887 mov edx, [rsi + 4]
1888
1889 cmpxchg8b [rdi]
1890
1891 mov [rsi], eax
1892 mov [rsi + 4], edx
1893 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1894
1895 pop rbx
1896 ret
1897
1898 %endif
1899%else
1900 push esi
1901 push edi
1902 push ebx
1903 push ebp
1904
1905 mov edi, ecx ; pu64Dst
1906 mov esi, edx ; pu64EaxEdx
1907 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1908 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1909
1910 mov ebx, [ecx]
1911 mov ecx, [ecx + 4]
1912 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1913 mov eax, [esi]
1914 mov edx, [esi + 4]
1915
1916 cmpxchg8b [edi]
1917
1918 mov [esi], eax
1919 mov [esi + 4], edx
1920 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1921
1922 pop ebp
1923 pop ebx
1924 pop edi
1925 pop esi
1926 ret 8
1927%endif
1928ENDPROC iemAImpl_cmpxchg8b
1929
1930BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1931%ifdef RT_ARCH_AMD64
1932 %ifdef ASM_CALL64_MSC
1933 push rbx
1934
1935 mov r11, rdx ; pu64EaxEdx (is also T1)
1936 mov r10, rcx ; pu64Dst
1937
1938 mov ebx, [r8]
1939 mov ecx, [r8 + 4]
1940 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1941 mov eax, [r11]
1942 mov edx, [r11 + 4]
1943
1944 lock cmpxchg8b [r10]
1945
1946 mov [r11], eax
1947 mov [r11 + 4], edx
1948 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1949
1950 pop rbx
1951 ret
1952 %else
1953 push rbx
1954
1955 mov r10, rcx ; pEFlags
1956 mov r11, rdx ; pu64EbxEcx (is also T1)
1957
1958 mov ebx, [r11]
1959 mov ecx, [r11 + 4]
1960 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1961 mov eax, [rsi]
1962 mov edx, [rsi + 4]
1963
1964 lock cmpxchg8b [rdi]
1965
1966 mov [rsi], eax
1967 mov [rsi + 4], edx
1968 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1969
1970 pop rbx
1971 ret
1972
1973 %endif
1974%else
1975 push esi
1976 push edi
1977 push ebx
1978 push ebp
1979
1980 mov edi, ecx ; pu64Dst
1981 mov esi, edx ; pu64EaxEdx
1982 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1983 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1984
1985 mov ebx, [ecx]
1986 mov ecx, [ecx + 4]
1987 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1988 mov eax, [esi]
1989 mov edx, [esi + 4]
1990
1991 lock cmpxchg8b [edi]
1992
1993 mov [esi], eax
1994 mov [esi + 4], edx
1995 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1996
1997 pop ebp
1998 pop ebx
1999 pop edi
2000 pop esi
2001 ret 8
2002%endif
2003ENDPROC iemAImpl_cmpxchg8b_locked
2004
2005%ifdef RT_ARCH_AMD64
2006
2007;
2008; CMPXCHG16B.
2009;
2010; These are tricky register wise, so the code is duplicated for each calling
2011; convention.
2012;
2013; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2014;
2015; C-proto:
2016; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
2017; uint32_t *pEFlags));
2018;
2019; Note! Identical to iemAImpl_cmpxchg8b.
2020;
2021BEGINCODE
2022BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
2023 %ifdef ASM_CALL64_MSC
2024 push rbx
2025
2026 mov r11, rdx ; pu64RaxRdx (is also T1)
2027 mov r10, rcx ; pu64Dst
2028
2029 mov rbx, [r8]
2030 mov rcx, [r8 + 8]
2031 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2032 mov rax, [r11]
2033 mov rdx, [r11 + 8]
2034
2035 cmpxchg16b [r10]
2036
2037 mov [r11], rax
2038 mov [r11 + 8], rdx
2039 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2040
2041 pop rbx
2042 ret
2043 %else
2044 push rbx
2045
2046 mov r10, rcx ; pEFlags
2047 mov r11, rdx ; pu64RbxRcx (is also T1)
2048
2049 mov rbx, [r11]
2050 mov rcx, [r11 + 8]
2051 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2052 mov rax, [rsi]
2053 mov rdx, [rsi + 8]
2054
2055 cmpxchg16b [rdi]
2056
2057 mov [rsi], rax
2058 mov [rsi + 8], rdx
2059 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2060
2061 pop rbx
2062 ret
2063
2064 %endif
2065ENDPROC iemAImpl_cmpxchg16b
2066
2067BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
2068 %ifdef ASM_CALL64_MSC
2069 push rbx
2070
2071 mov r11, rdx ; pu64RaxRdx (is also T1)
2072 mov r10, rcx ; pu64Dst
2073
2074 mov rbx, [r8]
2075 mov rcx, [r8 + 8]
2076 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2077 mov rax, [r11]
2078 mov rdx, [r11 + 8]
2079
2080 lock cmpxchg16b [r10]
2081
2082 mov [r11], rax
2083 mov [r11 + 8], rdx
2084 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2085
2086 pop rbx
2087 ret
2088 %else
2089 push rbx
2090
2091 mov r10, rcx ; pEFlags
2092 mov r11, rdx ; pu64RbxRcx (is also T1)
2093
2094 mov rbx, [r11]
2095 mov rcx, [r11 + 8]
2096 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2097 mov rax, [rsi]
2098 mov rdx, [rsi + 8]
2099
2100 lock cmpxchg16b [rdi]
2101
2102 mov [rsi], rax
2103 mov [rsi + 8], rdx
2104 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2105
2106 pop rbx
2107 ret
2108
2109 %endif
2110ENDPROC iemAImpl_cmpxchg16b_locked
2111
2112%endif ; RT_ARCH_AMD64
2113
2114
2115;
2116; CMPXCHG.
2117;
2118; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2119;
2120; C-proto:
2121; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
2122;
2123BEGINCODE
2124%macro IEMIMPL_CMPXCHG 2
2125BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2128 mov al, [A1]
2129 %1 cmpxchg [A0], A2_8
2130 mov [A1], al
2131 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2132 EPILOGUE_4_ARGS
2133ENDPROC iemAImpl_cmpxchg_u8 %+ %2
2134
2135BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
2136 PROLOGUE_4_ARGS
2137 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2138 mov ax, [A1]
2139 %1 cmpxchg [A0], A2_16
2140 mov [A1], ax
2141 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2142 EPILOGUE_4_ARGS
2143ENDPROC iemAImpl_cmpxchg_u16 %+ %2
2144
2145BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
2146 PROLOGUE_4_ARGS
2147 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2148 mov eax, [A1]
2149 %1 cmpxchg [A0], A2_32
2150 mov [A1], eax
2151 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2152 EPILOGUE_4_ARGS
2153ENDPROC iemAImpl_cmpxchg_u32 %+ %2
2154
2155BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
2156%ifdef RT_ARCH_AMD64
2157 PROLOGUE_4_ARGS
2158 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2159 mov rax, [A1]
2160 %1 cmpxchg [A0], A2
2161 mov [A1], rax
2162 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2163 EPILOGUE_4_ARGS
2164%else
2165 ;
2166 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
2167 ;
2168 push esi
2169 push edi
2170 push ebx
2171 push ebp
2172
2173 mov edi, ecx ; pu64Dst
2174 mov esi, edx ; pu64Rax
2175 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
2176 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
2177
2178 mov ebx, [ecx]
2179 mov ecx, [ecx + 4]
2180 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2181 mov eax, [esi]
2182 mov edx, [esi + 4]
2183
2184 lock cmpxchg8b [edi]
2185
2186 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
2187 jz .cmpxchg8b_not_equal
2188;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
2189 cmp eax, eax ; just set the other flags.
2190.store:
2191 mov [esi], eax
2192 mov [esi + 4], edx
2193 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
2194
2195 pop ebp
2196 pop ebx
2197 pop edi
2198 pop esi
2199 ret 8
2200
2201.cmpxchg8b_not_equal:
2202 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
2203 jne .store
2204 cmp [esi], eax
2205 jmp .store
2206
2207%endif
2208ENDPROC iemAImpl_cmpxchg_u64 %+ %2
2209%endmacro ; IEMIMPL_CMPXCHG
2210
2211IEMIMPL_CMPXCHG , ,
2212IEMIMPL_CMPXCHG lock, _locked
2213
2214
2215
2216;;
2217; Macro for implementing a unary operator.
2218;
2219; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
2220; variants, except on 32-bit system where the 64-bit accesses requires hand
2221; coding.
2222;
2223; All the functions takes a pointer to the destination memory operand in A0,
2224; the source register operand in A1 and a pointer to eflags in A2.
2225;
2226; @param 1 The instruction mnemonic.
2227; @param 2 The modified flags.
2228; @param 3 The undefined flags.
2229;
2230%macro IEMIMPL_UNARY_OP 3
2231BEGINCODE
2232BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
2233 PROLOGUE_2_ARGS
2234 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2235 %1 byte [A0]
2236 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2237 EPILOGUE_2_ARGS
2238ENDPROC iemAImpl_ %+ %1 %+ _u8
2239
2240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
2241 PROLOGUE_2_ARGS
2242 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2243 lock %1 byte [A0]
2244 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2245 EPILOGUE_2_ARGS
2246ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
2247
2248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
2249 PROLOGUE_2_ARGS
2250 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2251 %1 word [A0]
2252 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2253 EPILOGUE_2_ARGS
2254ENDPROC iemAImpl_ %+ %1 %+ _u16
2255
2256BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
2257 PROLOGUE_2_ARGS
2258 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2259 lock %1 word [A0]
2260 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2261 EPILOGUE_2_ARGS
2262ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
2263
2264BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
2265 PROLOGUE_2_ARGS
2266 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2267 %1 dword [A0]
2268 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2269 EPILOGUE_2_ARGS
2270ENDPROC iemAImpl_ %+ %1 %+ _u32
2271
2272BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
2273 PROLOGUE_2_ARGS
2274 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2275 lock %1 dword [A0]
2276 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2277 EPILOGUE_2_ARGS
2278ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
2279
2280 %ifdef RT_ARCH_AMD64
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
2282 PROLOGUE_2_ARGS
2283 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2284 %1 qword [A0]
2285 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2286 EPILOGUE_2_ARGS
2287ENDPROC iemAImpl_ %+ %1 %+ _u64
2288
2289BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
2290 PROLOGUE_2_ARGS
2291 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2292 lock %1 qword [A0]
2293 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2294 EPILOGUE_2_ARGS
2295ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
2296 %endif ; RT_ARCH_AMD64
2297
2298%endmacro
2299
2300IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2301IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2302IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
2303IEMIMPL_UNARY_OP not, 0, 0
2304
2305
2306;
2307; BSWAP. No flag changes.
2308;
2309; Each function takes one argument, pointer to the value to bswap
2310; (input/output). They all return void.
2311;
2312BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
2313 PROLOGUE_1_ARGS
2314 mov T0_32, [A0] ; just in case any of the upper bits are used.
2315 db 66h
2316 bswap T0_32
2317 mov [A0], T0_32
2318 EPILOGUE_1_ARGS
2319ENDPROC iemAImpl_bswap_u16
2320
2321BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
2322 PROLOGUE_1_ARGS
2323 mov T0_32, [A0]
2324 bswap T0_32
2325 mov [A0], T0_32
2326 EPILOGUE_1_ARGS
2327ENDPROC iemAImpl_bswap_u32
2328
2329BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
2330%ifdef RT_ARCH_AMD64
2331 PROLOGUE_1_ARGS
2332 mov T0, [A0]
2333 bswap T0
2334 mov [A0], T0
2335 EPILOGUE_1_ARGS
2336%else
2337 PROLOGUE_1_ARGS
2338 mov T0, [A0]
2339 mov T1, [A0 + 4]
2340 bswap T0
2341 bswap T1
2342 mov [A0 + 4], T0
2343 mov [A0], T1
2344 EPILOGUE_1_ARGS
2345%endif
2346ENDPROC iemAImpl_bswap_u64
2347
2348
2349;;
2350; Macro for implementing a shift operation.
2351;
2352; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2353; 32-bit system where the 64-bit accesses requires hand coding.
2354;
2355; All the functions takes a pointer to the destination memory operand in A0,
2356; the shift count in A1 and a pointer to eflags in A2.
2357;
2358; @param 1 The instruction mnemonic.
2359; @param 2 The modified flags.
2360; @param 3 The undefined flags.
2361; @param 4 Force load flags.
2362;
2363; Makes ASSUMPTIONS about A0, A1 and A2 assignments. Specifically, that with
2364; GCC/64 we're free to use RCX/CL as it isn't used for any arguments. While
2365; MSC/64 & 32-bit fastcall are using ECX for the first argument (fEFlagsIn),
2366; so we have to switch it around with the shift count parameter registers.
2367;
2368; @note the _intel and _amd variants are implemented in C.
2369;
2370%macro IEMIMPL_SHIFT_OP 4
2371BEGINCODE
2372BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2373 PROLOGUE_3_ARGS
2374 %ifdef ASM_CALL64_GCC
2375 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2376 mov cl, A2_8
2377 %1 byte [A1], cl
2378 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2379 %else
2380 xchg A2, A0
2381 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2382 %1 byte [A1], cl
2383 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2384 %endif
2385.zero_shift:
2386 EPILOGUE_3_ARGS
2387ENDPROC iemAImpl_ %+ %1 %+ _u8
2388
2389BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2390 PROLOGUE_3_ARGS
2391 %ifdef ASM_CALL64_GCC
2392 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2393 mov cl, A2_8
2394 %1 word [A1], cl
2395 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2396 %else
2397 xchg A2, A0
2398 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2399 %1 word [A1], cl
2400 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2401 %endif
2402 EPILOGUE_3_ARGS
2403ENDPROC iemAImpl_ %+ %1 %+ _u16
2404
2405BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2406 PROLOGUE_3_ARGS
2407 %ifdef ASM_CALL64_GCC
2408 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2409 mov cl, A2_8
2410 %1 dword [A1], cl
2411 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2412 %else
2413 xchg A2, A0
2414 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2415 %1 dword [A1], cl
2416 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2417 %endif
2418 EPILOGUE_3_ARGS
2419ENDPROC iemAImpl_ %+ %1 %+ _u32
2420
2421 %ifdef RT_ARCH_AMD64
2422BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2423 PROLOGUE_3_ARGS
2424 %ifdef ASM_CALL64_GCC
2425 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2426 mov cl, A2_8
2427 %1 qword [A1], cl
2428 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2429 %else
2430 xchg A2, A0
2431 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2432 %1 qword [A1], cl
2433 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2434 %endif
2435 EPILOGUE_3_ARGS
2436ENDPROC iemAImpl_ %+ %1 %+ _u64
2437 %endif ; RT_ARCH_AMD64
2438
2439%endmacro
2440
2441; These instructions will NOT modify flags if the masked shift count is zero
2442; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2443; we have to force load all modified and undefined.
2444IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2445IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2446IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2447IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2448IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2449IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2450IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2451
2452
2453;;
2454; Macro for implementing a double precision shift operation.
2455;
2456; This will generate code for the 16, 32 and 64 bit accesses, except on
2457; 32-bit system where the 64-bit accesses requires hand coding.
2458;
2459; The functions takes the destination operand (r/m) in A0, the source (reg) in
2460; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2461;
2462; @param 1 The instruction mnemonic.
2463; @param 2 The modified flags.
2464; @param 3 The undefined flags.
2465; @param 4 The force loaded flags.
2466;
2467; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2468;
2469; @note the _intel and _amd variants are implemented in C.
2470;
2471%macro IEMIMPL_SHIFT_DBL_OP 4
2472BEGINCODE
2473BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2474 PROLOGUE_4_ARGS
2475 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2476 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2477 %ifdef ASM_CALL64_GCC
2478 xchg A3, A2
2479 %1 [A0], A1_16, cl
2480 xchg A3, A2
2481 %else
2482 xchg A0, A2
2483 %1 [A2], A1_16, cl
2484 %endif
2485 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2486 EPILOGUE_4_ARGS
2487ENDPROC iemAImpl_ %+ %1 %+ _u16
2488
2489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2490 PROLOGUE_4_ARGS
2491 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2492 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2493 %ifdef ASM_CALL64_GCC
2494 xchg A3, A2
2495 %1 [A0], A1_32, cl
2496 xchg A3, A2
2497 %else
2498 xchg A0, A2
2499 %1 [A2], A1_32, cl
2500 %endif
2501 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2502 EPILOGUE_4_ARGS
2503ENDPROC iemAImpl_ %+ %1 %+ _u32
2504
2505 %ifdef RT_ARCH_AMD64
2506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2507 PROLOGUE_4_ARGS
2508 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2509 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2510 %ifdef ASM_CALL64_GCC
2511 xchg A3, A2
2512 %1 [A0], A1, cl
2513 xchg A3, A2
2514 %else
2515 xchg A0, A2
2516 %1 [A2], A1, cl
2517 %endif
2518 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2519 EPILOGUE_4_ARGS_EX 12
2520ENDPROC iemAImpl_ %+ %1 %+ _u64
2521 %endif ; RT_ARCH_AMD64
2522
2523%endmacro
2524
2525; These instructions will NOT modify flags if the masked shift count is zero
2526; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2527; we have to force load all modified and undefined.
2528IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2529IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2530
2531
2532;;
2533; Macro for implementing a multiplication operations.
2534;
2535; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2536; 32-bit system where the 64-bit accesses requires hand coding.
2537;
2538; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2539; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2540; pointer to eflags in A3.
2541;
2542; The functions all return 0 so the caller can be used for div/idiv as well as
2543; for the mul/imul implementation.
2544;
2545; @param 1 The instruction mnemonic.
2546; @param 2 The modified flags.
2547; @param 3 The undefined flags.
2548; @param 4 Name suffix.
2549; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2550;
2551; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2552;
2553%macro IEMIMPL_MUL_OP 5
2554BEGINCODE
2555BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2556 PROLOGUE_3_ARGS
2557 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2558 mov al, [A0]
2559 %1 A1_8
2560 mov [A0], ax
2561 %if %5 != 1
2562 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2563 %else
2564 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX ; intel
2565 %endif
2566 xor eax, eax
2567 EPILOGUE_3_ARGS
2568ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2569
2570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2571 PROLOGUE_4_ARGS
2572 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2573 mov ax, [A0]
2574 %ifdef ASM_CALL64_GCC
2575 %1 A2_16
2576 mov [A0], ax
2577 mov [A1], dx
2578 %else
2579 mov T1, A1
2580 %1 A2_16
2581 mov [A0], ax
2582 mov [T1], dx
2583 %endif
2584 %if %5 != 1
2585 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2586 %else
2587 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX ; intel
2588 %endif
2589 xor eax, eax
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2592
2593BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2594 PROLOGUE_4_ARGS
2595 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2596 mov eax, [A0]
2597 %ifdef ASM_CALL64_GCC
2598 %1 A2_32
2599 mov [A0], eax
2600 mov [A1], edx
2601 %else
2602 mov T1, A1
2603 %1 A2_32
2604 mov [A0], eax
2605 mov [T1], edx
2606 %endif
2607 %if %5 != 1
2608 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2609 %else
2610 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX ; intel
2611 %endif
2612 xor eax, eax
2613 EPILOGUE_4_ARGS
2614ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2615
2616 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2618 PROLOGUE_4_ARGS
2619 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2620 mov rax, [A0]
2621 %ifdef ASM_CALL64_GCC
2622 %1 A2
2623 mov [A0], rax
2624 mov [A1], rdx
2625 %else
2626 mov T1, A1
2627 %1 A2
2628 mov [A0], rax
2629 mov [T1], rdx
2630 %endif
2631 %if %5 != 1
2632 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2633 %else
2634 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX ; intel
2635 %endif
2636 xor eax, eax
2637 EPILOGUE_4_ARGS_EX 12
2638ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2639 %endif ; !RT_ARCH_AMD64
2640
2641%endmacro
2642
2643IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2644IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2645IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2646IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2647IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2648IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2649
2650
2651BEGINCODE
2652;;
2653; Worker function for negating a 32-bit number in T1:T0
2654; @uses None (T0,T1)
2655BEGINPROC iemAImpl_negate_T0_T1_u32
2656 push 0
2657 push 0
2658 xchg T0_32, [xSP]
2659 xchg T1_32, [xSP + xCB]
2660 sub T0_32, [xSP]
2661 sbb T1_32, [xSP + xCB]
2662 add xSP, xCB*2
2663 ret
2664ENDPROC iemAImpl_negate_T0_T1_u32
2665
2666%ifdef RT_ARCH_AMD64
2667;;
2668; Worker function for negating a 64-bit number in T1:T0
2669; @uses None (T0,T1)
2670BEGINPROC iemAImpl_negate_T0_T1_u64
2671 push 0
2672 push 0
2673 xchg T0, [xSP]
2674 xchg T1, [xSP + xCB]
2675 sub T0, [xSP]
2676 sbb T1, [xSP + xCB]
2677 add xSP, xCB*2
2678 ret
2679ENDPROC iemAImpl_negate_T0_T1_u64
2680%endif
2681
2682
2683;;
2684; Macro for implementing a division operations.
2685;
2686; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2687; 32-bit system where the 64-bit accesses requires hand coding.
2688;
2689; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2690; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2691; pointer to eflags in A3.
2692;
2693; The functions all return 0 on success and -1 if a divide error should be
2694; raised by the caller.
2695;
2696; @param 1 The instruction mnemonic.
2697; @param 2 The modified flags.
2698; @param 3 The undefined flags.
2699; @param 4 1 if signed, 0 if unsigned.
2700; @param 5 Function suffix.
2701; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2702; 2 for AMD (set AF, clear PF, ZF and SF).
2703;
2704; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2705;
2706%macro IEMIMPL_DIV_OP 6
2707BEGINCODE
2708BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2709 PROLOGUE_3_ARGS
2710
2711 ; div by chainsaw check.
2712 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2713 jz .div_zero
2714
2715 ; Overflow check - unsigned division is simple to verify, haven't
2716 ; found a simple way to check signed division yet unfortunately.
2717 %if %4 == 0
2718 cmp [A0 + 1], A1_8
2719 jae .div_overflow
2720 %else
2721 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2722 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2723 test A1_8, A1_8
2724 js .divisor_negative
2725 test T0_16, T0_16
2726 jns .both_positive
2727 neg T0_16
2728.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2729 push T0 ; Start off like unsigned below.
2730 shr T0_16, 7
2731 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2732 pop T0
2733 jb .div_no_overflow
2734 ja .div_overflow
2735 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2736 cmp T0_8, A1_8
2737 jae .div_overflow
2738 jmp .div_no_overflow
2739
2740.divisor_negative:
2741 neg A1_8
2742 test T0_16, T0_16
2743 jns .one_of_each
2744 neg T0_16
2745.both_positive: ; Same as unsigned shifted by sign indicator bit.
2746 shr T0_16, 7
2747 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2748 jae .div_overflow
2749.div_no_overflow:
2750 mov A1, T1 ; restore divisor
2751 %endif
2752
2753 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2754 mov ax, [A0]
2755 %1 A1_8
2756 mov [A0], ax
2757 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2758 IEM_ADJUST_FLAGS_OLD A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2759 %else
2760 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2761 %endif
2762 xor eax, eax
2763
2764.return:
2765 EPILOGUE_3_ARGS
2766
2767.div_zero:
2768.div_overflow:
2769 mov eax, -1
2770 jmp .return
2771ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2772
2773BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2774 PROLOGUE_4_ARGS
2775
2776 ; div by chainsaw check.
2777 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2778 jz .div_zero
2779
2780 ; Overflow check - unsigned division is simple to verify, haven't
2781 ; found a simple way to check signed division yet unfortunately.
2782 %if %4 == 0
2783 cmp [A1], A2_16
2784 jae .div_overflow
2785 %else
2786 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2787 shl T0_32, 16
2788 mov T0_16, [A0] ; T0 = dividend
2789 mov T1, A2 ; T1 = divisor
2790 test T1_16, T1_16
2791 js .divisor_negative
2792 test T0_32, T0_32
2793 jns .both_positive
2794 neg T0_32
2795.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2796 push T0 ; Start off like unsigned below.
2797 shr T0_32, 15
2798 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2799 pop T0
2800 jb .div_no_overflow
2801 ja .div_overflow
2802 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2803 cmp T0_16, T1_16
2804 jae .div_overflow
2805 jmp .div_no_overflow
2806
2807.divisor_negative:
2808 neg T1_16
2809 test T0_32, T0_32
2810 jns .one_of_each
2811 neg T0_32
2812.both_positive: ; Same as unsigned shifted by sign indicator bit.
2813 shr T0_32, 15
2814 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2815 jae .div_overflow
2816.div_no_overflow:
2817 %endif
2818
2819 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2820 %ifdef ASM_CALL64_GCC
2821 mov T1, A2
2822 mov ax, [A0]
2823 mov dx, [A1]
2824 %1 T1_16
2825 mov [A0], ax
2826 mov [A1], dx
2827 %else
2828 mov T1, A1
2829 mov ax, [A0]
2830 mov dx, [T1]
2831 %1 A2_16
2832 mov [A0], ax
2833 mov [T1], dx
2834 %endif
2835 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2836 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2837 %else
2838 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2839 %endif
2840 xor eax, eax
2841
2842.return:
2843 EPILOGUE_4_ARGS
2844
2845.div_zero:
2846.div_overflow:
2847 mov eax, -1
2848 jmp .return
2849ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2850
2851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2852 PROLOGUE_4_ARGS
2853
2854 ; div by chainsaw check.
2855 test A2_32, A2_32
2856 jz .div_zero
2857
2858 ; Overflow check - unsigned division is simple to verify, haven't
2859 ; found a simple way to check signed division yet unfortunately.
2860 %if %4 == 0
2861 cmp [A1], A2_32
2862 jae .div_overflow
2863 %else
2864 push A2 ; save A2 so we modify it (we out of regs on x86).
2865 mov T0_32, [A0] ; T0 = dividend low
2866 mov T1_32, [A1] ; T1 = dividend high
2867 ;test A2_32, A2_32 - we did this 5 instructions ago.
2868 js .divisor_negative
2869 test T1_32, T1_32
2870 jns .both_positive
2871 call NAME(iemAImpl_negate_T0_T1_u32)
2872.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2873 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2874 jnz .div_overflow
2875 push T0 ; Start off like unsigned below.
2876 shl T1_32, 1
2877 shr T0_32, 31
2878 or T1_32, T0_32
2879 cmp T1_32, A2_32
2880 pop T0
2881 jb .div_no_overflow
2882 ja .div_overflow
2883 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2884 cmp T0_32, A2_32
2885 jae .div_overflow
2886 jmp .div_no_overflow
2887
2888.divisor_negative:
2889 neg A2_32
2890 test T1_32, T1_32
2891 jns .one_of_each
2892 call NAME(iemAImpl_negate_T0_T1_u32)
2893.both_positive: ; Same as unsigned shifted by sign indicator bit.
2894 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2895 jnz .div_overflow
2896 shl T1_32, 1
2897 shr T0_32, 31
2898 or T1_32, T0_32
2899 cmp T1_32, A2_32
2900 jae .div_overflow
2901.div_no_overflow:
2902 pop A2
2903 %endif
2904
2905 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2906 mov eax, [A0]
2907 %ifdef ASM_CALL64_GCC
2908 mov T1, A2
2909 mov eax, [A0]
2910 mov edx, [A1]
2911 %1 T1_32
2912 mov [A0], eax
2913 mov [A1], edx
2914 %else
2915 mov T1, A1
2916 mov eax, [A0]
2917 mov edx, [T1]
2918 %1 A2_32
2919 mov [A0], eax
2920 mov [T1], edx
2921 %endif
2922 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2923 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2924 %else
2925 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2926 %endif
2927 xor eax, eax
2928
2929.return:
2930 EPILOGUE_4_ARGS
2931
2932.div_overflow:
2933 %if %4 != 0
2934 pop A2
2935 %endif
2936.div_zero:
2937 mov eax, -1
2938 jmp .return
2939ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2940
2941 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2942BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2943 PROLOGUE_4_ARGS
2944
2945 test A2, A2
2946 jz .div_zero
2947 %if %4 == 0
2948 cmp [A1], A2
2949 jae .div_overflow
2950 %else
2951 push A2 ; save A2 so we modify it (we out of regs on x86).
2952 mov T0, [A0] ; T0 = dividend low
2953 mov T1, [A1] ; T1 = dividend high
2954 ;test A2, A2 - we did this five instructions above.
2955 js .divisor_negative
2956 test T1, T1
2957 jns .both_positive
2958 call NAME(iemAImpl_negate_T0_T1_u64)
2959.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2960 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2961 jc .div_overflow
2962 push T0 ; Start off like unsigned below.
2963 shl T1, 1
2964 shr T0, 63
2965 or T1, T0
2966 cmp T1, A2
2967 pop T0
2968 jb .div_no_overflow
2969 ja .div_overflow
2970 mov T1, 0x7fffffffffffffff
2971 and T0, T1 ; Special case for covering (divisor - 1).
2972 cmp T0, A2
2973 jae .div_overflow
2974 jmp .div_no_overflow
2975
2976.divisor_negative:
2977 neg A2
2978 test T1, T1
2979 jns .one_of_each
2980 call NAME(iemAImpl_negate_T0_T1_u64)
2981.both_positive: ; Same as unsigned shifted by sign indicator bit.
2982 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2983 jc .div_overflow
2984 shl T1, 1
2985 shr T0, 63
2986 or T1, T0
2987 cmp T1, A2
2988 jae .div_overflow
2989.div_no_overflow:
2990 pop A2
2991 %endif
2992
2993 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2994 mov rax, [A0]
2995 %ifdef ASM_CALL64_GCC
2996 mov T1, A2
2997 mov rax, [A0]
2998 mov rdx, [A1]
2999 %1 T1
3000 mov [A0], rax
3001 mov [A1], rdx
3002 %else
3003 mov T1, A1
3004 mov rax, [A0]
3005 mov rdx, [T1]
3006 %1 A2
3007 mov [A0], rax
3008 mov [T1], rdx
3009 %endif
3010 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
3011 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
3012 %else
3013 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
3014 %endif
3015 xor eax, eax
3016
3017.return:
3018 EPILOGUE_4_ARGS_EX 12
3019
3020.div_overflow:
3021 %if %4 != 0
3022 pop A2
3023 %endif
3024.div_zero:
3025 mov eax, -1
3026 jmp .return
3027ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
3028 %endif ; !RT_ARCH_AMD64
3029
3030%endmacro
3031
3032IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
3033IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
3034IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
3035;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
3036IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
3037IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
3038IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
3039
3040
3041;;
3042; Macro for implementing memory fence operation.
3043;
3044; No return value, no operands or anything.
3045;
3046; @param 1 The instruction.
3047;
3048%macro IEMIMPL_MEM_FENCE 1
3049BEGINCODE
3050BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
3051 %1
3052 ret
3053ENDPROC iemAImpl_ %+ %1
3054%endmacro
3055
3056IEMIMPL_MEM_FENCE lfence
3057IEMIMPL_MEM_FENCE sfence
3058IEMIMPL_MEM_FENCE mfence
3059
3060;;
3061; Alternative for non-SSE2 host.
3062;
3063BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
3064 push xAX
3065 xchg xAX, [xSP]
3066 add xSP, xCB
3067 ret
3068ENDPROC iemAImpl_alt_mem_fence
3069
3070
3071;;
3072; Initialize the FPU for the actual instruction being emulated, this means
3073; loading parts of the guest's control word and status word.
3074;
3075; @uses 24 bytes of stack. T0, T1
3076; @param 1 Expression giving the address of the FXSTATE of the guest.
3077;
3078%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
3079 fnstenv [xSP]
3080
3081 ; FCW - for exception, precision and rounding control.
3082 movzx T0, word [%1 + X86FXSTATE.FCW]
3083 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3084 mov [xSP + X86FSTENV32P.FCW], T0_16
3085
3086 ; FSW - for undefined C0, C1, C2, and C3.
3087 movzx T1, word [%1 + X86FXSTATE.FSW]
3088 and T1, X86_FSW_C_MASK
3089 movzx T0, word [xSP + X86FSTENV32P.FSW]
3090 and T0, X86_FSW_TOP_MASK
3091 or T0, T1
3092 mov [xSP + X86FSTENV32P.FSW], T0_16
3093
3094 fldenv [xSP]
3095%endmacro
3096
3097
3098;;
3099; Initialize the FPU for the actual instruction being emulated, this means
3100; loading parts of the guest's control word, status word, and update the
3101; tag word for the top register if it's empty.
3102;
3103; ASSUMES actual TOP=7
3104;
3105; @uses 24 bytes of stack. T0, T1
3106; @param 1 Expression giving the address of the FXSTATE of the guest.
3107;
3108%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
3109 fnstenv [xSP]
3110
3111 ; FCW - for exception, precision and rounding control.
3112 movzx T0_32, word [%1 + X86FXSTATE.FCW]
3113 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3114 mov [xSP + X86FSTENV32P.FCW], T0_16
3115
3116 ; FSW - for undefined C0, C1, C2, and C3.
3117 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3118 and T1_32, X86_FSW_C_MASK
3119 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
3120 and T0_32, X86_FSW_TOP_MASK
3121 or T0_32, T1_32
3122 mov [xSP + X86FSTENV32P.FSW], T0_16
3123
3124 ; FTW - Only for ST0 (in/out).
3125 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3126 shr T1_32, X86_FSW_TOP_SHIFT
3127 and T1_32, X86_FSW_TOP_SMASK
3128 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
3129 jc %%st0_not_empty
3130 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
3131%%st0_not_empty:
3132
3133 fldenv [xSP]
3134%endmacro
3135
3136
3137;;
3138; Need to move this as well somewhere better?
3139;
3140struc IEMFPURESULT
3141 .r80Result resw 5
3142 .FSW resw 1
3143endstruc
3144
3145
3146;;
3147; Need to move this as well somewhere better?
3148;
3149struc IEMFPURESULTTWO
3150 .r80Result1 resw 5
3151 .FSW resw 1
3152 .r80Result2 resw 5
3153endstruc
3154
3155
3156;
3157;---------------------- 16-bit signed integer operations ----------------------
3158;
3159
3160
3161;;
3162; Converts a 16-bit floating point value to a 80-bit one (fpu register).
3163;
3164; @param A0 FPU context (fxsave).
3165; @param A1 Pointer to a IEMFPURESULT for the output.
3166; @param A2 Pointer to the 16-bit floating point value to convert.
3167;
3168BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
3169 PROLOGUE_3_ARGS
3170 sub xSP, 20h
3171
3172 fninit
3173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3174 fild word [A2]
3175
3176 fnstsw word [A1 + IEMFPURESULT.FSW]
3177 fnclex
3178 fstp tword [A1 + IEMFPURESULT.r80Result]
3179
3180 fninit
3181 add xSP, 20h
3182 EPILOGUE_3_ARGS
3183ENDPROC iemAImpl_fild_r80_from_i16
3184
3185
3186;;
3187; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
3188;
3189; @param A0 FPU context (fxsave).
3190; @param A1 Where to return the output FSW.
3191; @param A2 Where to store the 16-bit signed integer value.
3192; @param A3 Pointer to the 80-bit value.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
3195 PROLOGUE_4_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 fld tword [A3]
3200 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3201 fistp word [A2]
3202
3203 fnstsw word [A1]
3204
3205 fninit
3206 add xSP, 20h
3207 EPILOGUE_4_ARGS
3208ENDPROC iemAImpl_fist_r80_to_i16
3209
3210
3211;;
3212; Store a 80-bit floating point value (register) as a 16-bit signed integer
3213; (memory) with truncation.
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 16-bit signed integer value.
3218; @param A3 Pointer to the 80-bit value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fisttp word [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fistt_r80_to_i16
3235
3236
3237;;
3238; FPU instruction working on one 80-bit and one 16-bit signed integer value.
3239;
3240; @param 1 The instruction
3241;
3242; @param A0 FPU context (fxsave).
3243; @param A1 Pointer to a IEMFPURESULT for the output.
3244; @param A2 Pointer to the 80-bit value.
3245; @param A3 Pointer to the 16-bit value.
3246;
3247%macro IEMIMPL_FPU_R80_BY_I16 1
3248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3249 PROLOGUE_4_ARGS
3250 sub xSP, 20h
3251
3252 fninit
3253 fld tword [A2]
3254 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3255 %1 word [A3]
3256
3257 fnstsw word [A1 + IEMFPURESULT.FSW]
3258 fnclex
3259 fstp tword [A1 + IEMFPURESULT.r80Result]
3260
3261 fninit
3262 add xSP, 20h
3263 EPILOGUE_4_ARGS
3264ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3265%endmacro
3266
3267IEMIMPL_FPU_R80_BY_I16 fiadd
3268IEMIMPL_FPU_R80_BY_I16 fimul
3269IEMIMPL_FPU_R80_BY_I16 fisub
3270IEMIMPL_FPU_R80_BY_I16 fisubr
3271IEMIMPL_FPU_R80_BY_I16 fidiv
3272IEMIMPL_FPU_R80_BY_I16 fidivr
3273
3274
3275;;
3276; FPU instruction working on one 80-bit and one 16-bit signed integer value,
3277; only returning FSW.
3278;
3279; @param 1 The instruction
3280;
3281; @param A0 FPU context (fxsave).
3282; @param A1 Where to store the output FSW.
3283; @param A2 Pointer to the 80-bit value.
3284; @param A3 Pointer to the 64-bit value.
3285;
3286%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
3287BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3288 PROLOGUE_4_ARGS
3289 sub xSP, 20h
3290
3291 fninit
3292 fld tword [A2]
3293 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3294 %1 word [A3]
3295
3296 fnstsw word [A1]
3297
3298 fninit
3299 add xSP, 20h
3300 EPILOGUE_4_ARGS
3301ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3302%endmacro
3303
3304IEMIMPL_FPU_R80_BY_I16_FSW ficom
3305
3306
3307
3308;
3309;---------------------- 32-bit signed integer operations ----------------------
3310;
3311
3312
3313;;
3314; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3315;
3316; @param A0 FPU context (fxsave).
3317; @param A1 Pointer to a IEMFPURESULT for the output.
3318; @param A2 Pointer to the 32-bit floating point value to convert.
3319;
3320BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
3321 PROLOGUE_3_ARGS
3322 sub xSP, 20h
3323
3324 fninit
3325 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3326 fild dword [A2]
3327
3328 fnstsw word [A1 + IEMFPURESULT.FSW]
3329 fnclex
3330 fstp tword [A1 + IEMFPURESULT.r80Result]
3331
3332 fninit
3333 add xSP, 20h
3334 EPILOGUE_3_ARGS
3335ENDPROC iemAImpl_fild_r80_from_i32
3336
3337
3338;;
3339; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
3340;
3341; @param A0 FPU context (fxsave).
3342; @param A1 Where to return the output FSW.
3343; @param A2 Where to store the 32-bit signed integer value.
3344; @param A3 Pointer to the 80-bit value.
3345;
3346BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
3347 PROLOGUE_4_ARGS
3348 sub xSP, 20h
3349
3350 fninit
3351 fld tword [A3]
3352 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3353 fistp dword [A2]
3354
3355 fnstsw word [A1]
3356
3357 fninit
3358 add xSP, 20h
3359 EPILOGUE_4_ARGS
3360ENDPROC iemAImpl_fist_r80_to_i32
3361
3362
3363;;
3364; Store a 80-bit floating point value (register) as a 32-bit signed integer
3365; (memory) with truncation.
3366;
3367; @param A0 FPU context (fxsave).
3368; @param A1 Where to return the output FSW.
3369; @param A2 Where to store the 32-bit signed integer value.
3370; @param A3 Pointer to the 80-bit value.
3371;
3372BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3373 PROLOGUE_4_ARGS
3374 sub xSP, 20h
3375
3376 fninit
3377 fld tword [A3]
3378 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3379 fisttp dword [A2]
3380
3381 fnstsw word [A1]
3382
3383 fninit
3384 add xSP, 20h
3385 EPILOGUE_4_ARGS
3386ENDPROC iemAImpl_fistt_r80_to_i32
3387
3388
3389;;
3390; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3391;
3392; @param 1 The instruction
3393;
3394; @param A0 FPU context (fxsave).
3395; @param A1 Pointer to a IEMFPURESULT for the output.
3396; @param A2 Pointer to the 80-bit value.
3397; @param A3 Pointer to the 32-bit value.
3398;
3399%macro IEMIMPL_FPU_R80_BY_I32 1
3400BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3401 PROLOGUE_4_ARGS
3402 sub xSP, 20h
3403
3404 fninit
3405 fld tword [A2]
3406 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3407 %1 dword [A3]
3408
3409 fnstsw word [A1 + IEMFPURESULT.FSW]
3410 fnclex
3411 fstp tword [A1 + IEMFPURESULT.r80Result]
3412
3413 fninit
3414 add xSP, 20h
3415 EPILOGUE_4_ARGS
3416ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3417%endmacro
3418
3419IEMIMPL_FPU_R80_BY_I32 fiadd
3420IEMIMPL_FPU_R80_BY_I32 fimul
3421IEMIMPL_FPU_R80_BY_I32 fisub
3422IEMIMPL_FPU_R80_BY_I32 fisubr
3423IEMIMPL_FPU_R80_BY_I32 fidiv
3424IEMIMPL_FPU_R80_BY_I32 fidivr
3425
3426
3427;;
3428; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3429; only returning FSW.
3430;
3431; @param 1 The instruction
3432;
3433; @param A0 FPU context (fxsave).
3434; @param A1 Where to store the output FSW.
3435; @param A2 Pointer to the 80-bit value.
3436; @param A3 Pointer to the 64-bit value.
3437;
3438%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3440 PROLOGUE_4_ARGS
3441 sub xSP, 20h
3442
3443 fninit
3444 fld tword [A2]
3445 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3446 %1 dword [A3]
3447
3448 fnstsw word [A1]
3449
3450 fninit
3451 add xSP, 20h
3452 EPILOGUE_4_ARGS
3453ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3454%endmacro
3455
3456IEMIMPL_FPU_R80_BY_I32_FSW ficom
3457
3458
3459
3460;
3461;---------------------- 64-bit signed integer operations ----------------------
3462;
3463
3464
3465;;
3466; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3467;
3468; @param A0 FPU context (fxsave).
3469; @param A1 Pointer to a IEMFPURESULT for the output.
3470; @param A2 Pointer to the 64-bit floating point value to convert.
3471;
3472BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3473 PROLOGUE_3_ARGS
3474 sub xSP, 20h
3475
3476 fninit
3477 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3478 fild qword [A2]
3479
3480 fnstsw word [A1 + IEMFPURESULT.FSW]
3481 fnclex
3482 fstp tword [A1 + IEMFPURESULT.r80Result]
3483
3484 fninit
3485 add xSP, 20h
3486 EPILOGUE_3_ARGS
3487ENDPROC iemAImpl_fild_r80_from_i64
3488
3489
3490;;
3491; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3492;
3493; @param A0 FPU context (fxsave).
3494; @param A1 Where to return the output FSW.
3495; @param A2 Where to store the 64-bit signed integer value.
3496; @param A3 Pointer to the 80-bit value.
3497;
3498BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3499 PROLOGUE_4_ARGS
3500 sub xSP, 20h
3501
3502 fninit
3503 fld tword [A3]
3504 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3505 fistp qword [A2]
3506
3507 fnstsw word [A1]
3508
3509 fninit
3510 add xSP, 20h
3511 EPILOGUE_4_ARGS
3512ENDPROC iemAImpl_fist_r80_to_i64
3513
3514
3515;;
3516; Store a 80-bit floating point value (register) as a 64-bit signed integer
3517; (memory) with truncation.
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Where to return the output FSW.
3521; @param A2 Where to store the 64-bit signed integer value.
3522; @param A3 Pointer to the 80-bit value.
3523;
3524BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3525 PROLOGUE_4_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A3]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 fisttp qword [A2]
3532
3533 fnstsw word [A1]
3534
3535 fninit
3536 add xSP, 20h
3537 EPILOGUE_4_ARGS
3538ENDPROC iemAImpl_fistt_r80_to_i64
3539
3540
3541
3542;
3543;---------------------- 32-bit floating point operations ----------------------
3544;
3545
3546;;
3547; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3548;
3549; @param A0 FPU context (fxsave).
3550; @param A1 Pointer to a IEMFPURESULT for the output.
3551; @param A2 Pointer to the 32-bit floating point value to convert.
3552;
3553BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3554 PROLOGUE_3_ARGS
3555 sub xSP, 20h
3556
3557 fninit
3558 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3559 fld dword [A2]
3560
3561 fnstsw word [A1 + IEMFPURESULT.FSW]
3562 fnclex
3563 fstp tword [A1 + IEMFPURESULT.r80Result]
3564
3565 fninit
3566 add xSP, 20h
3567 EPILOGUE_3_ARGS
3568ENDPROC iemAImpl_fld_r80_from_r32
3569
3570
3571;;
3572; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3573;
3574; @param A0 FPU context (fxsave).
3575; @param A1 Where to return the output FSW.
3576; @param A2 Where to store the 32-bit value.
3577; @param A3 Pointer to the 80-bit value.
3578;
3579BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3580 PROLOGUE_4_ARGS
3581 sub xSP, 20h
3582
3583 fninit
3584 fld tword [A3]
3585 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3586 fst dword [A2]
3587
3588 fnstsw word [A1]
3589
3590 fninit
3591 add xSP, 20h
3592 EPILOGUE_4_ARGS
3593ENDPROC iemAImpl_fst_r80_to_r32
3594
3595
3596;;
3597; FPU instruction working on one 80-bit and one 32-bit floating point value.
3598;
3599; @param 1 The instruction
3600;
3601; @param A0 FPU context (fxsave).
3602; @param A1 Pointer to a IEMFPURESULT for the output.
3603; @param A2 Pointer to the 80-bit value.
3604; @param A3 Pointer to the 32-bit value.
3605;
3606%macro IEMIMPL_FPU_R80_BY_R32 1
3607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3608 PROLOGUE_4_ARGS
3609 sub xSP, 20h
3610
3611 fninit
3612 fld tword [A2]
3613 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3614 %1 dword [A3]
3615
3616 fnstsw word [A1 + IEMFPURESULT.FSW]
3617 fnclex
3618 fstp tword [A1 + IEMFPURESULT.r80Result]
3619
3620 fninit
3621 add xSP, 20h
3622 EPILOGUE_4_ARGS
3623ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3624%endmacro
3625
3626IEMIMPL_FPU_R80_BY_R32 fadd
3627IEMIMPL_FPU_R80_BY_R32 fmul
3628IEMIMPL_FPU_R80_BY_R32 fsub
3629IEMIMPL_FPU_R80_BY_R32 fsubr
3630IEMIMPL_FPU_R80_BY_R32 fdiv
3631IEMIMPL_FPU_R80_BY_R32 fdivr
3632
3633
3634;;
3635; FPU instruction working on one 80-bit and one 32-bit floating point value,
3636; only returning FSW.
3637;
3638; @param 1 The instruction
3639;
3640; @param A0 FPU context (fxsave).
3641; @param A1 Where to store the output FSW.
3642; @param A2 Pointer to the 80-bit value.
3643; @param A3 Pointer to the 64-bit value.
3644;
3645%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3647 PROLOGUE_4_ARGS
3648 sub xSP, 20h
3649
3650 fninit
3651 fld tword [A2]
3652 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3653 %1 dword [A3]
3654
3655 fnstsw word [A1]
3656
3657 fninit
3658 add xSP, 20h
3659 EPILOGUE_4_ARGS
3660ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3661%endmacro
3662
3663IEMIMPL_FPU_R80_BY_R32_FSW fcom
3664
3665
3666
3667;
3668;---------------------- 64-bit floating point operations ----------------------
3669;
3670
3671;;
3672; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3673;
3674; @param A0 FPU context (fxsave).
3675; @param A1 Pointer to a IEMFPURESULT for the output.
3676; @param A2 Pointer to the 64-bit floating point value to convert.
3677;
3678BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3679 PROLOGUE_3_ARGS
3680 sub xSP, 20h
3681
3682 fninit
3683 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3684 fld qword [A2]
3685
3686 fnstsw word [A1 + IEMFPURESULT.FSW]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULT.r80Result]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_fld_r80_from_r64
3694
3695
3696;;
3697; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3698;
3699; @param A0 FPU context (fxsave).
3700; @param A1 Where to return the output FSW.
3701; @param A2 Where to store the 64-bit value.
3702; @param A3 Pointer to the 80-bit value.
3703;
3704BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3705 PROLOGUE_4_ARGS
3706 sub xSP, 20h
3707
3708 fninit
3709 fld tword [A3]
3710 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3711 fst qword [A2]
3712
3713 fnstsw word [A1]
3714
3715 fninit
3716 add xSP, 20h
3717 EPILOGUE_4_ARGS
3718ENDPROC iemAImpl_fst_r80_to_r64
3719
3720
3721;;
3722; FPU instruction working on one 80-bit and one 64-bit floating point value.
3723;
3724; @param 1 The instruction
3725;
3726; @param A0 FPU context (fxsave).
3727; @param A1 Pointer to a IEMFPURESULT for the output.
3728; @param A2 Pointer to the 80-bit value.
3729; @param A3 Pointer to the 64-bit value.
3730;
3731%macro IEMIMPL_FPU_R80_BY_R64 1
3732BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3733 PROLOGUE_4_ARGS
3734 sub xSP, 20h
3735
3736 fninit
3737 fld tword [A2]
3738 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3739 %1 qword [A3]
3740
3741 fnstsw word [A1 + IEMFPURESULT.FSW]
3742 fnclex
3743 fstp tword [A1 + IEMFPURESULT.r80Result]
3744
3745 fninit
3746 add xSP, 20h
3747 EPILOGUE_4_ARGS
3748ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3749%endmacro
3750
3751IEMIMPL_FPU_R80_BY_R64 fadd
3752IEMIMPL_FPU_R80_BY_R64 fmul
3753IEMIMPL_FPU_R80_BY_R64 fsub
3754IEMIMPL_FPU_R80_BY_R64 fsubr
3755IEMIMPL_FPU_R80_BY_R64 fdiv
3756IEMIMPL_FPU_R80_BY_R64 fdivr
3757
3758;;
3759; FPU instruction working on one 80-bit and one 64-bit floating point value,
3760; only returning FSW.
3761;
3762; @param 1 The instruction
3763;
3764; @param A0 FPU context (fxsave).
3765; @param A1 Where to store the output FSW.
3766; @param A2 Pointer to the 80-bit value.
3767; @param A3 Pointer to the 64-bit value.
3768;
3769%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3770BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3771 PROLOGUE_4_ARGS
3772 sub xSP, 20h
3773
3774 fninit
3775 fld tword [A2]
3776 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3777 %1 qword [A3]
3778
3779 fnstsw word [A1]
3780
3781 fninit
3782 add xSP, 20h
3783 EPILOGUE_4_ARGS
3784ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3785%endmacro
3786
3787IEMIMPL_FPU_R80_BY_R64_FSW fcom
3788
3789
3790
3791;
3792;---------------------- 80-bit floating point operations ----------------------
3793;
3794
3795;;
3796; Loads a 80-bit floating point register value from memory.
3797;
3798; @param A0 FPU context (fxsave).
3799; @param A1 Pointer to a IEMFPURESULT for the output.
3800; @param A2 Pointer to the 80-bit floating point value to load.
3801;
3802BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3803 PROLOGUE_3_ARGS
3804 sub xSP, 20h
3805
3806 fninit
3807 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3808 fld tword [A2]
3809
3810 fnstsw word [A1 + IEMFPURESULT.FSW]
3811 fnclex
3812 fstp tword [A1 + IEMFPURESULT.r80Result]
3813
3814 fninit
3815 add xSP, 20h
3816 EPILOGUE_3_ARGS
3817ENDPROC iemAImpl_fld_r80_from_r80
3818
3819
3820;;
3821; Store a 80-bit floating point register to memory
3822;
3823; @param A0 FPU context (fxsave).
3824; @param A1 Where to return the output FSW.
3825; @param A2 Where to store the 80-bit value.
3826; @param A3 Pointer to the 80-bit register value.
3827;
3828BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3829 PROLOGUE_4_ARGS
3830 sub xSP, 20h
3831
3832 fninit
3833 fld tword [A3]
3834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3835 fstp tword [A2]
3836
3837 fnstsw word [A1]
3838
3839 fninit
3840 add xSP, 20h
3841 EPILOGUE_4_ARGS
3842ENDPROC iemAImpl_fst_r80_to_r80
3843
3844
3845;;
3846; Loads an 80-bit floating point register value in BCD format from memory.
3847;
3848; @param A0 FPU context (fxsave).
3849; @param A1 Pointer to a IEMFPURESULT for the output.
3850; @param A2 Pointer to the 80-bit BCD value to load.
3851;
3852BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3853 PROLOGUE_3_ARGS
3854 sub xSP, 20h
3855
3856 fninit
3857 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3858 fbld tword [A2]
3859
3860 fnstsw word [A1 + IEMFPURESULT.FSW]
3861 fnclex
3862 fstp tword [A1 + IEMFPURESULT.r80Result]
3863
3864 fninit
3865 add xSP, 20h
3866 EPILOGUE_3_ARGS
3867ENDPROC iemAImpl_fld_r80_from_d80
3868
3869
3870;;
3871; Store a 80-bit floating point register to memory as BCD
3872;
3873; @param A0 FPU context (fxsave).
3874; @param A1 Where to return the output FSW.
3875; @param A2 Where to store the 80-bit BCD value.
3876; @param A3 Pointer to the 80-bit register value.
3877;
3878BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3879 PROLOGUE_4_ARGS
3880 sub xSP, 20h
3881
3882 fninit
3883 fld tword [A3]
3884 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3885 fbstp tword [A2]
3886
3887 fnstsw word [A1]
3888
3889 fninit
3890 add xSP, 20h
3891 EPILOGUE_4_ARGS
3892ENDPROC iemAImpl_fst_r80_to_d80
3893
3894
3895;;
3896; FPU instruction working on two 80-bit floating point values.
3897;
3898; @param 1 The instruction
3899;
3900; @param A0 FPU context (fxsave).
3901; @param A1 Pointer to a IEMFPURESULT for the output.
3902; @param A2 Pointer to the first 80-bit value (ST0)
3903; @param A3 Pointer to the second 80-bit value (STn).
3904;
3905%macro IEMIMPL_FPU_R80_BY_R80 2
3906BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3907 PROLOGUE_4_ARGS
3908 sub xSP, 20h
3909
3910 fninit
3911 fld tword [A3]
3912 fld tword [A2]
3913 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3914 %1 %2
3915
3916 fnstsw word [A1 + IEMFPURESULT.FSW]
3917 fnclex
3918 fstp tword [A1 + IEMFPURESULT.r80Result]
3919
3920 fninit
3921 add xSP, 20h
3922 EPILOGUE_4_ARGS
3923ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3924%endmacro
3925
3926IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3927IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3928IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3929IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3930IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3931IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3932IEMIMPL_FPU_R80_BY_R80 fprem, {}
3933IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3934IEMIMPL_FPU_R80_BY_R80 fscale, {}
3935
3936
3937;;
3938; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3939; storing the result in ST1 and popping the stack.
3940;
3941; @param 1 The instruction
3942;
3943; @param A0 FPU context (fxsave).
3944; @param A1 Pointer to a IEMFPURESULT for the output.
3945; @param A2 Pointer to the first 80-bit value (ST1).
3946; @param A3 Pointer to the second 80-bit value (ST0).
3947;
3948%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3949BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3950 PROLOGUE_4_ARGS
3951 sub xSP, 20h
3952
3953 fninit
3954 fld tword [A2]
3955 fld tword [A3]
3956 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3957 %1
3958
3959 fnstsw word [A1 + IEMFPURESULT.FSW]
3960 fnclex
3961 fstp tword [A1 + IEMFPURESULT.r80Result]
3962
3963 fninit
3964 add xSP, 20h
3965 EPILOGUE_4_ARGS
3966ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3967%endmacro
3968
3969IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3970IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3971IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3972
3973
3974;;
3975; FPU instruction working on two 80-bit floating point values, only
3976; returning FSW.
3977;
3978; @param 1 The instruction
3979;
3980; @param A0 FPU context (fxsave).
3981; @param A1 Pointer to a uint16_t for the resulting FSW.
3982; @param A2 Pointer to the first 80-bit value.
3983; @param A3 Pointer to the second 80-bit value.
3984;
3985%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3986BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3987 PROLOGUE_4_ARGS
3988 sub xSP, 20h
3989
3990 fninit
3991 fld tword [A3]
3992 fld tword [A2]
3993 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3994 %1 st0, st1
3995
3996 fnstsw word [A1]
3997
3998 fninit
3999 add xSP, 20h
4000 EPILOGUE_4_ARGS
4001ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4002%endmacro
4003
4004IEMIMPL_FPU_R80_BY_R80_FSW fcom
4005IEMIMPL_FPU_R80_BY_R80_FSW fucom
4006
4007
4008;;
4009; FPU instruction working on two 80-bit floating point values,
4010; returning FSW and EFLAGS (eax).
4011;
4012; @param 1 The instruction
4013;
4014; @returns EFLAGS in EAX.
4015; @param A0 FPU context (fxsave).
4016; @param A1 Pointer to a uint16_t for the resulting FSW.
4017; @param A2 Pointer to the first 80-bit value.
4018; @param A3 Pointer to the second 80-bit value.
4019;
4020%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
4021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
4022 PROLOGUE_4_ARGS
4023 sub xSP, 20h
4024
4025 fninit
4026 fld tword [A3]
4027 fld tword [A2]
4028 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4029 %1 st1
4030
4031 fnstsw word [A1]
4032 pushf
4033 pop xAX
4034
4035 fninit
4036 add xSP, 20h
4037 EPILOGUE_4_ARGS
4038ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4039%endmacro
4040
4041IEMIMPL_FPU_R80_BY_R80_EFL fcomi
4042IEMIMPL_FPU_R80_BY_R80_EFL fucomi
4043
4044
4045;;
4046; FPU instruction working on one 80-bit floating point value.
4047;
4048; @param 1 The instruction
4049;
4050; @param A0 FPU context (fxsave).
4051; @param A1 Pointer to a IEMFPURESULT for the output.
4052; @param A2 Pointer to the 80-bit value.
4053;
4054%macro IEMIMPL_FPU_R80 1
4055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4056 PROLOGUE_3_ARGS
4057 sub xSP, 20h
4058
4059 fninit
4060 fld tword [A2]
4061 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4062 %1
4063
4064 fnstsw word [A1 + IEMFPURESULT.FSW]
4065 fnclex
4066 fstp tword [A1 + IEMFPURESULT.r80Result]
4067
4068 fninit
4069 add xSP, 20h
4070 EPILOGUE_3_ARGS
4071ENDPROC iemAImpl_ %+ %1 %+ _r80
4072%endmacro
4073
4074IEMIMPL_FPU_R80 fchs
4075IEMIMPL_FPU_R80 fabs
4076IEMIMPL_FPU_R80 f2xm1
4077IEMIMPL_FPU_R80 fsqrt
4078IEMIMPL_FPU_R80 frndint
4079IEMIMPL_FPU_R80 fsin
4080IEMIMPL_FPU_R80 fcos
4081
4082
4083;;
4084; FPU instruction working on one 80-bit floating point value, only
4085; returning FSW.
4086;
4087; @param 1 The instruction
4088; @param 2 Non-zero to also restore FTW.
4089;
4090; @param A0 FPU context (fxsave).
4091; @param A1 Pointer to a uint16_t for the resulting FSW.
4092; @param A2 Pointer to the 80-bit value.
4093;
4094%macro IEMIMPL_FPU_R80_FSW 2
4095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4096 PROLOGUE_3_ARGS
4097 sub xSP, 20h
4098
4099 fninit
4100 fld tword [A2]
4101%if %2 != 0
4102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
4103%else
4104 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4105%endif
4106 %1
4107
4108 fnstsw word [A1]
4109
4110 fninit
4111 add xSP, 20h
4112 EPILOGUE_3_ARGS
4113ENDPROC iemAImpl_ %+ %1 %+ _r80
4114%endmacro
4115
4116IEMIMPL_FPU_R80_FSW ftst, 0
4117IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
4118
4119
4120
4121;;
4122; FPU instruction loading a 80-bit floating point constant.
4123;
4124; @param 1 The instruction
4125;
4126; @param A0 FPU context (fxsave).
4127; @param A1 Pointer to a IEMFPURESULT for the output.
4128;
4129%macro IEMIMPL_FPU_R80_CONST 1
4130BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
4131 PROLOGUE_2_ARGS
4132 sub xSP, 20h
4133
4134 fninit
4135 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4136 %1
4137
4138 fnstsw word [A1 + IEMFPURESULT.FSW]
4139 fnclex
4140 fstp tword [A1 + IEMFPURESULT.r80Result]
4141
4142 fninit
4143 add xSP, 20h
4144 EPILOGUE_2_ARGS
4145ENDPROC iemAImpl_ %+ %1 %+
4146%endmacro
4147
4148IEMIMPL_FPU_R80_CONST fld1
4149IEMIMPL_FPU_R80_CONST fldl2t
4150IEMIMPL_FPU_R80_CONST fldl2e
4151IEMIMPL_FPU_R80_CONST fldpi
4152IEMIMPL_FPU_R80_CONST fldlg2
4153IEMIMPL_FPU_R80_CONST fldln2
4154IEMIMPL_FPU_R80_CONST fldz
4155
4156
4157;;
4158; FPU instruction working on one 80-bit floating point value, outputing two.
4159;
4160; @param 1 The instruction
4161;
4162; @param A0 FPU context (fxsave).
4163; @param A1 Pointer to a IEMFPURESULTTWO for the output.
4164; @param A2 Pointer to the 80-bit value.
4165;
4166%macro IEMIMPL_FPU_R80_R80 1
4167BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
4168 PROLOGUE_3_ARGS
4169 sub xSP, 20h
4170
4171 fninit
4172 fld tword [A2]
4173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4174 %1
4175
4176 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
4177 fnclex
4178 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
4179 fnclex
4180 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
4181
4182 fninit
4183 add xSP, 20h
4184 EPILOGUE_3_ARGS
4185ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
4186%endmacro
4187
4188IEMIMPL_FPU_R80_R80 fptan
4189IEMIMPL_FPU_R80_R80 fxtract
4190IEMIMPL_FPU_R80_R80 fsincos
4191
4192
4193
4194
4195;---------------------- SSE and MMX Operations ----------------------
4196
4197;; @todo what do we need to do for MMX?
4198%macro IEMIMPL_MMX_PROLOGUE 0
4199%endmacro
4200%macro IEMIMPL_MMX_EPILOGUE 0
4201%endmacro
4202
4203;; @todo what do we need to do for SSE?
4204%macro IEMIMPL_SSE_PROLOGUE 0
4205%endmacro
4206%macro IEMIMPL_SSE_EPILOGUE 0
4207%endmacro
4208
4209;; @todo what do we need to do for AVX?
4210%macro IEMIMPL_AVX_PROLOGUE 0
4211%endmacro
4212%macro IEMIMPL_AVX_EPILOGUE 0
4213%endmacro
4214
4215
4216;;
4217; Media instruction working on two full sized registers.
4218;
4219; @param 1 The instruction
4220; @param 2 Whether there is an MMX variant (1) or not (0).
4221;
4222; @param A0 FPU context (fxsave).
4223; @param A1 Pointer to the first media register size operand (input/output).
4224; @param A2 Pointer to the second media register size operand (input).
4225;
4226; @todo r=aeichner Currently unused, can probably be removed.
4227;
4228%macro IEMIMPL_MEDIA_F2 2
4229%if %2 != 0
4230BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
4231 PROLOGUE_3_ARGS
4232 IEMIMPL_MMX_PROLOGUE
4233
4234 movq mm0, [A1]
4235 movq mm1, [A2]
4236 %1 mm0, mm1
4237 movq [A1], mm0
4238
4239 IEMIMPL_MMX_EPILOGUE
4240 EPILOGUE_3_ARGS
4241ENDPROC iemAImpl_ %+ %1 %+ _u64
4242%endif
4243
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_SSE_PROLOGUE
4247
4248 movdqu xmm0, [A1]
4249 movdqu xmm1, [A2]
4250 %1 xmm0, xmm1
4251 movdqu [A1], xmm0
4252
4253 IEMIMPL_SSE_EPILOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256%endmacro
4257
4258;;
4259; Media instruction working on two full sized registers, but no FXSAVE state argument.
4260;
4261; @param 1 The instruction
4262; @param 2 Whether there is an MMX variant (1) or not (0).
4263;
4264; @param A0 Pointer to the first media register size operand (input/output).
4265; @param A1 Pointer to the second media register size operand (input).
4266;
4267%macro IEMIMPL_MEDIA_OPT_F2 2
4268%if %2 != 0
4269BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4270 PROLOGUE_2_ARGS
4271 IEMIMPL_MMX_PROLOGUE
4272
4273 movq mm0, [A0]
4274 movq mm1, [A1]
4275 %1 mm0, mm1
4276 movq [A0], mm0
4277
4278 IEMIMPL_MMX_EPILOGUE
4279 EPILOGUE_2_ARGS
4280ENDPROC iemAImpl_ %+ %1 %+ _u64
4281%endif
4282
4283BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4284 PROLOGUE_2_ARGS
4285 IEMIMPL_SSE_PROLOGUE
4286
4287 movdqu xmm0, [A0]
4288 movdqu xmm1, [A1]
4289 %1 xmm0, xmm1
4290 movdqu [A0], xmm0
4291
4292 IEMIMPL_SSE_EPILOGUE
4293 EPILOGUE_2_ARGS
4294ENDPROC iemAImpl_ %+ %1 %+ _u128
4295%endmacro
4296
4297IEMIMPL_MEDIA_OPT_F2 pshufb, 1
4298IEMIMPL_MEDIA_OPT_F2 pand, 1
4299IEMIMPL_MEDIA_OPT_F2 pandn, 1
4300IEMIMPL_MEDIA_OPT_F2 por, 1
4301IEMIMPL_MEDIA_OPT_F2 pxor, 1
4302IEMIMPL_MEDIA_OPT_F2 pcmpeqb, 1
4303IEMIMPL_MEDIA_OPT_F2 pcmpeqw, 1
4304IEMIMPL_MEDIA_OPT_F2 pcmpeqd, 1
4305IEMIMPL_MEDIA_OPT_F2 pcmpeqq, 0
4306IEMIMPL_MEDIA_OPT_F2 pcmpgtb, 1
4307IEMIMPL_MEDIA_OPT_F2 pcmpgtw, 1
4308IEMIMPL_MEDIA_OPT_F2 pcmpgtd, 1
4309IEMIMPL_MEDIA_OPT_F2 pcmpgtq, 0
4310IEMIMPL_MEDIA_OPT_F2 paddb, 1
4311IEMIMPL_MEDIA_OPT_F2 paddw, 1
4312IEMIMPL_MEDIA_OPT_F2 paddd, 1
4313IEMIMPL_MEDIA_OPT_F2 paddq, 1
4314IEMIMPL_MEDIA_OPT_F2 paddsb, 1
4315IEMIMPL_MEDIA_OPT_F2 paddsw, 1
4316IEMIMPL_MEDIA_OPT_F2 paddusb, 1
4317IEMIMPL_MEDIA_OPT_F2 paddusw, 1
4318IEMIMPL_MEDIA_OPT_F2 psubb, 1
4319IEMIMPL_MEDIA_OPT_F2 psubw, 1
4320IEMIMPL_MEDIA_OPT_F2 psubd, 1
4321IEMIMPL_MEDIA_OPT_F2 psubq, 1
4322IEMIMPL_MEDIA_OPT_F2 psubsb, 1
4323IEMIMPL_MEDIA_OPT_F2 psubsw, 1
4324IEMIMPL_MEDIA_OPT_F2 psubusb, 1
4325IEMIMPL_MEDIA_OPT_F2 psubusw, 1
4326IEMIMPL_MEDIA_OPT_F2 pmullw, 1
4327IEMIMPL_MEDIA_OPT_F2 pmulld, 0
4328IEMIMPL_MEDIA_OPT_F2 pmulhw, 1
4329IEMIMPL_MEDIA_OPT_F2 pmaddwd, 1
4330IEMIMPL_MEDIA_OPT_F2 pminub, 1
4331IEMIMPL_MEDIA_OPT_F2 pminuw, 0
4332IEMIMPL_MEDIA_OPT_F2 pminud, 0
4333IEMIMPL_MEDIA_OPT_F2 pminsb, 0
4334IEMIMPL_MEDIA_OPT_F2 pminsw, 1
4335IEMIMPL_MEDIA_OPT_F2 pminsd, 0
4336IEMIMPL_MEDIA_OPT_F2 pmaxub, 1
4337IEMIMPL_MEDIA_OPT_F2 pmaxuw, 0
4338IEMIMPL_MEDIA_OPT_F2 pmaxud, 0
4339IEMIMPL_MEDIA_OPT_F2 pmaxsb, 0
4340IEMIMPL_MEDIA_OPT_F2 pmaxsw, 1
4341IEMIMPL_MEDIA_OPT_F2 pmaxsd, 0
4342IEMIMPL_MEDIA_OPT_F2 pabsb, 1
4343IEMIMPL_MEDIA_OPT_F2 pabsw, 1
4344IEMIMPL_MEDIA_OPT_F2 pabsd, 1
4345IEMIMPL_MEDIA_OPT_F2 psignb, 1
4346IEMIMPL_MEDIA_OPT_F2 psignw, 1
4347IEMIMPL_MEDIA_OPT_F2 psignd, 1
4348IEMIMPL_MEDIA_OPT_F2 phaddw, 1
4349IEMIMPL_MEDIA_OPT_F2 phaddd, 1
4350IEMIMPL_MEDIA_OPT_F2 phsubw, 1
4351IEMIMPL_MEDIA_OPT_F2 phsubd, 1
4352IEMIMPL_MEDIA_OPT_F2 phaddsw, 1
4353IEMIMPL_MEDIA_OPT_F2 phsubsw, 1
4354IEMIMPL_MEDIA_OPT_F2 pmaddubsw, 1
4355IEMIMPL_MEDIA_OPT_F2 pmulhrsw, 1
4356IEMIMPL_MEDIA_OPT_F2 pmuludq, 1
4357IEMIMPL_MEDIA_OPT_F2 packsswb, 1
4358IEMIMPL_MEDIA_OPT_F2 packssdw, 1
4359IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4360IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4361IEMIMPL_MEDIA_OPT_F2 psllw, 1
4362IEMIMPL_MEDIA_OPT_F2 pslld, 1
4363IEMIMPL_MEDIA_OPT_F2 psllq, 1
4364IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4365IEMIMPL_MEDIA_OPT_F2 psrld, 1
4366IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4367IEMIMPL_MEDIA_OPT_F2 psraw, 1
4368IEMIMPL_MEDIA_OPT_F2 psrad, 1
4369IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4370IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4371IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4372IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4373IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4374IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4375IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4376IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4377IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4378IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4379IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4380IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4381IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4382IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4383IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4384IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4385IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4386IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4387IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4388IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4389
4390
4391;;
4392; Media instruction working on one full sized and one half sized register (lower half).
4393;
4394; @param 1 The instruction
4395; @param 2 1 if MMX is included, 0 if not.
4396;
4397; @param A0 Pointer to the first full sized media register operand (input/output).
4398; @param A1 Pointer to the second half sized media register operand (input).
4399;
4400%macro IEMIMPL_MEDIA_F1L1 2
4401 %if %2 != 0
4402BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4403 PROLOGUE_2_ARGS
4404 IEMIMPL_MMX_PROLOGUE
4405
4406 movq mm0, [A0]
4407 movq mm1, [A1]
4408 %1 mm0, mm1
4409 movq [A0], mm0
4410
4411 IEMIMPL_MMX_EPILOGUE
4412 EPILOGUE_2_ARGS
4413ENDPROC iemAImpl_ %+ %1 %+ _u64
4414 %endif
4415
4416BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4417 PROLOGUE_2_ARGS
4418 IEMIMPL_SSE_PROLOGUE
4419
4420 movdqu xmm0, [A0]
4421 movdqu xmm1, [A1]
4422 %1 xmm0, xmm1
4423 movdqu [A0], xmm0
4424
4425 IEMIMPL_SSE_EPILOGUE
4426 EPILOGUE_2_ARGS
4427ENDPROC iemAImpl_ %+ %1 %+ _u128
4428%endmacro
4429
4430IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4431IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4432IEMIMPL_MEDIA_F1L1 punpckldq, 1
4433IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4434
4435
4436;;
4437; Media instruction working two half sized input registers (lower half) and a full sized
4438; destination register (vpunpckh*).
4439;
4440; @param 1 The instruction
4441;
4442; @param A0 Pointer to the destination register (full sized, output only).
4443; @param A1 Pointer to the first full sized media source register operand, where we
4444; will only use the lower half as input - but we'll be loading it in full.
4445; @param A2 Pointer to the second full sized media source register operand, where we
4446; will only use the lower half as input - but we'll be loading it in full.
4447;
4448%macro IEMIMPL_MEDIA_F1L1L1 1
4449BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4450 PROLOGUE_3_ARGS
4451 IEMIMPL_AVX_PROLOGUE
4452
4453 vmovdqu xmm0, [A1]
4454 vmovdqu xmm1, [A2]
4455 %1 xmm0, xmm0, xmm1
4456 vmovdqu [A0], xmm0
4457
4458 IEMIMPL_AVX_PROLOGUE
4459 EPILOGUE_3_ARGS
4460ENDPROC iemAImpl_ %+ %1 %+ _u128
4461
4462BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4463 PROLOGUE_3_ARGS
4464 IEMIMPL_AVX_PROLOGUE
4465
4466 vmovdqu ymm0, [A1]
4467 vmovdqu ymm1, [A2]
4468 %1 ymm0, ymm0, ymm1
4469 vmovdqu [A0], ymm0
4470
4471 IEMIMPL_AVX_PROLOGUE
4472 EPILOGUE_3_ARGS
4473ENDPROC iemAImpl_ %+ %1 %+ _u256
4474%endmacro
4475
4476IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4477IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4478IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4479IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4480
4481
4482;;
4483; Media instruction working on one full sized and one half sized register (high half).
4484;
4485; @param 1 The instruction
4486; @param 2 1 if MMX is included, 0 if not.
4487;
4488; @param A0 Pointer to the first full sized media register operand (input/output).
4489; @param A1 Pointer to the second full sized media register operand, where we
4490; will only use the upper half as input - but we'll load it in full.
4491;
4492%macro IEMIMPL_MEDIA_F1H1 2
4493IEMIMPL_MEDIA_F1L1 %1, %2
4494%endmacro
4495
4496IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4497IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4498IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4499IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4500
4501
4502;;
4503; Media instruction working two half sized input registers (high half) and a full sized
4504; destination register (vpunpckh*).
4505;
4506; @param 1 The instruction
4507;
4508; @param A0 Pointer to the destination register (full sized, output only).
4509; @param A1 Pointer to the first full sized media source register operand, where we
4510; will only use the upper half as input - but we'll be loading it in full.
4511; @param A2 Pointer to the second full sized media source register operand, where we
4512; will only use the upper half as input - but we'll be loading it in full.
4513;
4514%macro IEMIMPL_MEDIA_F1H1H1 1
4515IEMIMPL_MEDIA_F1L1L1 %1
4516%endmacro
4517
4518IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4519IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4520IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4521IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4522
4523
4524;
4525; Shufflers with evil 8-bit immediates.
4526;
4527
4528BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4529 PROLOGUE_3_ARGS
4530 IEMIMPL_MMX_PROLOGUE
4531
4532 movzx A2, A2_8 ; must clear top bits
4533 movq mm1, [A1]
4534 movq mm0, mm0 ; paranoia!
4535 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 5
4536 movq [A0], mm0
4537
4538 IEMIMPL_MMX_EPILOGUE
4539 EPILOGUE_3_ARGS
4540%assign bImm 0
4541%rep 256
4542.imm %+ bImm:
4543 IBT_ENDBRxx_WITHOUT_NOTRACK
4544 pshufw mm0, mm1, bImm
4545 ret
4546 %assign bImm bImm + 1
4547%endrep
4548.immEnd:
4549ENDPROC iemAImpl_pshufw_u64
4550
4551
4552%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4553BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4554 PROLOGUE_3_ARGS
4555 IEMIMPL_SSE_PROLOGUE
4556
4557 movzx A2, A2_8 ; must clear top bits
4558 movdqu xmm1, [A1]
4559 movdqu xmm0, xmm1 ; paranoia!
4560 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4561 movdqu [A0], xmm0
4562
4563 IEMIMPL_SSE_EPILOGUE
4564 EPILOGUE_3_ARGS
4565
4566 %assign bImm 0
4567 %rep 256
4568.imm %+ bImm:
4569 IBT_ENDBRxx_WITHOUT_NOTRACK
4570 %1 xmm0, xmm1, bImm
4571 ret
4572 %assign bImm bImm + 1
4573 %endrep
4574.immEnd:
4575ENDPROC iemAImpl_ %+ %1 %+ _u128
4576%endmacro
4577
4578IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4579IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4580IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4581
4582
4583%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4584BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4585 PROLOGUE_3_ARGS
4586 IEMIMPL_SSE_PROLOGUE
4587
4588 movzx A2, A2_8 ; must clear top bits
4589 vmovdqu ymm1, [A1]
4590 vmovdqu ymm0, ymm1 ; paranoia!
4591 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4592 vmovdqu [A0], ymm0
4593
4594 IEMIMPL_SSE_EPILOGUE
4595 EPILOGUE_3_ARGS
4596 %assign bImm 0
4597 %rep 256
4598.imm %+ bImm:
4599 IBT_ENDBRxx_WITHOUT_NOTRACK
4600 %1 ymm0, ymm1, bImm
4601 ret
4602 %assign bImm bImm + 1
4603 %endrep
4604.immEnd:
4605ENDPROC iemAImpl_ %+ %1 %+ _u256
4606%endmacro
4607
4608IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4609IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4610IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4611
4612
4613;
4614; Shifts with evil 8-bit immediates.
4615;
4616
4617%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4618BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4619 PROLOGUE_2_ARGS
4620 IEMIMPL_MMX_PROLOGUE
4621
4622 movzx A1, A1_8 ; must clear top bits
4623 movq mm0, [A0]
4624 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 5
4625 movq [A0], mm0
4626
4627 IEMIMPL_MMX_EPILOGUE
4628 EPILOGUE_2_ARGS
4629%assign bImm 0
4630%rep 256
4631.imm %+ bImm:
4632 IBT_ENDBRxx_WITHOUT_NOTRACK
4633 %1 mm0, bImm
4634 ret
4635 %assign bImm bImm + 1
4636%endrep
4637.immEnd:
4638ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4639%endmacro
4640
4641IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4642IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4643IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4644IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4645IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4646IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4647IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4648IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4649
4650
4651%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4652BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4653 PROLOGUE_2_ARGS
4654 IEMIMPL_SSE_PROLOGUE
4655
4656 movzx A1, A1_8 ; must clear top bits
4657 movdqu xmm0, [A0]
4658 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 6
4659 movdqu [A0], xmm0
4660
4661 IEMIMPL_SSE_EPILOGUE
4662 EPILOGUE_2_ARGS
4663 %assign bImm 0
4664 %rep 256
4665.imm %+ bImm:
4666 IBT_ENDBRxx_WITHOUT_NOTRACK
4667 %1 xmm0, bImm
4668 ret
4669 %assign bImm bImm + 1
4670 %endrep
4671.immEnd:
4672ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4673%endmacro
4674
4675IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4676IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4677IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4678IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4679IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4680IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4681IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4682IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4683IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4684IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4685
4686
4687;
4688; Move byte mask.
4689;
4690
4691BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4692 PROLOGUE_2_ARGS
4693 IEMIMPL_MMX_PROLOGUE
4694
4695 movq mm1, [A1]
4696 pmovmskb T0, mm1
4697 mov [A0], T0
4698%ifdef RT_ARCH_X86
4699 mov dword [A0 + 4], 0
4700%endif
4701 IEMIMPL_MMX_EPILOGUE
4702 EPILOGUE_2_ARGS
4703ENDPROC iemAImpl_pmovmskb_u64
4704
4705BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4706 PROLOGUE_2_ARGS
4707 IEMIMPL_SSE_PROLOGUE
4708
4709 movdqu xmm1, [A1]
4710 pmovmskb T0, xmm1
4711 mov [A0], T0
4712%ifdef RT_ARCH_X86
4713 mov dword [A0 + 4], 0
4714%endif
4715 IEMIMPL_SSE_EPILOGUE
4716 EPILOGUE_2_ARGS
4717ENDPROC iemAImpl_pmovmskb_u128
4718
4719BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4720 PROLOGUE_2_ARGS
4721 IEMIMPL_AVX_PROLOGUE
4722
4723 vmovdqu ymm1, [A1]
4724 vpmovmskb T0, ymm1
4725 mov [A0], T0
4726%ifdef RT_ARCH_X86
4727 mov dword [A0 + 4], 0
4728%endif
4729 IEMIMPL_AVX_EPILOGUE
4730 EPILOGUE_2_ARGS
4731ENDPROC iemAImpl_vpmovmskb_u256
4732
4733
4734;;
4735; Media instruction working on two full sized source registers and one destination (AVX).
4736;
4737; @param 1 The instruction
4738;
4739; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4740; @param A1 Pointer to the destination media register size operand (output).
4741; @param A2 Pointer to the first source media register size operand (input).
4742; @param A3 Pointer to the second source media register size operand (input).
4743;
4744; @todo r=aeichner Not used right now
4745;
4746%macro IEMIMPL_MEDIA_F3 1
4747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4748 PROLOGUE_4_ARGS
4749 IEMIMPL_AVX_PROLOGUE
4750
4751 vmovdqu xmm0, [A2]
4752 vmovdqu xmm1, [A3]
4753 %1 xmm0, xmm0, xmm1
4754 vmovdqu [A1], xmm0
4755
4756 IEMIMPL_AVX_PROLOGUE
4757 EPILOGUE_4_ARGS
4758ENDPROC iemAImpl_ %+ %1 %+ _u128
4759
4760BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4761 PROLOGUE_4_ARGS
4762 IEMIMPL_AVX_PROLOGUE
4763
4764 vmovdqu ymm0, [A2]
4765 vmovdqu ymm1, [A3]
4766 %1 ymm0, ymm0, ymm1
4767 vmovdqu [A1], ymm0
4768
4769 IEMIMPL_AVX_PROLOGUE
4770 EPILOGUE_4_ARGS
4771ENDPROC iemAImpl_ %+ %1 %+ _u256
4772%endmacro
4773
4774;;
4775; Media instruction working on two full sized source registers and one destination (AVX),
4776; but no XSAVE state pointer argument.
4777;
4778; @param 1 The instruction
4779; @param 2 Flag whether to add a 256-bit variant (1) or not (0).
4780;
4781; @param A0 Pointer to the destination media register size operand (output).
4782; @param A1 Pointer to the first source media register size operand (input).
4783; @param A2 Pointer to the second source media register size operand (input).
4784;
4785%macro IEMIMPL_MEDIA_OPT_F3 2
4786BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4787 PROLOGUE_3_ARGS
4788 IEMIMPL_AVX_PROLOGUE
4789
4790 vmovdqu xmm0, [A1]
4791 vmovdqu xmm1, [A2]
4792 %1 xmm0, xmm0, xmm1
4793 vmovdqu [A0], xmm0
4794
4795 IEMIMPL_AVX_PROLOGUE
4796 EPILOGUE_3_ARGS
4797ENDPROC iemAImpl_ %+ %1 %+ _u128
4798
4799 %if %2 == 1
4800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4801 PROLOGUE_3_ARGS
4802 IEMIMPL_AVX_PROLOGUE
4803
4804 vmovdqu ymm0, [A1]
4805 vmovdqu ymm1, [A2]
4806 %1 ymm0, ymm0, ymm1
4807 vmovdqu [A0], ymm0
4808
4809 IEMIMPL_AVX_PROLOGUE
4810 EPILOGUE_3_ARGS
4811ENDPROC iemAImpl_ %+ %1 %+ _u256
4812 %endif
4813%endmacro
4814
4815IEMIMPL_MEDIA_OPT_F3 vpshufb, 1
4816IEMIMPL_MEDIA_OPT_F3 vpand, 1
4817IEMIMPL_MEDIA_OPT_F3 vpminub, 1
4818IEMIMPL_MEDIA_OPT_F3 vpminuw, 1
4819IEMIMPL_MEDIA_OPT_F3 vpminud, 1
4820IEMIMPL_MEDIA_OPT_F3 vpminsb, 1
4821IEMIMPL_MEDIA_OPT_F3 vpminsw, 1
4822IEMIMPL_MEDIA_OPT_F3 vpminsd, 1
4823IEMIMPL_MEDIA_OPT_F3 vpmaxub, 1
4824IEMIMPL_MEDIA_OPT_F3 vpmaxuw, 1
4825IEMIMPL_MEDIA_OPT_F3 vpmaxud, 1
4826IEMIMPL_MEDIA_OPT_F3 vpmaxsb, 1
4827IEMIMPL_MEDIA_OPT_F3 vpmaxsw, 1
4828IEMIMPL_MEDIA_OPT_F3 vpmaxsd, 1
4829IEMIMPL_MEDIA_OPT_F3 vpandn, 1
4830IEMIMPL_MEDIA_OPT_F3 vpor, 1
4831IEMIMPL_MEDIA_OPT_F3 vpxor, 1
4832IEMIMPL_MEDIA_OPT_F3 vpcmpeqb, 1
4833IEMIMPL_MEDIA_OPT_F3 vpcmpeqw, 1
4834IEMIMPL_MEDIA_OPT_F3 vpcmpeqd, 1
4835IEMIMPL_MEDIA_OPT_F3 vpcmpeqq, 1
4836IEMIMPL_MEDIA_OPT_F3 vpcmpgtb, 1
4837IEMIMPL_MEDIA_OPT_F3 vpcmpgtw, 1
4838IEMIMPL_MEDIA_OPT_F3 vpcmpgtd, 1
4839IEMIMPL_MEDIA_OPT_F3 vpcmpgtq, 1
4840IEMIMPL_MEDIA_OPT_F3 vpaddb, 1
4841IEMIMPL_MEDIA_OPT_F3 vpaddw, 1
4842IEMIMPL_MEDIA_OPT_F3 vpaddd, 1
4843IEMIMPL_MEDIA_OPT_F3 vpaddq, 1
4844IEMIMPL_MEDIA_OPT_F3 vpsubb, 1
4845IEMIMPL_MEDIA_OPT_F3 vpsubw, 1
4846IEMIMPL_MEDIA_OPT_F3 vpsubd, 1
4847IEMIMPL_MEDIA_OPT_F3 vpsubq, 1
4848IEMIMPL_MEDIA_OPT_F3 vpacksswb, 1
4849IEMIMPL_MEDIA_OPT_F3 vpackssdw, 1
4850IEMIMPL_MEDIA_OPT_F3 vpackuswb, 1
4851IEMIMPL_MEDIA_OPT_F3 vpackusdw, 1
4852IEMIMPL_MEDIA_OPT_F3 vpmullw, 1
4853IEMIMPL_MEDIA_OPT_F3 vpmulld, 1
4854IEMIMPL_MEDIA_OPT_F3 vpmulhw, 1
4855IEMIMPL_MEDIA_OPT_F3 vpmulhuw, 1
4856IEMIMPL_MEDIA_OPT_F3 vpavgb, 1
4857IEMIMPL_MEDIA_OPT_F3 vpavgw, 1
4858IEMIMPL_MEDIA_OPT_F3 vpsignb, 1
4859IEMIMPL_MEDIA_OPT_F3 vpsignw, 1
4860IEMIMPL_MEDIA_OPT_F3 vpsignd, 1
4861IEMIMPL_MEDIA_OPT_F3 vphaddw, 1
4862IEMIMPL_MEDIA_OPT_F3 vphaddd, 1
4863IEMIMPL_MEDIA_OPT_F3 vphsubw, 1
4864IEMIMPL_MEDIA_OPT_F3 vphsubd, 1
4865IEMIMPL_MEDIA_OPT_F3 vphaddsw, 1
4866IEMIMPL_MEDIA_OPT_F3 vphsubsw, 1
4867IEMIMPL_MEDIA_OPT_F3 vpmaddubsw, 1
4868IEMIMPL_MEDIA_OPT_F3 vpmulhrsw, 1
4869IEMIMPL_MEDIA_OPT_F3 vpsadbw, 1
4870IEMIMPL_MEDIA_OPT_F3 vpmuldq, 1
4871IEMIMPL_MEDIA_OPT_F3 vpmuludq, 1
4872IEMIMPL_MEDIA_OPT_F3 vunpcklps, 1
4873IEMIMPL_MEDIA_OPT_F3 vunpcklpd, 1
4874IEMIMPL_MEDIA_OPT_F3 vunpckhps, 1
4875IEMIMPL_MEDIA_OPT_F3 vunpckhpd, 1
4876IEMIMPL_MEDIA_OPT_F3 vpsubsb, 1
4877IEMIMPL_MEDIA_OPT_F3 vpsubsw, 1
4878IEMIMPL_MEDIA_OPT_F3 vpsubusb, 1
4879IEMIMPL_MEDIA_OPT_F3 vpsubusw, 1
4880IEMIMPL_MEDIA_OPT_F3 vpaddusb, 1
4881IEMIMPL_MEDIA_OPT_F3 vpaddusw, 1
4882IEMIMPL_MEDIA_OPT_F3 vpaddsb, 1
4883IEMIMPL_MEDIA_OPT_F3 vpaddsw, 1
4884IEMIMPL_MEDIA_OPT_F3 vpermilps, 1
4885IEMIMPL_MEDIA_OPT_F3 vpermilpd, 1
4886IEMIMPL_MEDIA_OPT_F3 vpmaddwd, 1
4887IEMIMPL_MEDIA_OPT_F3 vpsrlvd, 1
4888IEMIMPL_MEDIA_OPT_F3 vpsrlvq, 1
4889IEMIMPL_MEDIA_OPT_F3 vpsravd, 1
4890IEMIMPL_MEDIA_OPT_F3 vpsllvd, 1
4891IEMIMPL_MEDIA_OPT_F3 vpsllvq, 1
4892
4893IEMIMPL_MEDIA_OPT_F3 vaesenc, 0
4894IEMIMPL_MEDIA_OPT_F3 vaesenclast, 0
4895IEMIMPL_MEDIA_OPT_F3 vaesdec, 0
4896IEMIMPL_MEDIA_OPT_F3 vaesdeclast, 0
4897
4898
4899;;
4900; VAESIMC instruction.
4901;
4902; @param A0 Pointer to the first media register size operand (output).
4903; @param A1 Pointer to the second media register size operand (input).
4904;
4905BEGINPROC_FASTCALL iemAImpl_vaesimc_u128, 8
4906 PROLOGUE_2_ARGS
4907 IEMIMPL_SSE_PROLOGUE
4908
4909 movdqu xmm0, [A0]
4910 movdqu xmm1, [A1]
4911 vaesimc xmm0, xmm1
4912 movdqu [A0], xmm0
4913
4914 IEMIMPL_SSE_EPILOGUE
4915 EPILOGUE_2_ARGS
4916ENDPROC iemAImpl_vaesimc_u128
4917
4918
4919;;
4920; VAESKEYGENASSIST instruction.
4921;
4922; @param A0 Pointer to the first media register size operand (output).
4923; @param A1 Pointer to the second media register size operand (input).
4924; @param A2 8-bit immediate for the round constant.
4925;
4926BEGINPROC_FASTCALL iemAImpl_vaeskeygenassist_u128, 16
4927 PROLOGUE_3_ARGS
4928 IEMIMPL_AVX_PROLOGUE
4929
4930 movzx A2, A2_8 ; must clear top bits
4931 movdqu xmm0, [A0]
4932 movdqu xmm1, [A1]
4933 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4934 movdqu [A0], xmm0
4935
4936 IEMIMPL_AVX_EPILOGUE
4937 EPILOGUE_3_ARGS
4938 %assign bImm 0
4939 %rep 256
4940.imm %+ bImm:
4941 IBT_ENDBRxx_WITHOUT_NOTRACK
4942 vaeskeygenassist xmm0, xmm1, bImm
4943 ret
4944 int3
4945 %assign bImm bImm + 1
4946 %endrep
4947.immEnd:
4948ENDPROC iemAImpl_vaeskeygenassist_u128
4949
4950
4951;;
4952; VPERMQ instruction.
4953;
4954; @param A0 Pointer to the first media register size operand (output).
4955; @param A1 Pointer to the second media register size operand (input).
4956; @param A2 8-bit immediate for the round constant.
4957;
4958BEGINPROC_FASTCALL iemAImpl_vpermq_u256, 16
4959 PROLOGUE_3_ARGS
4960 IEMIMPL_AVX_PROLOGUE
4961
4962 movzx A2, A2_8 ; must clear top bits
4963 vmovdqu ymm1, [A1]
4964 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4965 vmovdqu [A0], ymm0
4966
4967 IEMIMPL_AVX_EPILOGUE
4968 EPILOGUE_3_ARGS
4969 %assign bImm 0
4970 %rep 256
4971.imm %+ bImm:
4972 IBT_ENDBRxx_WITHOUT_NOTRACK
4973 vpermq ymm0, ymm1, bImm
4974 ret
4975 int3
4976 %assign bImm bImm + 1
4977 %endrep
4978.immEnd:
4979ENDPROC iemAImpl_vpermq_u256
4980
4981
4982;;
4983; VPERMPD instruction.
4984;
4985; @param A0 Pointer to the first media register size operand (output).
4986; @param A1 Pointer to the second media register size operand (input).
4987; @param A2 8-bit immediate for the round constant.
4988;
4989BEGINPROC_FASTCALL iemAImpl_vpermpd_u256, 16
4990 PROLOGUE_3_ARGS
4991 IEMIMPL_AVX_PROLOGUE
4992
4993 movzx A2, A2_8 ; must clear top bits
4994 vmovdqu ymm1, [A1]
4995 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4996 vmovdqu [A0], ymm0
4997
4998 IEMIMPL_AVX_EPILOGUE
4999 EPILOGUE_3_ARGS
5000 %assign bImm 0
5001 %rep 256
5002.imm %+ bImm:
5003 IBT_ENDBRxx_WITHOUT_NOTRACK
5004 vpermpd ymm0, ymm1, bImm
5005 ret
5006 int3
5007 %assign bImm bImm + 1
5008 %endrep
5009.immEnd:
5010ENDPROC iemAImpl_vpermpd_u256
5011
5012
5013;;
5014; VPERMPS instruction.
5015;
5016; @param A0 Pointer to the first media register size operand (output).
5017; @param A1 Pointer to the second media register size operand (input).
5018; @param A2 Pointer to the third media register size operand (input).
5019;
5020BEGINPROC_FASTCALL iemAImpl_vpermps_u256, 16
5021 PROLOGUE_3_ARGS
5022 IEMIMPL_AVX_PROLOGUE
5023
5024 vmovdqu ymm0, [A1]
5025 vmovdqu ymm1, [A2]
5026 vpermps ymm0, ymm0, ymm1
5027 vmovdqu [A0], ymm0
5028
5029 IEMIMPL_AVX_EPILOGUE
5030 EPILOGUE_3_ARGS
5031ENDPROC iemAImpl_vpermps_u256
5032
5033
5034;;
5035; VPERMD instruction.
5036;
5037; @param A0 Pointer to the first media register size operand (output).
5038; @param A1 Pointer to the second media register size operand (input).
5039; @param A2 Pointer to the third media register size operand (input).
5040;
5041BEGINPROC_FASTCALL iemAImpl_vpermd_u256, 16
5042 PROLOGUE_3_ARGS
5043 IEMIMPL_AVX_PROLOGUE
5044
5045 vmovdqu ymm0, [A1]
5046 vmovdqu ymm1, [A2]
5047 vpermd ymm0, ymm0, ymm1
5048 vmovdqu [A0], ymm0
5049
5050 IEMIMPL_AVX_EPILOGUE
5051 EPILOGUE_3_ARGS
5052ENDPROC iemAImpl_vpermd_u256
5053
5054
5055;;
5056; Media instruction working on one full sized source register, one full sized destination
5057; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
5058; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
5059; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
5060; of either 16, 32, or 64, it acts like the max shift size)
5061;
5062; @param 1 The instruction
5063;
5064; @param A0 Pointer to the destination media register size operand (output).
5065; @param A1 Pointer to the first source media register size operand (input).
5066; @param A2 Pointer to the second source media register size operand (input).
5067;
5068%macro IEMIMPL_SHIFT_OPT_F3 1
5069BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5070 PROLOGUE_3_ARGS
5071 IEMIMPL_AVX_PROLOGUE
5072
5073 vmovdqu xmm0, [A1]
5074 vmovdqu xmm1, [A2]
5075 %1 xmm0, xmm0, xmm1
5076 vmovdqu [A0], xmm0
5077
5078 IEMIMPL_AVX_PROLOGUE
5079 EPILOGUE_3_ARGS
5080ENDPROC iemAImpl_ %+ %1 %+ _u128
5081
5082BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5083 PROLOGUE_3_ARGS
5084 IEMIMPL_AVX_PROLOGUE
5085
5086 vmovdqu ymm0, [A1]
5087 vmovdqu xmm1, [A2]
5088 %1 ymm0, ymm0, xmm1
5089 vmovdqu [A0], ymm0
5090
5091 IEMIMPL_AVX_PROLOGUE
5092 EPILOGUE_3_ARGS
5093ENDPROC iemAImpl_ %+ %1 %+ _u256
5094%endmacro
5095
5096IEMIMPL_SHIFT_OPT_F3 vpsllw
5097IEMIMPL_SHIFT_OPT_F3 vpslld
5098IEMIMPL_SHIFT_OPT_F3 vpsllq
5099IEMIMPL_SHIFT_OPT_F3 vpsraw
5100IEMIMPL_SHIFT_OPT_F3 vpsrad
5101IEMIMPL_SHIFT_OPT_F3 vpsrlw
5102IEMIMPL_SHIFT_OPT_F3 vpsrld
5103IEMIMPL_SHIFT_OPT_F3 vpsrlq
5104
5105
5106;;
5107; Media instruction working on one full sized source registers and one destination (AVX),
5108; but no XSAVE state pointer argument.
5109;
5110; @param 1 The instruction
5111; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
5112;
5113; @param A0 Pointer to the destination media register size operand (output).
5114; @param A1 Pointer to the source media register size operand (input).
5115;
5116%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
5117BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5118 PROLOGUE_2_ARGS
5119 IEMIMPL_AVX_PROLOGUE
5120
5121 vmovdqu xmm0, [A1]
5122 %1 xmm0, xmm0
5123 vmovdqu [A0], xmm0
5124
5125 IEMIMPL_AVX_PROLOGUE
5126 EPILOGUE_2_ARGS
5127ENDPROC iemAImpl_ %+ %1 %+ _u128
5128
5129 %if %2 == 1
5130BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5131 PROLOGUE_2_ARGS
5132 IEMIMPL_AVX_PROLOGUE
5133
5134 vmovdqu ymm0, [A1]
5135 %1 ymm0, ymm0
5136 vmovdqu [A0], ymm0
5137
5138 IEMIMPL_AVX_PROLOGUE
5139 EPILOGUE_2_ARGS
5140ENDPROC iemAImpl_ %+ %1 %+ _u256
5141 %endif
5142%endmacro
5143
5144IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
5145IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
5146IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
5147IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
5148
5149
5150;
5151; The SSE 4.2 crc32
5152;
5153; @param A1 Pointer to the 32-bit destination.
5154; @param A2 The source operand, sized according to the suffix.
5155;
5156BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
5157 PROLOGUE_2_ARGS
5158
5159 mov T0_32, [A0]
5160 crc32 T0_32, A1_8
5161 mov [A0], T0_32
5162
5163 EPILOGUE_2_ARGS
5164ENDPROC iemAImpl_crc32_u8
5165
5166BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
5167 PROLOGUE_2_ARGS
5168
5169 mov T0_32, [A0]
5170 crc32 T0_32, A1_16
5171 mov [A0], T0_32
5172
5173 EPILOGUE_2_ARGS
5174ENDPROC iemAImpl_crc32_u16
5175
5176BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
5177 PROLOGUE_2_ARGS
5178
5179 mov T0_32, [A0]
5180 crc32 T0_32, A1_32
5181 mov [A0], T0_32
5182
5183 EPILOGUE_2_ARGS
5184ENDPROC iemAImpl_crc32_u32
5185
5186%ifdef RT_ARCH_AMD64
5187BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
5188 PROLOGUE_2_ARGS
5189
5190 mov T0_32, [A0]
5191 crc32 T0, A1
5192 mov [A0], T0_32
5193
5194 EPILOGUE_2_ARGS
5195ENDPROC iemAImpl_crc32_u64
5196%endif
5197
5198
5199;
5200; PTEST (SSE 4.1)
5201;
5202; @param A0 Pointer to the first source operand (aka readonly destination).
5203; @param A1 Pointer to the second source operand.
5204; @param A2 Pointer to the EFLAGS register.
5205;
5206BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
5207 PROLOGUE_3_ARGS
5208 IEMIMPL_SSE_PROLOGUE
5209
5210 movdqu xmm0, [A0]
5211 movdqu xmm1, [A1]
5212 ptest xmm0, xmm1
5213 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5214
5215 IEMIMPL_SSE_EPILOGUE
5216 EPILOGUE_3_ARGS
5217ENDPROC iemAImpl_ptest_u128
5218
5219BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
5220 PROLOGUE_3_ARGS
5221 IEMIMPL_SSE_PROLOGUE
5222
5223 vmovdqu ymm0, [A0]
5224 vmovdqu ymm1, [A1]
5225 vptest ymm0, ymm1
5226 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5227
5228 IEMIMPL_SSE_EPILOGUE
5229 EPILOGUE_3_ARGS
5230ENDPROC iemAImpl_vptest_u256
5231
5232
5233;; Template for the vtestp{s,d} instructions
5234;
5235; @param 1 The instruction
5236;
5237; @param A0 Pointer to the first source operand (aka readonly destination).
5238; @param A1 Pointer to the second source operand.
5239; @param A2 Pointer to the EFLAGS register.
5240;
5241%macro IEMIMPL_VTESTP_S_D 1
5242BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5243 PROLOGUE_3_ARGS
5244 IEMIMPL_AVX_PROLOGUE
5245
5246 vmovdqu xmm0, [A0]
5247 vmovdqu xmm1, [A1]
5248 %1 xmm0, xmm1
5249 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5250
5251 IEMIMPL_AVX_EPILOGUE
5252 EPILOGUE_3_ARGS
5253ENDPROC iemAImpl_ %+ %1 %+ _u128
5254
5255BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5256 PROLOGUE_3_ARGS
5257 IEMIMPL_AVX_PROLOGUE
5258
5259 vmovdqu ymm0, [A0]
5260 vmovdqu ymm1, [A1]
5261 %1 ymm0, ymm1
5262 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5263
5264 IEMIMPL_AVX_EPILOGUE
5265 EPILOGUE_3_ARGS
5266ENDPROC iemAImpl_ %+ %1 %+ _u256
5267%endmacro
5268
5269IEMIMPL_VTESTP_S_D vtestps
5270IEMIMPL_VTESTP_S_D vtestpd
5271
5272
5273;;
5274; Template for the [v]pmov{s,z}x* instructions
5275;
5276; @param 1 The instruction
5277;
5278; @param A0 Pointer to the destination media register size operand (output).
5279; @param A1 The source operand value (input).
5280;
5281%macro IEMIMPL_V_PMOV_SZ_X 1
5282BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5283 PROLOGUE_2_ARGS
5284 IEMIMPL_SSE_PROLOGUE
5285
5286 movd xmm0, A1
5287 %1 xmm0, xmm0
5288 vmovdqu [A0], xmm0
5289
5290 IEMIMPL_SSE_PROLOGUE
5291 EPILOGUE_2_ARGS
5292ENDPROC iemAImpl_ %+ %1 %+ _u128
5293
5294BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5295 PROLOGUE_2_ARGS
5296 IEMIMPL_AVX_PROLOGUE
5297
5298 movd xmm0, A1
5299 v %+ %1 xmm0, xmm0
5300 vmovdqu [A0], xmm0
5301
5302 IEMIMPL_AVX_PROLOGUE
5303 EPILOGUE_2_ARGS
5304ENDPROC iemAImpl_v %+ %1 %+ _u128
5305
5306BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5307 PROLOGUE_2_ARGS
5308 IEMIMPL_AVX_PROLOGUE
5309
5310 movdqu xmm0, [A1]
5311 v %+ %1 ymm0, xmm0
5312 vmovdqu [A0], ymm0
5313
5314 IEMIMPL_AVX_PROLOGUE
5315 EPILOGUE_2_ARGS
5316ENDPROC iemAImpl_v %+ %1 %+ _u256
5317%endmacro
5318
5319IEMIMPL_V_PMOV_SZ_X pmovsxbw
5320IEMIMPL_V_PMOV_SZ_X pmovsxbd
5321IEMIMPL_V_PMOV_SZ_X pmovsxbq
5322IEMIMPL_V_PMOV_SZ_X pmovsxwd
5323IEMIMPL_V_PMOV_SZ_X pmovsxwq
5324IEMIMPL_V_PMOV_SZ_X pmovsxdq
5325
5326IEMIMPL_V_PMOV_SZ_X pmovzxbw
5327IEMIMPL_V_PMOV_SZ_X pmovzxbd
5328IEMIMPL_V_PMOV_SZ_X pmovzxbq
5329IEMIMPL_V_PMOV_SZ_X pmovzxwd
5330IEMIMPL_V_PMOV_SZ_X pmovzxwq
5331IEMIMPL_V_PMOV_SZ_X pmovzxdq
5332
5333
5334;;
5335; Initialize the SSE MXCSR register using the guest value partially to
5336; account for rounding mode, load the value from the given register.
5337;
5338; @uses 4 bytes of stack to save the original value, T0.
5339; @param 1 Expression giving the register holding the guest's MXCSR.
5340;
5341%macro SSE_AVX_LD_MXCSR 1
5342 sub xSP, 4
5343
5344 stmxcsr [xSP]
5345 mov T0_32, %1
5346 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5347 or T0_32, X86_MXCSR_XCPT_MASK
5348 sub xSP, 4
5349 mov [xSP], T0_32
5350 ldmxcsr [xSP]
5351 add xSP, 4
5352%endmacro
5353
5354
5355;;
5356; Restores the SSE MXCSR register with the original value.
5357;
5358; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5359; @param 1 Expression giving the register to return the new guest's MXCSR value.
5360; @param 2 Expression giving the register holding original guest's MXCSR value.
5361;
5362; @note Restores the stack pointer.
5363;
5364%macro SSE_AVX_ST_MXCSR 2
5365 sub xSP, 4
5366 stmxcsr [xSP]
5367 mov %1, [xSP]
5368 add xSP, 4
5369 ; Merge the status bits into the original MXCSR value.
5370 and %1, X86_MXCSR_XCPT_FLAGS
5371 ;
5372 ; If PE is set together with OE/UE and neither are masked
5373 ; PE needs to be cleared because on real hardware
5374 ; an exception is generated with only OE/UE being set,
5375 ; but because we mask all exceptions PE will get set as well.
5376 ;
5377 mov T2_32, %1
5378 and T2_32, X86_MXCSR_OE | X86_MXCSR_UE
5379 mov T1_32, %2
5380 and T1_32, X86_MXCSR_OM | X86_MXCSR_UM
5381 shr T1_32, X86_MXCSR_XCPT_MASK_SHIFT
5382 not T1_32
5383 and T2_32, T1_32
5384 test T2_32, X86_MXCSR_OE | X86_MXCSR_UE
5385 jz .excp_masked
5386 btr %1, X86_MXCSR_PE_BIT
5387.excp_masked:
5388 or %1, %2
5389
5390 ldmxcsr [xSP]
5391 add xSP, 4
5392%endmacro
5393
5394
5395;;
5396; Floating point instruction working on two full sized registers.
5397;
5398; @param 1 The instruction
5399; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
5400;
5401; @returns R0_32 The new MXCSR value of the guest.
5402; @param A0 The guest's MXCSR register value to use.
5403; @param A1 Where to return the result.
5404; @param A2 Pointer to the first media register size operand (input/output).
5405; @param A3 Pointer to the second media register size operand (input).
5406;
5407%macro IEMIMPL_FP_F2 2
5408BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5409 PROLOGUE_4_ARGS
5410 IEMIMPL_SSE_PROLOGUE
5411 SSE_AVX_LD_MXCSR A0_32
5412
5413 movdqu xmm0, [A2]
5414 movdqu xmm1, [A3]
5415 %1 xmm0, xmm1
5416 movdqu [A1], xmm0
5417
5418 SSE_AVX_ST_MXCSR R0_32, A0_32
5419 IEMIMPL_SSE_PROLOGUE
5420 EPILOGUE_4_ARGS
5421ENDPROC iemAImpl_ %+ %1 %+ _u128
5422
5423 %if %2 == 3
5424BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5425 PROLOGUE_4_ARGS
5426 IEMIMPL_AVX_PROLOGUE
5427 SSE_AVX_LD_MXCSR A0_32
5428
5429 vmovdqu xmm0, [A2]
5430 vmovdqu xmm1, [A3]
5431 v %+ %1 xmm0, xmm0, xmm1
5432 vmovdqu [A1], xmm0
5433
5434 SSE_AVX_ST_MXCSR R0_32, A0_32
5435 IEMIMPL_AVX_PROLOGUE
5436 EPILOGUE_4_ARGS
5437ENDPROC iemAImpl_v %+ %1 %+ _u128
5438
5439BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5440 PROLOGUE_4_ARGS
5441 IEMIMPL_AVX_PROLOGUE
5442 SSE_AVX_LD_MXCSR A0_32
5443
5444 vmovdqu ymm0, [A2]
5445 vmovdqu ymm1, [A3]
5446 v %+ %1 ymm0, ymm0, ymm1
5447 vmovdqu [A1], ymm0
5448
5449 SSE_AVX_ST_MXCSR R0_32, A0_32
5450 IEMIMPL_AVX_PROLOGUE
5451 EPILOGUE_4_ARGS
5452ENDPROC iemAImpl_v %+ %1 %+ _u256
5453 %elif %2 == 2
5454BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5455 PROLOGUE_4_ARGS
5456 IEMIMPL_AVX_PROLOGUE
5457 SSE_AVX_LD_MXCSR A0_32
5458
5459 vmovdqu xmm0, [A2]
5460 vmovdqu xmm1, [A3]
5461 v %+ %1 xmm0, xmm1
5462 vmovdqu [A1], xmm0
5463
5464 SSE_AVX_ST_MXCSR R0_32, A0_32
5465 IEMIMPL_AVX_PROLOGUE
5466 EPILOGUE_4_ARGS
5467ENDPROC iemAImpl_v %+ %1 %+ _u128
5468
5469BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5470 PROLOGUE_4_ARGS
5471 IEMIMPL_AVX_PROLOGUE
5472 SSE_AVX_LD_MXCSR A0_32
5473
5474 vmovdqu ymm0, [A2]
5475 vmovdqu ymm1, [A3]
5476 v %+ %1 ymm0, ymm1
5477 vmovdqu [A1], ymm0
5478
5479 SSE_AVX_ST_MXCSR R0_32, A0_32
5480 IEMIMPL_AVX_PROLOGUE
5481 EPILOGUE_4_ARGS
5482ENDPROC iemAImpl_v %+ %1 %+ _u256
5483 %endif
5484%endmacro
5485
5486IEMIMPL_FP_F2 addps, 3
5487IEMIMPL_FP_F2 addpd, 3
5488IEMIMPL_FP_F2 mulps, 3
5489IEMIMPL_FP_F2 mulpd, 3
5490IEMIMPL_FP_F2 subps, 3
5491IEMIMPL_FP_F2 subpd, 3
5492IEMIMPL_FP_F2 minps, 3
5493IEMIMPL_FP_F2 minpd, 3
5494IEMIMPL_FP_F2 divps, 3
5495IEMIMPL_FP_F2 divpd, 3
5496IEMIMPL_FP_F2 maxps, 3
5497IEMIMPL_FP_F2 maxpd, 3
5498IEMIMPL_FP_F2 haddps, 3
5499IEMIMPL_FP_F2 haddpd, 3
5500IEMIMPL_FP_F2 hsubps, 3
5501IEMIMPL_FP_F2 hsubpd, 3
5502IEMIMPL_FP_F2 addsubps, 3
5503IEMIMPL_FP_F2 addsubpd, 3
5504
5505
5506;;
5507; These are actually unary operations but to keep it simple
5508; we treat them as binary for now, so the output result is
5509; always in sync with the register where the result might get written
5510; to.
5511IEMIMPL_FP_F2 sqrtps, 2
5512IEMIMPL_FP_F2 rsqrtps, 2
5513IEMIMPL_FP_F2 sqrtpd, 2
5514IEMIMPL_FP_F2 rcpps, 2
5515IEMIMPL_FP_F2 cvtdq2ps, 2
5516IEMIMPL_FP_F2 cvtps2dq, 2
5517IEMIMPL_FP_F2 cvttps2dq, 2
5518IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5519
5520
5521;;
5522; Floating point instruction working on a full sized register and a single precision operand.
5523;
5524; @param 1 The instruction
5525;
5526; @return R0_32 The new MXCSR value of the guest.
5527; @param A0 The guest's MXCSR register value to use.
5528; @param A1 Where to return the result.
5529; @param A2 Pointer to the first media register size operand (input/output).
5530; @param A3 Pointer to the second single precision floating point value (input).
5531;
5532%macro IEMIMPL_FP_F2_R32 1
5533BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5534 PROLOGUE_4_ARGS
5535 IEMIMPL_SSE_PROLOGUE
5536 SSE_AVX_LD_MXCSR A0_32
5537
5538 movdqu xmm0, [A2]
5539 movd xmm1, [A3]
5540 %1 xmm0, xmm1
5541 movdqu [A1], xmm0
5542
5543 SSE_AVX_ST_MXCSR R0_32, A0_32
5544 IEMIMPL_SSE_EPILOGUE
5545 EPILOGUE_4_ARGS
5546ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5547
5548BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5549 PROLOGUE_4_ARGS
5550 IEMIMPL_AVX_PROLOGUE
5551 SSE_AVX_LD_MXCSR A0_32
5552
5553 vmovdqu xmm0, [A2]
5554 vmovd xmm1, [A3]
5555 v %+ %1 xmm0, xmm0, xmm1
5556 vmovdqu [A1], xmm0
5557
5558 SSE_AVX_ST_MXCSR R0_32, A0_32
5559 IEMIMPL_AVX_PROLOGUE
5560 EPILOGUE_4_ARGS
5561ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5562%endmacro
5563
5564IEMIMPL_FP_F2_R32 addss
5565IEMIMPL_FP_F2_R32 mulss
5566IEMIMPL_FP_F2_R32 subss
5567IEMIMPL_FP_F2_R32 minss
5568IEMIMPL_FP_F2_R32 divss
5569IEMIMPL_FP_F2_R32 maxss
5570IEMIMPL_FP_F2_R32 cvtss2sd
5571IEMIMPL_FP_F2_R32 sqrtss
5572IEMIMPL_FP_F2_R32 rsqrtss
5573IEMIMPL_FP_F2_R32 rcpss
5574
5575
5576;;
5577; Floating point instruction working on a full sized register and a double precision operand.
5578;
5579; @param 1 The instruction
5580;
5581; @return R0_32 The new MXCSR value of the guest.
5582; @param A0 The guest's MXCSR register value to use.
5583; @param A1 Where to return the result.
5584; @param A2 Pointer to the first media register size operand (input/output).
5585; @param A3 Pointer to the second double precision floating point value (input).
5586;
5587%macro IEMIMPL_FP_F2_R64 1
5588BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5589 PROLOGUE_4_ARGS
5590 IEMIMPL_SSE_PROLOGUE
5591 SSE_AVX_LD_MXCSR A0_32
5592
5593 movdqu xmm0, [A2]
5594 movq xmm1, [A3]
5595 %1 xmm0, xmm1
5596 movdqu [A1], xmm0
5597
5598 SSE_AVX_ST_MXCSR R0_32, A0_32
5599 IEMIMPL_SSE_EPILOGUE
5600 EPILOGUE_4_ARGS
5601ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5602
5603BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5604 PROLOGUE_4_ARGS
5605 IEMIMPL_AVX_PROLOGUE
5606 SSE_AVX_LD_MXCSR A0_32
5607
5608 vmovdqu xmm0, [A2]
5609 vmovq xmm1, [A3]
5610 v %+ %1 xmm0, xmm0, xmm1
5611 vmovdqu [A1], xmm0
5612
5613 SSE_AVX_ST_MXCSR R0_32, A0_32
5614 IEMIMPL_AVX_EPILOGUE
5615 EPILOGUE_4_ARGS
5616ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5617%endmacro
5618
5619IEMIMPL_FP_F2_R64 addsd
5620IEMIMPL_FP_F2_R64 mulsd
5621IEMIMPL_FP_F2_R64 subsd
5622IEMIMPL_FP_F2_R64 minsd
5623IEMIMPL_FP_F2_R64 divsd
5624IEMIMPL_FP_F2_R64 maxsd
5625IEMIMPL_FP_F2_R64 cvtsd2ss
5626IEMIMPL_FP_F2_R64 sqrtsd
5627
5628
5629;;
5630; Macro for the cvtpd2ps/cvtps2pd instructions.
5631;
5632; 1 The instruction name.
5633; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5634;
5635; @return R0_32 The new MXCSR value of the guest.
5636; @param A0_32 The guest's MXCSR register value to use.
5637; @param A1 Where to return the result.
5638; @param A2 Pointer to the first media register size operand (input/output).
5639; @param A3 Pointer to the second media register size operand (input).
5640;
5641%macro IEMIMPL_CVT_F2 2
5642BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5643 PROLOGUE_4_ARGS
5644 IEMIMPL_SSE_PROLOGUE
5645 SSE_AVX_LD_MXCSR A0_32
5646
5647 movdqu xmm0, [A2]
5648 movdqu xmm1, [A3]
5649 %1 xmm0, xmm1
5650 movdqu [A1], xmm0
5651
5652 SSE_AVX_ST_MXCSR R0_32, A0_32
5653 IEMIMPL_SSE_EPILOGUE
5654 EPILOGUE_4_ARGS
5655ENDPROC iemAImpl_ %+ %1 %+ _u128
5656
5657BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u128, 16
5658 PROLOGUE_4_ARGS
5659 IEMIMPL_AVX_PROLOGUE
5660 SSE_AVX_LD_MXCSR A0_32
5661
5662 vmovdqu xmm1, [A2]
5663 v %+ %1 xmm0, xmm1
5664 vmovdqu [A1], xmm0
5665
5666 SSE_AVX_ST_MXCSR R0_32, A0_32
5667 IEMIMPL_AVX_EPILOGUE
5668 EPILOGUE_4_ARGS
5669ENDPROC iemAImpl_v %+ %1 %+ _u128_u128
5670
5671BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u256, 16
5672 PROLOGUE_4_ARGS
5673 IEMIMPL_AVX_PROLOGUE
5674 SSE_AVX_LD_MXCSR A0_32
5675
5676 vmovdqu xmm1, [A2]
5677 %if %2 == 0
5678 v %+ %1 xmm0, xmm1
5679 %else
5680 v %+ %1 ymm0, xmm1
5681 %endif
5682 vmovdqu [A1], ymm0
5683
5684 SSE_AVX_ST_MXCSR R0_32, A0_32
5685 IEMIMPL_AVX_EPILOGUE
5686 EPILOGUE_4_ARGS
5687ENDPROC iemAImpl_v %+ %1 %+ _u128_u256
5688%endmacro
5689
5690IEMIMPL_CVT_F2 cvtpd2ps, 0
5691IEMIMPL_CVT_F2 cvttpd2dq, 0
5692IEMIMPL_CVT_F2 cvtpd2dq, 0
5693
5694;IEMIMPL_CVT_F2 cvtps2pd, 1 - inefficient.
5695
5696BEGINPROC_FASTCALL iemAImpl_cvtps2pd_u128, 12
5697 PROLOGUE_3_ARGS
5698 IEMIMPL_SSE_PROLOGUE
5699 SSE_AVX_LD_MXCSR A0_32
5700
5701 cvtps2pd xmm0, [A2]
5702 movdqu [A1], xmm0
5703
5704 SSE_AVX_ST_MXCSR R0_32, A0_32
5705 IEMIMPL_SSE_EPILOGUE
5706 EPILOGUE_3_ARGS
5707ENDPROC iemAImpl_cvtps2pd_u128
5708
5709
5710;;
5711; vcvtps2pd instruction - 128-bit variant.
5712;
5713; @return R0_32 The new MXCSR value of the guest.
5714; @param A0_32 The guest's MXCSR register value to use.
5715; @param A1 Pointer to the result operand (output).
5716; @param A2 Pointer to the second operand (input).
5717;
5718BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u128_u64, 16
5719 PROLOGUE_3_ARGS
5720 IEMIMPL_AVX_PROLOGUE
5721 SSE_AVX_LD_MXCSR A0_32
5722
5723 vcvtps2pd xmm0, qword [A2]
5724 movdqu [A1], xmm0
5725
5726 SSE_AVX_ST_MXCSR R0_32, A0_32
5727 IEMIMPL_AVX_EPILOGUE
5728 EPILOGUE_3_ARGS
5729ENDPROC iemAImpl_vcvtps2pd_u128_u64
5730
5731
5732;;
5733; vcvtps2pd instruction - 256-bit variant.
5734;
5735; @return R0_32 The new MXCSR value of the guest.
5736; @param A0_32 The guest's MXCSR register value to use.
5737; @param A1 Pointer to the result operand (output).
5738; @param A2 Pointer to the second operand (input).
5739;
5740BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u256_u128, 16
5741 PROLOGUE_3_ARGS
5742 IEMIMPL_AVX_PROLOGUE
5743 SSE_AVX_LD_MXCSR A0_32
5744
5745 movdqu xmm0, [A2]
5746 vcvtps2pd ymm0, xmm1
5747 vmovdqu [A1], ymm0
5748
5749 SSE_AVX_ST_MXCSR R0_32, A0_32
5750 IEMIMPL_AVX_EPILOGUE
5751 EPILOGUE_3_ARGS
5752ENDPROC iemAImpl_vcvtps2pd_u256_u128
5753
5754
5755;;
5756; vcvtdq2pd instruction - 128-bit variant.
5757;
5758; @return R0_32 The new MXCSR value of the guest.
5759; @param A0_32 The guest's MXCSR register value to use.
5760; @param A1 Pointer to the result operand (output).
5761; @param A2 Pointer to the second operand (input).
5762;
5763BEGINPROC_FASTCALL iemAImpl_vcvtdq2pd_u128_u64, 16
5764 PROLOGUE_3_ARGS
5765 IEMIMPL_AVX_PROLOGUE
5766 SSE_AVX_LD_MXCSR A0_32
5767
5768 vcvtdq2pd xmm0, qword [A2]
5769 movdqu [A1], xmm0
5770
5771 SSE_AVX_ST_MXCSR R0_32, A0_32
5772 IEMIMPL_AVX_EPILOGUE
5773 EPILOGUE_3_ARGS
5774ENDPROC iemAImpl_vcvtdq2pd_u128_u64
5775
5776
5777;;
5778; vcvtdq2pd instruction - 256-bit variant.
5779;
5780; @return R0_32 The new MXCSR value of the guest.
5781; @param A0_32 The guest's MXCSR register value to use.
5782; @param A1 Pointer to the result operand (output).
5783; @param A2 Pointer to the second operand (input).
5784;
5785BEGINPROC_FASTCALL iemAImpl_vcvtdq2pd_u256_u128, 16
5786 PROLOGUE_3_ARGS
5787 IEMIMPL_AVX_PROLOGUE
5788 SSE_AVX_LD_MXCSR A0_32
5789
5790 movdqu xmm0, [A2]
5791 vcvtdq2pd ymm0, xmm1
5792 vmovdqu [A1], ymm0
5793
5794 SSE_AVX_ST_MXCSR R0_32, A0_32
5795 IEMIMPL_AVX_EPILOGUE
5796 EPILOGUE_3_ARGS
5797ENDPROC iemAImpl_vcvtdq2pd_u256_u128
5798
5799
5800;;
5801; shufps instructions with 8-bit immediates.
5802;
5803; @param A0 Pointer to the destination media register size operand (input/output).
5804; @param A1 Pointer to the first source media register size operand (input).
5805; @param A2 The 8-bit immediate
5806;
5807BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5808 PROLOGUE_3_ARGS
5809 IEMIMPL_SSE_PROLOGUE
5810
5811 movzx A2, A2_8 ; must clear top bits
5812 movdqu xmm0, [A0]
5813 movdqu xmm1, [A1]
5814 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5815 movdqu [A0], xmm0
5816
5817 IEMIMPL_SSE_EPILOGUE
5818 EPILOGUE_3_ARGS
5819 %assign bImm 0
5820 %rep 256
5821.imm %+ bImm:
5822 IBT_ENDBRxx_WITHOUT_NOTRACK
5823 shufps xmm0, xmm1, bImm
5824 ret
5825 int3
5826 %assign bImm bImm + 1
5827 %endrep
5828.immEnd:
5829ENDPROC iemAImpl_shufps_u128
5830
5831
5832;;
5833; shufpd instruction with 8-bit immediates.
5834;
5835; @param A0 Pointer to the destination media register size operand (input/output).
5836; @param A1 Pointer to the first source media register size operand (input).
5837; @param A2 The 8-bit immediate
5838;
5839BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5840 PROLOGUE_3_ARGS
5841 IEMIMPL_SSE_PROLOGUE
5842
5843 movzx A2, A2_8 ; must clear top bits
5844 movdqu xmm0, [A0]
5845 movdqu xmm1, [A1]
5846 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5847 movdqu [A0], xmm0
5848
5849 IEMIMPL_SSE_EPILOGUE
5850 EPILOGUE_3_ARGS
5851 %assign bImm 0
5852 %rep 256
5853.imm %+ bImm:
5854 IBT_ENDBRxx_WITHOUT_NOTRACK
5855 shufpd xmm0, xmm1, bImm
5856 ret
5857 %assign bImm bImm + 1
5858 %endrep
5859.immEnd:
5860ENDPROC iemAImpl_shufpd_u128
5861
5862
5863;;
5864; vshufp{s,d} instructions with 8-bit immediates.
5865;
5866; @param 1 The instruction name.
5867;
5868; @param A0 Pointer to the destination media register size operand (output).
5869; @param A1 Pointer to the first source media register size operand (input).
5870; @param A2 Pointer to the second source media register size operand (input).
5871; @param A3 The 8-bit immediate
5872;
5873%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5875 PROLOGUE_4_ARGS
5876 IEMIMPL_AVX_PROLOGUE
5877
5878 movzx A3, A3_8 ; must clear top bits
5879 movdqu xmm0, [A1]
5880 movdqu xmm1, [A2]
5881 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5882 movdqu [A0], xmm0
5883
5884 IEMIMPL_AVX_EPILOGUE
5885 EPILOGUE_4_ARGS
5886 %assign bImm 0
5887 %rep 256
5888.imm %+ bImm:
5889 IBT_ENDBRxx_WITHOUT_NOTRACK
5890 %1 xmm0, xmm0, xmm1, bImm
5891 ret
5892 %assign bImm bImm + 1
5893 %endrep
5894.immEnd:
5895ENDPROC iemAImpl_ %+ %1 %+ _u128
5896
5897BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5898 PROLOGUE_4_ARGS
5899 IEMIMPL_AVX_PROLOGUE
5900
5901 movzx A3, A3_8 ; must clear top bits
5902 vmovdqu ymm0, [A1]
5903 vmovdqu ymm1, [A2]
5904 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5905 vmovdqu [A0], ymm0
5906
5907 IEMIMPL_AVX_EPILOGUE
5908 EPILOGUE_4_ARGS
5909 %assign bImm 0
5910 %rep 256
5911.imm %+ bImm:
5912 IBT_ENDBRxx_WITHOUT_NOTRACK
5913 %1 ymm0, ymm0, ymm1, bImm
5914 ret
5915 %assign bImm bImm + 1
5916 %endrep
5917.immEnd:
5918ENDPROC iemAImpl_ %+ %1 %+ _u256
5919%endmacro
5920
5921IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5922IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5923
5924
5925;;
5926; One of the [p]blendv{b,ps,pd} variants
5927;
5928; @param 1 The instruction
5929;
5930; @param A0 Pointer to the first media register sized operand (input/output).
5931; @param A1 Pointer to the second media sized value (input).
5932; @param A2 Pointer to the media register sized mask value (input).
5933;
5934%macro IEMIMPL_P_BLEND 1
5935BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5936 PROLOGUE_3_ARGS
5937 IEMIMPL_SSE_PROLOGUE
5938
5939 movdqu xmm0, [A2] ; This is implicit
5940 movdqu xmm1, [A0]
5941 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5942 %1 xmm1, xmm2
5943 movdqu [A0], xmm1
5944
5945 IEMIMPL_SSE_PROLOGUE
5946 EPILOGUE_3_ARGS
5947ENDPROC iemAImpl_ %+ %1 %+ _u128
5948%endmacro
5949
5950IEMIMPL_P_BLEND pblendvb
5951IEMIMPL_P_BLEND blendvps
5952IEMIMPL_P_BLEND blendvpd
5953
5954
5955;;
5956; One of the v[p]blendv{b,ps,pd} variants
5957;
5958; @param 1 The instruction
5959;
5960; @param A0 Pointer to the first media register sized operand (output).
5961; @param A1 Pointer to the first media register sized operand (input).
5962; @param A2 Pointer to the second media register sized operand (input).
5963; @param A3 Pointer to the media register sized mask value (input).
5964%macro IEMIMPL_AVX_P_BLEND 1
5965BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5966 PROLOGUE_4_ARGS
5967 IEMIMPL_AVX_PROLOGUE
5968
5969 vmovdqu xmm0, [A1]
5970 vmovdqu xmm1, [A2]
5971 vmovdqu xmm2, [A3]
5972 %1 xmm0, xmm0, xmm1, xmm2
5973 vmovdqu [A0], xmm0
5974
5975 IEMIMPL_AVX_PROLOGUE
5976 EPILOGUE_4_ARGS
5977ENDPROC iemAImpl_ %+ %1 %+ _u128
5978
5979BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5980 PROLOGUE_4_ARGS
5981 IEMIMPL_AVX_PROLOGUE
5982
5983 vmovdqu ymm0, [A1]
5984 vmovdqu ymm1, [A2]
5985 vmovdqu ymm2, [A3]
5986 %1 ymm0, ymm0, ymm1, ymm2
5987 vmovdqu [A0], ymm0
5988
5989 IEMIMPL_AVX_PROLOGUE
5990 EPILOGUE_4_ARGS
5991ENDPROC iemAImpl_ %+ %1 %+ _u256
5992%endmacro
5993
5994IEMIMPL_AVX_P_BLEND vpblendvb
5995IEMIMPL_AVX_P_BLEND vblendvps
5996IEMIMPL_AVX_P_BLEND vblendvpd
5997
5998
5999;;
6000; palignr mm1, mm2/m64 instruction.
6001;
6002; @param A0 Pointer to the first media register sized operand (output).
6003; @param A1 The second register sized operand (input).
6004; @param A2 The 8-bit immediate.
6005BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
6006 PROLOGUE_3_ARGS
6007 IEMIMPL_MMX_PROLOGUE
6008
6009 movzx A2, A2_8 ; must clear top bits
6010 movq mm0, [A0]
6011 movq mm1, A1
6012 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
6013 movq [A0], mm0
6014
6015 IEMIMPL_MMX_EPILOGUE
6016 EPILOGUE_3_ARGS
6017 %assign bImm 0
6018 %rep 256
6019.imm %+ bImm:
6020 IBT_ENDBRxx_WITHOUT_NOTRACK
6021 palignr mm0, mm1, bImm
6022 ret
6023 %assign bImm bImm + 1
6024 %endrep
6025.immEnd:
6026ENDPROC iemAImpl_palignr_u64
6027
6028
6029;;
6030; SSE instructions with 8-bit immediates of the form
6031; xxx xmm1, xmm2, imm8.
6032; where the instruction encoding takes up 6 bytes.
6033;
6034; @param 1 The instruction name.
6035;
6036; @param A0 Pointer to the first media register size operand (input/output).
6037; @param A1 Pointer to the second source media register size operand (input).
6038; @param A2 The 8-bit immediate
6039;
6040%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
6041BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6042 PROLOGUE_3_ARGS
6043 IEMIMPL_SSE_PROLOGUE
6044
6045 movzx A2, A2_8 ; must clear top bits
6046 movdqu xmm0, [A0]
6047 movdqu xmm1, [A1]
6048 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
6049 movdqu [A0], xmm0
6050
6051 IEMIMPL_SSE_EPILOGUE
6052 EPILOGUE_3_ARGS
6053 %assign bImm 0
6054 %rep 256
6055.imm %+ bImm:
6056 IBT_ENDBRxx_WITHOUT_NOTRACK
6057 %1 xmm0, xmm1, bImm
6058 ret
6059 int3
6060 %assign bImm bImm + 1
6061 %endrep
6062.immEnd:
6063ENDPROC iemAImpl_ %+ %1 %+ _u128
6064%endmacro
6065
6066IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
6067IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
6068IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
6069IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
6070IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
6071IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
6072IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
6073
6074
6075;;
6076; AVX instructions with 8-bit immediates of the form
6077; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
6078; where the instruction encoding takes up 6 bytes.
6079;
6080; @param 1 The instruction name.
6081; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6082; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6083;
6084; @param A0 Pointer to the destination media register size operand (output).
6085; @param A1 Pointer to the first source media register size operand (input).
6086; @param A2 Pointer to the second source media register size operand (input).
6087; @param A3 The 8-bit immediate
6088;
6089%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
6090 %if %2 == 1
6091BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6092 PROLOGUE_4_ARGS
6093 IEMIMPL_AVX_PROLOGUE
6094
6095 movzx A3, A3_8 ; must clear top bits
6096 movdqu xmm0, [A1]
6097 movdqu xmm1, [A2]
6098 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6099 movdqu [A0], xmm0
6100
6101 IEMIMPL_AVX_EPILOGUE
6102 EPILOGUE_4_ARGS
6103 %assign bImm 0
6104 %rep 256
6105.imm %+ bImm:
6106 IBT_ENDBRxx_WITHOUT_NOTRACK
6107 %1 xmm0, xmm0, xmm1, bImm
6108 ret
6109 int3
6110 %assign bImm bImm + 1
6111 %endrep
6112.immEnd:
6113ENDPROC iemAImpl_ %+ %1 %+ _u128
6114 %endif
6115
6116 %if %3 == 1
6117BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
6118 PROLOGUE_4_ARGS
6119 IEMIMPL_AVX_PROLOGUE
6120
6121 movzx A3, A3_8 ; must clear top bits
6122 vmovdqu ymm0, [A1]
6123 vmovdqu ymm1, [A2]
6124 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6125 vmovdqu [A0], ymm0
6126
6127 IEMIMPL_AVX_EPILOGUE
6128 EPILOGUE_4_ARGS
6129 %assign bImm 0
6130 %rep 256
6131.imm %+ bImm:
6132 IBT_ENDBRxx_WITHOUT_NOTRACK
6133 %1 ymm0, ymm0, ymm1, bImm
6134 ret
6135 int3
6136 %assign bImm bImm + 1
6137 %endrep
6138.immEnd:
6139ENDPROC iemAImpl_ %+ %1 %+ _u256
6140 %endif
6141%endmacro
6142
6143IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
6144IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
6145IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
6146IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
6147IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
6148IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
6149IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
6150IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
6151IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
6152
6153
6154;;
6155; AVX instructions with 8-bit immediates of the form
6156; xxx {x,y}mm1, {x,y}mm2, imm8.
6157; where the instruction encoding takes up 6 bytes.
6158;
6159; @param 1 The instruction name.
6160; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6161; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6162; @param 4 The number of bytes taken up by a single instance of the instruction.
6163;
6164; @param A0 Pointer to the destination media register size operand (output).
6165; @param A1 Pointer to the first source media register size operand (input).
6166; @param A2 The 8-bit immediate
6167;
6168%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP 4
6169 %if %2 == 1
6170BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
6171 PROLOGUE_4_ARGS
6172 IEMIMPL_AVX_PROLOGUE
6173
6174 movzx A2, A2_8 ; must clear top bits
6175 movdqu xmm1, [A1]
6176 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6177 movdqu [A0], xmm0
6178
6179 IEMIMPL_AVX_EPILOGUE
6180 EPILOGUE_4_ARGS
6181 %assign bImm 0
6182 %rep 256
6183.imm %+ bImm:
6184 IBT_ENDBRxx_WITHOUT_NOTRACK
6185 %1 xmm0, xmm1, bImm
6186 ret
6187 int3
6188 %assign bImm bImm + 1
6189 %endrep
6190.immEnd:
6191ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
6192 %endif
6193
6194 %if %3 == 1
6195BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
6196 PROLOGUE_4_ARGS
6197 IEMIMPL_AVX_PROLOGUE
6198
6199 movzx A2, A2_8 ; must clear top bits
6200 vmovdqu ymm1, [A1]
6201 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6202 vmovdqu [A0], ymm0
6203
6204 IEMIMPL_AVX_EPILOGUE
6205 EPILOGUE_4_ARGS
6206 %assign bImm 0
6207 %rep 256
6208.imm %+ bImm:
6209 IBT_ENDBRxx_WITHOUT_NOTRACK
6210 %1 ymm0, ymm1, bImm
6211 ret
6212 int3
6213 %assign bImm bImm + 1
6214 %endrep
6215.immEnd:
6216ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
6217 %endif
6218%endmacro
6219
6220IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilps, 1, 1, 8
6221IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilpd, 1, 1, 8
6222IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpslldq, 1, 1, 7
6223IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpsrldq, 1, 1, 7
6224
6225
6226;;
6227; Need to move this as well somewhere better?
6228;
6229struc IEMPCMPISTRXSRC
6230 .uSrc1 resd 4
6231 .uSrc2 resd 4
6232endstruc
6233
6234struc IEMPCMPESTRXSRC
6235 .uSrc1 resd 4
6236 .uSrc2 resd 4
6237 .u64Rax resd 2
6238 .u64Rdx resd 2
6239endstruc
6240
6241;;
6242; The pcmpistri/vcmpistri instruction.
6243;
6244; @param 1 The instruction name
6245;
6246; @return R0_32 The new ECX value.
6247; @param A0 Pointer to the EFLAGS register.
6248; @param A1 Pointer to the first operand (input).
6249; @param A2 Pointer to the second operand (input).
6250; @param A3 The 8-bit immediate
6251;
6252%macro IEMIMPL_MEDIA_V_CMPISTRI 1
6253BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6254 PROLOGUE_4_ARGS
6255 IEMIMPL_SSE_PROLOGUE
6256
6257 movzx A3, A3_8 ; must clear top bits
6258 movdqu xmm0, [A1]
6259 movdqu xmm1, [A2]
6260 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6261 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6262
6263 IEM_SAVE_FLAGS_OLD T2, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6264 mov R0_32, ecx
6265
6266 IEMIMPL_SSE_EPILOGUE
6267 EPILOGUE_4_ARGS
6268 %assign bImm 0
6269 %rep 256
6270.imm %+ bImm:
6271 IBT_ENDBRxx_WITHOUT_NOTRACK
6272 %1 xmm0, xmm1, bImm
6273 ret
6274 int3
6275 %assign bImm bImm + 1
6276 %endrep
6277.immEnd:
6278ENDPROC iemAImpl_ %+ %1 %+ _u128
6279%endmacro
6280
6281IEMIMPL_MEDIA_V_CMPISTRI pcmpistri
6282IEMIMPL_MEDIA_V_CMPISTRI vpcmpistri
6283
6284
6285;;
6286; The pcmpestri instruction.
6287;
6288; @param 1 The instruction name
6289;
6290; @param A0 Pointer to the ECX register to store the result to (output).
6291; @param A1 Pointer to the EFLAGS register.
6292; @param A2 Pointer to the structure containing the source operands (input).
6293; @param A3 The 8-bit immediate
6294;
6295BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
6296 PROLOGUE_4_ARGS
6297 IEMIMPL_SSE_PROLOGUE
6298
6299 movzx A3, A3_8 ; must clear top bits
6300 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6301 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6302 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6303 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6304 push xDX ; xDX can be A1 or A2 depending on the calling convention
6305 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6306 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6307 IBT_NOTRACK
6308 call T1
6309
6310 pop xDX
6311 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6312 mov [T2], ecx
6313
6314 IEMIMPL_SSE_EPILOGUE
6315 EPILOGUE_4_ARGS
6316 %assign bImm 0
6317 %rep 256
6318.imm %+ bImm:
6319 IBT_ENDBRxx_WITHOUT_NOTRACK
6320 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6321 pcmpestri xmm0, xmm1, bImm
6322 ret
6323 %assign bImm bImm + 1
6324 %endrep
6325.immEnd:
6326ENDPROC iemAImpl_pcmpestri_u128
6327
6328
6329;;
6330; The vpcmpestri instruction.
6331;
6332; @param 1 The instruction name
6333;
6334; @param A0 Pointer to the ECX register to store the result to (output).
6335; @param A1 Pointer to the EFLAGS register.
6336; @param A2 Pointer to the structure containing the source operands (input).
6337; @param A3 The 8-bit immediate
6338;
6339BEGINPROC_FASTCALL iemAImpl_vpcmpestri_u128, 16
6340 PROLOGUE_4_ARGS
6341 IEMIMPL_SSE_PROLOGUE
6342
6343 movzx A3, A3_8 ; must clear top bits
6344 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6345 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6346 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6347 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6348 push xDX ; xDX can be A1 or A2 depending on the calling convention
6349 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6350 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6351 IBT_NOTRACK
6352 call T1
6353
6354 pop xDX
6355 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6356 mov [T2], ecx
6357
6358 IEMIMPL_SSE_EPILOGUE
6359 EPILOGUE_4_ARGS
6360 %assign bImm 0
6361 %rep 256
6362.imm %+ bImm:
6363 IBT_ENDBRxx_WITHOUT_NOTRACK
6364 db 0xc4, 0xe3, 0xf9, 0x61, 0xc1, bImm ; vpcmpestri xmm0,xmm1,0x1 with VEX.W set
6365 ret
6366 int3
6367 %assign bImm bImm + 1
6368 %endrep
6369.immEnd:
6370ENDPROC iemAImpl_vpcmpestri_u128
6371
6372
6373;;
6374; The pcmpistrm/vpcmpistrm instruction template.
6375;
6376; @param 1 The instruction name
6377;
6378; @param A0 Pointer to the XMM0 register to store the result to (output).
6379; @param A1 Pointer to the EFLAGS register.
6380; @param A2 Pointer to the structure containing the source operands (input).
6381; @param A3 The 8-bit immediate
6382;
6383%macro IEMIMPL_MEDIA_V_CMPISTRM 1
6384BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6385 PROLOGUE_4_ARGS
6386 IEMIMPL_SSE_PROLOGUE
6387
6388 movzx A3, A3_8 ; must clear top bits
6389 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
6390 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
6391 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6392
6393 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6394 movdqu [A0], xmm0
6395
6396 IEMIMPL_SSE_EPILOGUE
6397 EPILOGUE_4_ARGS
6398 %assign bImm 0
6399 %rep 256
6400.imm %+ bImm:
6401 IBT_ENDBRxx_WITHOUT_NOTRACK
6402 %1 xmm1, xmm2, bImm
6403 ret
6404 int3
6405 %assign bImm bImm + 1
6406 %endrep
6407.immEnd:
6408ENDPROC iemAImpl_ %+ %1 %+ _u128
6409%endmacro
6410
6411IEMIMPL_MEDIA_V_CMPISTRM pcmpistrm
6412IEMIMPL_MEDIA_V_CMPISTRM vpcmpistrm
6413
6414
6415;;
6416; The pcmpestrm instruction.
6417;
6418; @param A0 Pointer to the XMM0 register to store the result to (output).
6419; @param A1 Pointer to the EFLAGS register.
6420; @param A2 Pointer to the structure containing the source operands (input).
6421; @param A3 The 8-bit immediate
6422;
6423BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
6424 PROLOGUE_4_ARGS
6425 IEMIMPL_SSE_PROLOGUE
6426
6427 movzx A3, A3_8 ; must clear top bits
6428 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6429 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6430 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6431 push xDX ; xDX can be A1 or A2 depending on the calling convention
6432 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6433 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6434 IBT_NOTRACK
6435 call T1
6436
6437 pop xDX
6438 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6439 movdqu [A0], xmm0
6440
6441 IEMIMPL_SSE_EPILOGUE
6442 EPILOGUE_4_ARGS
6443 %assign bImm 0
6444 %rep 256
6445.imm %+ bImm:
6446 IBT_ENDBRxx_WITHOUT_NOTRACK
6447 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6448 pcmpestrm xmm1, xmm2, bImm
6449 ret
6450 %assign bImm bImm + 1
6451 %endrep
6452.immEnd:
6453ENDPROC iemAImpl_pcmpestrm_u128
6454
6455
6456;;
6457; The vpcmpestrm instruction.
6458;
6459; @param A0 Pointer to the XMM0 register to store the result to (output).
6460; @param A1 Pointer to the EFLAGS register.
6461; @param A2 Pointer to the structure containing the source operands (input).
6462; @param A3 The 8-bit immediate
6463;
6464BEGINPROC_FASTCALL iemAImpl_vpcmpestrm_u128, 16
6465 PROLOGUE_4_ARGS
6466 IEMIMPL_SSE_PROLOGUE
6467
6468 movzx A3, A3_8 ; must clear top bits
6469 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6470 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6471 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6472 push xDX ; xDX can be A1 or A2 depending on the calling convention
6473 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6474 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6475 IBT_NOTRACK
6476 call T1
6477
6478 pop xDX
6479 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6480 movdqu [A0], xmm0
6481
6482 IEMIMPL_SSE_EPILOGUE
6483 EPILOGUE_4_ARGS
6484 %assign bImm 0
6485 %rep 256
6486.imm %+ bImm:
6487 IBT_ENDBRxx_WITHOUT_NOTRACK
6488 db 0xc4, 0xe3, 0xf9, 0x60, 0xca, bImm ; vpcmpestrm xmm1, xmm2, bImm with VEX.W set
6489 ret
6490 int3
6491 %assign bImm bImm + 1
6492 %endrep
6493.immEnd:
6494ENDPROC iemAImpl_vpcmpestrm_u128
6495
6496
6497;;
6498; movmskp{s,d} SSE instruction template
6499;
6500; @param 1 The SSE instruction name.
6501; @param 2 The AVX instruction name.
6502;
6503; @param A0 Pointer to the output register (output/byte sized).
6504; @param A1 Pointer to the source media register size operand (input).
6505;
6506%macro IEMIMPL_MEDIA_MOVMSK_P 2
6507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6508 PROLOGUE_2_ARGS
6509 IEMIMPL_SSE_PROLOGUE
6510
6511 movdqu xmm0, [A1]
6512 %1 T0, xmm0
6513 mov byte [A0], T0_8
6514
6515 IEMIMPL_SSE_EPILOGUE
6516 EPILOGUE_2_ARGS
6517ENDPROC iemAImpl_ %+ %1 %+ _u128
6518
6519BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6520 PROLOGUE_2_ARGS
6521 IEMIMPL_AVX_PROLOGUE
6522
6523 movdqu xmm0, [A1]
6524 %2 T0, xmm0
6525 mov byte [A0], T0_8
6526
6527 IEMIMPL_AVX_EPILOGUE
6528 EPILOGUE_2_ARGS
6529ENDPROC iemAImpl_ %+ %2 %+ _u128
6530
6531BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6532 PROLOGUE_2_ARGS
6533 IEMIMPL_AVX_PROLOGUE
6534
6535 vmovdqu ymm0, [A1]
6536 %2 T0, ymm0
6537 mov byte [A0], T0_8
6538
6539 IEMIMPL_AVX_EPILOGUE
6540 EPILOGUE_2_ARGS
6541ENDPROC iemAImpl_ %+ %2 %+ _u256
6542%endmacro
6543
6544IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6545IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6546
6547
6548;;
6549; Template for [v]cvttss2si/[v]cvtss2si instructions.
6550;
6551; @param 1 Instruction name.
6552; @param 2 AVX or SSE
6553;
6554; @return R0_32 The new MXCSR value of the guest.
6555; @param A0_32 The guest's MXCSR register value to use.
6556; @param A1 Pointer to the result operand (output).
6557; @param A2 Pointer to the second operand (input).
6558;
6559%macro IEMIMPL_MEDIA_V_CVTXSS2SI 2
6560BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r32, 16
6561 PROLOGUE_3_ARGS
6562 IEMIMPL_ %+ %2 %+ _PROLOGUE
6563 SSE_AVX_LD_MXCSR A0_32
6564
6565 %1 T0_32, [A2]
6566 mov dword [A1], T0_32
6567
6568 SSE_AVX_ST_MXCSR R0_32, A0_32
6569 IEMIMPL_ %+ %2 %+ _EPILOGUE
6570 EPILOGUE_3_ARGS
6571ENDPROC iemAImpl_ %+ %1 %+ _i32_r32
6572
6573
6574BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r32, 16
6575 PROLOGUE_3_ARGS
6576 IEMIMPL_ %+ %2 %+ _PROLOGUE
6577 SSE_AVX_LD_MXCSR A0_32
6578
6579 %1 T0, [A2]
6580 mov qword [A1], T0
6581
6582 SSE_AVX_ST_MXCSR R0_32, A0_32
6583 IEMIMPL_ %+ %2 %+ _EPILOGUE
6584 EPILOGUE_3_ARGS
6585ENDPROC iemAImpl_ %+ %1 %+ _i64_r32
6586%endmacro
6587
6588IEMIMPL_MEDIA_V_CVTXSS2SI cvttss2si, SSE
6589IEMIMPL_MEDIA_V_CVTXSS2SI vcvttss2si, AVX
6590IEMIMPL_MEDIA_V_CVTXSS2SI cvtss2si, SSE
6591IEMIMPL_MEDIA_V_CVTXSS2SI vcvtss2si, AVX
6592
6593
6594;;
6595; Template for [v]cvttsd2si/[v]cvtsd2si instructions.
6596;
6597; @param 1 Instruction name.
6598; @param 2 AVX or SSE
6599;
6600; @return R0_32 The new MXCSR value of the guest.
6601; @param A0_32 The guest's MXCSR register value to use.
6602; @param A1 Pointer to the result operand (output).
6603; @param A2 Pointer to the second operand (input).
6604;
6605%macro IEMIMPL_MEDIA_V_CVTXSD2SI 2
6606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r64, 16
6607 PROLOGUE_3_ARGS
6608 IEMIMPL_ %+ %2 %+ _PROLOGUE
6609 SSE_AVX_LD_MXCSR A0_32
6610
6611 %1 T0_32, [A2]
6612 mov dword [A1], T0_32
6613
6614 SSE_AVX_ST_MXCSR R0_32, A0_32
6615 IEMIMPL_ %+ %2 %+ _EPILOGUE
6616 EPILOGUE_3_ARGS
6617ENDPROC iemAImpl_ %+ %1 %+ _i32_r64
6618
6619
6620BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r64, 16
6621 PROLOGUE_3_ARGS
6622 IEMIMPL_ %+ %2 %+ _PROLOGUE
6623 SSE_AVX_LD_MXCSR A0_32
6624
6625 %1 T0, [A2]
6626 mov qword [A1], T0
6627
6628 SSE_AVX_ST_MXCSR R0_32, A0_32
6629 IEMIMPL_ %+ %2 %+ _EPILOGUE
6630 EPILOGUE_3_ARGS
6631ENDPROC iemAImpl_ %+ %1 %+ _i64_r64
6632%endmacro
6633
6634IEMIMPL_MEDIA_V_CVTXSD2SI cvttsd2si, SSE
6635IEMIMPL_MEDIA_V_CVTXSD2SI vcvttsd2si, AVX
6636IEMIMPL_MEDIA_V_CVTXSD2SI cvtsd2si, SSE
6637IEMIMPL_MEDIA_V_CVTXSD2SI vcvtsd2si, AVX
6638
6639
6640;;
6641; cvtsi2ss instruction - 32-bit variant.
6642;
6643; @return R0_32 The new MXCSR value of the guest.
6644; @param A0_32 The guest's MXCSR register value to use.
6645; @param A1 Pointer to the result operand (output).
6646; @param A2 Pointer to the second operand (input).
6647;
6648BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6649 PROLOGUE_3_ARGS
6650 IEMIMPL_SSE_PROLOGUE
6651 SSE_AVX_LD_MXCSR A0_32
6652
6653 cvtsi2ss xmm0, dword [A2]
6654 movd dword [A1], xmm0
6655
6656 SSE_AVX_ST_MXCSR R0_32, A0_32
6657 IEMIMPL_SSE_EPILOGUE
6658 EPILOGUE_3_ARGS
6659ENDPROC iemAImpl_cvtsi2ss_r32_i32
6660
6661
6662;;
6663; vcvtsi2ss instruction - 32-bit variant.
6664;
6665; @return R0_32 The new MXCSR value of the guest.
6666; @param A0_32 The guest's MXCSR register value to use.
6667; @param A1 Pointer to the result operand (output).
6668; @param A2 Pointer to the second operand (input).
6669; @param A3 Pointer to the third operand (input).
6670;
6671BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i32, 16
6672 PROLOGUE_3_ARGS
6673 IEMIMPL_AVX_PROLOGUE
6674 SSE_AVX_LD_MXCSR A0_32
6675
6676 movdqu xmm0, [A2]
6677 vcvtsi2ss xmm0, xmm0, dword [A3]
6678 movdqu [A1], xmm0
6679
6680 SSE_AVX_ST_MXCSR R0_32, A0_32
6681 IEMIMPL_AVX_EPILOGUE
6682 EPILOGUE_3_ARGS
6683ENDPROC iemAImpl_vcvtsi2ss_u128_i32
6684
6685
6686;;
6687; cvtsi2ss instruction - 64-bit variant.
6688;
6689; @return R0_32 The new MXCSR value of the guest.
6690; @param A0_32 The guest's MXCSR register value to use.
6691; @param A1 Pointer to the result operand (output).
6692; @param A2 Pointer to the second operand (input).
6693;
6694BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6695 PROLOGUE_3_ARGS
6696 IEMIMPL_SSE_PROLOGUE
6697 SSE_AVX_LD_MXCSR A0_32
6698
6699 cvtsi2ss xmm0, qword [A2]
6700 movd dword [A1], xmm0
6701
6702 SSE_AVX_ST_MXCSR R0_32, A0_32
6703 IEMIMPL_SSE_EPILOGUE
6704 EPILOGUE_3_ARGS
6705ENDPROC iemAImpl_cvtsi2ss_r32_i64
6706
6707
6708;;
6709; vcvtsi2ss instruction - 64-bit variant.
6710;
6711; @return R0_32 The new MXCSR value of the guest.
6712; @param A0_32 The guest's MXCSR register value to use.
6713; @param A1 Pointer to the result operand (output).
6714; @param A2 Pointer to the second operand (input).
6715; @param A3 Pointer to the third operand (input).
6716;
6717BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i64, 16
6718 PROLOGUE_3_ARGS
6719 IEMIMPL_AVX_PROLOGUE
6720 SSE_AVX_LD_MXCSR A0_32
6721
6722 movdqu xmm0, [A2]
6723 vcvtsi2ss xmm0, xmm0, qword [A3]
6724 movdqu [A1], xmm0
6725
6726 SSE_AVX_ST_MXCSR R0_32, A0_32
6727 IEMIMPL_AVX_EPILOGUE
6728 EPILOGUE_3_ARGS
6729ENDPROC iemAImpl_vcvtsi2ss_u128_i64
6730
6731
6732;;
6733; cvtsi2sd instruction - 32-bit variant.
6734;
6735; @return R0_32 The new MXCSR value of the guest.
6736; @param A0_32 The guest's MXCSR register value to use.
6737; @param A1 Pointer to the result operand (output).
6738; @param A2 Pointer to the second operand (input).
6739;
6740BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6741 PROLOGUE_3_ARGS
6742 IEMIMPL_SSE_PROLOGUE
6743 SSE_AVX_LD_MXCSR A0_32
6744
6745 cvtsi2sd xmm0, dword [A2]
6746 movq [A1], xmm0
6747
6748 SSE_AVX_ST_MXCSR R0_32, A0_32
6749 IEMIMPL_SSE_EPILOGUE
6750 EPILOGUE_3_ARGS
6751ENDPROC iemAImpl_cvtsi2sd_r64_i32
6752
6753
6754;;
6755; vcvtsi2sd instruction - 32-bit variant.
6756;
6757; @return R0_32 The new MXCSR value of the guest.
6758; @param A0_32 The guest's MXCSR register value to use.
6759; @param A1 Pointer to the result operand (output).
6760; @param A2 Pointer to the second operand (input).
6761; @param A3 Pointer to the third operand (input).
6762;
6763BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i32, 16
6764 PROLOGUE_3_ARGS
6765 IEMIMPL_AVX_PROLOGUE
6766 SSE_AVX_LD_MXCSR A0_32
6767
6768 movdqu xmm0, [A2]
6769 vcvtsi2sd xmm0, xmm0, dword [A3]
6770 movdqu [A1], xmm0
6771
6772 SSE_AVX_ST_MXCSR R0_32, A0_32
6773 IEMIMPL_AVX_EPILOGUE
6774 EPILOGUE_3_ARGS
6775ENDPROC iemAImpl_vcvtsi2sd_u128_i32
6776
6777
6778;;
6779; cvtsi2sd instruction - 64-bit variant.
6780;
6781; @return R0_32 The new MXCSR value of the guest.
6782; @param A0_32 The guest's MXCSR register value to use.
6783; @param A1 Pointer to the result operand (output).
6784; @param A2 Pointer to the second operand (input).
6785;
6786BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6787 PROLOGUE_3_ARGS
6788 IEMIMPL_SSE_PROLOGUE
6789 SSE_AVX_LD_MXCSR A0_32
6790
6791 cvtsi2sd xmm0, qword [A2]
6792 movq [A1], xmm0
6793
6794 SSE_AVX_ST_MXCSR R0_32, A0_32
6795 IEMIMPL_SSE_EPILOGUE
6796 EPILOGUE_3_ARGS
6797ENDPROC iemAImpl_cvtsi2sd_r64_i64
6798
6799
6800;;
6801; vcvtsi2sd instruction - 64-bit variant.
6802;
6803; @return R0_32 The new MXCSR value of the guest.
6804; @param A0_32 The guest's MXCSR register value to use.
6805; @param A1 Pointer to the result operand (output).
6806; @param A2 Pointer to the second operand (input).
6807; @param A3 Pointer to the third operand (input).
6808;
6809BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i64, 16
6810 PROLOGUE_3_ARGS
6811 IEMIMPL_AVX_PROLOGUE
6812 SSE_AVX_LD_MXCSR A0_32
6813
6814 movdqu xmm0, [A2]
6815 vcvtsi2sd xmm0, xmm0, qword [A3]
6816 movdqu [A1], xmm0
6817
6818 SSE_AVX_ST_MXCSR R0_32, A0_32
6819 IEMIMPL_AVX_EPILOGUE
6820 EPILOGUE_3_ARGS
6821ENDPROC iemAImpl_vcvtsi2sd_u128_i64
6822
6823
6824;
6825; UCOMISS (SSE)
6826;
6827; @return R0_32 The new MXCSR value of the guest.
6828; @param A0_32 The guest's MXCSR register value to use (input).
6829; @param A1 Pointer to the EFLAGS value (input/output).
6830; @param A2_32 The first source operand.
6831; @param A3_32 The second source operand.
6832;
6833BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6834 PROLOGUE_4_ARGS
6835 IEMIMPL_SSE_PROLOGUE
6836 SSE_AVX_LD_MXCSR A0_32
6837
6838 movd xmm0, A2_32
6839 movd xmm1, A3_32
6840 ucomiss xmm0, xmm1
6841 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6842
6843 SSE_AVX_ST_MXCSR R0_32, A0_32
6844 IEMIMPL_SSE_EPILOGUE
6845 EPILOGUE_4_ARGS
6846ENDPROC iemAImpl_ucomiss_u128
6847
6848BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6849 PROLOGUE_4_ARGS
6850 IEMIMPL_SSE_PROLOGUE
6851 SSE_AVX_LD_MXCSR A0_32
6852
6853 movd xmm0, A2_32
6854 movd xmm1, A3_32
6855 vucomiss xmm0, xmm1
6856 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6857
6858 SSE_AVX_ST_MXCSR R0_32, A0_32
6859 IEMIMPL_SSE_EPILOGUE
6860 EPILOGUE_3_ARGS
6861ENDPROC iemAImpl_vucomiss_u128
6862
6863
6864;
6865; UCOMISD (SSE)
6866;
6867; @return R0_32 The new MXCSR value of the guest.
6868; @param A0_32 The guest's MXCSR register value to use (input).
6869; @param A1 Pointer to the EFLAGS value (input/output).
6870; @param A2 The first source operand.
6871; @param A3 The second source operand.
6872;
6873BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6874 PROLOGUE_4_ARGS
6875 IEMIMPL_SSE_PROLOGUE
6876 SSE_AVX_LD_MXCSR A0_32
6877
6878 movq xmm0, A2
6879 movq xmm1, A3
6880 ucomisd xmm0, xmm1
6881 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6882
6883 SSE_AVX_ST_MXCSR R0_32, A0_32
6884 IEMIMPL_SSE_EPILOGUE
6885 EPILOGUE_4_ARGS
6886ENDPROC iemAImpl_ucomisd_u128
6887
6888BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6889 PROLOGUE_4_ARGS
6890 IEMIMPL_SSE_PROLOGUE
6891 SSE_AVX_LD_MXCSR A0_32
6892
6893 movq xmm0, A2
6894 movq xmm1, A3
6895 vucomisd xmm0, xmm1
6896 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6897
6898 SSE_AVX_ST_MXCSR R0_32, A0_32
6899 IEMIMPL_SSE_EPILOGUE
6900 EPILOGUE_4_ARGS
6901ENDPROC iemAImpl_vucomisd_u128
6902
6903;
6904; COMISS (SSE)
6905;
6906; @return R0_32 The new MXCSR value of the guest.
6907; @param A0_32 The guest's MXCSR register value to use (input).
6908; @param A1 Pointer to the EFLAGS value (input/output).
6909; @param A2_32 The first source operand.
6910; @param A3_32 The second source operand.
6911;
6912BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6913 PROLOGUE_4_ARGS
6914 IEMIMPL_SSE_PROLOGUE
6915 SSE_AVX_LD_MXCSR A0_32
6916
6917 movd xmm0, A2_32
6918 movd xmm1, A3_32
6919 comiss xmm0, xmm1
6920 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6921
6922 SSE_AVX_ST_MXCSR R0_32, A0_32
6923 IEMIMPL_SSE_EPILOGUE
6924 EPILOGUE_4_ARGS
6925ENDPROC iemAImpl_comiss_u128
6926
6927BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6928 PROLOGUE_4_ARGS
6929 IEMIMPL_SSE_PROLOGUE
6930 SSE_AVX_LD_MXCSR A0_32
6931
6932 movd xmm0, A2_32
6933 movd xmm1, A3_32
6934 vcomiss xmm0, xmm1
6935 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6936
6937 SSE_AVX_ST_MXCSR R0_32, A0_32
6938 IEMIMPL_SSE_EPILOGUE
6939 EPILOGUE_4_ARGS
6940ENDPROC iemAImpl_vcomiss_u128
6941
6942
6943;
6944; COMISD (SSE)
6945;
6946; @return R0_32 The new MXCSR value of the guest.
6947; @param A0_32 The guest's MXCSR register value to use (input).
6948; @param A1 Pointer to the EFLAGS value (input/output).
6949; @param A2 The first source operand.
6950; @param A3 The second source operand.
6951;
6952BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6953 PROLOGUE_4_ARGS
6954 IEMIMPL_SSE_PROLOGUE
6955 SSE_AVX_LD_MXCSR A0_32
6956
6957 movq xmm0, A2
6958 movq xmm1, A3
6959 comisd xmm0, xmm1
6960 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6961
6962 SSE_AVX_ST_MXCSR R0_32, A0_32
6963 IEMIMPL_SSE_EPILOGUE
6964 EPILOGUE_4_ARGS
6965ENDPROC iemAImpl_comisd_u128
6966
6967BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6968 PROLOGUE_4_ARGS
6969 IEMIMPL_SSE_PROLOGUE
6970 SSE_AVX_LD_MXCSR A0_32
6971
6972 movq xmm0, A2
6973 movq xmm1, A3
6974 vcomisd xmm0, xmm1
6975 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6976
6977 SSE_AVX_ST_MXCSR R0_32, A0_32
6978 IEMIMPL_SSE_EPILOGUE
6979 EPILOGUE_4_ARGS
6980ENDPROC iemAImpl_vcomisd_u128
6981
6982
6983;;
6984; Need to move this as well somewhere better?
6985;
6986struc IEMMEDIAF2XMMSRC
6987 .uSrc1 resd 4
6988 .uSrc2 resd 4
6989endstruc
6990
6991
6992struc IEMMEDIAF2YMMSRC
6993 .uSrc1 resd 8
6994 .uSrc2 resd 8
6995endstruc
6996
6997
6998;;
6999; SSE/AVX instructions with 8-bit immediates of the form
7000; xxx xmm1, xmm2, imm8.
7001; vxxx xmm1, xmm2, xmm3, imm8.
7002; and we need to load and save the MXCSR register.
7003;
7004; @param 1 The instruction name.
7005; @param 2 Flag whether this instruction has a 256-bit AVX variant (1) or not (0).
7006; @param 3 Number of bytes for the encoding of the SSE variant + ret instruction (AVX is fixed to 6).
7007;
7008; @return R0_32 The new MXCSR value of the guest.
7009; @param A0_32 The guest's MXCSR register value to use (input).
7010; @param A1 Pointer to the first media register size operand (output).
7011; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
7012; @param A3 The 8-bit immediate (input).
7013;
7014%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR 3
7015BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7016 PROLOGUE_4_ARGS
7017 IEMIMPL_SSE_PROLOGUE
7018 SSE_AVX_LD_MXCSR A0_32
7019
7020 movzx A3, A3_8 ; must clear top bits
7021 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7022 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7023 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, %3
7024 movdqu [A1], xmm0
7025
7026 SSE_AVX_ST_MXCSR R0_32, A0_32
7027 IEMIMPL_SSE_EPILOGUE
7028 EPILOGUE_4_ARGS
7029 %assign bImm 0
7030 %rep 256
7031.imm %+ bImm:
7032 IBT_ENDBRxx_WITHOUT_NOTRACK
7033 %1 xmm0, xmm1, bImm
7034 ret
7035 %assign bImm bImm + 1
7036 %endrep
7037.immEnd:
7038ENDPROC iemAImpl_ %+ %1 %+ _u128
7039
7040
7041BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7042 PROLOGUE_4_ARGS
7043 IEMIMPL_SSE_PROLOGUE
7044 SSE_AVX_LD_MXCSR A0_32
7045
7046 movzx A3, A3_8 ; must clear top bits
7047 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7048 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7049 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7050 movdqu [A1], xmm0
7051
7052 SSE_AVX_ST_MXCSR R0_32, A0_32
7053 IEMIMPL_SSE_EPILOGUE
7054 EPILOGUE_4_ARGS
7055 %assign bImm 0
7056 %rep 256
7057.imm %+ bImm:
7058 IBT_ENDBRxx_WITHOUT_NOTRACK
7059 v %+ %1 xmm0, xmm0, xmm1, bImm
7060 ret
7061 %assign bImm bImm + 1
7062 %endrep
7063.immEnd:
7064ENDPROC iemAImpl_v %+ %1 %+ _u128
7065
7066 %if %2 == 1
7067BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7068 PROLOGUE_4_ARGS
7069 IEMIMPL_SSE_PROLOGUE
7070 SSE_AVX_LD_MXCSR A0_32
7071
7072 movzx A3, A3_8 ; must clear top bits
7073 vmovdqu ymm0, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7074 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7075 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7076 vmovdqu [A1], ymm0
7077
7078 SSE_AVX_ST_MXCSR R0_32, A0_32
7079 IEMIMPL_SSE_EPILOGUE
7080 EPILOGUE_4_ARGS
7081 %assign bImm 0
7082 %rep 256
7083.imm %+ bImm:
7084 IBT_ENDBRxx_WITHOUT_NOTRACK
7085 v %+ %1 ymm0, ymm0, ymm1, bImm
7086 ret
7087 %assign bImm bImm + 1
7088 %endrep
7089.immEnd:
7090ENDPROC iemAImpl_v %+ %1 %+ _u256
7091 %endif
7092%endmacro
7093
7094IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpps, 1, 5
7095IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmppd, 1, 6
7096IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpss, 0, 6
7097IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpsd, 0, 6
7098
7099
7100;;
7101; SSE/AVX instructions with 2 full sized perands and an 8-bit immediate of the form
7102; xxx xmm1, xmm2, imm8.
7103; vxxx xmm1, xmm2, imm8
7104; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7105; register.
7106;
7107; @param 1 The instruction name.
7108;
7109; @return R0_32 The new MXCSR value of the guest.
7110; @param A0_32 The guest's MXCSR register value to use (input).
7111; @param A1 Pointer to the first media register size operand (output).
7112; @param A2 Pointer to the second media register size operand (input).
7113; @param A3 The 8-bit immediate (input).
7114;
7115%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 1
7116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7117 PROLOGUE_4_ARGS
7118 IEMIMPL_SSE_PROLOGUE
7119 SSE_AVX_LD_MXCSR A0_32
7120
7121 movzx A3, A3_8 ; must clear top bits
7122 movdqu xmm1, [A2]
7123 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7124 movdqu [A1], xmm0
7125
7126 SSE_AVX_ST_MXCSR R0_32, A0_32
7127 IEMIMPL_SSE_EPILOGUE
7128 EPILOGUE_4_ARGS
7129 %assign bImm 0
7130 %rep 256
7131.imm %+ bImm:
7132 IBT_ENDBRxx_WITHOUT_NOTRACK
7133 %1 xmm0, xmm1, bImm
7134 ret
7135 int3
7136 %assign bImm bImm + 1
7137 %endrep
7138.immEnd:
7139ENDPROC iemAImpl_ %+ %1 %+ _u128
7140
7141BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7142 PROLOGUE_4_ARGS
7143 IEMIMPL_SSE_PROLOGUE
7144 SSE_AVX_LD_MXCSR A0_32
7145
7146 movzx A3, A3_8 ; must clear top bits
7147 movdqu xmm1, [A2]
7148 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7149 movdqu [A1], xmm0
7150
7151 SSE_AVX_ST_MXCSR R0_32, A0_32
7152 IEMIMPL_SSE_EPILOGUE
7153 EPILOGUE_4_ARGS
7154 %assign bImm 0
7155 %rep 256
7156.imm %+ bImm:
7157 IBT_ENDBRxx_WITHOUT_NOTRACK
7158 v%1 xmm0, xmm1, bImm
7159 ret
7160 int3
7161 %assign bImm bImm + 1
7162 %endrep
7163.immEnd:
7164ENDPROC iemAImpl_v %+ %1 %+ _u128
7165
7166BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7167 PROLOGUE_4_ARGS
7168 IEMIMPL_SSE_PROLOGUE
7169 SSE_AVX_LD_MXCSR A0_32
7170
7171 movzx A3, A3_8 ; must clear top bits
7172 vmovdqu ymm1, [A2]
7173 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7174 vmovdqu [A1], ymm0
7175
7176 SSE_AVX_ST_MXCSR R0_32, A0_32
7177 IEMIMPL_SSE_EPILOGUE
7178 EPILOGUE_4_ARGS
7179 %assign bImm 0
7180 %rep 256
7181.imm %+ bImm:
7182 IBT_ENDBRxx_WITHOUT_NOTRACK
7183 v%1 ymm0, ymm1, bImm
7184 ret
7185 int3
7186 %assign bImm bImm + 1
7187 %endrep
7188.immEnd:
7189ENDPROC iemAImpl_v %+ %1 %+ _u256
7190%endmacro
7191
7192IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundps
7193IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundpd
7194
7195
7196;;
7197; SSE/AVX instructions with 3 full sized perands and an 8-bit immediate of the form
7198; xxx xmm1, xmm2, imm8.
7199; vxxx xmm1, xmm2, xmm3, imm8
7200; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7201; register.
7202;
7203; @param 1 The instruction name.
7204; @param 2 Flag whether to emit a 256-bit AVX variant (1) or not (0).
7205;
7206; @return R0_32 The new MXCSR value of the guest.
7207; @param A0_32 The guest's MXCSR register value to use (input).
7208; @param A1 Pointer to the first media register size operand (output).
7209; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC/IEMMEDIAF2YMMSRC (input).
7210; @param A3 The 8-bit immediate (input).
7211;
7212%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 2
7213BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7214 PROLOGUE_4_ARGS
7215 IEMIMPL_SSE_PROLOGUE
7216 SSE_AVX_LD_MXCSR A0_32
7217
7218 movzx A3, A3_8 ; must clear top bits
7219 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7220 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7221 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7222 movdqu [A1], xmm0
7223
7224 SSE_AVX_ST_MXCSR R0_32, A0_32
7225 IEMIMPL_SSE_EPILOGUE
7226 EPILOGUE_4_ARGS
7227 %assign bImm 0
7228 %rep 256
7229.imm %+ bImm:
7230 IBT_ENDBRxx_WITHOUT_NOTRACK
7231 %1 xmm0, xmm1, bImm
7232 ret
7233 int3
7234 %assign bImm bImm + 1
7235 %endrep
7236.immEnd:
7237ENDPROC iemAImpl_ %+ %1 %+ _u128
7238
7239
7240BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7241 PROLOGUE_4_ARGS
7242 IEMIMPL_SSE_PROLOGUE
7243 SSE_AVX_LD_MXCSR A0_32
7244
7245 movzx A3, A3_8 ; must clear top bits
7246 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7247 movdqu xmm2, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7248 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7249 movdqu [A1], xmm0
7250
7251 SSE_AVX_ST_MXCSR R0_32, A0_32
7252 IEMIMPL_SSE_EPILOGUE
7253 EPILOGUE_4_ARGS
7254 %assign bImm 0
7255 %rep 256
7256.imm %+ bImm:
7257 IBT_ENDBRxx_WITHOUT_NOTRACK
7258 v %+ %1 xmm0, xmm1, xmm2, bImm
7259 ret
7260 int3
7261 %assign bImm bImm + 1
7262 %endrep
7263.immEnd:
7264ENDPROC iemAImpl_v %+ %1 %+ _u128
7265
7266
7267 %if %2 == 1
7268BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7269 PROLOGUE_4_ARGS
7270 IEMIMPL_SSE_PROLOGUE
7271 SSE_AVX_LD_MXCSR A0_32
7272
7273 movzx A3, A3_8 ; must clear top bits
7274 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7275 vmovdqu ymm2, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7276 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7277 vmovdqu [A1], ymm0
7278
7279 SSE_AVX_ST_MXCSR R0_32, A0_32
7280 IEMIMPL_SSE_EPILOGUE
7281 EPILOGUE_4_ARGS
7282 %assign bImm 0
7283 %rep 256
7284.imm %+ bImm:
7285 IBT_ENDBRxx_WITHOUT_NOTRACK
7286 v %+ %1 ymm0, ymm1, ymm2, bImm
7287 ret
7288 int3
7289 %assign bImm bImm + 1
7290 %endrep
7291.immEnd:
7292ENDPROC iemAImpl_v %+ %1 %+ _u256
7293 %endif
7294%endmacro
7295
7296IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundss, 0
7297IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundsd, 0
7298IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dpps, 1
7299IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dppd, 0
7300
7301
7302;;
7303; SSE instructions of the form
7304; xxx mm, xmm.
7305; and we need to load and save the MXCSR register.
7306;
7307; @param 1 The instruction name.
7308;
7309; @return R0_32 The new MXCSR value of the guest.
7310; @param A0_32 The guest's MXCSR register value to use (input).
7311; @param A1 Pointer to the first MMX register sized operand (output).
7312; @param A2 Pointer to the media register sized operand (input).
7313;
7314%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
7315BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7316 PROLOGUE_3_ARGS
7317 IEMIMPL_SSE_PROLOGUE
7318 SSE_AVX_LD_MXCSR A0_32
7319
7320 movdqu xmm0, [A2]
7321 %1 mm0, xmm0
7322 movq [A1], mm0
7323
7324 SSE_AVX_ST_MXCSR R0_32, A0_32
7325 IEMIMPL_SSE_EPILOGUE
7326 EPILOGUE_3_ARGS
7327ENDPROC iemAImpl_ %+ %1 %+ _u128
7328%endmacro
7329
7330IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
7331IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
7332
7333;;
7334; SSE instructions of the form
7335; xxx xmm, xmm/m64.
7336; and we need to load and save the MXCSR register.
7337;
7338; @param 1 The instruction name.
7339;
7340; @return R0_32 The new MXCSR value of the guest.
7341; @param A0_32 The guest's MXCSR register value to use (input).
7342; @param A1 Pointer to the first media register sized operand (input/output).
7343; @param A2 The 64bit source value from a MMX media register (input)
7344;
7345%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
7346BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7347 PROLOGUE_3_ARGS
7348 IEMIMPL_SSE_PROLOGUE
7349 SSE_AVX_LD_MXCSR A0_32
7350
7351 movdqu xmm0, [A1]
7352 movq mm0, A2
7353 %1 xmm0, mm0
7354 movdqu [A1], xmm0
7355
7356 SSE_AVX_ST_MXCSR R0_32, A0_32
7357 IEMIMPL_SSE_EPILOGUE
7358 EPILOGUE_3_ARGS
7359ENDPROC iemAImpl_ %+ %1 %+ _u128
7360%endmacro
7361
7362IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
7363IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
7364
7365;;
7366; SSE instructions of the form
7367; xxx mm, xmm/m64.
7368; and we need to load and save the MXCSR register.
7369;
7370; @param 1 The instruction name.
7371;
7372; @return R0_32 The new MXCSR value of the guest.
7373; @param A0_32 The guest's MXCSR register value to use (input).
7374; @param A1 Pointer to the first MMX media register sized operand (output).
7375; @param A2 The 64bit source value (input).
7376;
7377%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
7378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7379 PROLOGUE_3_ARGS
7380 IEMIMPL_SSE_PROLOGUE
7381 SSE_AVX_LD_MXCSR A0_32
7382
7383 movq xmm0, A2
7384 %1 mm0, xmm0
7385 movq [A1], mm0
7386
7387 SSE_AVX_ST_MXCSR R0_32, A0_32
7388 IEMIMPL_SSE_EPILOGUE
7389 EPILOGUE_3_ARGS
7390ENDPROC iemAImpl_ %+ %1 %+ _u128
7391%endmacro
7392
7393IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
7394IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
7395
7396;
7397; All forms of RDRAND and RDSEED
7398;
7399; @param A0 Pointer to the destination operand.
7400; @param A1 Pointer to the EFLAGS value (input/output).
7401;
7402%macro IEMIMPL_RDRAND_RDSEED 3
7403BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
7404 PROLOGUE_2_ARGS
7405
7406 %1 %2
7407 mov [A0], %2
7408 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
7409
7410 EPILOGUE_2_ARGS
7411ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
7412%endmacro
7413
7414IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
7415IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
7416IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
7417IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
7418IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
7419IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
7420
7421
7422;;
7423; sha1rnds4 xmm1, xmm2, imm8.
7424;
7425; @param 1 The instruction name.
7426;
7427; @param A0 Pointer to the first media register size operand (input/output).
7428; @param A1 Pointer to the second source media register size operand (input).
7429; @param A2 The 8-bit immediate
7430;
7431BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
7432 PROLOGUE_3_ARGS
7433 IEMIMPL_SSE_PROLOGUE
7434
7435 movzx A2, A2_8 ; must clear top bits
7436 movdqu xmm0, [A0]
7437 movdqu xmm1, [A1]
7438 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
7439 movdqu [A0], xmm0
7440
7441 IEMIMPL_SSE_EPILOGUE
7442 EPILOGUE_3_ARGS
7443 %assign bImm 0
7444 %rep 256
7445.imm %+ bImm:
7446 IBT_ENDBRxx_WITHOUT_NOTRACK
7447 sha1rnds4 xmm0, xmm1, bImm
7448 ret
7449 %assign bImm bImm + 1
7450 %endrep
7451.immEnd:
7452ENDPROC iemAImpl_sha1rnds4_u128
7453
7454
7455;;
7456; sha256rnds2 xmm1, xmm2, <XMM0>.
7457;
7458; @param 1 The instruction name.
7459;
7460; @param A0 Pointer to the first media register size operand (input/output).
7461; @param A1 Pointer to the second source media register size operand (input).
7462; @param A2 Pointer to the implicit XMM0 constants (input).
7463;
7464BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
7465 PROLOGUE_3_ARGS
7466 IEMIMPL_SSE_PROLOGUE
7467
7468 movdqu xmm0, [A2]
7469 movdqu xmm1, [A0]
7470 movdqu xmm2, [A1]
7471 sha256rnds2 xmm1, xmm2
7472 movdqu [A0], xmm1
7473
7474 IEMIMPL_SSE_EPILOGUE
7475 EPILOGUE_3_ARGS
7476ENDPROC iemAImpl_sha256rnds2_u128
7477
7478
7479;
7480; 32-bit forms of ADCX and ADOX
7481;
7482; @returns Updated EFLAGS.
7483; @param A0 Incoming EFLAGS value (input).
7484; @param A1 Pointer to the destination operand (input/output).
7485; @param A2 32-bit source operand 1 (input).
7486;
7487%macro IEMIMPL_ADX_32 2
7488BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
7489 PROLOGUE_4_ARGS
7490
7491 IEM_LOAD_FLAGS A0_32, %2, 0
7492 %1 A2_32, [A1]
7493 mov [A1], A2_32
7494 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7495
7496 EPILOGUE_4_ARGS
7497ENDPROC iemAImpl_ %+ %1 %+ _u32
7498%endmacro
7499
7500;
7501; 64-bit forms of ADCX and ADOX
7502;
7503; @returns Updated EFLAGS.
7504; @param A0 Incoming EFLAGS value (input).
7505; @param A1 Pointer to the destination operand (input/output).
7506; @param A2 64-bit source operand 1 (input).
7507;
7508%macro IEMIMPL_ADX_64 2
7509BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
7510 PROLOGUE_4_ARGS
7511
7512 IEM_LOAD_FLAGS A0_32, %2, 0
7513 %1 A2, [A1]
7514 mov [A1], A2
7515 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7516
7517 EPILOGUE_4_ARGS
7518ENDPROC iemAImpl_ %+ %1 %+ _u64
7519%endmacro
7520
7521IEMIMPL_ADX_32 adcx, X86_EFL_CF
7522IEMIMPL_ADX_64 adcx, X86_EFL_CF
7523
7524IEMIMPL_ADX_32 adox, X86_EFL_OF
7525IEMIMPL_ADX_64 adox, X86_EFL_OF
7526
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette