VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 105409

Last change on this file since 105409 was 105355, checked in by vboxsync, 7 months ago

VMM/IEM: Implement vcvtdq2pd instruction emulation, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 220.8 KB
Line 
1; $Id: IEMAllAImpl.asm 105355 2024-07-16 12:52:31Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; This is handy for generating absolutly correct EFLAGS.
43;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
44
45
46;;
47; RET XX / RET wrapper for fastcall.
48;
49%macro RET_FASTCALL 1
50%ifdef RT_ARCH_X86
51 %ifdef RT_OS_WINDOWS
52 ret %1
53 %else
54 ret
55 %endif
56%else
57 ret
58%endif
59%endmacro
60
61;;
62; NAME for fastcall functions.
63;
64;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
65; escaping (or whatever the dollar is good for here). Thus the ugly
66; prefix argument.
67;
68%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
69%ifdef RT_ARCH_X86
70 %ifdef RT_OS_WINDOWS
71 %undef NAME_FASTCALL
72 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
73 %endif
74%endif
75
76;;
77; BEGINPROC for fastcall functions.
78;
79; @param 1 The function name (C).
80; @param 2 The argument size on x86.
81;
82%macro BEGINPROC_FASTCALL 2
83GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
84 IBT_ENDBRxx
85%endmacro
86
87
88;
89; We employ some macro assembly here to hid the calling convention differences.
90;
91%ifdef RT_ARCH_AMD64
92 %macro PROLOGUE_1_ARGS 0
93 %endmacro
94 %macro EPILOGUE_1_ARGS 0
95 ret
96 %endmacro
97 %macro EPILOGUE_1_ARGS_EX 0
98 ret
99 %endmacro
100
101 %macro PROLOGUE_2_ARGS 0
102 %endmacro
103 %macro EPILOGUE_2_ARGS 0
104 ret
105 %endmacro
106 %macro EPILOGUE_2_ARGS_EX 1
107 ret
108 %endmacro
109
110 %macro PROLOGUE_3_ARGS 0
111 %endmacro
112 %macro EPILOGUE_3_ARGS 0
113 ret
114 %endmacro
115 %macro EPILOGUE_3_ARGS_EX 1
116 ret
117 %endmacro
118
119 %macro PROLOGUE_4_ARGS 0
120 %endmacro
121 %macro EPILOGUE_4_ARGS 0
122 ret
123 %endmacro
124 %macro EPILOGUE_4_ARGS_EX 1
125 ret
126 %endmacro
127
128 %ifdef ASM_CALL64_GCC
129 %define A0 rdi
130 %define A0_32 edi
131 %define A0_16 di
132 %define A0_8 dil
133
134 %define A1 rsi
135 %define A1_32 esi
136 %define A1_16 si
137 %define A1_8 sil
138
139 %define A2 rdx
140 %define A2_32 edx
141 %define A2_16 dx
142 %define A2_8 dl
143
144 %define A3 rcx
145 %define A3_32 ecx
146 %define A3_16 cx
147 %define A3_8 cl
148 %endif
149
150 %ifdef ASM_CALL64_MSC
151 %define A0 rcx
152 %define A0_32 ecx
153 %define A0_16 cx
154 %define A0_8 cl
155
156 %define A1 rdx
157 %define A1_32 edx
158 %define A1_16 dx
159 %define A1_8 dl
160
161 %define A2 r8
162 %define A2_32 r8d
163 %define A2_16 r8w
164 %define A2_8 r8b
165
166 %define A3 r9
167 %define A3_32 r9d
168 %define A3_16 r9w
169 %define A3_8 r9b
170 %endif
171
172 %define T0 rax
173 %define T0_32 eax
174 %define T0_16 ax
175 %define T0_8 al
176
177 %define T1 r11
178 %define T1_32 r11d
179 %define T1_16 r11w
180 %define T1_8 r11b
181
182 %define T2 r10 ; only AMD64
183 %define T2_32 r10d
184 %define T2_16 r10w
185 %define T2_8 r10b
186
187 ;
188 ; Return value, same as T0 but to make it more obvious
189 ; that this is a return value.
190 ;
191 %define R0 rax
192 %define R0_32 eax
193 %define R0_16 ax
194 %define R0_8 al
195
196%else
197 ; x86
198 %macro PROLOGUE_1_ARGS 0
199 push edi
200 %endmacro
201 %macro EPILOGUE_1_ARGS 0
202 pop edi
203 ret 0
204 %endmacro
205 %macro EPILOGUE_1_ARGS_EX 1
206 pop edi
207 ret %1
208 %endmacro
209
210 %macro PROLOGUE_2_ARGS 0
211 push edi
212 %endmacro
213 %macro EPILOGUE_2_ARGS 0
214 pop edi
215 ret 0
216 %endmacro
217 %macro EPILOGUE_2_ARGS_EX 1
218 pop edi
219 ret %1
220 %endmacro
221
222 %macro PROLOGUE_3_ARGS 0
223 push ebx
224 mov ebx, [esp + 4 + 4]
225 push edi
226 %endmacro
227 %macro EPILOGUE_3_ARGS_EX 1
228 %if (%1) < 4
229 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
230 %endif
231 pop edi
232 pop ebx
233 ret %1
234 %endmacro
235 %macro EPILOGUE_3_ARGS 0
236 EPILOGUE_3_ARGS_EX 4
237 %endmacro
238
239 %macro PROLOGUE_4_ARGS 0
240 push ebx
241 push edi
242 push esi
243 mov ebx, [esp + 12 + 4 + 0]
244 mov esi, [esp + 12 + 4 + 4]
245 %endmacro
246 %macro EPILOGUE_4_ARGS_EX 1
247 %if (%1) < 8
248 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
249 %endif
250 pop esi
251 pop edi
252 pop ebx
253 ret %1
254 %endmacro
255 %macro EPILOGUE_4_ARGS 0
256 EPILOGUE_4_ARGS_EX 8
257 %endmacro
258
259 %define A0 ecx
260 %define A0_32 ecx
261 %define A0_16 cx
262 %define A0_8 cl
263
264 %define A1 edx
265 %define A1_32 edx
266 %define A1_16 dx
267 %define A1_8 dl
268
269 %define A2 ebx
270 %define A2_32 ebx
271 %define A2_16 bx
272 %define A2_8 bl
273
274 %define A3 esi
275 %define A3_32 esi
276 %define A3_16 si
277
278 %define T0 eax
279 %define T0_32 eax
280 %define T0_16 ax
281 %define T0_8 al
282
283 %define T1 edi
284 %define T1_32 edi
285 %define T1_16 di
286%endif
287
288
289;;
290; Load the relevant flags from [%1] if there are undefined flags (%3).
291;
292; @remarks Clobbers T0, stack. Changes EFLAGS.
293; @param 1 The parameter (A0..A3) holding the eflags value.
294; @param 2 The set of modified flags.
295; @param 3 The set of undefined flags.
296; @param 4 The flags that must be loaded.
297;
298%macro IEM_MAYBE_LOAD_FLAGS 4
299 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
300 pushf ; store current flags
301 mov T0_32, %1 ; load the guest flags
302 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
303 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
304 or [xSP], T0 ; merge guest flags with host flags.
305 popf ; load the mixed flags.
306
307 %elif (%3 + %4) != 0
308 %if 1 ; This approach seems faster on intel 10980XE
309 %if (%3 | %4) == X86_EFL_CF
310 ; Use bt to load bit into CF
311 bt %1, X86_EFL_CF_BIT
312 %else
313 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
314 mov eax, %1
315 %if (%3 | %4) == X86_EFL_OF
316 ; Use ADD to set OF.
317 shl eax, 31 - X86_EFL_OF_BIT
318 add eax, 80000000h
319 %elif ((%3 | %4) & X86_EFL_OF) != 0
320 ; Use ADD to set OF.
321 xchg al, ah
322 shl al, 15 - X86_EFL_OF_BIT
323 add al, 80h
324 ; Use SAHF to set the other status flags.
325 sahf
326 %else ; OF not needed; so al -> ah and load ah into eflags.
327 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
328 shl eax, 8
329 %else
330 xchg al, ah
331 %endif
332 sahf
333 %endif
334 %endif
335
336 %else
337 pushf ; store current flags
338 mov T0_32, %1 ; load the guest flags
339 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
340 and T0_32, (%2 | %3) ; select the modified and undefined flags.
341 or [xSP], T0 ; merge guest flags with host flags.
342 popf ; load the mixed flags.
343 %endif
344 %endif
345%endmacro
346
347;;
348; Load the relevant flags from [%1].
349;
350; @remarks Clobbers T0, stack. Changes EFLAGS.
351; @param 1 The parameter (A0..A3) holding the eflags value.
352; @param 2 The set of flags to load.
353; @param 3 The set of undefined flags.
354;
355%macro IEM_LOAD_FLAGS 3
356 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
357 pushf ; store current flags
358 mov T0_32, %1 ; load the guest flags
359 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
360 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
361 or [xSP], T0 ; merge guest flags with host flags.
362 popf ; load the mixed flags.
363
364 %elif 1 ; This approach seems faster on intel 10980XE
365 %if (%3 | %2) == X86_EFL_CF
366 ; Use bt to load bit into CF
367 bt %1, X86_EFL_CF_BIT
368 %else
369 mov eax, %1 ; ASSUMES T0_32 is eax!!
370 %if (%3 | %2) == X86_EFL_OF
371 ; Use ADD to set OF.
372 shl eax, 31 - X86_EFL_OF_BIT
373 add eax, 80000000h
374 %elif ((%3 | %2) & X86_EFL_OF) != 0
375 ; Use ADD to set OF.
376 xchg al, ah
377 shl al, 15 - X86_EFL_OF_BIT
378 add al, 80h
379 ; Use SAHF to set the other status flags.
380 sahf
381 %else ; OF not needed; so al -> ah and load ah into eflags.
382 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
383 shl eax, 8
384 %else
385 xchg al, ah
386 %endif
387 sahf
388 %endif
389 %endif ; (%3 | %2) != X86_EFL_CF
390
391 %else
392 pushf ; store current flags
393 mov T0_32, %1 ; load the guest flags
394 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
395 and T0_32, (%2 | %3) ; select the modified and undefined flags.
396 or [xSP], T0 ; merge guest flags with host flags.
397 popf ; load the mixed flags.
398 %endif
399%endmacro
400
401;;
402; Merge incoming guest EFLAGS (%1) with host EFLAGS into EAX (T0).
403;
404; @remarks Clobbers T0, T1, %1, stack.
405; @param 1 The parameter (A0..A3) holding the OLD eflags value. Clobbered.
406; @param 2 The mask of modified flags to save.
407; @param 3 The mask of undefined flags to (maybe) save.
408; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
409;
410%macro IEM_SAVE_FLAGS_RETVAL 4 0
411 %if (%2 | %3 | %4) != 0
412 mov T1_32, %1 ; flags
413 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
414 pushf
415 pop T0
416 and %1, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
417 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
418 %else
419 %if (%2 | %3 | %4) == X86_EFL_CF
420 setc T0_8
421 %elif (%2 | %3) == X86_EFL_OF
422 seto T0_8
423 shl T0_32, X86_EFL_OF_BIT
424 %elif (%2 | %3) == X86_EFL_ZF
425 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
426 shl T0_32, X86_EFL_ZF_BIT
427 %elif (%2 | %3) <= 0xff
428 lahf
429 movzx eax, ah ; ASSUMES T0_32 is eax!
430 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
431 lahf ; while there seems only to be a tiny advantage in most other test.
432 movzx eax, ah ; ASSUMES T0_32 is eax!
433 jno .of_is_clear
434 or eax, X86_EFL_OF
435.of_is_clear:
436 %else
437 pushf ; this is a bit slow
438 pop T0
439 %endif
440 and %1, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
441 and T0_32, (%2 | %3) ; select the modified and undefined flags.
442 %endif
443 or T0_32, %1 ; combine the flags. ASSUMES T0 = eax!
444 ;mov %1, T0_32 ; save the flags.
445 %endif
446%endmacro
447
448;;
449; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
450;
451; @remarks Clobbers T0, T1, stack.
452; @param 1 The parameter (A0..A3) holding the eflags value.
453; @param 2 The mask of modified flags to save.
454; @param 3 Mask of additional flags to always clear
455; @param 4 Mask of additional flags to always set.
456;
457;; @todo make it stuff the result into EAX?
458%macro IEM_SAVE_AND_ADJUST_FLAGS 4
459 %if (%2 | %3 | %4) != 0
460 pushf
461 pop T1
462 mov T0_32, %1 ; load flags.
463 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
464 and T1_32, (%2) ; select the modified flags.
465 or T0_32, T1_32 ; combine the flags.
466 %if (%4) != 0
467 or T0_32, %4 ; add the always set flags.
468 %endif
469 mov %1, T0_32 ; save the result.
470 %endif
471%endmacro
472
473;;
474; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
475; signed input (%4[%5]) and parity index (%6), storing the result into EAX (T0).
476;
477; @note %4 & %6 must not be RAX, EAX, or AX! So, don't use with full MUL/IMUL.
478
479; @remarks Clobbers T0, T1, stack, %6, EFLAGS, %1.
480; @param 1 The parameter (A0..A3) holding the eflags value.
481; @param 2 The mask of modified flags to save.
482; @param 3 Mask of additional flags to always clear
483; @param 4 The result register to set SF by.
484; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
485; @param 6 The (full) register containing the parity table index. Will be modified!
486%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL 6
487 pushf
488 pop T0
489 and %1, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
490 and T0_32, (%2) ; select the modified flags.
491 or T0_32, %1 ; combine the flags.
492
493 ; First calculate SF as it is the same register as %6 (only %6 is always full width).
494 bt %4, %5 - 1
495 jnc %%sf_clear
496 or T0_32, X86_EFL_SF
497 %%sf_clear:
498
499 ; Parity last.
500 and %6, 0xff
501 %ifdef RT_ARCH_AMD64
502 lea T1, [NAME(g_afParity) xWrtRIP]
503 or T0_8, [T1 + %6]
504 %else
505 or T0_8, [NAME(g_afParity) + %6]
506 %endif
507
508 ;mov %1, T0_32 ; save the result.
509 ; ASSUMES T0 = eax!
510%endmacro
511
512;;
513; Calculates the new EFLAGS using fixed clear and set bit masks.
514;
515; @remarks Clobbers T0.
516; @param 1 The parameter (A0..A3) holding the eflags value.
517; @param 2 Mask of additional flags to always clear
518; @param 3 Mask of additional flags to always set.
519;
520%macro IEM_ADJUST_FLAGS 3
521 %if (%2 | %3) != 0
522 mov T0_32, %1 ; Load flags.
523 %if (%2) != 0
524 and T0_32, ~(%2) ; Remove the always cleared flags.
525 %endif
526 %if (%3) != 0
527 or T0_32, %3 ; Add the always set flags.
528 %endif
529 mov %1, T0_32 ; Save the result.
530 %endif
531%endmacro
532
533;;
534; Calculates the new EFLAGS using fixed clear and set bit masks.
535;
536; @remarks Clobbers T0, %4, EFLAGS.
537; @param 1 The parameter (A0..A3) holding the eflags value.
538; @param 2 Mask of additional flags to always clear
539; @param 3 Mask of additional flags to always set.
540; @param 4 The (full) register containing the parity table index. Will be modified!
541;
542%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
543 mov T0_32, %1 ; Load flags.
544 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
545 %if (%3) != 0
546 or T0_32, %3 ; Add the always set flags.
547 %endif
548 and %4, 0xff
549 %ifdef RT_ARCH_AMD64
550 lea T2, [NAME(g_afParity) xWrtRIP]
551 or T0_8, [T2 + %4]
552 %else
553 or T0_8, [NAME(g_afParity) + %4]
554 %endif
555 mov %1, T0_32 ; Save the result.
556%endmacro
557
558
559;;;; OLD EFLAGS macros.
560;;;; OLD EFLAGS macros.
561;;;; OLD EFLAGS macros.
562;;;; OLD EFLAGS macros.
563;;;; OLD EFLAGS macros.
564
565;;
566; Load the relevant flags from [%1] if there are undefined flags (%3).
567;
568; @remarks Clobbers T0, stack. Changes EFLAGS.
569; @param 1 The parameter (A0..A3) pointing to the eflags.
570; @param 2 The set of modified flags.
571; @param 3 The set of undefined flags.
572; @param 4 The flags that must be loaded.
573;
574%macro IEM_MAYBE_LOAD_FLAGS_OLD 4
575 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
576 pushf ; store current flags
577 mov T0_32, [%1] ; load the guest flags
578 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
579 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
580 or [xSP], T0 ; merge guest flags with host flags.
581 popf ; load the mixed flags.
582
583 %elif (%3 + %4) != 0
584 %if 1 ; This approach seems faster on intel 10980XE
585 %if (%3 | %4) == X86_EFL_CF
586 ; Use bt to load bit into CF
587 bt dword [%1], X86_EFL_CF_BIT
588 %else
589 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
590 mov eax, [%1]
591 %if (%3 | %4) == X86_EFL_OF
592 ; Use ADD to set OF.
593 shl eax, 31 - X86_EFL_OF_BIT
594 add eax, 80000000h
595 %elif ((%3 | %4) & X86_EFL_OF) != 0
596 ; Use ADD to set OF.
597 xchg al, ah
598 shl al, 15 - X86_EFL_OF_BIT
599 add al, 80h
600 ; Use SAHF to set the other status flags.
601 sahf
602 %else ; OF not needed; so al -> ah and load ah into eflags.
603 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
604 shl eax, 8
605 %else
606 xchg al, ah
607 %endif
608 sahf
609 %endif
610 %endif
611
612 %else
613 pushf ; store current flags
614 mov T0_32, [%1] ; load the guest flags
615 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
616 and T0_32, (%2 | %3) ; select the modified and undefined flags.
617 or [xSP], T0 ; merge guest flags with host flags.
618 popf ; load the mixed flags.
619 %endif
620 %endif
621%endmacro
622
623;;
624; Load the relevant flags from [%1].
625;
626; @remarks Clobbers T0, stack. Changes EFLAGS.
627; @param 1 The parameter (A0..A3) pointing to the eflags.
628; @param 2 The set of flags to load.
629; @param 3 The set of undefined flags.
630;
631%macro IEM_LOAD_FLAGS_OLD 3
632 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
633 pushf ; store current flags
634 mov T0_32, [%1] ; load the guest flags
635 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
636 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
637 or [xSP], T0 ; merge guest flags with host flags.
638 popf ; load the mixed flags.
639
640 %elif 1 ; This approach seems faster on intel 10980XE
641 %if (%3 | %2) == X86_EFL_CF
642 ; Use bt to load bit into CF
643 bt dword [%1], X86_EFL_CF_BIT
644 %else
645 mov eax, [%1] ; ASSUMES T0_32 is eax!!
646 %if (%3 | %2) == X86_EFL_OF
647 ; Use ADD to set OF.
648 shl eax, 31 - X86_EFL_OF_BIT
649 add eax, 80000000h
650 %elif ((%3 | %2) & X86_EFL_OF) != 0
651 ; Use ADD to set OF.
652 xchg al, ah
653 shl al, 15 - X86_EFL_OF_BIT
654 add al, 80h
655 ; Use SAHF to set the other status flags.
656 sahf
657 %else ; OF not needed; so al -> ah and load ah into eflags.
658 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
659 shl eax, 8
660 %else
661 xchg al, ah
662 %endif
663 sahf
664 %endif
665 %endif ; (%3 | %2) != X86_EFL_CF
666
667 %else
668 pushf ; store current flags
669 mov T0_32, [%1] ; load the guest flags
670 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
671 and T0_32, (%2 | %3) ; select the modified and undefined flags.
672 or [xSP], T0 ; merge guest flags with host flags.
673 popf ; load the mixed flags.
674 %endif
675%endmacro
676
677;;
678; Update the flag.
679;
680; @remarks Clobbers T0, T1, stack.
681; @param 1 The register pointing to the EFLAGS.
682; @param 2 The mask of modified flags to save.
683; @param 3 The mask of undefined flags to (maybe) save.
684; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
685;
686%macro IEM_SAVE_FLAGS_OLD 4 0
687 %if (%2 | %3 | %4) != 0
688 mov T1_32, [%1] ; flags
689 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
690 pushf
691 pop T0
692 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
693 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
694 %else
695 %if (%2 | %3 | %4) == X86_EFL_CF
696 setc T0_8
697 %elif (%2 | %3) == X86_EFL_OF
698 seto T0_8
699 shl T0_32, X86_EFL_OF_BIT
700 %elif (%2 | %3) == X86_EFL_ZF
701 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
702 shl T0_32, X86_EFL_ZF_BIT
703 %elif (%2 | %3) <= 0xff
704 lahf
705 movzx eax, ah ; ASSUMES T0_32 is eax!
706 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
707 lahf ; while there seems only to be a tiny advantage in most other test.
708 movzx eax, ah ; ASSUMES T0_32 is eax!
709 jno .of_is_clear
710 or eax, X86_EFL_OF
711.of_is_clear:
712 %else
713 pushf ; this is a bit slow
714 pop T0
715 %endif
716 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
717 and T0_32, (%2 | %3) ; select the modified and undefined flags.
718 %endif
719 or T0_32, T1_32 ; combine the flags.
720 mov [%1], T0_32 ; save the flags.
721 %endif
722%endmacro
723
724;;
725; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
726;
727; @remarks Clobbers T0, T1, stack.
728; @param 1 The register pointing to the EFLAGS.
729; @param 2 The mask of modified flags to save.
730; @param 3 Mask of additional flags to always clear
731; @param 4 Mask of additional flags to always set.
732;
733%macro IEM_SAVE_AND_ADJUST_FLAGS_OLD 4
734 %if (%2 | %3 | %4) != 0
735 pushf
736 pop T1
737 mov T0_32, [%1] ; load flags.
738 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
739 and T1_32, (%2) ; select the modified flags.
740 or T0_32, T1_32 ; combine the flags.
741 %if (%4) != 0
742 or T0_32, %4 ; add the always set flags.
743 %endif
744 mov [%1], T0_32 ; save the result.
745 %endif
746%endmacro
747
748;;
749; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
750; signed input (%4[%5]) and parity index (%6).
751;
752; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
753; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
754; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
755;
756; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
757; @param 1 The register pointing to the EFLAGS.
758; @param 2 The mask of modified flags to save.
759; @param 3 Mask of additional flags to always clear
760; @param 4 The result register to set SF by.
761; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
762; @param 6 The (full) register containing the parity table index. Will be modified!
763
764%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD 6
765 %ifdef RT_ARCH_AMD64
766 pushf
767 pop T2
768 %else
769 push T0
770 pushf
771 pop T0
772 %endif
773 mov T1_32, [%1] ; load flags.
774 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
775 %ifdef RT_ARCH_AMD64
776 and T2_32, (%2) ; select the modified flags.
777 or T1_32, T2_32 ; combine the flags.
778 %else
779 and T0_32, (%2) ; select the modified flags.
780 or T1_32, T0_32 ; combine the flags.
781 pop T0
782 %endif
783
784 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
785 bt %4, %5 - 1
786 jnc %%sf_clear
787 or T1_32, X86_EFL_SF
788 %%sf_clear:
789
790 ; Parity last.
791 and %6, 0xff
792 %ifdef RT_ARCH_AMD64
793 lea T2, [NAME(g_afParity) xWrtRIP]
794 or T1_8, [T2 + %6]
795 %else
796 or T1_8, [NAME(g_afParity) + %6]
797 %endif
798
799 mov [%1], T1_32 ; save the result.
800%endmacro
801
802;;
803; Calculates the new EFLAGS using fixed clear and set bit masks.
804;
805; @remarks Clobbers T0.
806; @param 1 The register pointing to the EFLAGS.
807; @param 2 Mask of additional flags to always clear
808; @param 3 Mask of additional flags to always set.
809;
810%macro IEM_ADJUST_FLAGS_OLD 3
811 %if (%2 | %3) != 0
812 mov T0_32, [%1] ; Load flags.
813 %if (%2) != 0
814 and T0_32, ~(%2) ; Remove the always cleared flags.
815 %endif
816 %if (%3) != 0
817 or T0_32, %3 ; Add the always set flags.
818 %endif
819 mov [%1], T0_32 ; Save the result.
820 %endif
821%endmacro
822
823;;
824; Calculates the new EFLAGS using fixed clear and set bit masks.
825;
826; @remarks Clobbers T0, %4, EFLAGS.
827; @param 1 The register pointing to the EFLAGS.
828; @param 2 Mask of additional flags to always clear
829; @param 3 Mask of additional flags to always set.
830; @param 4 The (full) register containing the parity table index. Will be modified!
831;
832%macro IEM_ADJUST_FLAGS_WITH_PARITY_OLD 4
833 mov T0_32, [%1] ; Load flags.
834 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
835 %if (%3) != 0
836 or T0_32, %3 ; Add the always set flags.
837 %endif
838 and %4, 0xff
839 %ifdef RT_ARCH_AMD64
840 lea T2, [NAME(g_afParity) xWrtRIP]
841 or T0_8, [T2 + %4]
842 %else
843 or T0_8, [NAME(g_afParity) + %4]
844 %endif
845 mov [%1], T0_32 ; Save the result.
846%endmacro
847
848
849
850;;
851; Loads register with offset of imm8 instruction -- used by all of the instruction
852; implementations which lay out jump tables of 256x immediate byte variants.
853; Also checks that the instruction size matches the offsets in the table.
854;
855; @param 1 The register to receive the jump target address (T1).
856; @param 2 The register containing the imm8 index (A1 / A2 / A3).
857; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
858; @note Implicitly uses local symbols .imm0, .imm1, and .immEmd
859; (implementation artifacts of each instruction jump table).
860;
861; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]`.
862;
863%macro IEMIMPL_JUMP_TABLE_TARGET_INT 3
864 lea %1, [.imm0 xWrtRIP]
865 %if %3 == 5
866 lea T0, [%2 + %2*4] ; *5
867 lea %1, [%1 + T0] ; *5 + .imm0
868 %elif %3 == 6
869 lea T0, [%2 + %2*2] ; *3
870 lea %1, [%1 + T0*2] ; *6 + .imm0
871 %elif %3 == 7
872 lea T0, [%2 + %2*2] ; *3
873 lea T0, [T0 + %2*4] ; *7
874 lea %1, [%1 + T0] ; *7 + .imm0
875 %elif %3 == 8
876 lea %1, [%1 + %2*8] ; *8 + .imm0
877 %elif %3 == 9
878 lea T0, [%2 + %2*8] ; *9
879 lea %1, [%1 + T0] ; *9 + .imm0
880 %elif %3 == 10
881 lea T0, [%2 + %2*4] ; *5
882 lea %1, [%1 + T0*2] ; *10 + .imm0
883 %elif %3 == 11
884 lea T0, [%2 + %2*4] ; *5
885 lea T0, [%2 + T0*2] ; *11
886 lea %1, [%1 + T0] ; *11 + .imm0
887 %elif %3 == 12
888 lea T0, [%2 + %2*2] ; *3
889 lea %1, [%1 + T0*4] ; *12 + .imm0
890 %else
891 %error Unexpected instruction byte count in IEMIMPL_JUMP_TABLE_TARGET_INT
892 %endif
893 ; check size: 'warning: value does not fit in 8 bit field' if bad
894 times (.imm1 - .imm0 + %3) %% %3 db 999 * \
895 (.imm1 - .imm0 + %3)
896 ; check alignment: 'warning: value does not fit in 8 bit field' if bad
897 times ((.immEnd - .imm0) - 256 * %3) db 999 * \
898 ((.immEnd - .imm0) - 256 * %3)
899%endmacro
900
901%macro IEMIMPL_JUMP_TABLE_TARGET 3
902 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
903 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, (%3 + 4)
904 %else
905 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, %3
906 %endif
907%endmacro
908
909
910;;
911; Calls the given imm8 instruction -- used by all of the instruction
912; implementations which lay out jump tables of 256x immediate byte variants.
913;
914; @param 1 The register to receive the jump target address (T1).
915; @param 2 The register containing the imm8 index (A1 / A2 / A3).
916; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
917;
918; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]` +
919; `IBT_NOTRACK, call %1`.
920;
921%macro IEMIMPL_CALL_JUMP_TABLE_TARGET 3
922 IEMIMPL_JUMP_TABLE_TARGET %1, %2, %3
923 IBT_NOTRACK
924 call %1
925%endmacro
926
927
928;*********************************************************************************************************************************
929;* External Symbols *
930;*********************************************************************************************************************************
931extern NAME(g_afParity)
932
933
934;;
935; Macro for implementing a binary operator.
936;
937; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
938; variants, except on 32-bit system where the 64-bit accesses requires hand
939; coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; @param 1 The instruction mnemonic.
945; @param 2 Non-zero if there should be a locked version.
946; @param 3 The modified flags.
947; @param 4 The undefined flags.
948; @param 5 The flags that must be loaded (ADC, SBC).
949; @param 6 The flags that will be zeroed by the operation.
950;
951%macro IEMIMPL_BIN_OP 6
952BEGINCODE
953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
954 PROLOGUE_3_ARGS
955 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
956 %1 byte [A1], A2_8
957 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
958 EPILOGUE_3_ARGS
959ENDPROC iemAImpl_ %+ %1 %+ _u8
960
961BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
962 PROLOGUE_3_ARGS
963 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
964 %1 word [A1], A2_16
965 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
966 EPILOGUE_3_ARGS
967ENDPROC iemAImpl_ %+ %1 %+ _u16
968
969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
970 PROLOGUE_3_ARGS
971 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
972 %1 dword [A1], A2_32
973 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
974 EPILOGUE_3_ARGS
975ENDPROC iemAImpl_ %+ %1 %+ _u32
976
977 %ifdef RT_ARCH_AMD64
978BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
979 PROLOGUE_3_ARGS
980 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
981 %1 qword [A1], A2
982 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
983 EPILOGUE_3_ARGS_EX 8
984ENDPROC iemAImpl_ %+ %1 %+ _u64
985 %endif ; RT_ARCH_AMD64
986
987 %if %2 != 0 ; locked versions requested?
988
989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
990 PROLOGUE_3_ARGS
991 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
992 lock %1 byte [A1], A2_8
993 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1000 lock %1 word [A1], A2_16
1001 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1002 EPILOGUE_3_ARGS
1003ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1004
1005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1006 PROLOGUE_3_ARGS
1007 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1008 lock %1 dword [A1], A2_32
1009 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1010 EPILOGUE_3_ARGS
1011ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1012
1013 %ifdef RT_ARCH_AMD64
1014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1015 PROLOGUE_3_ARGS
1016 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1017 lock %1 qword [A1], A2
1018 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1019 EPILOGUE_3_ARGS_EX 8
1020ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1021 %endif ; RT_ARCH_AMD64
1022 %endif ; locked
1023%endmacro
1024
1025; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
1026IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1027IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1028IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1029IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1030IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1031IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1032IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1033IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1034IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1035
1036
1037;;
1038; Macro for implementing a binary operator, VEX variant with separate input/output.
1039;
1040; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1041; where the 64-bit accesses requires hand coding.
1042;
1043; All the functions takes a pointer to the destination memory operand in A0,
1044; the first source register operand in A1, the second source register operand
1045; in A2 and a pointer to eflags in A3.
1046;
1047; @param 1 The instruction mnemonic.
1048; @param 2 The modified flags.
1049; @param 3 The undefined flags.
1050; @param 4 The zeroed flags.
1051;
1052%macro IEMIMPL_VEX_BIN_OP 4
1053BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1054 PROLOGUE_4_ARGS
1055 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
1056 %1 T0_32, A1_32, A2_32
1057 mov [A0], T0_32
1058 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1059 EPILOGUE_4_ARGS
1060ENDPROC iemAImpl_ %+ %1 %+ _u32
1061
1062 %ifdef RT_ARCH_AMD64
1063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1064 PROLOGUE_4_ARGS
1065 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0
1066 %1 T0, A1, A2
1067 mov [A0], T0
1068 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1069 EPILOGUE_4_ARGS
1070ENDPROC iemAImpl_ %+ %1 %+ _u64
1071 %endif ; RT_ARCH_AMD64
1072%endmacro
1073
1074; instr, modified-flags, undefined-flags, zeroed-flags
1075IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1076IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1077IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
1078
1079;;
1080; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
1081;
1082; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1083; where the 64-bit accesses requires hand coding.
1084;
1085; All the functions takes a pointer to the destination memory operand in A1,
1086; the source register operand in A2 and incoming EFLAGS in A0. Updated EFLAGS
1087; are returned in EAX.
1088;
1089; @param 1 The instruction mnemonic.
1090; @param 2 The modified flags.
1091; @param 3 The undefined flags.
1092; @param 4 The zeroed flags.
1093;
1094%macro IEMIMPL_VEX_BIN_OP_2 4
1095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1096 PROLOGUE_4_ARGS
1097 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
1098 mov T0_32, [A1]
1099 %1 T0_32, A2_32
1100 mov [A1], T0_32
1101 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1102 EPILOGUE_4_ARGS
1103ENDPROC iemAImpl_ %+ %1 %+ _u32
1104
1105 %ifdef RT_ARCH_AMD64
1106BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1107 PROLOGUE_4_ARGS
1108 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1109 mov T0, [A1]
1110 %1 T0, A2
1111 mov [A1], T0
1112 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1113 EPILOGUE_4_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u64
1115 %endif ; RT_ARCH_AMD64
1116%endmacro
1117
1118; instr, modified-flags, undefined-flags zeroed-flags
1119IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1120IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1121IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1122
1123
1124;;
1125; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
1126;
1127; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1128; where the 64-bit accesses requires hand coding.
1129;
1130; All the functions takes a pointer to the destination memory operand in A0,
1131; the first source register operand in A1, the second source register operand
1132; in A2 and a pointer to eflags in A3.
1133;
1134; @param 1 The instruction mnemonic.
1135; @param 2 Fallback instruction if applicable.
1136; @param 3 Whether to emit fallback or not.
1137;
1138%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
1139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1140 PROLOGUE_3_ARGS
1141 %1 T0_32, A1_32, A2_32
1142 mov [A0], T0_32
1143 EPILOGUE_3_ARGS
1144ENDPROC iemAImpl_ %+ %1 %+ _u32
1145
1146 %if %3
1147BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
1148 PROLOGUE_3_ARGS
1149 %ifdef ASM_CALL64_GCC
1150 mov cl, A2_8
1151 %2 A1_32, cl
1152 mov [A0], A1_32
1153 %else
1154 xchg A2, A0
1155 %2 A1_32, cl
1156 mov [A2], A1_32
1157 %endif
1158 EPILOGUE_3_ARGS
1159ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
1160 %endif
1161
1162 %ifdef RT_ARCH_AMD64
1163BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1164 PROLOGUE_3_ARGS
1165 %1 T0, A1, A2
1166 mov [A0], T0
1167 EPILOGUE_3_ARGS
1168ENDPROC iemAImpl_ %+ %1 %+ _u64
1169
1170 %if %3
1171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
1172 PROLOGUE_3_ARGS
1173 %ifdef ASM_CALL64_GCC
1174 mov cl, A2_8
1175 %2 A1, cl
1176 mov [A0], A1_32
1177 %else
1178 xchg A2, A0
1179 %2 A1, cl
1180 mov [A2], A1_32
1181 %endif
1182 mov [A0], A1
1183 EPILOGUE_3_ARGS
1184ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
1185 %endif
1186 %endif ; RT_ARCH_AMD64
1187%endmacro
1188
1189; instr, fallback instr, emit fallback
1190IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
1191IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
1192IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
1193IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
1194IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
1195
1196
1197;
1198; RORX uses a immediate byte for the shift count, so we only do
1199; fallback implementation of that one.
1200;
1201BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
1202 PROLOGUE_3_ARGS
1203 %ifdef ASM_CALL64_GCC
1204 mov cl, A2_8
1205 ror A1_32, cl
1206 mov [A0], A1_32
1207 %else
1208 xchg A2, A0
1209 ror A1_32, cl
1210 mov [A2], A1_32
1211 %endif
1212 EPILOGUE_3_ARGS
1213ENDPROC iemAImpl_rorx_u32
1214
1215 %ifdef RT_ARCH_AMD64
1216BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
1217 PROLOGUE_3_ARGS
1218 %ifdef ASM_CALL64_GCC
1219 mov cl, A2_8
1220 ror A1, cl
1221 mov [A0], A1
1222 %else
1223 xchg A2, A0
1224 ror A1, cl
1225 mov [A2], A1
1226 %endif
1227 EPILOGUE_3_ARGS
1228ENDPROC iemAImpl_rorx_u64
1229 %endif ; RT_ARCH_AMD64
1230
1231
1232;
1233; MULX
1234;
1235BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
1236 PROLOGUE_4_ARGS
1237%ifdef ASM_CALL64_GCC
1238 ; A2_32 is EDX - prefect
1239 mulx T0_32, T1_32, A3_32
1240 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
1241 mov [A0], T0_32
1242%else
1243 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
1244 xchg A1, A2
1245 mulx T0_32, T1_32, A3_32
1246 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
1247 mov [A0], T0_32
1248%endif
1249 EPILOGUE_4_ARGS
1250ENDPROC iemAImpl_mulx_u32
1251
1252
1253BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
1254 PROLOGUE_4_ARGS
1255%ifdef ASM_CALL64_GCC
1256 ; A2_32 is EDX, T0_32 is EAX
1257 mov eax, A3_32
1258 mul A2_32
1259 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
1260 mov [A0], edx
1261%else
1262 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
1263 xchg A1, A2
1264 mov eax, A3_32
1265 mul A2_32
1266 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
1267 mov [A0], edx
1268%endif
1269 EPILOGUE_4_ARGS
1270ENDPROC iemAImpl_mulx_u32_fallback
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
1274 PROLOGUE_4_ARGS
1275%ifdef ASM_CALL64_GCC
1276 ; A2 is RDX - prefect
1277 mulx T0, T1, A3
1278 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
1279 mov [A0], T0
1280%else
1281 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
1282 xchg A1, A2
1283 mulx T0, T1, A3
1284 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
1285 mov [A0], T0
1286%endif
1287 EPILOGUE_4_ARGS
1288ENDPROC iemAImpl_mulx_u64
1289
1290
1291BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
1292 PROLOGUE_4_ARGS
1293%ifdef ASM_CALL64_GCC
1294 ; A2 is RDX, T0 is RAX
1295 mov rax, A3
1296 mul A2
1297 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
1298 mov [A0], rdx
1299%else
1300 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
1301 xchg A1, A2
1302 mov rax, A3
1303 mul A2
1304 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
1305 mov [A0], rdx
1306%endif
1307 EPILOGUE_4_ARGS
1308ENDPROC iemAImpl_mulx_u64_fallback
1309
1310%endif
1311
1312
1313;;
1314; Macro for implementing a bit operator.
1315;
1316; This will generate code for the 16, 32 and 64 bit accesses with locked
1317; variants, except on 32-bit system where the 64-bit accesses requires hand
1318; coding.
1319;
1320; All the functions takes a pointer to the destination memory operand in A1,
1321; the source register operand in A2 and incoming eflags in A0.
1322;
1323; @param 1 The instruction mnemonic.
1324; @param 2 Non-zero if there should be a locked version.
1325; @param 3 The modified flags.
1326; @param 4 The undefined flags.
1327;
1328%macro IEMIMPL_BIT_OP 4
1329BEGINCODE
1330BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1333 %1 word [A1], A2_16
1334 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_ %+ %1 %+ _u16
1337
1338BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1341 %1 dword [A1], A2_32
1342 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1343 EPILOGUE_3_ARGS
1344ENDPROC iemAImpl_ %+ %1 %+ _u32
1345
1346 %ifdef RT_ARCH_AMD64
1347BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1348 PROLOGUE_3_ARGS
1349 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1350 %1 qword [A1], A2
1351 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1352 EPILOGUE_3_ARGS_EX 8
1353ENDPROC iemAImpl_ %+ %1 %+ _u64
1354 %endif ; RT_ARCH_AMD64
1355
1356 %if %2 != 0 ; locked versions requested?
1357
1358BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1359 PROLOGUE_3_ARGS
1360 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1361 lock %1 word [A1], A2_16
1362 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1363 EPILOGUE_3_ARGS
1364ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1365
1366BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1367 PROLOGUE_3_ARGS
1368 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1369 lock %1 dword [A1], A2_32
1370 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1371 EPILOGUE_3_ARGS
1372ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1373
1374 %ifdef RT_ARCH_AMD64
1375BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1376 PROLOGUE_3_ARGS
1377 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1378 lock %1 qword [A1], A2
1379 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1380 EPILOGUE_3_ARGS_EX 8
1381ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1382 %endif ; RT_ARCH_AMD64
1383 %endif ; locked
1384%endmacro
1385
1386; Undefined flags are passed thru here by the intel and amd CPUs we have.
1387; modified efl, undefined eflags
1388IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1389IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1390IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1391IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1392
1393;;
1394; Macro for implementing a bit search operator.
1395;
1396; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1397; system where the 64-bit accesses requires hand coding.
1398;
1399; All the functions takes a pointer to the destination memory operand in A1,
1400; the source register operand in A2 and the incoming eflags in A0.
1401;
1402; In the ZF case the destination register is 'undefined', however it seems that
1403; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1404; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1405; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1406; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1407;
1408; Intel: Clear all and calculate PF in addition to ZF.
1409; AMD: Passthru all flags other than ZF.
1410;
1411; @param 1 The instruction mnemonic.
1412; @param 2 The modified flags.
1413; @param 3 The undefined flags.
1414; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1415;
1416%macro IEMIMPL_BIT_OP2 4
1417BEGINCODE
1418; 16-bit
1419
1420BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1421 PROLOGUE_3_ARGS
1422 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1423 %1 T0_16, A2_16
1424%if %4 != 0
1425 jz .unchanged_dst
1426%endif
1427 mov [A1], T0_16
1428.unchanged_dst:
1429 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1430 EPILOGUE_3_ARGS
1431ENDPROC iemAImpl_ %+ %1 %+ _u16
1432
1433;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1434;bad; PROLOGUE_3_ARGS
1435;bad; %1 T1_16, A1_16
1436;bad; jz .unchanged_dst
1437;bad; mov [A0], T1_16
1438;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1439;bad; EPILOGUE_3_ARGS
1440;bad;.unchanged_dst:
1441;bad;%if %4 != 0
1442;bad; mov [A0], T1_16
1443;bad;%endif
1444;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1445;bad; EPILOGUE_3_ARGS
1446;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1447;bad;
1448;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1449;bad; PROLOGUE_3_ARGS
1450;bad; %1 T0_16, A1_16
1451;bad;%if %4 != 0
1452;bad; jz .unchanged_dst
1453;bad;%endif
1454;bad; mov [A0], T0_16
1455;bad;.unchanged_dst:
1456;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1457;bad; EPILOGUE_3_ARGS
1458;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1459
1460; 32-bit
1461
1462BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1463 PROLOGUE_3_ARGS
1464 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1465 %1 T0_32, A2_32
1466%if %4 != 0
1467 jz .unchanged_dst
1468%endif
1469 mov [A1], T0_32
1470.unchanged_dst:
1471 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1472 EPILOGUE_3_ARGS
1473ENDPROC iemAImpl_ %+ %1 %+ _u32
1474
1475;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1476;bad; PROLOGUE_3_ARGS
1477;bad; %1 T1_32, A1_32
1478;bad;%if %4 != 0
1479;bad; jz .unchanged_dst
1480;bad;%endif
1481;bad; mov [A0], T1_32
1482;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1483;bad; EPILOGUE_3_ARGS
1484;bad;.unchanged_dst:
1485;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1486;bad; EPILOGUE_3_ARGS
1487;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1488;bad;
1489;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1490;bad; PROLOGUE_3_ARGS
1491;bad; %1 T0_32, A1_32
1492;bad;%if %4 != 0
1493;bad; jz .unchanged_dst
1494;bad;%endif
1495;bad; mov [A0], T0_32
1496;bad;.unchanged_dst:
1497;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1498;bad; EPILOGUE_3_ARGS
1499;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1500
1501
1502 %ifdef RT_ARCH_AMD64
1503; 64-bit
1504
1505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1506 PROLOGUE_3_ARGS
1507 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1508 %1 T0, A2
1509%if %4 != 0
1510 jz .unchanged_dst
1511%endif
1512 mov [A1], T0
1513.unchanged_dst:
1514 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1515 EPILOGUE_3_ARGS_EX 8
1516ENDPROC iemAImpl_ %+ %1 %+ _u64
1517
1518;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1519;bad; PROLOGUE_3_ARGS
1520;bad; %1 T1, A1
1521;bad;%if %4 != 0
1522;bad; jz .unchanged_dst
1523;bad;%endif
1524;bad; mov [A0], T1
1525;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1526;bad; EPILOGUE_3_ARGS
1527;bad;.unchanged_dst:
1528;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1529;bad; EPILOGUE_3_ARGS
1530;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1531;bad;
1532;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1533;bad; PROLOGUE_3_ARGS
1534;bad; %1 T0, A1
1535;bad;%if %4 != 0
1536;bad; jz .unchanged_dst
1537;bad;%endif
1538;bad; mov [A0], T0
1539;bad;.unchanged_dst:
1540;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1541;bad; EPILOGUE_3_ARGS_EX 8
1542;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1543
1544 %endif ; RT_ARCH_AMD64
1545%endmacro
1546
1547IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1548IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1549IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1550IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1551
1552
1553;;
1554; Macro for implementing POPCNT.
1555;
1556; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1557; system where the 64-bit accesses requires hand coding.
1558;
1559; All the functions takes a pointer to the destination memory operand in A1,
1560; the source register operand in A2 and eflags in A0.
1561;
1562; ASSUMES Intel and AMD set EFLAGS the same way.
1563;
1564; ASSUMES the instruction does not support memory destination.
1565;
1566; @param 1 The instruction mnemonic.
1567; @param 2 The modified flags.
1568; @param 3 The undefined flags.
1569; @param 4 The zeroed flags.
1570;
1571%macro IEMIMPL_BIT_OP3 4
1572BEGINCODE
1573BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1574 PROLOGUE_3_ARGS
1575 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1576 %1 T0_16, A2_16
1577 mov [A1], T0_16
1578 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1579 EPILOGUE_3_ARGS
1580ENDPROC iemAImpl_ %+ %1 %+ _u16
1581
1582BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1583 PROLOGUE_3_ARGS
1584 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1585 %1 T0_32, A2_32
1586 mov [A1], T0_32
1587 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1588 EPILOGUE_3_ARGS
1589ENDPROC iemAImpl_ %+ %1 %+ _u32
1590
1591 %ifdef RT_ARCH_AMD64
1592BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1593 PROLOGUE_3_ARGS
1594 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1595 %1 T0, A2
1596 mov [A1], T0
1597 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1598 EPILOGUE_3_ARGS_EX 8
1599ENDPROC iemAImpl_ %+ %1 %+ _u64
1600 %endif ; RT_ARCH_AMD64
1601%endmacro
1602IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1603
1604
1605;
1606; IMUL is also a similar but yet different case (no lock, no mem dst).
1607; The rDX:rAX variant of imul is handled together with mul further down.
1608;
1609BEGINCODE
1610; @param 1 EFLAGS that are modified.
1611; @param 2 Undefined EFLAGS.
1612; @param 3 Function suffix.
1613; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1614; 2 for AMD (set AF, clear PF, ZF and SF).
1615%macro IEMIMPL_IMUL_TWO 4
1616BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1617 PROLOGUE_3_ARGS
1618 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1619 imul A2_16, word [A1]
1620 mov [A1], A2_16
1621 %if %4 != 1
1622 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1623 %else
1624 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_16, 16, A2 ; intel
1625 %endif
1626 EPILOGUE_3_ARGS
1627ENDPROC iemAImpl_imul_two_u16 %+ %3
1628
1629BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1630 PROLOGUE_3_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1632 imul A2_32, dword [A1]
1633 mov [A1], A2_32
1634 %if %4 != 1
1635 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1636 %else
1637 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_32, 32, A2 ; intel
1638 %endif
1639 EPILOGUE_3_ARGS
1640ENDPROC iemAImpl_imul_two_u32 %+ %3
1641
1642 %ifdef RT_ARCH_AMD64
1643BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1644 PROLOGUE_3_ARGS
1645 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1646 imul A2, qword [A1]
1647 mov [A1], A2
1648 %if %4 != 1
1649 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1650 %else
1651 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2, 64, A2 ; intel
1652 %endif
1653 EPILOGUE_3_ARGS_EX 8
1654ENDPROC iemAImpl_imul_two_u64 %+ %3
1655 %endif ; RT_ARCH_AMD64
1656%endmacro
1657; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1658; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1659; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1660IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1661IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1662IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1663
1664
1665;
1666; XCHG for memory operands. This implies locking. No flag changes.
1667;
1668; Each function takes two arguments, first the pointer to the memory,
1669; then the pointer to the register. They all return void.
1670;
1671BEGINCODE
1672BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1673 PROLOGUE_2_ARGS
1674 mov T0_8, [A1]
1675 xchg [A0], T0_8
1676 mov [A1], T0_8
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_xchg_u8_locked
1679
1680BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 mov T0_16, [A1]
1683 xchg [A0], T0_16
1684 mov [A1], T0_16
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_xchg_u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1689 PROLOGUE_2_ARGS
1690 mov T0_32, [A1]
1691 xchg [A0], T0_32
1692 mov [A1], T0_32
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_xchg_u32_locked
1695
1696%ifdef RT_ARCH_AMD64
1697BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1698 PROLOGUE_2_ARGS
1699 mov T0, [A1]
1700 xchg [A0], T0
1701 mov [A1], T0
1702 EPILOGUE_2_ARGS
1703ENDPROC iemAImpl_xchg_u64_locked
1704%endif
1705
1706; Unlocked variants for fDisregardLock mode.
1707
1708BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1709 PROLOGUE_2_ARGS
1710 mov T0_8, [A1]
1711 mov T1_8, [A0]
1712 mov [A0], T0_8
1713 mov [A1], T1_8
1714 EPILOGUE_2_ARGS
1715ENDPROC iemAImpl_xchg_u8_unlocked
1716
1717BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1718 PROLOGUE_2_ARGS
1719 mov T0_16, [A1]
1720 mov T1_16, [A0]
1721 mov [A0], T0_16
1722 mov [A1], T1_16
1723 EPILOGUE_2_ARGS
1724ENDPROC iemAImpl_xchg_u16_unlocked
1725
1726BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1727 PROLOGUE_2_ARGS
1728 mov T0_32, [A1]
1729 mov T1_32, [A0]
1730 mov [A0], T0_32
1731 mov [A1], T1_32
1732 EPILOGUE_2_ARGS
1733ENDPROC iemAImpl_xchg_u32_unlocked
1734
1735%ifdef RT_ARCH_AMD64
1736BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1737 PROLOGUE_2_ARGS
1738 mov T0, [A1]
1739 mov T1, [A0]
1740 mov [A0], T0
1741 mov [A1], T1
1742 EPILOGUE_2_ARGS
1743ENDPROC iemAImpl_xchg_u64_unlocked
1744%endif
1745
1746
1747;
1748; XADD for memory operands.
1749;
1750; Each function takes three arguments, first the pointer to the
1751; memory/register, then the pointer to the register, and finally a pointer to
1752; eflags. They all return void.
1753;
1754BEGINCODE
1755BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1756 PROLOGUE_3_ARGS
1757 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1758 mov T0_8, [A1]
1759 xadd [A0], T0_8
1760 mov [A1], T0_8
1761 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1762 EPILOGUE_3_ARGS
1763ENDPROC iemAImpl_xadd_u8
1764
1765BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1766 PROLOGUE_3_ARGS
1767 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1768 mov T0_16, [A1]
1769 xadd [A0], T0_16
1770 mov [A1], T0_16
1771 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1772 EPILOGUE_3_ARGS
1773ENDPROC iemAImpl_xadd_u16
1774
1775BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1778 mov T0_32, [A1]
1779 xadd [A0], T0_32
1780 mov [A1], T0_32
1781 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_xadd_u32
1784
1785%ifdef RT_ARCH_AMD64
1786BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1787 PROLOGUE_3_ARGS
1788 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1789 mov T0, [A1]
1790 xadd [A0], T0
1791 mov [A1], T0
1792 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1793 EPILOGUE_3_ARGS
1794ENDPROC iemAImpl_xadd_u64
1795%endif ; RT_ARCH_AMD64
1796
1797BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1798 PROLOGUE_3_ARGS
1799 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1800 mov T0_8, [A1]
1801 lock xadd [A0], T0_8
1802 mov [A1], T0_8
1803 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1804 EPILOGUE_3_ARGS
1805ENDPROC iemAImpl_xadd_u8_locked
1806
1807BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1808 PROLOGUE_3_ARGS
1809 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1810 mov T0_16, [A1]
1811 lock xadd [A0], T0_16
1812 mov [A1], T0_16
1813 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1814 EPILOGUE_3_ARGS
1815ENDPROC iemAImpl_xadd_u16_locked
1816
1817BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1818 PROLOGUE_3_ARGS
1819 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1820 mov T0_32, [A1]
1821 lock xadd [A0], T0_32
1822 mov [A1], T0_32
1823 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1824 EPILOGUE_3_ARGS
1825ENDPROC iemAImpl_xadd_u32_locked
1826
1827%ifdef RT_ARCH_AMD64
1828BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1829 PROLOGUE_3_ARGS
1830 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1831 mov T0, [A1]
1832 lock xadd [A0], T0
1833 mov [A1], T0
1834 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1835 EPILOGUE_3_ARGS
1836ENDPROC iemAImpl_xadd_u64_locked
1837%endif ; RT_ARCH_AMD64
1838
1839
1840;
1841; CMPXCHG8B.
1842;
1843; These are tricky register wise, so the code is duplicated for each calling
1844; convention.
1845;
1846; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1847;
1848; C-proto:
1849; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1850; uint32_t *pEFlags));
1851;
1852; Note! Identical to iemAImpl_cmpxchg16b.
1853;
1854BEGINCODE
1855BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1856%ifdef RT_ARCH_AMD64
1857 %ifdef ASM_CALL64_MSC
1858 push rbx
1859
1860 mov r11, rdx ; pu64EaxEdx (is also T1)
1861 mov r10, rcx ; pu64Dst
1862
1863 mov ebx, [r8]
1864 mov ecx, [r8 + 4]
1865 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1866 mov eax, [r11]
1867 mov edx, [r11 + 4]
1868
1869 cmpxchg8b [r10]
1870
1871 mov [r11], eax
1872 mov [r11 + 4], edx
1873 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1874
1875 pop rbx
1876 ret
1877 %else
1878 push rbx
1879
1880 mov r10, rcx ; pEFlags
1881 mov r11, rdx ; pu64EbxEcx (is also T1)
1882
1883 mov ebx, [r11]
1884 mov ecx, [r11 + 4]
1885 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1886 mov eax, [rsi]
1887 mov edx, [rsi + 4]
1888
1889 cmpxchg8b [rdi]
1890
1891 mov [rsi], eax
1892 mov [rsi + 4], edx
1893 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1894
1895 pop rbx
1896 ret
1897
1898 %endif
1899%else
1900 push esi
1901 push edi
1902 push ebx
1903 push ebp
1904
1905 mov edi, ecx ; pu64Dst
1906 mov esi, edx ; pu64EaxEdx
1907 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1908 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1909
1910 mov ebx, [ecx]
1911 mov ecx, [ecx + 4]
1912 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1913 mov eax, [esi]
1914 mov edx, [esi + 4]
1915
1916 cmpxchg8b [edi]
1917
1918 mov [esi], eax
1919 mov [esi + 4], edx
1920 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1921
1922 pop ebp
1923 pop ebx
1924 pop edi
1925 pop esi
1926 ret 8
1927%endif
1928ENDPROC iemAImpl_cmpxchg8b
1929
1930BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1931%ifdef RT_ARCH_AMD64
1932 %ifdef ASM_CALL64_MSC
1933 push rbx
1934
1935 mov r11, rdx ; pu64EaxEdx (is also T1)
1936 mov r10, rcx ; pu64Dst
1937
1938 mov ebx, [r8]
1939 mov ecx, [r8 + 4]
1940 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1941 mov eax, [r11]
1942 mov edx, [r11 + 4]
1943
1944 lock cmpxchg8b [r10]
1945
1946 mov [r11], eax
1947 mov [r11 + 4], edx
1948 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1949
1950 pop rbx
1951 ret
1952 %else
1953 push rbx
1954
1955 mov r10, rcx ; pEFlags
1956 mov r11, rdx ; pu64EbxEcx (is also T1)
1957
1958 mov ebx, [r11]
1959 mov ecx, [r11 + 4]
1960 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1961 mov eax, [rsi]
1962 mov edx, [rsi + 4]
1963
1964 lock cmpxchg8b [rdi]
1965
1966 mov [rsi], eax
1967 mov [rsi + 4], edx
1968 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1969
1970 pop rbx
1971 ret
1972
1973 %endif
1974%else
1975 push esi
1976 push edi
1977 push ebx
1978 push ebp
1979
1980 mov edi, ecx ; pu64Dst
1981 mov esi, edx ; pu64EaxEdx
1982 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1983 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1984
1985 mov ebx, [ecx]
1986 mov ecx, [ecx + 4]
1987 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1988 mov eax, [esi]
1989 mov edx, [esi + 4]
1990
1991 lock cmpxchg8b [edi]
1992
1993 mov [esi], eax
1994 mov [esi + 4], edx
1995 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1996
1997 pop ebp
1998 pop ebx
1999 pop edi
2000 pop esi
2001 ret 8
2002%endif
2003ENDPROC iemAImpl_cmpxchg8b_locked
2004
2005%ifdef RT_ARCH_AMD64
2006
2007;
2008; CMPXCHG16B.
2009;
2010; These are tricky register wise, so the code is duplicated for each calling
2011; convention.
2012;
2013; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2014;
2015; C-proto:
2016; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
2017; uint32_t *pEFlags));
2018;
2019; Note! Identical to iemAImpl_cmpxchg8b.
2020;
2021BEGINCODE
2022BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
2023 %ifdef ASM_CALL64_MSC
2024 push rbx
2025
2026 mov r11, rdx ; pu64RaxRdx (is also T1)
2027 mov r10, rcx ; pu64Dst
2028
2029 mov rbx, [r8]
2030 mov rcx, [r8 + 8]
2031 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2032 mov rax, [r11]
2033 mov rdx, [r11 + 8]
2034
2035 cmpxchg16b [r10]
2036
2037 mov [r11], rax
2038 mov [r11 + 8], rdx
2039 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2040
2041 pop rbx
2042 ret
2043 %else
2044 push rbx
2045
2046 mov r10, rcx ; pEFlags
2047 mov r11, rdx ; pu64RbxRcx (is also T1)
2048
2049 mov rbx, [r11]
2050 mov rcx, [r11 + 8]
2051 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2052 mov rax, [rsi]
2053 mov rdx, [rsi + 8]
2054
2055 cmpxchg16b [rdi]
2056
2057 mov [rsi], rax
2058 mov [rsi + 8], rdx
2059 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2060
2061 pop rbx
2062 ret
2063
2064 %endif
2065ENDPROC iemAImpl_cmpxchg16b
2066
2067BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
2068 %ifdef ASM_CALL64_MSC
2069 push rbx
2070
2071 mov r11, rdx ; pu64RaxRdx (is also T1)
2072 mov r10, rcx ; pu64Dst
2073
2074 mov rbx, [r8]
2075 mov rcx, [r8 + 8]
2076 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2077 mov rax, [r11]
2078 mov rdx, [r11 + 8]
2079
2080 lock cmpxchg16b [r10]
2081
2082 mov [r11], rax
2083 mov [r11 + 8], rdx
2084 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2085
2086 pop rbx
2087 ret
2088 %else
2089 push rbx
2090
2091 mov r10, rcx ; pEFlags
2092 mov r11, rdx ; pu64RbxRcx (is also T1)
2093
2094 mov rbx, [r11]
2095 mov rcx, [r11 + 8]
2096 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2097 mov rax, [rsi]
2098 mov rdx, [rsi + 8]
2099
2100 lock cmpxchg16b [rdi]
2101
2102 mov [rsi], rax
2103 mov [rsi + 8], rdx
2104 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2105
2106 pop rbx
2107 ret
2108
2109 %endif
2110ENDPROC iemAImpl_cmpxchg16b_locked
2111
2112%endif ; RT_ARCH_AMD64
2113
2114
2115;
2116; CMPXCHG.
2117;
2118; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2119;
2120; C-proto:
2121; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
2122;
2123BEGINCODE
2124%macro IEMIMPL_CMPXCHG 2
2125BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2128 mov al, [A1]
2129 %1 cmpxchg [A0], A2_8
2130 mov [A1], al
2131 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2132 EPILOGUE_4_ARGS
2133ENDPROC iemAImpl_cmpxchg_u8 %+ %2
2134
2135BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
2136 PROLOGUE_4_ARGS
2137 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2138 mov ax, [A1]
2139 %1 cmpxchg [A0], A2_16
2140 mov [A1], ax
2141 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2142 EPILOGUE_4_ARGS
2143ENDPROC iemAImpl_cmpxchg_u16 %+ %2
2144
2145BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
2146 PROLOGUE_4_ARGS
2147 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2148 mov eax, [A1]
2149 %1 cmpxchg [A0], A2_32
2150 mov [A1], eax
2151 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2152 EPILOGUE_4_ARGS
2153ENDPROC iemAImpl_cmpxchg_u32 %+ %2
2154
2155BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
2156%ifdef RT_ARCH_AMD64
2157 PROLOGUE_4_ARGS
2158 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2159 mov rax, [A1]
2160 %1 cmpxchg [A0], A2
2161 mov [A1], rax
2162 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2163 EPILOGUE_4_ARGS
2164%else
2165 ;
2166 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
2167 ;
2168 push esi
2169 push edi
2170 push ebx
2171 push ebp
2172
2173 mov edi, ecx ; pu64Dst
2174 mov esi, edx ; pu64Rax
2175 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
2176 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
2177
2178 mov ebx, [ecx]
2179 mov ecx, [ecx + 4]
2180 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2181 mov eax, [esi]
2182 mov edx, [esi + 4]
2183
2184 lock cmpxchg8b [edi]
2185
2186 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
2187 jz .cmpxchg8b_not_equal
2188;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
2189 cmp eax, eax ; just set the other flags.
2190.store:
2191 mov [esi], eax
2192 mov [esi + 4], edx
2193 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
2194
2195 pop ebp
2196 pop ebx
2197 pop edi
2198 pop esi
2199 ret 8
2200
2201.cmpxchg8b_not_equal:
2202 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
2203 jne .store
2204 cmp [esi], eax
2205 jmp .store
2206
2207%endif
2208ENDPROC iemAImpl_cmpxchg_u64 %+ %2
2209%endmacro ; IEMIMPL_CMPXCHG
2210
2211IEMIMPL_CMPXCHG , ,
2212IEMIMPL_CMPXCHG lock, _locked
2213
2214
2215
2216;;
2217; Macro for implementing a unary operator.
2218;
2219; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
2220; variants, except on 32-bit system where the 64-bit accesses requires hand
2221; coding.
2222;
2223; All the functions takes a pointer to the destination memory operand in A0,
2224; the source register operand in A1 and a pointer to eflags in A2.
2225;
2226; @param 1 The instruction mnemonic.
2227; @param 2 The modified flags.
2228; @param 3 The undefined flags.
2229;
2230%macro IEMIMPL_UNARY_OP 3
2231BEGINCODE
2232BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
2233 PROLOGUE_2_ARGS
2234 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2235 %1 byte [A0]
2236 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2237 EPILOGUE_2_ARGS
2238ENDPROC iemAImpl_ %+ %1 %+ _u8
2239
2240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
2241 PROLOGUE_2_ARGS
2242 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2243 lock %1 byte [A0]
2244 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2245 EPILOGUE_2_ARGS
2246ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
2247
2248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
2249 PROLOGUE_2_ARGS
2250 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2251 %1 word [A0]
2252 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2253 EPILOGUE_2_ARGS
2254ENDPROC iemAImpl_ %+ %1 %+ _u16
2255
2256BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
2257 PROLOGUE_2_ARGS
2258 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2259 lock %1 word [A0]
2260 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2261 EPILOGUE_2_ARGS
2262ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
2263
2264BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
2265 PROLOGUE_2_ARGS
2266 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2267 %1 dword [A0]
2268 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2269 EPILOGUE_2_ARGS
2270ENDPROC iemAImpl_ %+ %1 %+ _u32
2271
2272BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
2273 PROLOGUE_2_ARGS
2274 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2275 lock %1 dword [A0]
2276 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2277 EPILOGUE_2_ARGS
2278ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
2279
2280 %ifdef RT_ARCH_AMD64
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
2282 PROLOGUE_2_ARGS
2283 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2284 %1 qword [A0]
2285 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2286 EPILOGUE_2_ARGS
2287ENDPROC iemAImpl_ %+ %1 %+ _u64
2288
2289BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
2290 PROLOGUE_2_ARGS
2291 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2292 lock %1 qword [A0]
2293 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2294 EPILOGUE_2_ARGS
2295ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
2296 %endif ; RT_ARCH_AMD64
2297
2298%endmacro
2299
2300IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2301IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2302IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
2303IEMIMPL_UNARY_OP not, 0, 0
2304
2305
2306;
2307; BSWAP. No flag changes.
2308;
2309; Each function takes one argument, pointer to the value to bswap
2310; (input/output). They all return void.
2311;
2312BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
2313 PROLOGUE_1_ARGS
2314 mov T0_32, [A0] ; just in case any of the upper bits are used.
2315 db 66h
2316 bswap T0_32
2317 mov [A0], T0_32
2318 EPILOGUE_1_ARGS
2319ENDPROC iemAImpl_bswap_u16
2320
2321BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
2322 PROLOGUE_1_ARGS
2323 mov T0_32, [A0]
2324 bswap T0_32
2325 mov [A0], T0_32
2326 EPILOGUE_1_ARGS
2327ENDPROC iemAImpl_bswap_u32
2328
2329BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
2330%ifdef RT_ARCH_AMD64
2331 PROLOGUE_1_ARGS
2332 mov T0, [A0]
2333 bswap T0
2334 mov [A0], T0
2335 EPILOGUE_1_ARGS
2336%else
2337 PROLOGUE_1_ARGS
2338 mov T0, [A0]
2339 mov T1, [A0 + 4]
2340 bswap T0
2341 bswap T1
2342 mov [A0 + 4], T0
2343 mov [A0], T1
2344 EPILOGUE_1_ARGS
2345%endif
2346ENDPROC iemAImpl_bswap_u64
2347
2348
2349;;
2350; Macro for implementing a shift operation.
2351;
2352; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2353; 32-bit system where the 64-bit accesses requires hand coding.
2354;
2355; All the functions takes a pointer to the destination memory operand in A0,
2356; the shift count in A1 and a pointer to eflags in A2.
2357;
2358; @param 1 The instruction mnemonic.
2359; @param 2 The modified flags.
2360; @param 3 The undefined flags.
2361; @param 4 Force load flags.
2362;
2363; Makes ASSUMPTIONS about A0, A1 and A2 assignments. Specifically, that with
2364; GCC/64 we're free to use RCX/CL as it isn't used for any arguments. While
2365; MSC/64 & 32-bit fastcall are using ECX for the first argument (fEFlagsIn),
2366; so we have to switch it around with the shift count parameter registers.
2367;
2368; @note the _intel and _amd variants are implemented in C.
2369;
2370%macro IEMIMPL_SHIFT_OP 4
2371BEGINCODE
2372BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2373 PROLOGUE_3_ARGS
2374 %ifdef ASM_CALL64_GCC
2375 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2376 mov cl, A2_8
2377 %1 byte [A1], cl
2378 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2379 %else
2380 xchg A2, A0
2381 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2382 %1 byte [A1], cl
2383 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2384 %endif
2385.zero_shift:
2386 EPILOGUE_3_ARGS
2387ENDPROC iemAImpl_ %+ %1 %+ _u8
2388
2389BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2390 PROLOGUE_3_ARGS
2391 %ifdef ASM_CALL64_GCC
2392 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2393 mov cl, A2_8
2394 %1 word [A1], cl
2395 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2396 %else
2397 xchg A2, A0
2398 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2399 %1 word [A1], cl
2400 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2401 %endif
2402 EPILOGUE_3_ARGS
2403ENDPROC iemAImpl_ %+ %1 %+ _u16
2404
2405BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2406 PROLOGUE_3_ARGS
2407 %ifdef ASM_CALL64_GCC
2408 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2409 mov cl, A2_8
2410 %1 dword [A1], cl
2411 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2412 %else
2413 xchg A2, A0
2414 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2415 %1 dword [A1], cl
2416 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2417 %endif
2418 EPILOGUE_3_ARGS
2419ENDPROC iemAImpl_ %+ %1 %+ _u32
2420
2421 %ifdef RT_ARCH_AMD64
2422BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2423 PROLOGUE_3_ARGS
2424 %ifdef ASM_CALL64_GCC
2425 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2426 mov cl, A2_8
2427 %1 qword [A1], cl
2428 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2429 %else
2430 xchg A2, A0
2431 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2432 %1 qword [A1], cl
2433 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2434 %endif
2435 EPILOGUE_3_ARGS
2436ENDPROC iemAImpl_ %+ %1 %+ _u64
2437 %endif ; RT_ARCH_AMD64
2438
2439%endmacro
2440
2441; These instructions will NOT modify flags if the masked shift count is zero
2442; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2443; we have to force load all modified and undefined.
2444IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2445IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2446IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2447IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2448IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2449IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2450IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2451
2452
2453;;
2454; Macro for implementing a double precision shift operation.
2455;
2456; This will generate code for the 16, 32 and 64 bit accesses, except on
2457; 32-bit system where the 64-bit accesses requires hand coding.
2458;
2459; The functions takes the destination operand (r/m) in A0, the source (reg) in
2460; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2461;
2462; @param 1 The instruction mnemonic.
2463; @param 2 The modified flags.
2464; @param 3 The undefined flags.
2465; @param 4 The force loaded flags.
2466;
2467; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2468;
2469; @note the _intel and _amd variants are implemented in C.
2470;
2471%macro IEMIMPL_SHIFT_DBL_OP 4
2472BEGINCODE
2473BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2474 PROLOGUE_4_ARGS
2475 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2476 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2477 %ifdef ASM_CALL64_GCC
2478 xchg A3, A2
2479 %1 [A0], A1_16, cl
2480 xchg A3, A2
2481 %else
2482 xchg A0, A2
2483 %1 [A2], A1_16, cl
2484 %endif
2485 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2486 EPILOGUE_4_ARGS
2487ENDPROC iemAImpl_ %+ %1 %+ _u16
2488
2489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2490 PROLOGUE_4_ARGS
2491 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2492 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2493 %ifdef ASM_CALL64_GCC
2494 xchg A3, A2
2495 %1 [A0], A1_32, cl
2496 xchg A3, A2
2497 %else
2498 xchg A0, A2
2499 %1 [A2], A1_32, cl
2500 %endif
2501 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2502 EPILOGUE_4_ARGS
2503ENDPROC iemAImpl_ %+ %1 %+ _u32
2504
2505 %ifdef RT_ARCH_AMD64
2506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2507 PROLOGUE_4_ARGS
2508 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2509 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2510 %ifdef ASM_CALL64_GCC
2511 xchg A3, A2
2512 %1 [A0], A1, cl
2513 xchg A3, A2
2514 %else
2515 xchg A0, A2
2516 %1 [A2], A1, cl
2517 %endif
2518 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2519 EPILOGUE_4_ARGS_EX 12
2520ENDPROC iemAImpl_ %+ %1 %+ _u64
2521 %endif ; RT_ARCH_AMD64
2522
2523%endmacro
2524
2525; These instructions will NOT modify flags if the masked shift count is zero
2526; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2527; we have to force load all modified and undefined.
2528IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2529IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2530
2531
2532;;
2533; Macro for implementing a multiplication operations.
2534;
2535; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2536; 32-bit system where the 64-bit accesses requires hand coding.
2537;
2538; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2539; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2540; pointer to eflags in A3.
2541;
2542; The functions all return 0 so the caller can be used for div/idiv as well as
2543; for the mul/imul implementation.
2544;
2545; @param 1 The instruction mnemonic.
2546; @param 2 The modified flags.
2547; @param 3 The undefined flags.
2548; @param 4 Name suffix.
2549; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2550;
2551; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2552;
2553%macro IEMIMPL_MUL_OP 5
2554BEGINCODE
2555BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2556 PROLOGUE_3_ARGS
2557 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2558 mov al, [A0]
2559 %1 A1_8
2560 mov [A0], ax
2561 %if %5 != 1
2562 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2563 %else
2564 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX ; intel
2565 %endif
2566 xor eax, eax
2567 EPILOGUE_3_ARGS
2568ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2569
2570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2571 PROLOGUE_4_ARGS
2572 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2573 mov ax, [A0]
2574 %ifdef ASM_CALL64_GCC
2575 %1 A2_16
2576 mov [A0], ax
2577 mov [A1], dx
2578 %else
2579 mov T1, A1
2580 %1 A2_16
2581 mov [A0], ax
2582 mov [T1], dx
2583 %endif
2584 %if %5 != 1
2585 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2586 %else
2587 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX ; intel
2588 %endif
2589 xor eax, eax
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2592
2593BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2594 PROLOGUE_4_ARGS
2595 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2596 mov eax, [A0]
2597 %ifdef ASM_CALL64_GCC
2598 %1 A2_32
2599 mov [A0], eax
2600 mov [A1], edx
2601 %else
2602 mov T1, A1
2603 %1 A2_32
2604 mov [A0], eax
2605 mov [T1], edx
2606 %endif
2607 %if %5 != 1
2608 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2609 %else
2610 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX ; intel
2611 %endif
2612 xor eax, eax
2613 EPILOGUE_4_ARGS
2614ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2615
2616 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2618 PROLOGUE_4_ARGS
2619 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2620 mov rax, [A0]
2621 %ifdef ASM_CALL64_GCC
2622 %1 A2
2623 mov [A0], rax
2624 mov [A1], rdx
2625 %else
2626 mov T1, A1
2627 %1 A2
2628 mov [A0], rax
2629 mov [T1], rdx
2630 %endif
2631 %if %5 != 1
2632 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2633 %else
2634 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX ; intel
2635 %endif
2636 xor eax, eax
2637 EPILOGUE_4_ARGS_EX 12
2638ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2639 %endif ; !RT_ARCH_AMD64
2640
2641%endmacro
2642
2643IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2644IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2645IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2646IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2647IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2648IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2649
2650
2651BEGINCODE
2652;;
2653; Worker function for negating a 32-bit number in T1:T0
2654; @uses None (T0,T1)
2655BEGINPROC iemAImpl_negate_T0_T1_u32
2656 push 0
2657 push 0
2658 xchg T0_32, [xSP]
2659 xchg T1_32, [xSP + xCB]
2660 sub T0_32, [xSP]
2661 sbb T1_32, [xSP + xCB]
2662 add xSP, xCB*2
2663 ret
2664ENDPROC iemAImpl_negate_T0_T1_u32
2665
2666%ifdef RT_ARCH_AMD64
2667;;
2668; Worker function for negating a 64-bit number in T1:T0
2669; @uses None (T0,T1)
2670BEGINPROC iemAImpl_negate_T0_T1_u64
2671 push 0
2672 push 0
2673 xchg T0, [xSP]
2674 xchg T1, [xSP + xCB]
2675 sub T0, [xSP]
2676 sbb T1, [xSP + xCB]
2677 add xSP, xCB*2
2678 ret
2679ENDPROC iemAImpl_negate_T0_T1_u64
2680%endif
2681
2682
2683;;
2684; Macro for implementing a division operations.
2685;
2686; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2687; 32-bit system where the 64-bit accesses requires hand coding.
2688;
2689; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2690; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2691; pointer to eflags in A3.
2692;
2693; The functions all return 0 on success and -1 if a divide error should be
2694; raised by the caller.
2695;
2696; @param 1 The instruction mnemonic.
2697; @param 2 The modified flags.
2698; @param 3 The undefined flags.
2699; @param 4 1 if signed, 0 if unsigned.
2700; @param 5 Function suffix.
2701; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2702; 2 for AMD (set AF, clear PF, ZF and SF).
2703;
2704; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2705;
2706%macro IEMIMPL_DIV_OP 6
2707BEGINCODE
2708BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2709 PROLOGUE_3_ARGS
2710
2711 ; div by chainsaw check.
2712 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2713 jz .div_zero
2714
2715 ; Overflow check - unsigned division is simple to verify, haven't
2716 ; found a simple way to check signed division yet unfortunately.
2717 %if %4 == 0
2718 cmp [A0 + 1], A1_8
2719 jae .div_overflow
2720 %else
2721 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2722 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2723 test A1_8, A1_8
2724 js .divisor_negative
2725 test T0_16, T0_16
2726 jns .both_positive
2727 neg T0_16
2728.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2729 push T0 ; Start off like unsigned below.
2730 shr T0_16, 7
2731 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2732 pop T0
2733 jb .div_no_overflow
2734 ja .div_overflow
2735 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2736 cmp T0_8, A1_8
2737 jae .div_overflow
2738 jmp .div_no_overflow
2739
2740.divisor_negative:
2741 neg A1_8
2742 test T0_16, T0_16
2743 jns .one_of_each
2744 neg T0_16
2745.both_positive: ; Same as unsigned shifted by sign indicator bit.
2746 shr T0_16, 7
2747 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2748 jae .div_overflow
2749.div_no_overflow:
2750 mov A1, T1 ; restore divisor
2751 %endif
2752
2753 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2754 mov ax, [A0]
2755 %1 A1_8
2756 mov [A0], ax
2757 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2758 IEM_ADJUST_FLAGS_OLD A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2759 %else
2760 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2761 %endif
2762 xor eax, eax
2763
2764.return:
2765 EPILOGUE_3_ARGS
2766
2767.div_zero:
2768.div_overflow:
2769 mov eax, -1
2770 jmp .return
2771ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2772
2773BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2774 PROLOGUE_4_ARGS
2775
2776 ; div by chainsaw check.
2777 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2778 jz .div_zero
2779
2780 ; Overflow check - unsigned division is simple to verify, haven't
2781 ; found a simple way to check signed division yet unfortunately.
2782 %if %4 == 0
2783 cmp [A1], A2_16
2784 jae .div_overflow
2785 %else
2786 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2787 shl T0_32, 16
2788 mov T0_16, [A0] ; T0 = dividend
2789 mov T1, A2 ; T1 = divisor
2790 test T1_16, T1_16
2791 js .divisor_negative
2792 test T0_32, T0_32
2793 jns .both_positive
2794 neg T0_32
2795.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2796 push T0 ; Start off like unsigned below.
2797 shr T0_32, 15
2798 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2799 pop T0
2800 jb .div_no_overflow
2801 ja .div_overflow
2802 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2803 cmp T0_16, T1_16
2804 jae .div_overflow
2805 jmp .div_no_overflow
2806
2807.divisor_negative:
2808 neg T1_16
2809 test T0_32, T0_32
2810 jns .one_of_each
2811 neg T0_32
2812.both_positive: ; Same as unsigned shifted by sign indicator bit.
2813 shr T0_32, 15
2814 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2815 jae .div_overflow
2816.div_no_overflow:
2817 %endif
2818
2819 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2820 %ifdef ASM_CALL64_GCC
2821 mov T1, A2
2822 mov ax, [A0]
2823 mov dx, [A1]
2824 %1 T1_16
2825 mov [A0], ax
2826 mov [A1], dx
2827 %else
2828 mov T1, A1
2829 mov ax, [A0]
2830 mov dx, [T1]
2831 %1 A2_16
2832 mov [A0], ax
2833 mov [T1], dx
2834 %endif
2835 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2836 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2837 %else
2838 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2839 %endif
2840 xor eax, eax
2841
2842.return:
2843 EPILOGUE_4_ARGS
2844
2845.div_zero:
2846.div_overflow:
2847 mov eax, -1
2848 jmp .return
2849ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2850
2851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2852 PROLOGUE_4_ARGS
2853
2854 ; div by chainsaw check.
2855 test A2_32, A2_32
2856 jz .div_zero
2857
2858 ; Overflow check - unsigned division is simple to verify, haven't
2859 ; found a simple way to check signed division yet unfortunately.
2860 %if %4 == 0
2861 cmp [A1], A2_32
2862 jae .div_overflow
2863 %else
2864 push A2 ; save A2 so we modify it (we out of regs on x86).
2865 mov T0_32, [A0] ; T0 = dividend low
2866 mov T1_32, [A1] ; T1 = dividend high
2867 ;test A2_32, A2_32 - we did this 5 instructions ago.
2868 js .divisor_negative
2869 test T1_32, T1_32
2870 jns .both_positive
2871 call NAME(iemAImpl_negate_T0_T1_u32)
2872.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2873 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2874 jnz .div_overflow
2875 push T0 ; Start off like unsigned below.
2876 shl T1_32, 1
2877 shr T0_32, 31
2878 or T1_32, T0_32
2879 cmp T1_32, A2_32
2880 pop T0
2881 jb .div_no_overflow
2882 ja .div_overflow
2883 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2884 cmp T0_32, A2_32
2885 jae .div_overflow
2886 jmp .div_no_overflow
2887
2888.divisor_negative:
2889 neg A2_32
2890 test T1_32, T1_32
2891 jns .one_of_each
2892 call NAME(iemAImpl_negate_T0_T1_u32)
2893.both_positive: ; Same as unsigned shifted by sign indicator bit.
2894 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2895 jnz .div_overflow
2896 shl T1_32, 1
2897 shr T0_32, 31
2898 or T1_32, T0_32
2899 cmp T1_32, A2_32
2900 jae .div_overflow
2901.div_no_overflow:
2902 pop A2
2903 %endif
2904
2905 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2906 mov eax, [A0]
2907 %ifdef ASM_CALL64_GCC
2908 mov T1, A2
2909 mov eax, [A0]
2910 mov edx, [A1]
2911 %1 T1_32
2912 mov [A0], eax
2913 mov [A1], edx
2914 %else
2915 mov T1, A1
2916 mov eax, [A0]
2917 mov edx, [T1]
2918 %1 A2_32
2919 mov [A0], eax
2920 mov [T1], edx
2921 %endif
2922 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2923 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2924 %else
2925 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2926 %endif
2927 xor eax, eax
2928
2929.return:
2930 EPILOGUE_4_ARGS
2931
2932.div_overflow:
2933 %if %4 != 0
2934 pop A2
2935 %endif
2936.div_zero:
2937 mov eax, -1
2938 jmp .return
2939ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2940
2941 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2942BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2943 PROLOGUE_4_ARGS
2944
2945 test A2, A2
2946 jz .div_zero
2947 %if %4 == 0
2948 cmp [A1], A2
2949 jae .div_overflow
2950 %else
2951 push A2 ; save A2 so we modify it (we out of regs on x86).
2952 mov T0, [A0] ; T0 = dividend low
2953 mov T1, [A1] ; T1 = dividend high
2954 ;test A2, A2 - we did this five instructions above.
2955 js .divisor_negative
2956 test T1, T1
2957 jns .both_positive
2958 call NAME(iemAImpl_negate_T0_T1_u64)
2959.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2960 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2961 jc .div_overflow
2962 push T0 ; Start off like unsigned below.
2963 shl T1, 1
2964 shr T0, 63
2965 or T1, T0
2966 cmp T1, A2
2967 pop T0
2968 jb .div_no_overflow
2969 ja .div_overflow
2970 mov T1, 0x7fffffffffffffff
2971 and T0, T1 ; Special case for covering (divisor - 1).
2972 cmp T0, A2
2973 jae .div_overflow
2974 jmp .div_no_overflow
2975
2976.divisor_negative:
2977 neg A2
2978 test T1, T1
2979 jns .one_of_each
2980 call NAME(iemAImpl_negate_T0_T1_u64)
2981.both_positive: ; Same as unsigned shifted by sign indicator bit.
2982 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2983 jc .div_overflow
2984 shl T1, 1
2985 shr T0, 63
2986 or T1, T0
2987 cmp T1, A2
2988 jae .div_overflow
2989.div_no_overflow:
2990 pop A2
2991 %endif
2992
2993 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2994 mov rax, [A0]
2995 %ifdef ASM_CALL64_GCC
2996 mov T1, A2
2997 mov rax, [A0]
2998 mov rdx, [A1]
2999 %1 T1
3000 mov [A0], rax
3001 mov [A1], rdx
3002 %else
3003 mov T1, A1
3004 mov rax, [A0]
3005 mov rdx, [T1]
3006 %1 A2
3007 mov [A0], rax
3008 mov [T1], rdx
3009 %endif
3010 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
3011 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
3012 %else
3013 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
3014 %endif
3015 xor eax, eax
3016
3017.return:
3018 EPILOGUE_4_ARGS_EX 12
3019
3020.div_overflow:
3021 %if %4 != 0
3022 pop A2
3023 %endif
3024.div_zero:
3025 mov eax, -1
3026 jmp .return
3027ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
3028 %endif ; !RT_ARCH_AMD64
3029
3030%endmacro
3031
3032IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
3033IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
3034IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
3035;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
3036IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
3037IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
3038IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
3039
3040
3041;;
3042; Macro for implementing memory fence operation.
3043;
3044; No return value, no operands or anything.
3045;
3046; @param 1 The instruction.
3047;
3048%macro IEMIMPL_MEM_FENCE 1
3049BEGINCODE
3050BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
3051 %1
3052 ret
3053ENDPROC iemAImpl_ %+ %1
3054%endmacro
3055
3056IEMIMPL_MEM_FENCE lfence
3057IEMIMPL_MEM_FENCE sfence
3058IEMIMPL_MEM_FENCE mfence
3059
3060;;
3061; Alternative for non-SSE2 host.
3062;
3063BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
3064 push xAX
3065 xchg xAX, [xSP]
3066 add xSP, xCB
3067 ret
3068ENDPROC iemAImpl_alt_mem_fence
3069
3070
3071;;
3072; Initialize the FPU for the actual instruction being emulated, this means
3073; loading parts of the guest's control word and status word.
3074;
3075; @uses 24 bytes of stack. T0, T1
3076; @param 1 Expression giving the address of the FXSTATE of the guest.
3077;
3078%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
3079 fnstenv [xSP]
3080
3081 ; FCW - for exception, precision and rounding control.
3082 movzx T0, word [%1 + X86FXSTATE.FCW]
3083 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3084 mov [xSP + X86FSTENV32P.FCW], T0_16
3085
3086 ; FSW - for undefined C0, C1, C2, and C3.
3087 movzx T1, word [%1 + X86FXSTATE.FSW]
3088 and T1, X86_FSW_C_MASK
3089 movzx T0, word [xSP + X86FSTENV32P.FSW]
3090 and T0, X86_FSW_TOP_MASK
3091 or T0, T1
3092 mov [xSP + X86FSTENV32P.FSW], T0_16
3093
3094 fldenv [xSP]
3095%endmacro
3096
3097
3098;;
3099; Initialize the FPU for the actual instruction being emulated, this means
3100; loading parts of the guest's control word, status word, and update the
3101; tag word for the top register if it's empty.
3102;
3103; ASSUMES actual TOP=7
3104;
3105; @uses 24 bytes of stack. T0, T1
3106; @param 1 Expression giving the address of the FXSTATE of the guest.
3107;
3108%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
3109 fnstenv [xSP]
3110
3111 ; FCW - for exception, precision and rounding control.
3112 movzx T0_32, word [%1 + X86FXSTATE.FCW]
3113 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3114 mov [xSP + X86FSTENV32P.FCW], T0_16
3115
3116 ; FSW - for undefined C0, C1, C2, and C3.
3117 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3118 and T1_32, X86_FSW_C_MASK
3119 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
3120 and T0_32, X86_FSW_TOP_MASK
3121 or T0_32, T1_32
3122 mov [xSP + X86FSTENV32P.FSW], T0_16
3123
3124 ; FTW - Only for ST0 (in/out).
3125 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3126 shr T1_32, X86_FSW_TOP_SHIFT
3127 and T1_32, X86_FSW_TOP_SMASK
3128 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
3129 jc %%st0_not_empty
3130 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
3131%%st0_not_empty:
3132
3133 fldenv [xSP]
3134%endmacro
3135
3136
3137;;
3138; Need to move this as well somewhere better?
3139;
3140struc IEMFPURESULT
3141 .r80Result resw 5
3142 .FSW resw 1
3143endstruc
3144
3145
3146;;
3147; Need to move this as well somewhere better?
3148;
3149struc IEMFPURESULTTWO
3150 .r80Result1 resw 5
3151 .FSW resw 1
3152 .r80Result2 resw 5
3153endstruc
3154
3155
3156;
3157;---------------------- 16-bit signed integer operations ----------------------
3158;
3159
3160
3161;;
3162; Converts a 16-bit floating point value to a 80-bit one (fpu register).
3163;
3164; @param A0 FPU context (fxsave).
3165; @param A1 Pointer to a IEMFPURESULT for the output.
3166; @param A2 Pointer to the 16-bit floating point value to convert.
3167;
3168BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
3169 PROLOGUE_3_ARGS
3170 sub xSP, 20h
3171
3172 fninit
3173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3174 fild word [A2]
3175
3176 fnstsw word [A1 + IEMFPURESULT.FSW]
3177 fnclex
3178 fstp tword [A1 + IEMFPURESULT.r80Result]
3179
3180 fninit
3181 add xSP, 20h
3182 EPILOGUE_3_ARGS
3183ENDPROC iemAImpl_fild_r80_from_i16
3184
3185
3186;;
3187; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
3188;
3189; @param A0 FPU context (fxsave).
3190; @param A1 Where to return the output FSW.
3191; @param A2 Where to store the 16-bit signed integer value.
3192; @param A3 Pointer to the 80-bit value.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
3195 PROLOGUE_4_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 fld tword [A3]
3200 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3201 fistp word [A2]
3202
3203 fnstsw word [A1]
3204
3205 fninit
3206 add xSP, 20h
3207 EPILOGUE_4_ARGS
3208ENDPROC iemAImpl_fist_r80_to_i16
3209
3210
3211;;
3212; Store a 80-bit floating point value (register) as a 16-bit signed integer
3213; (memory) with truncation.
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 16-bit signed integer value.
3218; @param A3 Pointer to the 80-bit value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fisttp word [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fistt_r80_to_i16
3235
3236
3237;;
3238; FPU instruction working on one 80-bit and one 16-bit signed integer value.
3239;
3240; @param 1 The instruction
3241;
3242; @param A0 FPU context (fxsave).
3243; @param A1 Pointer to a IEMFPURESULT for the output.
3244; @param A2 Pointer to the 80-bit value.
3245; @param A3 Pointer to the 16-bit value.
3246;
3247%macro IEMIMPL_FPU_R80_BY_I16 1
3248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3249 PROLOGUE_4_ARGS
3250 sub xSP, 20h
3251
3252 fninit
3253 fld tword [A2]
3254 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3255 %1 word [A3]
3256
3257 fnstsw word [A1 + IEMFPURESULT.FSW]
3258 fnclex
3259 fstp tword [A1 + IEMFPURESULT.r80Result]
3260
3261 fninit
3262 add xSP, 20h
3263 EPILOGUE_4_ARGS
3264ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3265%endmacro
3266
3267IEMIMPL_FPU_R80_BY_I16 fiadd
3268IEMIMPL_FPU_R80_BY_I16 fimul
3269IEMIMPL_FPU_R80_BY_I16 fisub
3270IEMIMPL_FPU_R80_BY_I16 fisubr
3271IEMIMPL_FPU_R80_BY_I16 fidiv
3272IEMIMPL_FPU_R80_BY_I16 fidivr
3273
3274
3275;;
3276; FPU instruction working on one 80-bit and one 16-bit signed integer value,
3277; only returning FSW.
3278;
3279; @param 1 The instruction
3280;
3281; @param A0 FPU context (fxsave).
3282; @param A1 Where to store the output FSW.
3283; @param A2 Pointer to the 80-bit value.
3284; @param A3 Pointer to the 64-bit value.
3285;
3286%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
3287BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3288 PROLOGUE_4_ARGS
3289 sub xSP, 20h
3290
3291 fninit
3292 fld tword [A2]
3293 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3294 %1 word [A3]
3295
3296 fnstsw word [A1]
3297
3298 fninit
3299 add xSP, 20h
3300 EPILOGUE_4_ARGS
3301ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3302%endmacro
3303
3304IEMIMPL_FPU_R80_BY_I16_FSW ficom
3305
3306
3307
3308;
3309;---------------------- 32-bit signed integer operations ----------------------
3310;
3311
3312
3313;;
3314; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3315;
3316; @param A0 FPU context (fxsave).
3317; @param A1 Pointer to a IEMFPURESULT for the output.
3318; @param A2 Pointer to the 32-bit floating point value to convert.
3319;
3320BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
3321 PROLOGUE_3_ARGS
3322 sub xSP, 20h
3323
3324 fninit
3325 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3326 fild dword [A2]
3327
3328 fnstsw word [A1 + IEMFPURESULT.FSW]
3329 fnclex
3330 fstp tword [A1 + IEMFPURESULT.r80Result]
3331
3332 fninit
3333 add xSP, 20h
3334 EPILOGUE_3_ARGS
3335ENDPROC iemAImpl_fild_r80_from_i32
3336
3337
3338;;
3339; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
3340;
3341; @param A0 FPU context (fxsave).
3342; @param A1 Where to return the output FSW.
3343; @param A2 Where to store the 32-bit signed integer value.
3344; @param A3 Pointer to the 80-bit value.
3345;
3346BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
3347 PROLOGUE_4_ARGS
3348 sub xSP, 20h
3349
3350 fninit
3351 fld tword [A3]
3352 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3353 fistp dword [A2]
3354
3355 fnstsw word [A1]
3356
3357 fninit
3358 add xSP, 20h
3359 EPILOGUE_4_ARGS
3360ENDPROC iemAImpl_fist_r80_to_i32
3361
3362
3363;;
3364; Store a 80-bit floating point value (register) as a 32-bit signed integer
3365; (memory) with truncation.
3366;
3367; @param A0 FPU context (fxsave).
3368; @param A1 Where to return the output FSW.
3369; @param A2 Where to store the 32-bit signed integer value.
3370; @param A3 Pointer to the 80-bit value.
3371;
3372BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3373 PROLOGUE_4_ARGS
3374 sub xSP, 20h
3375
3376 fninit
3377 fld tword [A3]
3378 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3379 fisttp dword [A2]
3380
3381 fnstsw word [A1]
3382
3383 fninit
3384 add xSP, 20h
3385 EPILOGUE_4_ARGS
3386ENDPROC iemAImpl_fistt_r80_to_i32
3387
3388
3389;;
3390; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3391;
3392; @param 1 The instruction
3393;
3394; @param A0 FPU context (fxsave).
3395; @param A1 Pointer to a IEMFPURESULT for the output.
3396; @param A2 Pointer to the 80-bit value.
3397; @param A3 Pointer to the 32-bit value.
3398;
3399%macro IEMIMPL_FPU_R80_BY_I32 1
3400BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3401 PROLOGUE_4_ARGS
3402 sub xSP, 20h
3403
3404 fninit
3405 fld tword [A2]
3406 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3407 %1 dword [A3]
3408
3409 fnstsw word [A1 + IEMFPURESULT.FSW]
3410 fnclex
3411 fstp tword [A1 + IEMFPURESULT.r80Result]
3412
3413 fninit
3414 add xSP, 20h
3415 EPILOGUE_4_ARGS
3416ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3417%endmacro
3418
3419IEMIMPL_FPU_R80_BY_I32 fiadd
3420IEMIMPL_FPU_R80_BY_I32 fimul
3421IEMIMPL_FPU_R80_BY_I32 fisub
3422IEMIMPL_FPU_R80_BY_I32 fisubr
3423IEMIMPL_FPU_R80_BY_I32 fidiv
3424IEMIMPL_FPU_R80_BY_I32 fidivr
3425
3426
3427;;
3428; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3429; only returning FSW.
3430;
3431; @param 1 The instruction
3432;
3433; @param A0 FPU context (fxsave).
3434; @param A1 Where to store the output FSW.
3435; @param A2 Pointer to the 80-bit value.
3436; @param A3 Pointer to the 64-bit value.
3437;
3438%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3440 PROLOGUE_4_ARGS
3441 sub xSP, 20h
3442
3443 fninit
3444 fld tword [A2]
3445 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3446 %1 dword [A3]
3447
3448 fnstsw word [A1]
3449
3450 fninit
3451 add xSP, 20h
3452 EPILOGUE_4_ARGS
3453ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3454%endmacro
3455
3456IEMIMPL_FPU_R80_BY_I32_FSW ficom
3457
3458
3459
3460;
3461;---------------------- 64-bit signed integer operations ----------------------
3462;
3463
3464
3465;;
3466; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3467;
3468; @param A0 FPU context (fxsave).
3469; @param A1 Pointer to a IEMFPURESULT for the output.
3470; @param A2 Pointer to the 64-bit floating point value to convert.
3471;
3472BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3473 PROLOGUE_3_ARGS
3474 sub xSP, 20h
3475
3476 fninit
3477 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3478 fild qword [A2]
3479
3480 fnstsw word [A1 + IEMFPURESULT.FSW]
3481 fnclex
3482 fstp tword [A1 + IEMFPURESULT.r80Result]
3483
3484 fninit
3485 add xSP, 20h
3486 EPILOGUE_3_ARGS
3487ENDPROC iemAImpl_fild_r80_from_i64
3488
3489
3490;;
3491; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3492;
3493; @param A0 FPU context (fxsave).
3494; @param A1 Where to return the output FSW.
3495; @param A2 Where to store the 64-bit signed integer value.
3496; @param A3 Pointer to the 80-bit value.
3497;
3498BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3499 PROLOGUE_4_ARGS
3500 sub xSP, 20h
3501
3502 fninit
3503 fld tword [A3]
3504 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3505 fistp qword [A2]
3506
3507 fnstsw word [A1]
3508
3509 fninit
3510 add xSP, 20h
3511 EPILOGUE_4_ARGS
3512ENDPROC iemAImpl_fist_r80_to_i64
3513
3514
3515;;
3516; Store a 80-bit floating point value (register) as a 64-bit signed integer
3517; (memory) with truncation.
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Where to return the output FSW.
3521; @param A2 Where to store the 64-bit signed integer value.
3522; @param A3 Pointer to the 80-bit value.
3523;
3524BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3525 PROLOGUE_4_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A3]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 fisttp qword [A2]
3532
3533 fnstsw word [A1]
3534
3535 fninit
3536 add xSP, 20h
3537 EPILOGUE_4_ARGS
3538ENDPROC iemAImpl_fistt_r80_to_i64
3539
3540
3541
3542;
3543;---------------------- 32-bit floating point operations ----------------------
3544;
3545
3546;;
3547; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3548;
3549; @param A0 FPU context (fxsave).
3550; @param A1 Pointer to a IEMFPURESULT for the output.
3551; @param A2 Pointer to the 32-bit floating point value to convert.
3552;
3553BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3554 PROLOGUE_3_ARGS
3555 sub xSP, 20h
3556
3557 fninit
3558 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3559 fld dword [A2]
3560
3561 fnstsw word [A1 + IEMFPURESULT.FSW]
3562 fnclex
3563 fstp tword [A1 + IEMFPURESULT.r80Result]
3564
3565 fninit
3566 add xSP, 20h
3567 EPILOGUE_3_ARGS
3568ENDPROC iemAImpl_fld_r80_from_r32
3569
3570
3571;;
3572; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3573;
3574; @param A0 FPU context (fxsave).
3575; @param A1 Where to return the output FSW.
3576; @param A2 Where to store the 32-bit value.
3577; @param A3 Pointer to the 80-bit value.
3578;
3579BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3580 PROLOGUE_4_ARGS
3581 sub xSP, 20h
3582
3583 fninit
3584 fld tword [A3]
3585 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3586 fst dword [A2]
3587
3588 fnstsw word [A1]
3589
3590 fninit
3591 add xSP, 20h
3592 EPILOGUE_4_ARGS
3593ENDPROC iemAImpl_fst_r80_to_r32
3594
3595
3596;;
3597; FPU instruction working on one 80-bit and one 32-bit floating point value.
3598;
3599; @param 1 The instruction
3600;
3601; @param A0 FPU context (fxsave).
3602; @param A1 Pointer to a IEMFPURESULT for the output.
3603; @param A2 Pointer to the 80-bit value.
3604; @param A3 Pointer to the 32-bit value.
3605;
3606%macro IEMIMPL_FPU_R80_BY_R32 1
3607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3608 PROLOGUE_4_ARGS
3609 sub xSP, 20h
3610
3611 fninit
3612 fld tword [A2]
3613 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3614 %1 dword [A3]
3615
3616 fnstsw word [A1 + IEMFPURESULT.FSW]
3617 fnclex
3618 fstp tword [A1 + IEMFPURESULT.r80Result]
3619
3620 fninit
3621 add xSP, 20h
3622 EPILOGUE_4_ARGS
3623ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3624%endmacro
3625
3626IEMIMPL_FPU_R80_BY_R32 fadd
3627IEMIMPL_FPU_R80_BY_R32 fmul
3628IEMIMPL_FPU_R80_BY_R32 fsub
3629IEMIMPL_FPU_R80_BY_R32 fsubr
3630IEMIMPL_FPU_R80_BY_R32 fdiv
3631IEMIMPL_FPU_R80_BY_R32 fdivr
3632
3633
3634;;
3635; FPU instruction working on one 80-bit and one 32-bit floating point value,
3636; only returning FSW.
3637;
3638; @param 1 The instruction
3639;
3640; @param A0 FPU context (fxsave).
3641; @param A1 Where to store the output FSW.
3642; @param A2 Pointer to the 80-bit value.
3643; @param A3 Pointer to the 64-bit value.
3644;
3645%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3647 PROLOGUE_4_ARGS
3648 sub xSP, 20h
3649
3650 fninit
3651 fld tword [A2]
3652 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3653 %1 dword [A3]
3654
3655 fnstsw word [A1]
3656
3657 fninit
3658 add xSP, 20h
3659 EPILOGUE_4_ARGS
3660ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3661%endmacro
3662
3663IEMIMPL_FPU_R80_BY_R32_FSW fcom
3664
3665
3666
3667;
3668;---------------------- 64-bit floating point operations ----------------------
3669;
3670
3671;;
3672; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3673;
3674; @param A0 FPU context (fxsave).
3675; @param A1 Pointer to a IEMFPURESULT for the output.
3676; @param A2 Pointer to the 64-bit floating point value to convert.
3677;
3678BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3679 PROLOGUE_3_ARGS
3680 sub xSP, 20h
3681
3682 fninit
3683 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3684 fld qword [A2]
3685
3686 fnstsw word [A1 + IEMFPURESULT.FSW]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULT.r80Result]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_fld_r80_from_r64
3694
3695
3696;;
3697; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3698;
3699; @param A0 FPU context (fxsave).
3700; @param A1 Where to return the output FSW.
3701; @param A2 Where to store the 64-bit value.
3702; @param A3 Pointer to the 80-bit value.
3703;
3704BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3705 PROLOGUE_4_ARGS
3706 sub xSP, 20h
3707
3708 fninit
3709 fld tword [A3]
3710 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3711 fst qword [A2]
3712
3713 fnstsw word [A1]
3714
3715 fninit
3716 add xSP, 20h
3717 EPILOGUE_4_ARGS
3718ENDPROC iemAImpl_fst_r80_to_r64
3719
3720
3721;;
3722; FPU instruction working on one 80-bit and one 64-bit floating point value.
3723;
3724; @param 1 The instruction
3725;
3726; @param A0 FPU context (fxsave).
3727; @param A1 Pointer to a IEMFPURESULT for the output.
3728; @param A2 Pointer to the 80-bit value.
3729; @param A3 Pointer to the 64-bit value.
3730;
3731%macro IEMIMPL_FPU_R80_BY_R64 1
3732BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3733 PROLOGUE_4_ARGS
3734 sub xSP, 20h
3735
3736 fninit
3737 fld tword [A2]
3738 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3739 %1 qword [A3]
3740
3741 fnstsw word [A1 + IEMFPURESULT.FSW]
3742 fnclex
3743 fstp tword [A1 + IEMFPURESULT.r80Result]
3744
3745 fninit
3746 add xSP, 20h
3747 EPILOGUE_4_ARGS
3748ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3749%endmacro
3750
3751IEMIMPL_FPU_R80_BY_R64 fadd
3752IEMIMPL_FPU_R80_BY_R64 fmul
3753IEMIMPL_FPU_R80_BY_R64 fsub
3754IEMIMPL_FPU_R80_BY_R64 fsubr
3755IEMIMPL_FPU_R80_BY_R64 fdiv
3756IEMIMPL_FPU_R80_BY_R64 fdivr
3757
3758;;
3759; FPU instruction working on one 80-bit and one 64-bit floating point value,
3760; only returning FSW.
3761;
3762; @param 1 The instruction
3763;
3764; @param A0 FPU context (fxsave).
3765; @param A1 Where to store the output FSW.
3766; @param A2 Pointer to the 80-bit value.
3767; @param A3 Pointer to the 64-bit value.
3768;
3769%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3770BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3771 PROLOGUE_4_ARGS
3772 sub xSP, 20h
3773
3774 fninit
3775 fld tword [A2]
3776 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3777 %1 qword [A3]
3778
3779 fnstsw word [A1]
3780
3781 fninit
3782 add xSP, 20h
3783 EPILOGUE_4_ARGS
3784ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3785%endmacro
3786
3787IEMIMPL_FPU_R80_BY_R64_FSW fcom
3788
3789
3790
3791;
3792;---------------------- 80-bit floating point operations ----------------------
3793;
3794
3795;;
3796; Loads a 80-bit floating point register value from memory.
3797;
3798; @param A0 FPU context (fxsave).
3799; @param A1 Pointer to a IEMFPURESULT for the output.
3800; @param A2 Pointer to the 80-bit floating point value to load.
3801;
3802BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3803 PROLOGUE_3_ARGS
3804 sub xSP, 20h
3805
3806 fninit
3807 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3808 fld tword [A2]
3809
3810 fnstsw word [A1 + IEMFPURESULT.FSW]
3811 fnclex
3812 fstp tword [A1 + IEMFPURESULT.r80Result]
3813
3814 fninit
3815 add xSP, 20h
3816 EPILOGUE_3_ARGS
3817ENDPROC iemAImpl_fld_r80_from_r80
3818
3819
3820;;
3821; Store a 80-bit floating point register to memory
3822;
3823; @param A0 FPU context (fxsave).
3824; @param A1 Where to return the output FSW.
3825; @param A2 Where to store the 80-bit value.
3826; @param A3 Pointer to the 80-bit register value.
3827;
3828BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3829 PROLOGUE_4_ARGS
3830 sub xSP, 20h
3831
3832 fninit
3833 fld tword [A3]
3834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3835 fstp tword [A2]
3836
3837 fnstsw word [A1]
3838
3839 fninit
3840 add xSP, 20h
3841 EPILOGUE_4_ARGS
3842ENDPROC iemAImpl_fst_r80_to_r80
3843
3844
3845;;
3846; Loads an 80-bit floating point register value in BCD format from memory.
3847;
3848; @param A0 FPU context (fxsave).
3849; @param A1 Pointer to a IEMFPURESULT for the output.
3850; @param A2 Pointer to the 80-bit BCD value to load.
3851;
3852BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3853 PROLOGUE_3_ARGS
3854 sub xSP, 20h
3855
3856 fninit
3857 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3858 fbld tword [A2]
3859
3860 fnstsw word [A1 + IEMFPURESULT.FSW]
3861 fnclex
3862 fstp tword [A1 + IEMFPURESULT.r80Result]
3863
3864 fninit
3865 add xSP, 20h
3866 EPILOGUE_3_ARGS
3867ENDPROC iemAImpl_fld_r80_from_d80
3868
3869
3870;;
3871; Store a 80-bit floating point register to memory as BCD
3872;
3873; @param A0 FPU context (fxsave).
3874; @param A1 Where to return the output FSW.
3875; @param A2 Where to store the 80-bit BCD value.
3876; @param A3 Pointer to the 80-bit register value.
3877;
3878BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3879 PROLOGUE_4_ARGS
3880 sub xSP, 20h
3881
3882 fninit
3883 fld tword [A3]
3884 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3885 fbstp tword [A2]
3886
3887 fnstsw word [A1]
3888
3889 fninit
3890 add xSP, 20h
3891 EPILOGUE_4_ARGS
3892ENDPROC iemAImpl_fst_r80_to_d80
3893
3894
3895;;
3896; FPU instruction working on two 80-bit floating point values.
3897;
3898; @param 1 The instruction
3899;
3900; @param A0 FPU context (fxsave).
3901; @param A1 Pointer to a IEMFPURESULT for the output.
3902; @param A2 Pointer to the first 80-bit value (ST0)
3903; @param A3 Pointer to the second 80-bit value (STn).
3904;
3905%macro IEMIMPL_FPU_R80_BY_R80 2
3906BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3907 PROLOGUE_4_ARGS
3908 sub xSP, 20h
3909
3910 fninit
3911 fld tword [A3]
3912 fld tword [A2]
3913 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3914 %1 %2
3915
3916 fnstsw word [A1 + IEMFPURESULT.FSW]
3917 fnclex
3918 fstp tword [A1 + IEMFPURESULT.r80Result]
3919
3920 fninit
3921 add xSP, 20h
3922 EPILOGUE_4_ARGS
3923ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3924%endmacro
3925
3926IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3927IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3928IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3929IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3930IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3931IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3932IEMIMPL_FPU_R80_BY_R80 fprem, {}
3933IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3934IEMIMPL_FPU_R80_BY_R80 fscale, {}
3935
3936
3937;;
3938; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3939; storing the result in ST1 and popping the stack.
3940;
3941; @param 1 The instruction
3942;
3943; @param A0 FPU context (fxsave).
3944; @param A1 Pointer to a IEMFPURESULT for the output.
3945; @param A2 Pointer to the first 80-bit value (ST1).
3946; @param A3 Pointer to the second 80-bit value (ST0).
3947;
3948%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3949BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3950 PROLOGUE_4_ARGS
3951 sub xSP, 20h
3952
3953 fninit
3954 fld tword [A2]
3955 fld tword [A3]
3956 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3957 %1
3958
3959 fnstsw word [A1 + IEMFPURESULT.FSW]
3960 fnclex
3961 fstp tword [A1 + IEMFPURESULT.r80Result]
3962
3963 fninit
3964 add xSP, 20h
3965 EPILOGUE_4_ARGS
3966ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3967%endmacro
3968
3969IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3970IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3971IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3972
3973
3974;;
3975; FPU instruction working on two 80-bit floating point values, only
3976; returning FSW.
3977;
3978; @param 1 The instruction
3979;
3980; @param A0 FPU context (fxsave).
3981; @param A1 Pointer to a uint16_t for the resulting FSW.
3982; @param A2 Pointer to the first 80-bit value.
3983; @param A3 Pointer to the second 80-bit value.
3984;
3985%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3986BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3987 PROLOGUE_4_ARGS
3988 sub xSP, 20h
3989
3990 fninit
3991 fld tword [A3]
3992 fld tword [A2]
3993 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3994 %1 st0, st1
3995
3996 fnstsw word [A1]
3997
3998 fninit
3999 add xSP, 20h
4000 EPILOGUE_4_ARGS
4001ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4002%endmacro
4003
4004IEMIMPL_FPU_R80_BY_R80_FSW fcom
4005IEMIMPL_FPU_R80_BY_R80_FSW fucom
4006
4007
4008;;
4009; FPU instruction working on two 80-bit floating point values,
4010; returning FSW and EFLAGS (eax).
4011;
4012; @param 1 The instruction
4013;
4014; @returns EFLAGS in EAX.
4015; @param A0 FPU context (fxsave).
4016; @param A1 Pointer to a uint16_t for the resulting FSW.
4017; @param A2 Pointer to the first 80-bit value.
4018; @param A3 Pointer to the second 80-bit value.
4019;
4020%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
4021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
4022 PROLOGUE_4_ARGS
4023 sub xSP, 20h
4024
4025 fninit
4026 fld tword [A3]
4027 fld tword [A2]
4028 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4029 %1 st1
4030
4031 fnstsw word [A1]
4032 pushf
4033 pop xAX
4034
4035 fninit
4036 add xSP, 20h
4037 EPILOGUE_4_ARGS
4038ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4039%endmacro
4040
4041IEMIMPL_FPU_R80_BY_R80_EFL fcomi
4042IEMIMPL_FPU_R80_BY_R80_EFL fucomi
4043
4044
4045;;
4046; FPU instruction working on one 80-bit floating point value.
4047;
4048; @param 1 The instruction
4049;
4050; @param A0 FPU context (fxsave).
4051; @param A1 Pointer to a IEMFPURESULT for the output.
4052; @param A2 Pointer to the 80-bit value.
4053;
4054%macro IEMIMPL_FPU_R80 1
4055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4056 PROLOGUE_3_ARGS
4057 sub xSP, 20h
4058
4059 fninit
4060 fld tword [A2]
4061 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4062 %1
4063
4064 fnstsw word [A1 + IEMFPURESULT.FSW]
4065 fnclex
4066 fstp tword [A1 + IEMFPURESULT.r80Result]
4067
4068 fninit
4069 add xSP, 20h
4070 EPILOGUE_3_ARGS
4071ENDPROC iemAImpl_ %+ %1 %+ _r80
4072%endmacro
4073
4074IEMIMPL_FPU_R80 fchs
4075IEMIMPL_FPU_R80 fabs
4076IEMIMPL_FPU_R80 f2xm1
4077IEMIMPL_FPU_R80 fsqrt
4078IEMIMPL_FPU_R80 frndint
4079IEMIMPL_FPU_R80 fsin
4080IEMIMPL_FPU_R80 fcos
4081
4082
4083;;
4084; FPU instruction working on one 80-bit floating point value, only
4085; returning FSW.
4086;
4087; @param 1 The instruction
4088; @param 2 Non-zero to also restore FTW.
4089;
4090; @param A0 FPU context (fxsave).
4091; @param A1 Pointer to a uint16_t for the resulting FSW.
4092; @param A2 Pointer to the 80-bit value.
4093;
4094%macro IEMIMPL_FPU_R80_FSW 2
4095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4096 PROLOGUE_3_ARGS
4097 sub xSP, 20h
4098
4099 fninit
4100 fld tword [A2]
4101%if %2 != 0
4102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
4103%else
4104 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4105%endif
4106 %1
4107
4108 fnstsw word [A1]
4109
4110 fninit
4111 add xSP, 20h
4112 EPILOGUE_3_ARGS
4113ENDPROC iemAImpl_ %+ %1 %+ _r80
4114%endmacro
4115
4116IEMIMPL_FPU_R80_FSW ftst, 0
4117IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
4118
4119
4120
4121;;
4122; FPU instruction loading a 80-bit floating point constant.
4123;
4124; @param 1 The instruction
4125;
4126; @param A0 FPU context (fxsave).
4127; @param A1 Pointer to a IEMFPURESULT for the output.
4128;
4129%macro IEMIMPL_FPU_R80_CONST 1
4130BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
4131 PROLOGUE_2_ARGS
4132 sub xSP, 20h
4133
4134 fninit
4135 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4136 %1
4137
4138 fnstsw word [A1 + IEMFPURESULT.FSW]
4139 fnclex
4140 fstp tword [A1 + IEMFPURESULT.r80Result]
4141
4142 fninit
4143 add xSP, 20h
4144 EPILOGUE_2_ARGS
4145ENDPROC iemAImpl_ %+ %1 %+
4146%endmacro
4147
4148IEMIMPL_FPU_R80_CONST fld1
4149IEMIMPL_FPU_R80_CONST fldl2t
4150IEMIMPL_FPU_R80_CONST fldl2e
4151IEMIMPL_FPU_R80_CONST fldpi
4152IEMIMPL_FPU_R80_CONST fldlg2
4153IEMIMPL_FPU_R80_CONST fldln2
4154IEMIMPL_FPU_R80_CONST fldz
4155
4156
4157;;
4158; FPU instruction working on one 80-bit floating point value, outputing two.
4159;
4160; @param 1 The instruction
4161;
4162; @param A0 FPU context (fxsave).
4163; @param A1 Pointer to a IEMFPURESULTTWO for the output.
4164; @param A2 Pointer to the 80-bit value.
4165;
4166%macro IEMIMPL_FPU_R80_R80 1
4167BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
4168 PROLOGUE_3_ARGS
4169 sub xSP, 20h
4170
4171 fninit
4172 fld tword [A2]
4173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4174 %1
4175
4176 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
4177 fnclex
4178 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
4179 fnclex
4180 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
4181
4182 fninit
4183 add xSP, 20h
4184 EPILOGUE_3_ARGS
4185ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
4186%endmacro
4187
4188IEMIMPL_FPU_R80_R80 fptan
4189IEMIMPL_FPU_R80_R80 fxtract
4190IEMIMPL_FPU_R80_R80 fsincos
4191
4192
4193
4194
4195;---------------------- SSE and MMX Operations ----------------------
4196
4197;; @todo what do we need to do for MMX?
4198%macro IEMIMPL_MMX_PROLOGUE 0
4199%endmacro
4200%macro IEMIMPL_MMX_EPILOGUE 0
4201%endmacro
4202
4203;; @todo what do we need to do for SSE?
4204%macro IEMIMPL_SSE_PROLOGUE 0
4205%endmacro
4206%macro IEMIMPL_SSE_EPILOGUE 0
4207%endmacro
4208
4209;; @todo what do we need to do for AVX?
4210%macro IEMIMPL_AVX_PROLOGUE 0
4211%endmacro
4212%macro IEMIMPL_AVX_EPILOGUE 0
4213%endmacro
4214
4215
4216;;
4217; Media instruction working on two full sized registers.
4218;
4219; @param 1 The instruction
4220; @param 2 Whether there is an MMX variant (1) or not (0).
4221;
4222; @param A0 FPU context (fxsave).
4223; @param A1 Pointer to the first media register size operand (input/output).
4224; @param A2 Pointer to the second media register size operand (input).
4225;
4226; @todo r=aeichner Currently unused, can probably be removed.
4227;
4228%macro IEMIMPL_MEDIA_F2 2
4229%if %2 != 0
4230BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
4231 PROLOGUE_3_ARGS
4232 IEMIMPL_MMX_PROLOGUE
4233
4234 movq mm0, [A1]
4235 movq mm1, [A2]
4236 %1 mm0, mm1
4237 movq [A1], mm0
4238
4239 IEMIMPL_MMX_EPILOGUE
4240 EPILOGUE_3_ARGS
4241ENDPROC iemAImpl_ %+ %1 %+ _u64
4242%endif
4243
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_SSE_PROLOGUE
4247
4248 movdqu xmm0, [A1]
4249 movdqu xmm1, [A2]
4250 %1 xmm0, xmm1
4251 movdqu [A1], xmm0
4252
4253 IEMIMPL_SSE_EPILOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256%endmacro
4257
4258;;
4259; Media instruction working on two full sized registers, but no FXSAVE state argument.
4260;
4261; @param 1 The instruction
4262; @param 2 Whether there is an MMX variant (1) or not (0).
4263;
4264; @param A0 Pointer to the first media register size operand (input/output).
4265; @param A1 Pointer to the second media register size operand (input).
4266;
4267%macro IEMIMPL_MEDIA_OPT_F2 2
4268%if %2 != 0
4269BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4270 PROLOGUE_2_ARGS
4271 IEMIMPL_MMX_PROLOGUE
4272
4273 movq mm0, [A0]
4274 movq mm1, [A1]
4275 %1 mm0, mm1
4276 movq [A0], mm0
4277
4278 IEMIMPL_MMX_EPILOGUE
4279 EPILOGUE_2_ARGS
4280ENDPROC iemAImpl_ %+ %1 %+ _u64
4281%endif
4282
4283BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4284 PROLOGUE_2_ARGS
4285 IEMIMPL_SSE_PROLOGUE
4286
4287 movdqu xmm0, [A0]
4288 movdqu xmm1, [A1]
4289 %1 xmm0, xmm1
4290 movdqu [A0], xmm0
4291
4292 IEMIMPL_SSE_EPILOGUE
4293 EPILOGUE_2_ARGS
4294ENDPROC iemAImpl_ %+ %1 %+ _u128
4295%endmacro
4296
4297IEMIMPL_MEDIA_OPT_F2 pshufb, 1
4298IEMIMPL_MEDIA_OPT_F2 pand, 1
4299IEMIMPL_MEDIA_OPT_F2 pandn, 1
4300IEMIMPL_MEDIA_OPT_F2 por, 1
4301IEMIMPL_MEDIA_OPT_F2 pxor, 1
4302IEMIMPL_MEDIA_OPT_F2 pcmpeqb, 1
4303IEMIMPL_MEDIA_OPT_F2 pcmpeqw, 1
4304IEMIMPL_MEDIA_OPT_F2 pcmpeqd, 1
4305IEMIMPL_MEDIA_OPT_F2 pcmpeqq, 0
4306IEMIMPL_MEDIA_OPT_F2 pcmpgtb, 1
4307IEMIMPL_MEDIA_OPT_F2 pcmpgtw, 1
4308IEMIMPL_MEDIA_OPT_F2 pcmpgtd, 1
4309IEMIMPL_MEDIA_OPT_F2 pcmpgtq, 0
4310IEMIMPL_MEDIA_OPT_F2 paddb, 1
4311IEMIMPL_MEDIA_OPT_F2 paddw, 1
4312IEMIMPL_MEDIA_OPT_F2 paddd, 1
4313IEMIMPL_MEDIA_OPT_F2 paddq, 1
4314IEMIMPL_MEDIA_OPT_F2 paddsb, 1
4315IEMIMPL_MEDIA_OPT_F2 paddsw, 1
4316IEMIMPL_MEDIA_OPT_F2 paddusb, 1
4317IEMIMPL_MEDIA_OPT_F2 paddusw, 1
4318IEMIMPL_MEDIA_OPT_F2 psubb, 1
4319IEMIMPL_MEDIA_OPT_F2 psubw, 1
4320IEMIMPL_MEDIA_OPT_F2 psubd, 1
4321IEMIMPL_MEDIA_OPT_F2 psubq, 1
4322IEMIMPL_MEDIA_OPT_F2 psubsb, 1
4323IEMIMPL_MEDIA_OPT_F2 psubsw, 1
4324IEMIMPL_MEDIA_OPT_F2 psubusb, 1
4325IEMIMPL_MEDIA_OPT_F2 psubusw, 1
4326IEMIMPL_MEDIA_OPT_F2 pmullw, 1
4327IEMIMPL_MEDIA_OPT_F2 pmulld, 0
4328IEMIMPL_MEDIA_OPT_F2 pmulhw, 1
4329IEMIMPL_MEDIA_OPT_F2 pmaddwd, 1
4330IEMIMPL_MEDIA_OPT_F2 pminub, 1
4331IEMIMPL_MEDIA_OPT_F2 pminuw, 0
4332IEMIMPL_MEDIA_OPT_F2 pminud, 0
4333IEMIMPL_MEDIA_OPT_F2 pminsb, 0
4334IEMIMPL_MEDIA_OPT_F2 pminsw, 1
4335IEMIMPL_MEDIA_OPT_F2 pminsd, 0
4336IEMIMPL_MEDIA_OPT_F2 pmaxub, 1
4337IEMIMPL_MEDIA_OPT_F2 pmaxuw, 0
4338IEMIMPL_MEDIA_OPT_F2 pmaxud, 0
4339IEMIMPL_MEDIA_OPT_F2 pmaxsb, 0
4340IEMIMPL_MEDIA_OPT_F2 pmaxsw, 1
4341IEMIMPL_MEDIA_OPT_F2 pmaxsd, 0
4342IEMIMPL_MEDIA_OPT_F2 pabsb, 1
4343IEMIMPL_MEDIA_OPT_F2 pabsw, 1
4344IEMIMPL_MEDIA_OPT_F2 pabsd, 1
4345IEMIMPL_MEDIA_OPT_F2 psignb, 1
4346IEMIMPL_MEDIA_OPT_F2 psignw, 1
4347IEMIMPL_MEDIA_OPT_F2 psignd, 1
4348IEMIMPL_MEDIA_OPT_F2 phaddw, 1
4349IEMIMPL_MEDIA_OPT_F2 phaddd, 1
4350IEMIMPL_MEDIA_OPT_F2 phsubw, 1
4351IEMIMPL_MEDIA_OPT_F2 phsubd, 1
4352IEMIMPL_MEDIA_OPT_F2 phaddsw, 1
4353IEMIMPL_MEDIA_OPT_F2 phsubsw, 1
4354IEMIMPL_MEDIA_OPT_F2 pmaddubsw, 1
4355IEMIMPL_MEDIA_OPT_F2 pmulhrsw, 1
4356IEMIMPL_MEDIA_OPT_F2 pmuludq, 1
4357IEMIMPL_MEDIA_OPT_F2 packsswb, 1
4358IEMIMPL_MEDIA_OPT_F2 packssdw, 1
4359IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4360IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4361IEMIMPL_MEDIA_OPT_F2 psllw, 1
4362IEMIMPL_MEDIA_OPT_F2 pslld, 1
4363IEMIMPL_MEDIA_OPT_F2 psllq, 1
4364IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4365IEMIMPL_MEDIA_OPT_F2 psrld, 1
4366IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4367IEMIMPL_MEDIA_OPT_F2 psraw, 1
4368IEMIMPL_MEDIA_OPT_F2 psrad, 1
4369IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4370IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4371IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4372IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4373IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4374IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4375IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4376IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4377IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4378IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4379IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4380IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4381IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4382IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4383IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4384IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4385IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4386IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4387IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4388IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4389
4390
4391;;
4392; Media instruction working on one full sized and one half sized register (lower half).
4393;
4394; @param 1 The instruction
4395; @param 2 1 if MMX is included, 0 if not.
4396;
4397; @param A0 Pointer to the first full sized media register operand (input/output).
4398; @param A1 Pointer to the second half sized media register operand (input).
4399;
4400%macro IEMIMPL_MEDIA_F1L1 2
4401 %if %2 != 0
4402BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4403 PROLOGUE_2_ARGS
4404 IEMIMPL_MMX_PROLOGUE
4405
4406 movq mm0, [A0]
4407 movq mm1, [A1]
4408 %1 mm0, mm1
4409 movq [A0], mm0
4410
4411 IEMIMPL_MMX_EPILOGUE
4412 EPILOGUE_2_ARGS
4413ENDPROC iemAImpl_ %+ %1 %+ _u64
4414 %endif
4415
4416BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4417 PROLOGUE_2_ARGS
4418 IEMIMPL_SSE_PROLOGUE
4419
4420 movdqu xmm0, [A0]
4421 movdqu xmm1, [A1]
4422 %1 xmm0, xmm1
4423 movdqu [A0], xmm0
4424
4425 IEMIMPL_SSE_EPILOGUE
4426 EPILOGUE_2_ARGS
4427ENDPROC iemAImpl_ %+ %1 %+ _u128
4428%endmacro
4429
4430IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4431IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4432IEMIMPL_MEDIA_F1L1 punpckldq, 1
4433IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4434
4435
4436;;
4437; Media instruction working two half sized input registers (lower half) and a full sized
4438; destination register (vpunpckh*).
4439;
4440; @param 1 The instruction
4441;
4442; @param A0 Pointer to the destination register (full sized, output only).
4443; @param A1 Pointer to the first full sized media source register operand, where we
4444; will only use the lower half as input - but we'll be loading it in full.
4445; @param A2 Pointer to the second full sized media source register operand, where we
4446; will only use the lower half as input - but we'll be loading it in full.
4447;
4448%macro IEMIMPL_MEDIA_F1L1L1 1
4449BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4450 PROLOGUE_3_ARGS
4451 IEMIMPL_AVX_PROLOGUE
4452
4453 vmovdqu xmm0, [A1]
4454 vmovdqu xmm1, [A2]
4455 %1 xmm0, xmm0, xmm1
4456 vmovdqu [A0], xmm0
4457
4458 IEMIMPL_AVX_PROLOGUE
4459 EPILOGUE_3_ARGS
4460ENDPROC iemAImpl_ %+ %1 %+ _u128
4461
4462BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4463 PROLOGUE_3_ARGS
4464 IEMIMPL_AVX_PROLOGUE
4465
4466 vmovdqu ymm0, [A1]
4467 vmovdqu ymm1, [A2]
4468 %1 ymm0, ymm0, ymm1
4469 vmovdqu [A0], ymm0
4470
4471 IEMIMPL_AVX_PROLOGUE
4472 EPILOGUE_3_ARGS
4473ENDPROC iemAImpl_ %+ %1 %+ _u256
4474%endmacro
4475
4476IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4477IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4478IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4479IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4480
4481
4482;;
4483; Media instruction working on one full sized and one half sized register (high half).
4484;
4485; @param 1 The instruction
4486; @param 2 1 if MMX is included, 0 if not.
4487;
4488; @param A0 Pointer to the first full sized media register operand (input/output).
4489; @param A1 Pointer to the second full sized media register operand, where we
4490; will only use the upper half as input - but we'll load it in full.
4491;
4492%macro IEMIMPL_MEDIA_F1H1 2
4493IEMIMPL_MEDIA_F1L1 %1, %2
4494%endmacro
4495
4496IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4497IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4498IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4499IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4500
4501
4502;;
4503; Media instruction working two half sized input registers (high half) and a full sized
4504; destination register (vpunpckh*).
4505;
4506; @param 1 The instruction
4507;
4508; @param A0 Pointer to the destination register (full sized, output only).
4509; @param A1 Pointer to the first full sized media source register operand, where we
4510; will only use the upper half as input - but we'll be loading it in full.
4511; @param A2 Pointer to the second full sized media source register operand, where we
4512; will only use the upper half as input - but we'll be loading it in full.
4513;
4514%macro IEMIMPL_MEDIA_F1H1H1 1
4515IEMIMPL_MEDIA_F1L1L1 %1
4516%endmacro
4517
4518IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4519IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4520IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4521IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4522
4523
4524;
4525; Shufflers with evil 8-bit immediates.
4526;
4527
4528BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4529 PROLOGUE_3_ARGS
4530 IEMIMPL_MMX_PROLOGUE
4531
4532 movzx A2, A2_8 ; must clear top bits
4533 movq mm1, [A1]
4534 movq mm0, mm0 ; paranoia!
4535 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 5
4536 movq [A0], mm0
4537
4538 IEMIMPL_MMX_EPILOGUE
4539 EPILOGUE_3_ARGS
4540%assign bImm 0
4541%rep 256
4542.imm %+ bImm:
4543 IBT_ENDBRxx_WITHOUT_NOTRACK
4544 pshufw mm0, mm1, bImm
4545 ret
4546 %assign bImm bImm + 1
4547%endrep
4548.immEnd:
4549ENDPROC iemAImpl_pshufw_u64
4550
4551
4552%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4553BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4554 PROLOGUE_3_ARGS
4555 IEMIMPL_SSE_PROLOGUE
4556
4557 movzx A2, A2_8 ; must clear top bits
4558 movdqu xmm1, [A1]
4559 movdqu xmm0, xmm1 ; paranoia!
4560 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4561 movdqu [A0], xmm0
4562
4563 IEMIMPL_SSE_EPILOGUE
4564 EPILOGUE_3_ARGS
4565
4566 %assign bImm 0
4567 %rep 256
4568.imm %+ bImm:
4569 IBT_ENDBRxx_WITHOUT_NOTRACK
4570 %1 xmm0, xmm1, bImm
4571 ret
4572 %assign bImm bImm + 1
4573 %endrep
4574.immEnd:
4575ENDPROC iemAImpl_ %+ %1 %+ _u128
4576%endmacro
4577
4578IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4579IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4580IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4581
4582
4583%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4584BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4585 PROLOGUE_3_ARGS
4586 IEMIMPL_SSE_PROLOGUE
4587
4588 movzx A2, A2_8 ; must clear top bits
4589 vmovdqu ymm1, [A1]
4590 vmovdqu ymm0, ymm1 ; paranoia!
4591 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4592 vmovdqu [A0], ymm0
4593
4594 IEMIMPL_SSE_EPILOGUE
4595 EPILOGUE_3_ARGS
4596 %assign bImm 0
4597 %rep 256
4598.imm %+ bImm:
4599 IBT_ENDBRxx_WITHOUT_NOTRACK
4600 %1 ymm0, ymm1, bImm
4601 ret
4602 %assign bImm bImm + 1
4603 %endrep
4604.immEnd:
4605ENDPROC iemAImpl_ %+ %1 %+ _u256
4606%endmacro
4607
4608IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4609IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4610IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4611
4612
4613;
4614; Shifts with evil 8-bit immediates.
4615;
4616
4617%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4618BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4619 PROLOGUE_2_ARGS
4620 IEMIMPL_MMX_PROLOGUE
4621
4622 movzx A1, A1_8 ; must clear top bits
4623 movq mm0, [A0]
4624 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 5
4625 movq [A0], mm0
4626
4627 IEMIMPL_MMX_EPILOGUE
4628 EPILOGUE_2_ARGS
4629%assign bImm 0
4630%rep 256
4631.imm %+ bImm:
4632 IBT_ENDBRxx_WITHOUT_NOTRACK
4633 %1 mm0, bImm
4634 ret
4635 %assign bImm bImm + 1
4636%endrep
4637.immEnd:
4638ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4639%endmacro
4640
4641IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4642IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4643IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4644IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4645IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4646IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4647IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4648IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4649
4650
4651%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4652BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4653 PROLOGUE_2_ARGS
4654 IEMIMPL_SSE_PROLOGUE
4655
4656 movzx A1, A1_8 ; must clear top bits
4657 movdqu xmm0, [A0]
4658 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 6
4659 movdqu [A0], xmm0
4660
4661 IEMIMPL_SSE_EPILOGUE
4662 EPILOGUE_2_ARGS
4663 %assign bImm 0
4664 %rep 256
4665.imm %+ bImm:
4666 IBT_ENDBRxx_WITHOUT_NOTRACK
4667 %1 xmm0, bImm
4668 ret
4669 %assign bImm bImm + 1
4670 %endrep
4671.immEnd:
4672ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4673%endmacro
4674
4675IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4676IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4677IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4678IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4679IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4680IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4681IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4682IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4683IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4684IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4685
4686
4687;
4688; Move byte mask.
4689;
4690
4691BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4692 PROLOGUE_2_ARGS
4693 IEMIMPL_MMX_PROLOGUE
4694
4695 movq mm1, [A1]
4696 pmovmskb T0, mm1
4697 mov [A0], T0
4698%ifdef RT_ARCH_X86
4699 mov dword [A0 + 4], 0
4700%endif
4701 IEMIMPL_MMX_EPILOGUE
4702 EPILOGUE_2_ARGS
4703ENDPROC iemAImpl_pmovmskb_u64
4704
4705BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4706 PROLOGUE_2_ARGS
4707 IEMIMPL_SSE_PROLOGUE
4708
4709 movdqu xmm1, [A1]
4710 pmovmskb T0, xmm1
4711 mov [A0], T0
4712%ifdef RT_ARCH_X86
4713 mov dword [A0 + 4], 0
4714%endif
4715 IEMIMPL_SSE_EPILOGUE
4716 EPILOGUE_2_ARGS
4717ENDPROC iemAImpl_pmovmskb_u128
4718
4719BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4720 PROLOGUE_2_ARGS
4721 IEMIMPL_AVX_PROLOGUE
4722
4723 vmovdqu ymm1, [A1]
4724 vpmovmskb T0, ymm1
4725 mov [A0], T0
4726%ifdef RT_ARCH_X86
4727 mov dword [A0 + 4], 0
4728%endif
4729 IEMIMPL_AVX_EPILOGUE
4730 EPILOGUE_2_ARGS
4731ENDPROC iemAImpl_vpmovmskb_u256
4732
4733
4734;;
4735; Media instruction working on two full sized source registers and one destination (AVX).
4736;
4737; @param 1 The instruction
4738;
4739; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4740; @param A1 Pointer to the destination media register size operand (output).
4741; @param A2 Pointer to the first source media register size operand (input).
4742; @param A3 Pointer to the second source media register size operand (input).
4743;
4744; @todo r=aeichner Not used right now
4745;
4746%macro IEMIMPL_MEDIA_F3 1
4747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4748 PROLOGUE_4_ARGS
4749 IEMIMPL_AVX_PROLOGUE
4750
4751 vmovdqu xmm0, [A2]
4752 vmovdqu xmm1, [A3]
4753 %1 xmm0, xmm0, xmm1
4754 vmovdqu [A1], xmm0
4755
4756 IEMIMPL_AVX_PROLOGUE
4757 EPILOGUE_4_ARGS
4758ENDPROC iemAImpl_ %+ %1 %+ _u128
4759
4760BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4761 PROLOGUE_4_ARGS
4762 IEMIMPL_AVX_PROLOGUE
4763
4764 vmovdqu ymm0, [A2]
4765 vmovdqu ymm1, [A3]
4766 %1 ymm0, ymm0, ymm1
4767 vmovdqu [A1], ymm0
4768
4769 IEMIMPL_AVX_PROLOGUE
4770 EPILOGUE_4_ARGS
4771ENDPROC iemAImpl_ %+ %1 %+ _u256
4772%endmacro
4773
4774;;
4775; Media instruction working on two full sized source registers and one destination (AVX),
4776; but no XSAVE state pointer argument.
4777;
4778; @param 1 The instruction
4779; @param 2 Flag whether to add a 256-bit variant (1) or not (0).
4780;
4781; @param A0 Pointer to the destination media register size operand (output).
4782; @param A1 Pointer to the first source media register size operand (input).
4783; @param A2 Pointer to the second source media register size operand (input).
4784;
4785%macro IEMIMPL_MEDIA_OPT_F3 2
4786BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4787 PROLOGUE_3_ARGS
4788 IEMIMPL_AVX_PROLOGUE
4789
4790 vmovdqu xmm0, [A1]
4791 vmovdqu xmm1, [A2]
4792 %1 xmm0, xmm0, xmm1
4793 vmovdqu [A0], xmm0
4794
4795 IEMIMPL_AVX_PROLOGUE
4796 EPILOGUE_3_ARGS
4797ENDPROC iemAImpl_ %+ %1 %+ _u128
4798
4799 %if %2 == 1
4800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4801 PROLOGUE_3_ARGS
4802 IEMIMPL_AVX_PROLOGUE
4803
4804 vmovdqu ymm0, [A1]
4805 vmovdqu ymm1, [A2]
4806 %1 ymm0, ymm0, ymm1
4807 vmovdqu [A0], ymm0
4808
4809 IEMIMPL_AVX_PROLOGUE
4810 EPILOGUE_3_ARGS
4811ENDPROC iemAImpl_ %+ %1 %+ _u256
4812 %endif
4813%endmacro
4814
4815IEMIMPL_MEDIA_OPT_F3 vpshufb, 1
4816IEMIMPL_MEDIA_OPT_F3 vpand, 1
4817IEMIMPL_MEDIA_OPT_F3 vpminub, 1
4818IEMIMPL_MEDIA_OPT_F3 vpminuw, 1
4819IEMIMPL_MEDIA_OPT_F3 vpminud, 1
4820IEMIMPL_MEDIA_OPT_F3 vpminsb, 1
4821IEMIMPL_MEDIA_OPT_F3 vpminsw, 1
4822IEMIMPL_MEDIA_OPT_F3 vpminsd, 1
4823IEMIMPL_MEDIA_OPT_F3 vpmaxub, 1
4824IEMIMPL_MEDIA_OPT_F3 vpmaxuw, 1
4825IEMIMPL_MEDIA_OPT_F3 vpmaxud, 1
4826IEMIMPL_MEDIA_OPT_F3 vpmaxsb, 1
4827IEMIMPL_MEDIA_OPT_F3 vpmaxsw, 1
4828IEMIMPL_MEDIA_OPT_F3 vpmaxsd, 1
4829IEMIMPL_MEDIA_OPT_F3 vpandn, 1
4830IEMIMPL_MEDIA_OPT_F3 vpor, 1
4831IEMIMPL_MEDIA_OPT_F3 vpxor, 1
4832IEMIMPL_MEDIA_OPT_F3 vpcmpeqb, 1
4833IEMIMPL_MEDIA_OPT_F3 vpcmpeqw, 1
4834IEMIMPL_MEDIA_OPT_F3 vpcmpeqd, 1
4835IEMIMPL_MEDIA_OPT_F3 vpcmpeqq, 1
4836IEMIMPL_MEDIA_OPT_F3 vpcmpgtb, 1
4837IEMIMPL_MEDIA_OPT_F3 vpcmpgtw, 1
4838IEMIMPL_MEDIA_OPT_F3 vpcmpgtd, 1
4839IEMIMPL_MEDIA_OPT_F3 vpcmpgtq, 1
4840IEMIMPL_MEDIA_OPT_F3 vpaddb, 1
4841IEMIMPL_MEDIA_OPT_F3 vpaddw, 1
4842IEMIMPL_MEDIA_OPT_F3 vpaddd, 1
4843IEMIMPL_MEDIA_OPT_F3 vpaddq, 1
4844IEMIMPL_MEDIA_OPT_F3 vpsubb, 1
4845IEMIMPL_MEDIA_OPT_F3 vpsubw, 1
4846IEMIMPL_MEDIA_OPT_F3 vpsubd, 1
4847IEMIMPL_MEDIA_OPT_F3 vpsubq, 1
4848IEMIMPL_MEDIA_OPT_F3 vpacksswb, 1
4849IEMIMPL_MEDIA_OPT_F3 vpackssdw, 1
4850IEMIMPL_MEDIA_OPT_F3 vpackuswb, 1
4851IEMIMPL_MEDIA_OPT_F3 vpackusdw, 1
4852IEMIMPL_MEDIA_OPT_F3 vpmullw, 1
4853IEMIMPL_MEDIA_OPT_F3 vpmulld, 1
4854IEMIMPL_MEDIA_OPT_F3 vpmulhw, 1
4855IEMIMPL_MEDIA_OPT_F3 vpmulhuw, 1
4856IEMIMPL_MEDIA_OPT_F3 vpavgb, 1
4857IEMIMPL_MEDIA_OPT_F3 vpavgw, 1
4858IEMIMPL_MEDIA_OPT_F3 vpsignb, 1
4859IEMIMPL_MEDIA_OPT_F3 vpsignw, 1
4860IEMIMPL_MEDIA_OPT_F3 vpsignd, 1
4861IEMIMPL_MEDIA_OPT_F3 vphaddw, 1
4862IEMIMPL_MEDIA_OPT_F3 vphaddd, 1
4863IEMIMPL_MEDIA_OPT_F3 vphsubw, 1
4864IEMIMPL_MEDIA_OPT_F3 vphsubd, 1
4865IEMIMPL_MEDIA_OPT_F3 vphaddsw, 1
4866IEMIMPL_MEDIA_OPT_F3 vphsubsw, 1
4867IEMIMPL_MEDIA_OPT_F3 vpmaddubsw, 1
4868IEMIMPL_MEDIA_OPT_F3 vpmulhrsw, 1
4869IEMIMPL_MEDIA_OPT_F3 vpsadbw, 1
4870IEMIMPL_MEDIA_OPT_F3 vpmuldq, 1
4871IEMIMPL_MEDIA_OPT_F3 vpmuludq, 1
4872IEMIMPL_MEDIA_OPT_F3 vunpcklps, 1
4873IEMIMPL_MEDIA_OPT_F3 vunpcklpd, 1
4874IEMIMPL_MEDIA_OPT_F3 vunpckhps, 1
4875IEMIMPL_MEDIA_OPT_F3 vunpckhpd, 1
4876IEMIMPL_MEDIA_OPT_F3 vpsubsb, 1
4877IEMIMPL_MEDIA_OPT_F3 vpsubsw, 1
4878IEMIMPL_MEDIA_OPT_F3 vpsubusb, 1
4879IEMIMPL_MEDIA_OPT_F3 vpsubusw, 1
4880IEMIMPL_MEDIA_OPT_F3 vpaddusb, 1
4881IEMIMPL_MEDIA_OPT_F3 vpaddusw, 1
4882IEMIMPL_MEDIA_OPT_F3 vpaddsb, 1
4883IEMIMPL_MEDIA_OPT_F3 vpaddsw, 1
4884IEMIMPL_MEDIA_OPT_F3 vpermilps, 1
4885IEMIMPL_MEDIA_OPT_F3 vpermilpd, 1
4886IEMIMPL_MEDIA_OPT_F3 vpmaddwd, 1
4887IEMIMPL_MEDIA_OPT_F3 vpsrlvd, 1
4888IEMIMPL_MEDIA_OPT_F3 vpsrlvq, 1
4889IEMIMPL_MEDIA_OPT_F3 vpsravd, 1
4890IEMIMPL_MEDIA_OPT_F3 vpsllvd, 1
4891IEMIMPL_MEDIA_OPT_F3 vpsllvq, 1
4892
4893IEMIMPL_MEDIA_OPT_F3 vaesenc, 0
4894IEMIMPL_MEDIA_OPT_F3 vaesenclast, 0
4895IEMIMPL_MEDIA_OPT_F3 vaesdec, 0
4896IEMIMPL_MEDIA_OPT_F3 vaesdeclast, 0
4897
4898
4899;;
4900; VAESIMC instruction.
4901;
4902; @param A0 Pointer to the first media register size operand (output).
4903; @param A1 Pointer to the second media register size operand (input).
4904;
4905BEGINPROC_FASTCALL iemAImpl_vaesimc_u128, 8
4906 PROLOGUE_2_ARGS
4907 IEMIMPL_SSE_PROLOGUE
4908
4909 movdqu xmm0, [A0]
4910 movdqu xmm1, [A1]
4911 vaesimc xmm0, xmm1
4912 movdqu [A0], xmm0
4913
4914 IEMIMPL_SSE_EPILOGUE
4915 EPILOGUE_2_ARGS
4916ENDPROC iemAImpl_vaesimc_u128
4917
4918
4919;;
4920; VAESKEYGENASSIST instruction.
4921;
4922; @param A0 Pointer to the first media register size operand (output).
4923; @param A1 Pointer to the second media register size operand (input).
4924; @param A2 8-bit immediate for the round constant.
4925;
4926BEGINPROC_FASTCALL iemAImpl_vaeskeygenassist_u128, 16
4927 PROLOGUE_3_ARGS
4928 IEMIMPL_AVX_PROLOGUE
4929
4930 movzx A2, A2_8 ; must clear top bits
4931 movdqu xmm0, [A0]
4932 movdqu xmm1, [A1]
4933 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4934 movdqu [A0], xmm0
4935
4936 IEMIMPL_AVX_EPILOGUE
4937 EPILOGUE_3_ARGS
4938 %assign bImm 0
4939 %rep 256
4940.imm %+ bImm:
4941 IBT_ENDBRxx_WITHOUT_NOTRACK
4942 vaeskeygenassist xmm0, xmm1, bImm
4943 ret
4944 int3
4945 %assign bImm bImm + 1
4946 %endrep
4947.immEnd:
4948ENDPROC iemAImpl_vaeskeygenassist_u128
4949
4950
4951;;
4952; VPERMQ instruction.
4953;
4954; @param A0 Pointer to the first media register size operand (output).
4955; @param A1 Pointer to the second media register size operand (input).
4956; @param A2 8-bit immediate for the round constant.
4957;
4958BEGINPROC_FASTCALL iemAImpl_vpermq_u256, 16
4959 PROLOGUE_3_ARGS
4960 IEMIMPL_AVX_PROLOGUE
4961
4962 movzx A2, A2_8 ; must clear top bits
4963 vmovdqu ymm1, [A1]
4964 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4965 vmovdqu [A0], ymm0
4966
4967 IEMIMPL_AVX_EPILOGUE
4968 EPILOGUE_3_ARGS
4969 %assign bImm 0
4970 %rep 256
4971.imm %+ bImm:
4972 IBT_ENDBRxx_WITHOUT_NOTRACK
4973 vpermq ymm0, ymm1, bImm
4974 ret
4975 int3
4976 %assign bImm bImm + 1
4977 %endrep
4978.immEnd:
4979ENDPROC iemAImpl_vpermq_u256
4980
4981
4982;;
4983; VPERMPD instruction.
4984;
4985; @param A0 Pointer to the first media register size operand (output).
4986; @param A1 Pointer to the second media register size operand (input).
4987; @param A2 8-bit immediate for the round constant.
4988;
4989BEGINPROC_FASTCALL iemAImpl_vpermpd_u256, 16
4990 PROLOGUE_3_ARGS
4991 IEMIMPL_AVX_PROLOGUE
4992
4993 movzx A2, A2_8 ; must clear top bits
4994 vmovdqu ymm1, [A1]
4995 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4996 vmovdqu [A0], ymm0
4997
4998 IEMIMPL_AVX_EPILOGUE
4999 EPILOGUE_3_ARGS
5000 %assign bImm 0
5001 %rep 256
5002.imm %+ bImm:
5003 IBT_ENDBRxx_WITHOUT_NOTRACK
5004 vpermpd ymm0, ymm1, bImm
5005 ret
5006 int3
5007 %assign bImm bImm + 1
5008 %endrep
5009.immEnd:
5010ENDPROC iemAImpl_vpermpd_u256
5011
5012
5013;;
5014; VPERMPS instruction.
5015;
5016; @param A0 Pointer to the first media register size operand (output).
5017; @param A1 Pointer to the second media register size operand (input).
5018; @param A2 Pointer to the third media register size operand (input).
5019;
5020BEGINPROC_FASTCALL iemAImpl_vpermps_u256, 16
5021 PROLOGUE_3_ARGS
5022 IEMIMPL_AVX_PROLOGUE
5023
5024 vmovdqu ymm0, [A1]
5025 vmovdqu ymm1, [A2]
5026 vpermps ymm0, ymm0, ymm1
5027 vmovdqu [A0], ymm0
5028
5029 IEMIMPL_AVX_EPILOGUE
5030 EPILOGUE_3_ARGS
5031ENDPROC iemAImpl_vpermps_u256
5032
5033
5034;;
5035; VPERMD instruction.
5036;
5037; @param A0 Pointer to the first media register size operand (output).
5038; @param A1 Pointer to the second media register size operand (input).
5039; @param A2 Pointer to the third media register size operand (input).
5040;
5041BEGINPROC_FASTCALL iemAImpl_vpermd_u256, 16
5042 PROLOGUE_3_ARGS
5043 IEMIMPL_AVX_PROLOGUE
5044
5045 vmovdqu ymm0, [A1]
5046 vmovdqu ymm1, [A2]
5047 vpermd ymm0, ymm0, ymm1
5048 vmovdqu [A0], ymm0
5049
5050 IEMIMPL_AVX_EPILOGUE
5051 EPILOGUE_3_ARGS
5052ENDPROC iemAImpl_vpermd_u256
5053
5054
5055;;
5056; Media instruction working on one full sized source register, one full sized destination
5057; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
5058; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
5059; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
5060; of either 16, 32, or 64, it acts like the max shift size)
5061;
5062; @param 1 The instruction
5063;
5064; @param A0 Pointer to the destination media register size operand (output).
5065; @param A1 Pointer to the first source media register size operand (input).
5066; @param A2 Pointer to the second source media register size operand (input).
5067;
5068%macro IEMIMPL_SHIFT_OPT_F3 1
5069BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5070 PROLOGUE_3_ARGS
5071 IEMIMPL_AVX_PROLOGUE
5072
5073 vmovdqu xmm0, [A1]
5074 vmovdqu xmm1, [A2]
5075 %1 xmm0, xmm0, xmm1
5076 vmovdqu [A0], xmm0
5077
5078 IEMIMPL_AVX_PROLOGUE
5079 EPILOGUE_3_ARGS
5080ENDPROC iemAImpl_ %+ %1 %+ _u128
5081
5082BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5083 PROLOGUE_3_ARGS
5084 IEMIMPL_AVX_PROLOGUE
5085
5086 vmovdqu ymm0, [A1]
5087 vmovdqu xmm1, [A2]
5088 %1 ymm0, ymm0, xmm1
5089 vmovdqu [A0], ymm0
5090
5091 IEMIMPL_AVX_PROLOGUE
5092 EPILOGUE_3_ARGS
5093ENDPROC iemAImpl_ %+ %1 %+ _u256
5094%endmacro
5095
5096IEMIMPL_SHIFT_OPT_F3 vpsllw
5097IEMIMPL_SHIFT_OPT_F3 vpslld
5098IEMIMPL_SHIFT_OPT_F3 vpsllq
5099IEMIMPL_SHIFT_OPT_F3 vpsraw
5100IEMIMPL_SHIFT_OPT_F3 vpsrad
5101IEMIMPL_SHIFT_OPT_F3 vpsrlw
5102IEMIMPL_SHIFT_OPT_F3 vpsrld
5103IEMIMPL_SHIFT_OPT_F3 vpsrlq
5104
5105
5106;;
5107; Media instruction working on one full sized source registers and one destination (AVX),
5108; but no XSAVE state pointer argument.
5109;
5110; @param 1 The instruction
5111; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
5112;
5113; @param A0 Pointer to the destination media register size operand (output).
5114; @param A1 Pointer to the source media register size operand (input).
5115;
5116%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
5117BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5118 PROLOGUE_2_ARGS
5119 IEMIMPL_AVX_PROLOGUE
5120
5121 vmovdqu xmm0, [A1]
5122 %1 xmm0, xmm0
5123 vmovdqu [A0], xmm0
5124
5125 IEMIMPL_AVX_PROLOGUE
5126 EPILOGUE_2_ARGS
5127ENDPROC iemAImpl_ %+ %1 %+ _u128
5128
5129 %if %2 == 1
5130BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5131 PROLOGUE_2_ARGS
5132 IEMIMPL_AVX_PROLOGUE
5133
5134 vmovdqu ymm0, [A1]
5135 %1 ymm0, ymm0
5136 vmovdqu [A0], ymm0
5137
5138 IEMIMPL_AVX_PROLOGUE
5139 EPILOGUE_2_ARGS
5140ENDPROC iemAImpl_ %+ %1 %+ _u256
5141 %endif
5142%endmacro
5143
5144IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
5145IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
5146IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
5147IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
5148
5149
5150;
5151; The SSE 4.2 crc32
5152;
5153; @param A1 Pointer to the 32-bit destination.
5154; @param A2 The source operand, sized according to the suffix.
5155;
5156BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
5157 PROLOGUE_2_ARGS
5158
5159 mov T0_32, [A0]
5160 crc32 T0_32, A1_8
5161 mov [A0], T0_32
5162
5163 EPILOGUE_2_ARGS
5164ENDPROC iemAImpl_crc32_u8
5165
5166BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
5167 PROLOGUE_2_ARGS
5168
5169 mov T0_32, [A0]
5170 crc32 T0_32, A1_16
5171 mov [A0], T0_32
5172
5173 EPILOGUE_2_ARGS
5174ENDPROC iemAImpl_crc32_u16
5175
5176BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
5177 PROLOGUE_2_ARGS
5178
5179 mov T0_32, [A0]
5180 crc32 T0_32, A1_32
5181 mov [A0], T0_32
5182
5183 EPILOGUE_2_ARGS
5184ENDPROC iemAImpl_crc32_u32
5185
5186%ifdef RT_ARCH_AMD64
5187BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
5188 PROLOGUE_2_ARGS
5189
5190 mov T0_32, [A0]
5191 crc32 T0, A1
5192 mov [A0], T0_32
5193
5194 EPILOGUE_2_ARGS
5195ENDPROC iemAImpl_crc32_u64
5196%endif
5197
5198
5199;
5200; PTEST (SSE 4.1)
5201;
5202; @param A0 Pointer to the first source operand (aka readonly destination).
5203; @param A1 Pointer to the second source operand.
5204; @param A2 Pointer to the EFLAGS register.
5205;
5206BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
5207 PROLOGUE_3_ARGS
5208 IEMIMPL_SSE_PROLOGUE
5209
5210 movdqu xmm0, [A0]
5211 movdqu xmm1, [A1]
5212 ptest xmm0, xmm1
5213 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5214
5215 IEMIMPL_SSE_EPILOGUE
5216 EPILOGUE_3_ARGS
5217ENDPROC iemAImpl_ptest_u128
5218
5219BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
5220 PROLOGUE_3_ARGS
5221 IEMIMPL_SSE_PROLOGUE
5222
5223 vmovdqu ymm0, [A0]
5224 vmovdqu ymm1, [A1]
5225 vptest ymm0, ymm1
5226 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5227
5228 IEMIMPL_SSE_EPILOGUE
5229 EPILOGUE_3_ARGS
5230ENDPROC iemAImpl_vptest_u256
5231
5232
5233;; Template for the vtestp{s,d} instructions
5234;
5235; @param 1 The instruction
5236;
5237; @param A0 Pointer to the first source operand (aka readonly destination).
5238; @param A1 Pointer to the second source operand.
5239; @param A2 Pointer to the EFLAGS register.
5240;
5241%macro IEMIMPL_VTESTP_S_D 1
5242BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5243 PROLOGUE_3_ARGS
5244 IEMIMPL_AVX_PROLOGUE
5245
5246 vmovdqu xmm0, [A0]
5247 vmovdqu xmm1, [A1]
5248 %1 xmm0, xmm1
5249 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5250
5251 IEMIMPL_AVX_EPILOGUE
5252 EPILOGUE_3_ARGS
5253ENDPROC iemAImpl_ %+ %1 %+ _u128
5254
5255BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5256 PROLOGUE_3_ARGS
5257 IEMIMPL_AVX_PROLOGUE
5258
5259 vmovdqu ymm0, [A0]
5260 vmovdqu ymm1, [A1]
5261 %1 ymm0, ymm1
5262 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5263
5264 IEMIMPL_AVX_EPILOGUE
5265 EPILOGUE_3_ARGS
5266ENDPROC iemAImpl_ %+ %1 %+ _u256
5267%endmacro
5268
5269IEMIMPL_VTESTP_S_D vtestps
5270IEMIMPL_VTESTP_S_D vtestpd
5271
5272
5273;;
5274; Template for the [v]pmov{s,z}x* instructions
5275;
5276; @param 1 The instruction
5277;
5278; @param A0 Pointer to the destination media register size operand (output).
5279; @param A1 The source operand value (input).
5280;
5281%macro IEMIMPL_V_PMOV_SZ_X 1
5282BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5283 PROLOGUE_2_ARGS
5284 IEMIMPL_SSE_PROLOGUE
5285
5286 movd xmm0, A1
5287 %1 xmm0, xmm0
5288 vmovdqu [A0], xmm0
5289
5290 IEMIMPL_SSE_PROLOGUE
5291 EPILOGUE_2_ARGS
5292ENDPROC iemAImpl_ %+ %1 %+ _u128
5293
5294BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5295 PROLOGUE_2_ARGS
5296 IEMIMPL_AVX_PROLOGUE
5297
5298 movd xmm0, A1
5299 v %+ %1 xmm0, xmm0
5300 vmovdqu [A0], xmm0
5301
5302 IEMIMPL_AVX_PROLOGUE
5303 EPILOGUE_2_ARGS
5304ENDPROC iemAImpl_v %+ %1 %+ _u128
5305
5306BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5307 PROLOGUE_2_ARGS
5308 IEMIMPL_AVX_PROLOGUE
5309
5310 movdqu xmm0, [A1]
5311 v %+ %1 ymm0, xmm0
5312 vmovdqu [A0], ymm0
5313
5314 IEMIMPL_AVX_PROLOGUE
5315 EPILOGUE_2_ARGS
5316ENDPROC iemAImpl_v %+ %1 %+ _u256
5317%endmacro
5318
5319IEMIMPL_V_PMOV_SZ_X pmovsxbw
5320IEMIMPL_V_PMOV_SZ_X pmovsxbd
5321IEMIMPL_V_PMOV_SZ_X pmovsxbq
5322IEMIMPL_V_PMOV_SZ_X pmovsxwd
5323IEMIMPL_V_PMOV_SZ_X pmovsxwq
5324IEMIMPL_V_PMOV_SZ_X pmovsxdq
5325
5326IEMIMPL_V_PMOV_SZ_X pmovzxbw
5327IEMIMPL_V_PMOV_SZ_X pmovzxbd
5328IEMIMPL_V_PMOV_SZ_X pmovzxbq
5329IEMIMPL_V_PMOV_SZ_X pmovzxwd
5330IEMIMPL_V_PMOV_SZ_X pmovzxwq
5331IEMIMPL_V_PMOV_SZ_X pmovzxdq
5332
5333
5334;;
5335; Initialize the SSE MXCSR register using the guest value partially to
5336; account for rounding mode, load the value from the given register.
5337;
5338; @uses 4 bytes of stack to save the original value, T0.
5339; @param 1 Expression giving the register holding the guest's MXCSR.
5340;
5341%macro SSE_AVX_LD_MXCSR 1
5342 sub xSP, 4
5343
5344 stmxcsr [xSP]
5345 mov T0_32, %1
5346 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5347 or T0_32, X86_MXCSR_XCPT_MASK
5348 sub xSP, 4
5349 mov [xSP], T0_32
5350 ldmxcsr [xSP]
5351 add xSP, 4
5352%endmacro
5353
5354
5355;;
5356; Restores the SSE MXCSR register with the original value.
5357;
5358; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5359; @param 1 Expression giving the register to return the new guest's MXCSR value.
5360; @param 2 Expression giving the register holding original guest's MXCSR value.
5361;
5362; @note Restores the stack pointer.
5363;
5364%macro SSE_AVX_ST_MXCSR 2
5365 sub xSP, 4
5366 stmxcsr [xSP]
5367 mov %1, [xSP]
5368 add xSP, 4
5369 ; Merge the status bits into the original MXCSR value.
5370 and %1, X86_MXCSR_XCPT_FLAGS
5371 or %1, %2
5372
5373 ldmxcsr [xSP]
5374 add xSP, 4
5375%endmacro
5376
5377
5378;;
5379; Floating point instruction working on two full sized registers.
5380;
5381; @param 1 The instruction
5382; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
5383;
5384; @returns R0_32 The new MXCSR value of the guest.
5385; @param A0 The guest's MXCSR register value to use.
5386; @param A1 Where to return the result.
5387; @param A2 Pointer to the first media register size operand (input/output).
5388; @param A3 Pointer to the second media register size operand (input).
5389;
5390%macro IEMIMPL_FP_F2 2
5391BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5392 PROLOGUE_4_ARGS
5393 IEMIMPL_SSE_PROLOGUE
5394 SSE_AVX_LD_MXCSR A0_32
5395
5396 movdqu xmm0, [A2]
5397 movdqu xmm1, [A3]
5398 %1 xmm0, xmm1
5399 movdqu [A1], xmm0
5400
5401 SSE_AVX_ST_MXCSR R0_32, A0_32
5402 IEMIMPL_SSE_PROLOGUE
5403 EPILOGUE_4_ARGS
5404ENDPROC iemAImpl_ %+ %1 %+ _u128
5405
5406 %if %2 == 3
5407BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5408 PROLOGUE_4_ARGS
5409 IEMIMPL_AVX_PROLOGUE
5410 SSE_AVX_LD_MXCSR A0_32
5411
5412 vmovdqu xmm0, [A2]
5413 vmovdqu xmm1, [A3]
5414 v %+ %1 xmm0, xmm0, xmm1
5415 vmovdqu [A1], xmm0
5416
5417 SSE_AVX_ST_MXCSR R0_32, A0_32
5418 IEMIMPL_AVX_PROLOGUE
5419 EPILOGUE_4_ARGS
5420ENDPROC iemAImpl_v %+ %1 %+ _u128
5421
5422BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5423 PROLOGUE_4_ARGS
5424 IEMIMPL_AVX_PROLOGUE
5425 SSE_AVX_LD_MXCSR A0_32
5426
5427 vmovdqu ymm0, [A2]
5428 vmovdqu ymm1, [A3]
5429 v %+ %1 ymm0, ymm0, ymm1
5430 vmovdqu [A1], ymm0
5431
5432 SSE_AVX_ST_MXCSR R0_32, A0_32
5433 IEMIMPL_AVX_PROLOGUE
5434 EPILOGUE_4_ARGS
5435ENDPROC iemAImpl_v %+ %1 %+ _u256
5436 %elif %2 == 2
5437BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5438 PROLOGUE_4_ARGS
5439 IEMIMPL_AVX_PROLOGUE
5440 SSE_AVX_LD_MXCSR A0_32
5441
5442 vmovdqu xmm0, [A2]
5443 vmovdqu xmm1, [A3]
5444 v %+ %1 xmm0, xmm1
5445 vmovdqu [A1], xmm0
5446
5447 SSE_AVX_ST_MXCSR R0_32, A0_32
5448 IEMIMPL_AVX_PROLOGUE
5449 EPILOGUE_4_ARGS
5450ENDPROC iemAImpl_v %+ %1 %+ _u128
5451
5452BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5453 PROLOGUE_4_ARGS
5454 IEMIMPL_AVX_PROLOGUE
5455 SSE_AVX_LD_MXCSR A0_32
5456
5457 vmovdqu ymm0, [A2]
5458 vmovdqu ymm1, [A3]
5459 v %+ %1 ymm0, ymm1
5460 vmovdqu [A1], ymm0
5461
5462 SSE_AVX_ST_MXCSR R0_32, A0_32
5463 IEMIMPL_AVX_PROLOGUE
5464 EPILOGUE_4_ARGS
5465ENDPROC iemAImpl_v %+ %1 %+ _u256
5466 %endif
5467%endmacro
5468
5469IEMIMPL_FP_F2 addps, 3
5470IEMIMPL_FP_F2 addpd, 3
5471IEMIMPL_FP_F2 mulps, 3
5472IEMIMPL_FP_F2 mulpd, 3
5473IEMIMPL_FP_F2 subps, 3
5474IEMIMPL_FP_F2 subpd, 3
5475IEMIMPL_FP_F2 minps, 3
5476IEMIMPL_FP_F2 minpd, 3
5477IEMIMPL_FP_F2 divps, 3
5478IEMIMPL_FP_F2 divpd, 3
5479IEMIMPL_FP_F2 maxps, 3
5480IEMIMPL_FP_F2 maxpd, 3
5481IEMIMPL_FP_F2 haddps, 3
5482IEMIMPL_FP_F2 haddpd, 3
5483IEMIMPL_FP_F2 hsubps, 3
5484IEMIMPL_FP_F2 hsubpd, 3
5485IEMIMPL_FP_F2 addsubps, 3
5486IEMIMPL_FP_F2 addsubpd, 3
5487
5488
5489;;
5490; These are actually unary operations but to keep it simple
5491; we treat them as binary for now, so the output result is
5492; always in sync with the register where the result might get written
5493; to.
5494IEMIMPL_FP_F2 sqrtps, 2
5495IEMIMPL_FP_F2 rsqrtps, 2
5496IEMIMPL_FP_F2 sqrtpd, 2
5497IEMIMPL_FP_F2 rcpps, 2
5498IEMIMPL_FP_F2 cvtdq2ps, 2
5499IEMIMPL_FP_F2 cvtps2dq, 2
5500IEMIMPL_FP_F2 cvttps2dq, 2
5501IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5502
5503
5504;;
5505; Floating point instruction working on a full sized register and a single precision operand.
5506;
5507; @param 1 The instruction
5508;
5509; @return R0_32 The new MXCSR value of the guest.
5510; @param A0 The guest's MXCSR register value to use.
5511; @param A1 Where to return the result.
5512; @param A2 Pointer to the first media register size operand (input/output).
5513; @param A3 Pointer to the second single precision floating point value (input).
5514;
5515%macro IEMIMPL_FP_F2_R32 1
5516BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5517 PROLOGUE_4_ARGS
5518 IEMIMPL_SSE_PROLOGUE
5519 SSE_AVX_LD_MXCSR A0_32
5520
5521 movdqu xmm0, [A2]
5522 movd xmm1, [A3]
5523 %1 xmm0, xmm1
5524 movdqu [A1], xmm0
5525
5526 SSE_AVX_ST_MXCSR R0_32, A0_32
5527 IEMIMPL_SSE_EPILOGUE
5528 EPILOGUE_4_ARGS
5529ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5530
5531BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5532 PROLOGUE_4_ARGS
5533 IEMIMPL_AVX_PROLOGUE
5534 SSE_AVX_LD_MXCSR A0_32
5535
5536 vmovdqu xmm0, [A2]
5537 vmovd xmm1, [A3]
5538 v %+ %1 xmm0, xmm0, xmm1
5539 vmovdqu [A1], xmm0
5540
5541 SSE_AVX_ST_MXCSR R0_32, A0_32
5542 IEMIMPL_AVX_PROLOGUE
5543 EPILOGUE_4_ARGS
5544ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5545%endmacro
5546
5547IEMIMPL_FP_F2_R32 addss
5548IEMIMPL_FP_F2_R32 mulss
5549IEMIMPL_FP_F2_R32 subss
5550IEMIMPL_FP_F2_R32 minss
5551IEMIMPL_FP_F2_R32 divss
5552IEMIMPL_FP_F2_R32 maxss
5553IEMIMPL_FP_F2_R32 cvtss2sd
5554IEMIMPL_FP_F2_R32 sqrtss
5555IEMIMPL_FP_F2_R32 rsqrtss
5556IEMIMPL_FP_F2_R32 rcpss
5557
5558
5559;;
5560; Floating point instruction working on a full sized register and a double precision operand.
5561;
5562; @param 1 The instruction
5563;
5564; @return R0_32 The new MXCSR value of the guest.
5565; @param A0 The guest's MXCSR register value to use.
5566; @param A1 Where to return the result.
5567; @param A2 Pointer to the first media register size operand (input/output).
5568; @param A3 Pointer to the second double precision floating point value (input).
5569;
5570%macro IEMIMPL_FP_F2_R64 1
5571BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5572 PROLOGUE_4_ARGS
5573 IEMIMPL_SSE_PROLOGUE
5574 SSE_AVX_LD_MXCSR A0_32
5575
5576 movdqu xmm0, [A2]
5577 movq xmm1, [A3]
5578 %1 xmm0, xmm1
5579 movdqu [A1], xmm0
5580
5581 SSE_AVX_ST_MXCSR R0_32, A0_32
5582 IEMIMPL_SSE_EPILOGUE
5583 EPILOGUE_4_ARGS
5584ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5585
5586BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5587 PROLOGUE_4_ARGS
5588 IEMIMPL_AVX_PROLOGUE
5589 SSE_AVX_LD_MXCSR A0_32
5590
5591 vmovdqu xmm0, [A2]
5592 vmovq xmm1, [A3]
5593 v %+ %1 xmm0, xmm0, xmm1
5594 vmovdqu [A1], xmm0
5595
5596 SSE_AVX_ST_MXCSR R0_32, A0_32
5597 IEMIMPL_AVX_EPILOGUE
5598 EPILOGUE_4_ARGS
5599ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5600%endmacro
5601
5602IEMIMPL_FP_F2_R64 addsd
5603IEMIMPL_FP_F2_R64 mulsd
5604IEMIMPL_FP_F2_R64 subsd
5605IEMIMPL_FP_F2_R64 minsd
5606IEMIMPL_FP_F2_R64 divsd
5607IEMIMPL_FP_F2_R64 maxsd
5608IEMIMPL_FP_F2_R64 cvtsd2ss
5609IEMIMPL_FP_F2_R64 sqrtsd
5610
5611
5612;;
5613; Macro for the cvtpd2ps/cvtps2pd instructions.
5614;
5615; 1 The instruction name.
5616; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5617;
5618; @return R0_32 The new MXCSR value of the guest.
5619; @param A0_32 The guest's MXCSR register value to use.
5620; @param A1 Where to return the result.
5621; @param A2 Pointer to the first media register size operand (input/output).
5622; @param A3 Pointer to the second media register size operand (input).
5623;
5624%macro IEMIMPL_CVT_F2 2
5625BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5626 PROLOGUE_4_ARGS
5627 IEMIMPL_SSE_PROLOGUE
5628 SSE_AVX_LD_MXCSR A0_32
5629
5630 movdqu xmm0, [A2]
5631 movdqu xmm1, [A3]
5632 %1 xmm0, xmm1
5633 movdqu [A1], xmm0
5634
5635 SSE_AVX_ST_MXCSR R0_32, A0_32
5636 IEMIMPL_SSE_EPILOGUE
5637 EPILOGUE_4_ARGS
5638ENDPROC iemAImpl_ %+ %1 %+ _u128
5639
5640BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u128, 16
5641 PROLOGUE_4_ARGS
5642 IEMIMPL_AVX_PROLOGUE
5643 SSE_AVX_LD_MXCSR A0_32
5644
5645 vmovdqu xmm1, [A2]
5646 v %+ %1 xmm0, xmm1
5647 vmovdqu [A1], xmm0
5648
5649 SSE_AVX_ST_MXCSR R0_32, A0_32
5650 IEMIMPL_AVX_EPILOGUE
5651 EPILOGUE_4_ARGS
5652ENDPROC iemAImpl_v %+ %1 %+ _u128_u128
5653
5654BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u256, 16
5655 PROLOGUE_4_ARGS
5656 IEMIMPL_AVX_PROLOGUE
5657 SSE_AVX_LD_MXCSR A0_32
5658
5659 vmovdqu xmm1, [A2]
5660 %if %2 == 0
5661 v %+ %1 xmm0, xmm1
5662 %else
5663 v %+ %1 ymm0, xmm1
5664 %endif
5665 vmovdqu [A1], ymm0
5666
5667 SSE_AVX_ST_MXCSR R0_32, A0_32
5668 IEMIMPL_AVX_EPILOGUE
5669 EPILOGUE_4_ARGS
5670ENDPROC iemAImpl_v %+ %1 %+ _u128_u256
5671%endmacro
5672
5673IEMIMPL_CVT_F2 cvtpd2ps, 0
5674IEMIMPL_CVT_F2 cvttpd2dq, 0
5675IEMIMPL_CVT_F2 cvtpd2dq, 0
5676
5677;IEMIMPL_CVT_F2 cvtps2pd, 1 - inefficient.
5678
5679BEGINPROC_FASTCALL iemAImpl_cvtps2pd_u128, 12
5680 PROLOGUE_3_ARGS
5681 IEMIMPL_SSE_PROLOGUE
5682 SSE_AVX_LD_MXCSR A0_32
5683
5684 cvtps2pd xmm0, [A2]
5685 movdqu [A1], xmm0
5686
5687 SSE_AVX_ST_MXCSR R0_32, A0_32
5688 IEMIMPL_SSE_EPILOGUE
5689 EPILOGUE_3_ARGS
5690ENDPROC iemAImpl_cvtps2pd_u128
5691
5692
5693;;
5694; vcvtps2pd instruction - 128-bit variant.
5695;
5696; @return R0_32 The new MXCSR value of the guest.
5697; @param A0_32 The guest's MXCSR register value to use.
5698; @param A1 Pointer to the result operand (output).
5699; @param A2 Pointer to the second operand (input).
5700;
5701BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u128_u64, 16
5702 PROLOGUE_3_ARGS
5703 IEMIMPL_AVX_PROLOGUE
5704 SSE_AVX_LD_MXCSR A0_32
5705
5706 vcvtps2pd xmm0, qword [A2]
5707 movdqu [A1], xmm0
5708
5709 SSE_AVX_ST_MXCSR R0_32, A0_32
5710 IEMIMPL_AVX_EPILOGUE
5711 EPILOGUE_3_ARGS
5712ENDPROC iemAImpl_vcvtps2pd_u128_u64
5713
5714
5715;;
5716; vcvtps2pd instruction - 256-bit variant.
5717;
5718; @return R0_32 The new MXCSR value of the guest.
5719; @param A0_32 The guest's MXCSR register value to use.
5720; @param A1 Pointer to the result operand (output).
5721; @param A2 Pointer to the second operand (input).
5722;
5723BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u256_u128, 16
5724 PROLOGUE_3_ARGS
5725 IEMIMPL_AVX_PROLOGUE
5726 SSE_AVX_LD_MXCSR A0_32
5727
5728 movdqu xmm0, [A2]
5729 vcvtps2pd ymm0, xmm1
5730 vmovdqu [A1], ymm0
5731
5732 SSE_AVX_ST_MXCSR R0_32, A0_32
5733 IEMIMPL_AVX_EPILOGUE
5734 EPILOGUE_3_ARGS
5735ENDPROC iemAImpl_vcvtps2pd_u256_u128
5736
5737
5738;;
5739; vcvtdq2pd instruction - 128-bit variant.
5740;
5741; @return R0_32 The new MXCSR value of the guest.
5742; @param A0_32 The guest's MXCSR register value to use.
5743; @param A1 Pointer to the result operand (output).
5744; @param A2 Pointer to the second operand (input).
5745;
5746BEGINPROC_FASTCALL iemAImpl_vcvtdq2pd_u128_u64, 16
5747 PROLOGUE_3_ARGS
5748 IEMIMPL_AVX_PROLOGUE
5749 SSE_AVX_LD_MXCSR A0_32
5750
5751 vcvtdq2pd xmm0, qword [A2]
5752 movdqu [A1], xmm0
5753
5754 SSE_AVX_ST_MXCSR R0_32, A0_32
5755 IEMIMPL_AVX_EPILOGUE
5756 EPILOGUE_3_ARGS
5757ENDPROC iemAImpl_vcvtdq2pd_u128_u64
5758
5759
5760;;
5761; vcvtdq2pd instruction - 256-bit variant.
5762;
5763; @return R0_32 The new MXCSR value of the guest.
5764; @param A0_32 The guest's MXCSR register value to use.
5765; @param A1 Pointer to the result operand (output).
5766; @param A2 Pointer to the second operand (input).
5767;
5768BEGINPROC_FASTCALL iemAImpl_vcvtdq2pd_u256_u128, 16
5769 PROLOGUE_3_ARGS
5770 IEMIMPL_AVX_PROLOGUE
5771 SSE_AVX_LD_MXCSR A0_32
5772
5773 movdqu xmm0, [A2]
5774 vcvtdq2pd ymm0, xmm1
5775 vmovdqu [A1], ymm0
5776
5777 SSE_AVX_ST_MXCSR R0_32, A0_32
5778 IEMIMPL_AVX_EPILOGUE
5779 EPILOGUE_3_ARGS
5780ENDPROC iemAImpl_vcvtdq2pd_u256_u128
5781
5782
5783;;
5784; shufps instructions with 8-bit immediates.
5785;
5786; @param A0 Pointer to the destination media register size operand (input/output).
5787; @param A1 Pointer to the first source media register size operand (input).
5788; @param A2 The 8-bit immediate
5789;
5790BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5791 PROLOGUE_3_ARGS
5792 IEMIMPL_SSE_PROLOGUE
5793
5794 movzx A2, A2_8 ; must clear top bits
5795 movdqu xmm0, [A0]
5796 movdqu xmm1, [A1]
5797 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5798 movdqu [A0], xmm0
5799
5800 IEMIMPL_SSE_EPILOGUE
5801 EPILOGUE_3_ARGS
5802 %assign bImm 0
5803 %rep 256
5804.imm %+ bImm:
5805 IBT_ENDBRxx_WITHOUT_NOTRACK
5806 shufps xmm0, xmm1, bImm
5807 ret
5808 int3
5809 %assign bImm bImm + 1
5810 %endrep
5811.immEnd:
5812ENDPROC iemAImpl_shufps_u128
5813
5814
5815;;
5816; shufpd instruction with 8-bit immediates.
5817;
5818; @param A0 Pointer to the destination media register size operand (input/output).
5819; @param A1 Pointer to the first source media register size operand (input).
5820; @param A2 The 8-bit immediate
5821;
5822BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5823 PROLOGUE_3_ARGS
5824 IEMIMPL_SSE_PROLOGUE
5825
5826 movzx A2, A2_8 ; must clear top bits
5827 movdqu xmm0, [A0]
5828 movdqu xmm1, [A1]
5829 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5830 movdqu [A0], xmm0
5831
5832 IEMIMPL_SSE_EPILOGUE
5833 EPILOGUE_3_ARGS
5834 %assign bImm 0
5835 %rep 256
5836.imm %+ bImm:
5837 IBT_ENDBRxx_WITHOUT_NOTRACK
5838 shufpd xmm0, xmm1, bImm
5839 ret
5840 %assign bImm bImm + 1
5841 %endrep
5842.immEnd:
5843ENDPROC iemAImpl_shufpd_u128
5844
5845
5846;;
5847; vshufp{s,d} instructions with 8-bit immediates.
5848;
5849; @param 1 The instruction name.
5850;
5851; @param A0 Pointer to the destination media register size operand (output).
5852; @param A1 Pointer to the first source media register size operand (input).
5853; @param A2 Pointer to the second source media register size operand (input).
5854; @param A3 The 8-bit immediate
5855;
5856%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5857BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5858 PROLOGUE_4_ARGS
5859 IEMIMPL_AVX_PROLOGUE
5860
5861 movzx A3, A3_8 ; must clear top bits
5862 movdqu xmm0, [A1]
5863 movdqu xmm1, [A2]
5864 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5865 movdqu [A0], xmm0
5866
5867 IEMIMPL_AVX_EPILOGUE
5868 EPILOGUE_4_ARGS
5869 %assign bImm 0
5870 %rep 256
5871.imm %+ bImm:
5872 IBT_ENDBRxx_WITHOUT_NOTRACK
5873 %1 xmm0, xmm0, xmm1, bImm
5874 ret
5875 %assign bImm bImm + 1
5876 %endrep
5877.immEnd:
5878ENDPROC iemAImpl_ %+ %1 %+ _u128
5879
5880BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5881 PROLOGUE_4_ARGS
5882 IEMIMPL_AVX_PROLOGUE
5883
5884 movzx A3, A3_8 ; must clear top bits
5885 vmovdqu ymm0, [A1]
5886 vmovdqu ymm1, [A2]
5887 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5888 vmovdqu [A0], ymm0
5889
5890 IEMIMPL_AVX_EPILOGUE
5891 EPILOGUE_4_ARGS
5892 %assign bImm 0
5893 %rep 256
5894.imm %+ bImm:
5895 IBT_ENDBRxx_WITHOUT_NOTRACK
5896 %1 ymm0, ymm0, ymm1, bImm
5897 ret
5898 %assign bImm bImm + 1
5899 %endrep
5900.immEnd:
5901ENDPROC iemAImpl_ %+ %1 %+ _u256
5902%endmacro
5903
5904IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5905IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5906
5907
5908;;
5909; One of the [p]blendv{b,ps,pd} variants
5910;
5911; @param 1 The instruction
5912;
5913; @param A0 Pointer to the first media register sized operand (input/output).
5914; @param A1 Pointer to the second media sized value (input).
5915; @param A2 Pointer to the media register sized mask value (input).
5916;
5917%macro IEMIMPL_P_BLEND 1
5918BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5919 PROLOGUE_3_ARGS
5920 IEMIMPL_SSE_PROLOGUE
5921
5922 movdqu xmm0, [A2] ; This is implicit
5923 movdqu xmm1, [A0]
5924 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5925 %1 xmm1, xmm2
5926 movdqu [A0], xmm1
5927
5928 IEMIMPL_SSE_PROLOGUE
5929 EPILOGUE_3_ARGS
5930ENDPROC iemAImpl_ %+ %1 %+ _u128
5931%endmacro
5932
5933IEMIMPL_P_BLEND pblendvb
5934IEMIMPL_P_BLEND blendvps
5935IEMIMPL_P_BLEND blendvpd
5936
5937
5938;;
5939; One of the v[p]blendv{b,ps,pd} variants
5940;
5941; @param 1 The instruction
5942;
5943; @param A0 Pointer to the first media register sized operand (output).
5944; @param A1 Pointer to the first media register sized operand (input).
5945; @param A2 Pointer to the second media register sized operand (input).
5946; @param A3 Pointer to the media register sized mask value (input).
5947%macro IEMIMPL_AVX_P_BLEND 1
5948BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5949 PROLOGUE_4_ARGS
5950 IEMIMPL_AVX_PROLOGUE
5951
5952 vmovdqu xmm0, [A1]
5953 vmovdqu xmm1, [A2]
5954 vmovdqu xmm2, [A3]
5955 %1 xmm0, xmm0, xmm1, xmm2
5956 vmovdqu [A0], xmm0
5957
5958 IEMIMPL_AVX_PROLOGUE
5959 EPILOGUE_4_ARGS
5960ENDPROC iemAImpl_ %+ %1 %+ _u128
5961
5962BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5963 PROLOGUE_4_ARGS
5964 IEMIMPL_AVX_PROLOGUE
5965
5966 vmovdqu ymm0, [A1]
5967 vmovdqu ymm1, [A2]
5968 vmovdqu ymm2, [A3]
5969 %1 ymm0, ymm0, ymm1, ymm2
5970 vmovdqu [A0], ymm0
5971
5972 IEMIMPL_AVX_PROLOGUE
5973 EPILOGUE_4_ARGS
5974ENDPROC iemAImpl_ %+ %1 %+ _u256
5975%endmacro
5976
5977IEMIMPL_AVX_P_BLEND vpblendvb
5978IEMIMPL_AVX_P_BLEND vblendvps
5979IEMIMPL_AVX_P_BLEND vblendvpd
5980
5981
5982;;
5983; palignr mm1, mm2/m64 instruction.
5984;
5985; @param A0 Pointer to the first media register sized operand (output).
5986; @param A1 The second register sized operand (input).
5987; @param A2 The 8-bit immediate.
5988BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5989 PROLOGUE_3_ARGS
5990 IEMIMPL_MMX_PROLOGUE
5991
5992 movzx A2, A2_8 ; must clear top bits
5993 movq mm0, [A0]
5994 movq mm1, A1
5995 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5996 movq [A0], mm0
5997
5998 IEMIMPL_MMX_EPILOGUE
5999 EPILOGUE_3_ARGS
6000 %assign bImm 0
6001 %rep 256
6002.imm %+ bImm:
6003 IBT_ENDBRxx_WITHOUT_NOTRACK
6004 palignr mm0, mm1, bImm
6005 ret
6006 %assign bImm bImm + 1
6007 %endrep
6008.immEnd:
6009ENDPROC iemAImpl_palignr_u64
6010
6011
6012;;
6013; SSE instructions with 8-bit immediates of the form
6014; xxx xmm1, xmm2, imm8.
6015; where the instruction encoding takes up 6 bytes.
6016;
6017; @param 1 The instruction name.
6018;
6019; @param A0 Pointer to the first media register size operand (input/output).
6020; @param A1 Pointer to the second source media register size operand (input).
6021; @param A2 The 8-bit immediate
6022;
6023%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
6024BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6025 PROLOGUE_3_ARGS
6026 IEMIMPL_SSE_PROLOGUE
6027
6028 movzx A2, A2_8 ; must clear top bits
6029 movdqu xmm0, [A0]
6030 movdqu xmm1, [A1]
6031 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
6032 movdqu [A0], xmm0
6033
6034 IEMIMPL_SSE_EPILOGUE
6035 EPILOGUE_3_ARGS
6036 %assign bImm 0
6037 %rep 256
6038.imm %+ bImm:
6039 IBT_ENDBRxx_WITHOUT_NOTRACK
6040 %1 xmm0, xmm1, bImm
6041 ret
6042 int3
6043 %assign bImm bImm + 1
6044 %endrep
6045.immEnd:
6046ENDPROC iemAImpl_ %+ %1 %+ _u128
6047%endmacro
6048
6049IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
6050IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
6051IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
6052IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
6053IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
6054IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
6055IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
6056
6057
6058;;
6059; AVX instructions with 8-bit immediates of the form
6060; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
6061; where the instruction encoding takes up 6 bytes.
6062;
6063; @param 1 The instruction name.
6064; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6065; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6066;
6067; @param A0 Pointer to the destination media register size operand (output).
6068; @param A1 Pointer to the first source media register size operand (input).
6069; @param A2 Pointer to the second source media register size operand (input).
6070; @param A3 The 8-bit immediate
6071;
6072%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
6073 %if %2 == 1
6074BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6075 PROLOGUE_4_ARGS
6076 IEMIMPL_AVX_PROLOGUE
6077
6078 movzx A3, A3_8 ; must clear top bits
6079 movdqu xmm0, [A1]
6080 movdqu xmm1, [A2]
6081 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6082 movdqu [A0], xmm0
6083
6084 IEMIMPL_AVX_EPILOGUE
6085 EPILOGUE_4_ARGS
6086 %assign bImm 0
6087 %rep 256
6088.imm %+ bImm:
6089 IBT_ENDBRxx_WITHOUT_NOTRACK
6090 %1 xmm0, xmm0, xmm1, bImm
6091 ret
6092 int3
6093 %assign bImm bImm + 1
6094 %endrep
6095.immEnd:
6096ENDPROC iemAImpl_ %+ %1 %+ _u128
6097 %endif
6098
6099 %if %3 == 1
6100BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
6101 PROLOGUE_4_ARGS
6102 IEMIMPL_AVX_PROLOGUE
6103
6104 movzx A3, A3_8 ; must clear top bits
6105 vmovdqu ymm0, [A1]
6106 vmovdqu ymm1, [A2]
6107 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6108 vmovdqu [A0], ymm0
6109
6110 IEMIMPL_AVX_EPILOGUE
6111 EPILOGUE_4_ARGS
6112 %assign bImm 0
6113 %rep 256
6114.imm %+ bImm:
6115 IBT_ENDBRxx_WITHOUT_NOTRACK
6116 %1 ymm0, ymm0, ymm1, bImm
6117 ret
6118 int3
6119 %assign bImm bImm + 1
6120 %endrep
6121.immEnd:
6122ENDPROC iemAImpl_ %+ %1 %+ _u256
6123 %endif
6124%endmacro
6125
6126IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
6127IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
6128IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
6129IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
6130IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
6131IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
6132IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
6133IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
6134IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
6135
6136
6137;;
6138; AVX instructions with 8-bit immediates of the form
6139; xxx {x,y}mm1, {x,y}mm2, imm8.
6140; where the instruction encoding takes up 6 bytes.
6141;
6142; @param 1 The instruction name.
6143; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6144; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6145; @param 4 The number of bytes taken up by a single instance of the instruction.
6146;
6147; @param A0 Pointer to the destination media register size operand (output).
6148; @param A1 Pointer to the first source media register size operand (input).
6149; @param A2 The 8-bit immediate
6150;
6151%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP 4
6152 %if %2 == 1
6153BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
6154 PROLOGUE_4_ARGS
6155 IEMIMPL_AVX_PROLOGUE
6156
6157 movzx A2, A2_8 ; must clear top bits
6158 movdqu xmm1, [A1]
6159 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6160 movdqu [A0], xmm0
6161
6162 IEMIMPL_AVX_EPILOGUE
6163 EPILOGUE_4_ARGS
6164 %assign bImm 0
6165 %rep 256
6166.imm %+ bImm:
6167 IBT_ENDBRxx_WITHOUT_NOTRACK
6168 %1 xmm0, xmm1, bImm
6169 ret
6170 int3
6171 %assign bImm bImm + 1
6172 %endrep
6173.immEnd:
6174ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
6175 %endif
6176
6177 %if %3 == 1
6178BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
6179 PROLOGUE_4_ARGS
6180 IEMIMPL_AVX_PROLOGUE
6181
6182 movzx A2, A2_8 ; must clear top bits
6183 vmovdqu ymm1, [A1]
6184 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6185 vmovdqu [A0], ymm0
6186
6187 IEMIMPL_AVX_EPILOGUE
6188 EPILOGUE_4_ARGS
6189 %assign bImm 0
6190 %rep 256
6191.imm %+ bImm:
6192 IBT_ENDBRxx_WITHOUT_NOTRACK
6193 %1 ymm0, ymm1, bImm
6194 ret
6195 int3
6196 %assign bImm bImm + 1
6197 %endrep
6198.immEnd:
6199ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
6200 %endif
6201%endmacro
6202
6203IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilps, 1, 1, 8
6204IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilpd, 1, 1, 8
6205IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpslldq, 1, 1, 7
6206IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpsrldq, 1, 1, 7
6207
6208
6209;;
6210; Need to move this as well somewhere better?
6211;
6212struc IEMPCMPISTRXSRC
6213 .uSrc1 resd 4
6214 .uSrc2 resd 4
6215endstruc
6216
6217struc IEMPCMPESTRXSRC
6218 .uSrc1 resd 4
6219 .uSrc2 resd 4
6220 .u64Rax resd 2
6221 .u64Rdx resd 2
6222endstruc
6223
6224;;
6225; The pcmpistri/vcmpistri instruction.
6226;
6227; @param 1 The instruction name
6228;
6229; @return R0_32 The new ECX value.
6230; @param A0 Pointer to the EFLAGS register.
6231; @param A1 Pointer to the first operand (input).
6232; @param A2 Pointer to the second operand (input).
6233; @param A3 The 8-bit immediate
6234;
6235%macro IEMIMPL_MEDIA_V_CMPISTRI 1
6236BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6237 PROLOGUE_4_ARGS
6238 IEMIMPL_SSE_PROLOGUE
6239
6240 movzx A3, A3_8 ; must clear top bits
6241 movdqu xmm0, [A1]
6242 movdqu xmm1, [A2]
6243 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6244 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6245
6246 IEM_SAVE_FLAGS_OLD T2, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6247 mov R0_32, ecx
6248
6249 IEMIMPL_SSE_EPILOGUE
6250 EPILOGUE_4_ARGS
6251 %assign bImm 0
6252 %rep 256
6253.imm %+ bImm:
6254 IBT_ENDBRxx_WITHOUT_NOTRACK
6255 %1 xmm0, xmm1, bImm
6256 ret
6257 int3
6258 %assign bImm bImm + 1
6259 %endrep
6260.immEnd:
6261ENDPROC iemAImpl_ %+ %1 %+ _u128
6262%endmacro
6263
6264IEMIMPL_MEDIA_V_CMPISTRI pcmpistri
6265IEMIMPL_MEDIA_V_CMPISTRI vpcmpistri
6266
6267
6268;;
6269; The pcmpestri instruction.
6270;
6271; @param 1 The instruction name
6272;
6273; @param A0 Pointer to the ECX register to store the result to (output).
6274; @param A1 Pointer to the EFLAGS register.
6275; @param A2 Pointer to the structure containing the source operands (input).
6276; @param A3 The 8-bit immediate
6277;
6278BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
6279 PROLOGUE_4_ARGS
6280 IEMIMPL_SSE_PROLOGUE
6281
6282 movzx A3, A3_8 ; must clear top bits
6283 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6284 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6285 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6286 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6287 push xDX ; xDX can be A1 or A2 depending on the calling convention
6288 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6289 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6290 IBT_NOTRACK
6291 call T1
6292
6293 pop xDX
6294 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6295 mov [T2], ecx
6296
6297 IEMIMPL_SSE_EPILOGUE
6298 EPILOGUE_4_ARGS
6299 %assign bImm 0
6300 %rep 256
6301.imm %+ bImm:
6302 IBT_ENDBRxx_WITHOUT_NOTRACK
6303 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6304 pcmpestri xmm0, xmm1, bImm
6305 ret
6306 %assign bImm bImm + 1
6307 %endrep
6308.immEnd:
6309ENDPROC iemAImpl_pcmpestri_u128
6310
6311
6312;;
6313; The vpcmpestri instruction.
6314;
6315; @param 1 The instruction name
6316;
6317; @param A0 Pointer to the ECX register to store the result to (output).
6318; @param A1 Pointer to the EFLAGS register.
6319; @param A2 Pointer to the structure containing the source operands (input).
6320; @param A3 The 8-bit immediate
6321;
6322BEGINPROC_FASTCALL iemAImpl_vpcmpestri_u128, 16
6323 PROLOGUE_4_ARGS
6324 IEMIMPL_SSE_PROLOGUE
6325
6326 movzx A3, A3_8 ; must clear top bits
6327 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6328 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6329 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6330 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6331 push xDX ; xDX can be A1 or A2 depending on the calling convention
6332 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6333 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6334 IBT_NOTRACK
6335 call T1
6336
6337 pop xDX
6338 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6339 mov [T2], ecx
6340
6341 IEMIMPL_SSE_EPILOGUE
6342 EPILOGUE_4_ARGS
6343 %assign bImm 0
6344 %rep 256
6345.imm %+ bImm:
6346 IBT_ENDBRxx_WITHOUT_NOTRACK
6347 db 0xc4, 0xe3, 0xf9, 0x61, 0xc1, bImm ; vpcmpestri xmm0,xmm1,0x1 with VEX.W set
6348 ret
6349 int3
6350 %assign bImm bImm + 1
6351 %endrep
6352.immEnd:
6353ENDPROC iemAImpl_vpcmpestri_u128
6354
6355
6356;;
6357; The pcmpistrm/vpcmpistrm instruction template.
6358;
6359; @param 1 The instruction name
6360;
6361; @param A0 Pointer to the XMM0 register to store the result to (output).
6362; @param A1 Pointer to the EFLAGS register.
6363; @param A2 Pointer to the structure containing the source operands (input).
6364; @param A3 The 8-bit immediate
6365;
6366%macro IEMIMPL_MEDIA_V_CMPISTRM 1
6367BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6368 PROLOGUE_4_ARGS
6369 IEMIMPL_SSE_PROLOGUE
6370
6371 movzx A3, A3_8 ; must clear top bits
6372 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
6373 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
6374 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6375
6376 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6377 movdqu [A0], xmm0
6378
6379 IEMIMPL_SSE_EPILOGUE
6380 EPILOGUE_4_ARGS
6381 %assign bImm 0
6382 %rep 256
6383.imm %+ bImm:
6384 IBT_ENDBRxx_WITHOUT_NOTRACK
6385 %1 xmm1, xmm2, bImm
6386 ret
6387 int3
6388 %assign bImm bImm + 1
6389 %endrep
6390.immEnd:
6391ENDPROC iemAImpl_ %+ %1 %+ _u128
6392%endmacro
6393
6394IEMIMPL_MEDIA_V_CMPISTRM pcmpistrm
6395IEMIMPL_MEDIA_V_CMPISTRM vpcmpistrm
6396
6397
6398;;
6399; The pcmpestrm instruction.
6400;
6401; @param A0 Pointer to the XMM0 register to store the result to (output).
6402; @param A1 Pointer to the EFLAGS register.
6403; @param A2 Pointer to the structure containing the source operands (input).
6404; @param A3 The 8-bit immediate
6405;
6406BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
6407 PROLOGUE_4_ARGS
6408 IEMIMPL_SSE_PROLOGUE
6409
6410 movzx A3, A3_8 ; must clear top bits
6411 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6412 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6413 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6414 push xDX ; xDX can be A1 or A2 depending on the calling convention
6415 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6416 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6417 IBT_NOTRACK
6418 call T1
6419
6420 pop xDX
6421 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6422 movdqu [A0], xmm0
6423
6424 IEMIMPL_SSE_EPILOGUE
6425 EPILOGUE_4_ARGS
6426 %assign bImm 0
6427 %rep 256
6428.imm %+ bImm:
6429 IBT_ENDBRxx_WITHOUT_NOTRACK
6430 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6431 pcmpestrm xmm1, xmm2, bImm
6432 ret
6433 %assign bImm bImm + 1
6434 %endrep
6435.immEnd:
6436ENDPROC iemAImpl_pcmpestrm_u128
6437
6438
6439;;
6440; The vpcmpestrm instruction.
6441;
6442; @param A0 Pointer to the XMM0 register to store the result to (output).
6443; @param A1 Pointer to the EFLAGS register.
6444; @param A2 Pointer to the structure containing the source operands (input).
6445; @param A3 The 8-bit immediate
6446;
6447BEGINPROC_FASTCALL iemAImpl_vpcmpestrm_u128, 16
6448 PROLOGUE_4_ARGS
6449 IEMIMPL_SSE_PROLOGUE
6450
6451 movzx A3, A3_8 ; must clear top bits
6452 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6453 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6454 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6455 push xDX ; xDX can be A1 or A2 depending on the calling convention
6456 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6457 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6458 IBT_NOTRACK
6459 call T1
6460
6461 pop xDX
6462 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6463 movdqu [A0], xmm0
6464
6465 IEMIMPL_SSE_EPILOGUE
6466 EPILOGUE_4_ARGS
6467 %assign bImm 0
6468 %rep 256
6469.imm %+ bImm:
6470 IBT_ENDBRxx_WITHOUT_NOTRACK
6471 db 0xc4, 0xe3, 0xf9, 0x60, 0xca, bImm ; vpcmpestrm xmm1, xmm2, bImm with VEX.W set
6472 ret
6473 int3
6474 %assign bImm bImm + 1
6475 %endrep
6476.immEnd:
6477ENDPROC iemAImpl_vpcmpestrm_u128
6478
6479
6480;;
6481; movmskp{s,d} SSE instruction template
6482;
6483; @param 1 The SSE instruction name.
6484; @param 2 The AVX instruction name.
6485;
6486; @param A0 Pointer to the output register (output/byte sized).
6487; @param A1 Pointer to the source media register size operand (input).
6488;
6489%macro IEMIMPL_MEDIA_MOVMSK_P 2
6490BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6491 PROLOGUE_2_ARGS
6492 IEMIMPL_SSE_PROLOGUE
6493
6494 movdqu xmm0, [A1]
6495 %1 T0, xmm0
6496 mov byte [A0], T0_8
6497
6498 IEMIMPL_SSE_EPILOGUE
6499 EPILOGUE_2_ARGS
6500ENDPROC iemAImpl_ %+ %1 %+ _u128
6501
6502BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6503 PROLOGUE_2_ARGS
6504 IEMIMPL_AVX_PROLOGUE
6505
6506 movdqu xmm0, [A1]
6507 %2 T0, xmm0
6508 mov byte [A0], T0_8
6509
6510 IEMIMPL_AVX_EPILOGUE
6511 EPILOGUE_2_ARGS
6512ENDPROC iemAImpl_ %+ %2 %+ _u128
6513
6514BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6515 PROLOGUE_2_ARGS
6516 IEMIMPL_AVX_PROLOGUE
6517
6518 vmovdqu ymm0, [A1]
6519 %2 T0, ymm0
6520 mov byte [A0], T0_8
6521
6522 IEMIMPL_AVX_EPILOGUE
6523 EPILOGUE_2_ARGS
6524ENDPROC iemAImpl_ %+ %2 %+ _u256
6525%endmacro
6526
6527IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6528IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6529
6530
6531;;
6532; Template for [v]cvttss2si/[v]cvtss2si instructions.
6533;
6534; @param 1 Instruction name.
6535; @param 2 AVX or SSE
6536;
6537; @return R0_32 The new MXCSR value of the guest.
6538; @param A0_32 The guest's MXCSR register value to use.
6539; @param A1 Pointer to the result operand (output).
6540; @param A2 Pointer to the second operand (input).
6541;
6542%macro IEMIMPL_MEDIA_V_CVTXSS2SI 2
6543BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r32, 16
6544 PROLOGUE_3_ARGS
6545 IEMIMPL_ %+ %2 %+ _PROLOGUE
6546 SSE_AVX_LD_MXCSR A0_32
6547
6548 %1 T0_32, [A2]
6549 mov dword [A1], T0_32
6550
6551 SSE_AVX_ST_MXCSR R0_32, A0_32
6552 IEMIMPL_ %+ %2 %+ _EPILOGUE
6553 EPILOGUE_3_ARGS
6554ENDPROC iemAImpl_ %+ %1 %+ _i32_r32
6555
6556
6557BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r32, 16
6558 PROLOGUE_3_ARGS
6559 IEMIMPL_ %+ %2 %+ _PROLOGUE
6560 SSE_AVX_LD_MXCSR A0_32
6561
6562 %1 T0, [A2]
6563 mov qword [A1], T0
6564
6565 SSE_AVX_ST_MXCSR R0_32, A0_32
6566 IEMIMPL_ %+ %2 %+ _EPILOGUE
6567 EPILOGUE_3_ARGS
6568ENDPROC iemAImpl_ %+ %1 %+ _i64_r32
6569%endmacro
6570
6571IEMIMPL_MEDIA_V_CVTXSS2SI cvttss2si, SSE
6572IEMIMPL_MEDIA_V_CVTXSS2SI vcvttss2si, AVX
6573IEMIMPL_MEDIA_V_CVTXSS2SI cvtss2si, SSE
6574IEMIMPL_MEDIA_V_CVTXSS2SI vcvtss2si, AVX
6575
6576
6577;;
6578; Template for [v]cvttsd2si/[v]cvtsd2si instructions.
6579;
6580; @param 1 Instruction name.
6581; @param 2 AVX or SSE
6582;
6583; @return R0_32 The new MXCSR value of the guest.
6584; @param A0_32 The guest's MXCSR register value to use.
6585; @param A1 Pointer to the result operand (output).
6586; @param A2 Pointer to the second operand (input).
6587;
6588%macro IEMIMPL_MEDIA_V_CVTXSD2SI 2
6589BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r64, 16
6590 PROLOGUE_3_ARGS
6591 IEMIMPL_ %+ %2 %+ _PROLOGUE
6592 SSE_AVX_LD_MXCSR A0_32
6593
6594 %1 T0_32, [A2]
6595 mov dword [A1], T0_32
6596
6597 SSE_AVX_ST_MXCSR R0_32, A0_32
6598 IEMIMPL_ %+ %2 %+ _EPILOGUE
6599 EPILOGUE_3_ARGS
6600ENDPROC iemAImpl_ %+ %1 %+ _i32_r64
6601
6602
6603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r64, 16
6604 PROLOGUE_3_ARGS
6605 IEMIMPL_ %+ %2 %+ _PROLOGUE
6606 SSE_AVX_LD_MXCSR A0_32
6607
6608 %1 T0, [A2]
6609 mov qword [A1], T0
6610
6611 SSE_AVX_ST_MXCSR R0_32, A0_32
6612 IEMIMPL_ %+ %2 %+ _EPILOGUE
6613 EPILOGUE_3_ARGS
6614ENDPROC iemAImpl_ %+ %1 %+ _i64_r64
6615%endmacro
6616
6617IEMIMPL_MEDIA_V_CVTXSD2SI cvttsd2si, SSE
6618IEMIMPL_MEDIA_V_CVTXSD2SI vcvttsd2si, AVX
6619IEMIMPL_MEDIA_V_CVTXSD2SI cvtsd2si, SSE
6620IEMIMPL_MEDIA_V_CVTXSD2SI vcvtsd2si, AVX
6621
6622
6623;;
6624; cvtsi2ss instruction - 32-bit variant.
6625;
6626; @return R0_32 The new MXCSR value of the guest.
6627; @param A0_32 The guest's MXCSR register value to use.
6628; @param A1 Pointer to the result operand (output).
6629; @param A2 Pointer to the second operand (input).
6630;
6631BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6632 PROLOGUE_3_ARGS
6633 IEMIMPL_SSE_PROLOGUE
6634 SSE_AVX_LD_MXCSR A0_32
6635
6636 cvtsi2ss xmm0, dword [A2]
6637 movd dword [A1], xmm0
6638
6639 SSE_AVX_ST_MXCSR R0_32, A0_32
6640 IEMIMPL_SSE_EPILOGUE
6641 EPILOGUE_3_ARGS
6642ENDPROC iemAImpl_cvtsi2ss_r32_i32
6643
6644
6645;;
6646; vcvtsi2ss instruction - 32-bit variant.
6647;
6648; @return R0_32 The new MXCSR value of the guest.
6649; @param A0_32 The guest's MXCSR register value to use.
6650; @param A1 Pointer to the result operand (output).
6651; @param A2 Pointer to the second operand (input).
6652; @param A3 Pointer to the third operand (input).
6653;
6654BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i32, 16
6655 PROLOGUE_3_ARGS
6656 IEMIMPL_AVX_PROLOGUE
6657 SSE_AVX_LD_MXCSR A0_32
6658
6659 movdqu xmm0, [A2]
6660 vcvtsi2ss xmm0, xmm0, dword [A3]
6661 movdqu [A1], xmm0
6662
6663 SSE_AVX_ST_MXCSR R0_32, A0_32
6664 IEMIMPL_AVX_EPILOGUE
6665 EPILOGUE_3_ARGS
6666ENDPROC iemAImpl_vcvtsi2ss_u128_i32
6667
6668
6669;;
6670; cvtsi2ss instruction - 64-bit variant.
6671;
6672; @return R0_32 The new MXCSR value of the guest.
6673; @param A0_32 The guest's MXCSR register value to use.
6674; @param A1 Pointer to the result operand (output).
6675; @param A2 Pointer to the second operand (input).
6676;
6677BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6678 PROLOGUE_3_ARGS
6679 IEMIMPL_SSE_PROLOGUE
6680 SSE_AVX_LD_MXCSR A0_32
6681
6682 cvtsi2ss xmm0, qword [A2]
6683 movd dword [A1], xmm0
6684
6685 SSE_AVX_ST_MXCSR R0_32, A0_32
6686 IEMIMPL_SSE_EPILOGUE
6687 EPILOGUE_3_ARGS
6688ENDPROC iemAImpl_cvtsi2ss_r32_i64
6689
6690
6691;;
6692; vcvtsi2ss instruction - 64-bit variant.
6693;
6694; @return R0_32 The new MXCSR value of the guest.
6695; @param A0_32 The guest's MXCSR register value to use.
6696; @param A1 Pointer to the result operand (output).
6697; @param A2 Pointer to the second operand (input).
6698; @param A3 Pointer to the third operand (input).
6699;
6700BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i64, 16
6701 PROLOGUE_3_ARGS
6702 IEMIMPL_AVX_PROLOGUE
6703 SSE_AVX_LD_MXCSR A0_32
6704
6705 movdqu xmm0, [A2]
6706 vcvtsi2ss xmm0, xmm0, qword [A3]
6707 movdqu [A1], xmm0
6708
6709 SSE_AVX_ST_MXCSR R0_32, A0_32
6710 IEMIMPL_AVX_EPILOGUE
6711 EPILOGUE_3_ARGS
6712ENDPROC iemAImpl_vcvtsi2ss_u128_i64
6713
6714
6715;;
6716; cvtsi2sd instruction - 32-bit variant.
6717;
6718; @return R0_32 The new MXCSR value of the guest.
6719; @param A0_32 The guest's MXCSR register value to use.
6720; @param A1 Pointer to the result operand (output).
6721; @param A2 Pointer to the second operand (input).
6722;
6723BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6724 PROLOGUE_3_ARGS
6725 IEMIMPL_SSE_PROLOGUE
6726 SSE_AVX_LD_MXCSR A0_32
6727
6728 cvtsi2sd xmm0, dword [A2]
6729 movq [A1], xmm0
6730
6731 SSE_AVX_ST_MXCSR R0_32, A0_32
6732 IEMIMPL_SSE_EPILOGUE
6733 EPILOGUE_3_ARGS
6734ENDPROC iemAImpl_cvtsi2sd_r64_i32
6735
6736
6737;;
6738; vcvtsi2sd instruction - 32-bit variant.
6739;
6740; @return R0_32 The new MXCSR value of the guest.
6741; @param A0_32 The guest's MXCSR register value to use.
6742; @param A1 Pointer to the result operand (output).
6743; @param A2 Pointer to the second operand (input).
6744; @param A3 Pointer to the third operand (input).
6745;
6746BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i32, 16
6747 PROLOGUE_3_ARGS
6748 IEMIMPL_AVX_PROLOGUE
6749 SSE_AVX_LD_MXCSR A0_32
6750
6751 movdqu xmm0, [A2]
6752 vcvtsi2sd xmm0, xmm0, dword [A3]
6753 movdqu [A1], xmm0
6754
6755 SSE_AVX_ST_MXCSR R0_32, A0_32
6756 IEMIMPL_AVX_EPILOGUE
6757 EPILOGUE_3_ARGS
6758ENDPROC iemAImpl_vcvtsi2sd_u128_i32
6759
6760
6761;;
6762; cvtsi2sd instruction - 64-bit variant.
6763;
6764; @return R0_32 The new MXCSR value of the guest.
6765; @param A0_32 The guest's MXCSR register value to use.
6766; @param A1 Pointer to the result operand (output).
6767; @param A2 Pointer to the second operand (input).
6768;
6769BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6770 PROLOGUE_3_ARGS
6771 IEMIMPL_SSE_PROLOGUE
6772 SSE_AVX_LD_MXCSR A0_32
6773
6774 cvtsi2sd xmm0, qword [A2]
6775 movq [A1], xmm0
6776
6777 SSE_AVX_ST_MXCSR R0_32, A0_32
6778 IEMIMPL_SSE_EPILOGUE
6779 EPILOGUE_3_ARGS
6780ENDPROC iemAImpl_cvtsi2sd_r64_i64
6781
6782
6783;;
6784; vcvtsi2sd instruction - 64-bit variant.
6785;
6786; @return R0_32 The new MXCSR value of the guest.
6787; @param A0_32 The guest's MXCSR register value to use.
6788; @param A1 Pointer to the result operand (output).
6789; @param A2 Pointer to the second operand (input).
6790; @param A3 Pointer to the third operand (input).
6791;
6792BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i64, 16
6793 PROLOGUE_3_ARGS
6794 IEMIMPL_AVX_PROLOGUE
6795 SSE_AVX_LD_MXCSR A0_32
6796
6797 movdqu xmm0, [A2]
6798 vcvtsi2sd xmm0, xmm0, qword [A3]
6799 movdqu [A1], xmm0
6800
6801 SSE_AVX_ST_MXCSR R0_32, A0_32
6802 IEMIMPL_AVX_EPILOGUE
6803 EPILOGUE_3_ARGS
6804ENDPROC iemAImpl_vcvtsi2sd_u128_i64
6805
6806
6807;
6808; UCOMISS (SSE)
6809;
6810; @return R0_32 The new MXCSR value of the guest.
6811; @param A0_32 The guest's MXCSR register value to use (input).
6812; @param A1 Pointer to the EFLAGS value (input/output).
6813; @param A2_32 The first source operand.
6814; @param A3_32 The second source operand.
6815;
6816BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6817 PROLOGUE_4_ARGS
6818 IEMIMPL_SSE_PROLOGUE
6819 SSE_AVX_LD_MXCSR A0_32
6820
6821 movd xmm0, A2_32
6822 movd xmm1, A3_32
6823 ucomiss xmm0, xmm1
6824 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6825
6826 SSE_AVX_ST_MXCSR R0_32, A0_32
6827 IEMIMPL_SSE_EPILOGUE
6828 EPILOGUE_4_ARGS
6829ENDPROC iemAImpl_ucomiss_u128
6830
6831BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6832 PROLOGUE_4_ARGS
6833 IEMIMPL_SSE_PROLOGUE
6834 SSE_AVX_LD_MXCSR A0_32
6835
6836 movd xmm0, A2_32
6837 movd xmm1, A3_32
6838 vucomiss xmm0, xmm1
6839 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6840
6841 SSE_AVX_ST_MXCSR R0_32, A0_32
6842 IEMIMPL_SSE_EPILOGUE
6843 EPILOGUE_3_ARGS
6844ENDPROC iemAImpl_vucomiss_u128
6845
6846
6847;
6848; UCOMISD (SSE)
6849;
6850; @return R0_32 The new MXCSR value of the guest.
6851; @param A0_32 The guest's MXCSR register value to use (input).
6852; @param A1 Pointer to the EFLAGS value (input/output).
6853; @param A2 The first source operand.
6854; @param A3 The second source operand.
6855;
6856BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6857 PROLOGUE_4_ARGS
6858 IEMIMPL_SSE_PROLOGUE
6859 SSE_AVX_LD_MXCSR A0_32
6860
6861 movq xmm0, A2
6862 movq xmm1, A3
6863 ucomisd xmm0, xmm1
6864 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6865
6866 SSE_AVX_ST_MXCSR R0_32, A0_32
6867 IEMIMPL_SSE_EPILOGUE
6868 EPILOGUE_4_ARGS
6869ENDPROC iemAImpl_ucomisd_u128
6870
6871BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6872 PROLOGUE_4_ARGS
6873 IEMIMPL_SSE_PROLOGUE
6874 SSE_AVX_LD_MXCSR A0_32
6875
6876 movq xmm0, A2
6877 movq xmm1, A3
6878 vucomisd xmm0, xmm1
6879 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6880
6881 SSE_AVX_ST_MXCSR R0_32, A0_32
6882 IEMIMPL_SSE_EPILOGUE
6883 EPILOGUE_4_ARGS
6884ENDPROC iemAImpl_vucomisd_u128
6885
6886;
6887; COMISS (SSE)
6888;
6889; @return R0_32 The new MXCSR value of the guest.
6890; @param A0_32 The guest's MXCSR register value to use (input).
6891; @param A1 Pointer to the EFLAGS value (input/output).
6892; @param A2_32 The first source operand.
6893; @param A3_32 The second source operand.
6894;
6895BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6896 PROLOGUE_4_ARGS
6897 IEMIMPL_SSE_PROLOGUE
6898 SSE_AVX_LD_MXCSR A0_32
6899
6900 movd xmm0, A2_32
6901 movd xmm1, A3_32
6902 comiss xmm0, xmm1
6903 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6904
6905 SSE_AVX_ST_MXCSR R0_32, A0_32
6906 IEMIMPL_SSE_EPILOGUE
6907 EPILOGUE_4_ARGS
6908ENDPROC iemAImpl_comiss_u128
6909
6910BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6911 PROLOGUE_4_ARGS
6912 IEMIMPL_SSE_PROLOGUE
6913 SSE_AVX_LD_MXCSR A0_32
6914
6915 movd xmm0, A2_32
6916 movd xmm1, A3_32
6917 vcomiss xmm0, xmm1
6918 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6919
6920 SSE_AVX_ST_MXCSR R0_32, A0_32
6921 IEMIMPL_SSE_EPILOGUE
6922 EPILOGUE_4_ARGS
6923ENDPROC iemAImpl_vcomiss_u128
6924
6925
6926;
6927; COMISD (SSE)
6928;
6929; @return R0_32 The new MXCSR value of the guest.
6930; @param A0_32 The guest's MXCSR register value to use (input).
6931; @param A1 Pointer to the EFLAGS value (input/output).
6932; @param A2 The first source operand.
6933; @param A3 The second source operand.
6934;
6935BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6936 PROLOGUE_4_ARGS
6937 IEMIMPL_SSE_PROLOGUE
6938 SSE_AVX_LD_MXCSR A0_32
6939
6940 movq xmm0, A2
6941 movq xmm1, A3
6942 comisd xmm0, xmm1
6943 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6944
6945 SSE_AVX_ST_MXCSR R0_32, A0_32
6946 IEMIMPL_SSE_EPILOGUE
6947 EPILOGUE_4_ARGS
6948ENDPROC iemAImpl_comisd_u128
6949
6950BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6951 PROLOGUE_4_ARGS
6952 IEMIMPL_SSE_PROLOGUE
6953 SSE_AVX_LD_MXCSR A0_32
6954
6955 movq xmm0, A2
6956 movq xmm1, A3
6957 vcomisd xmm0, xmm1
6958 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6959
6960 SSE_AVX_ST_MXCSR R0_32, A0_32
6961 IEMIMPL_SSE_EPILOGUE
6962 EPILOGUE_4_ARGS
6963ENDPROC iemAImpl_vcomisd_u128
6964
6965
6966;;
6967; Need to move this as well somewhere better?
6968;
6969struc IEMMEDIAF2XMMSRC
6970 .uSrc1 resd 4
6971 .uSrc2 resd 4
6972endstruc
6973
6974
6975struc IEMMEDIAF2YMMSRC
6976 .uSrc1 resd 8
6977 .uSrc2 resd 8
6978endstruc
6979
6980
6981;;
6982; SSE/AVX instructions with 8-bit immediates of the form
6983; xxx xmm1, xmm2, imm8.
6984; vxxx xmm1, xmm2, xmm3, imm8.
6985; and we need to load and save the MXCSR register.
6986;
6987; @param 1 The instruction name.
6988; @param 2 Flag whether this instruction has a 256-bit AVX variant (1) or not (0).
6989; @param 3 Number of bytes for the encoding of the SSE variant + ret instruction (AVX is fixed to 6).
6990;
6991; @return R0_32 The new MXCSR value of the guest.
6992; @param A0_32 The guest's MXCSR register value to use (input).
6993; @param A1 Pointer to the first media register size operand (output).
6994; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6995; @param A3 The 8-bit immediate (input).
6996;
6997%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR 3
6998BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6999 PROLOGUE_4_ARGS
7000 IEMIMPL_SSE_PROLOGUE
7001 SSE_AVX_LD_MXCSR A0_32
7002
7003 movzx A3, A3_8 ; must clear top bits
7004 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7005 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7006 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, %3
7007 movdqu [A1], xmm0
7008
7009 SSE_AVX_ST_MXCSR R0_32, A0_32
7010 IEMIMPL_SSE_EPILOGUE
7011 EPILOGUE_4_ARGS
7012 %assign bImm 0
7013 %rep 256
7014.imm %+ bImm:
7015 IBT_ENDBRxx_WITHOUT_NOTRACK
7016 %1 xmm0, xmm1, bImm
7017 ret
7018 %assign bImm bImm + 1
7019 %endrep
7020.immEnd:
7021ENDPROC iemAImpl_ %+ %1 %+ _u128
7022
7023
7024BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7025 PROLOGUE_4_ARGS
7026 IEMIMPL_SSE_PROLOGUE
7027 SSE_AVX_LD_MXCSR A0_32
7028
7029 movzx A3, A3_8 ; must clear top bits
7030 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7031 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7032 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7033 movdqu [A1], xmm0
7034
7035 SSE_AVX_ST_MXCSR R0_32, A0_32
7036 IEMIMPL_SSE_EPILOGUE
7037 EPILOGUE_4_ARGS
7038 %assign bImm 0
7039 %rep 256
7040.imm %+ bImm:
7041 IBT_ENDBRxx_WITHOUT_NOTRACK
7042 v %+ %1 xmm0, xmm0, xmm1, bImm
7043 ret
7044 %assign bImm bImm + 1
7045 %endrep
7046.immEnd:
7047ENDPROC iemAImpl_v %+ %1 %+ _u128
7048
7049 %if %2 == 1
7050BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7051 PROLOGUE_4_ARGS
7052 IEMIMPL_SSE_PROLOGUE
7053 SSE_AVX_LD_MXCSR A0_32
7054
7055 movzx A3, A3_8 ; must clear top bits
7056 vmovdqu ymm0, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7057 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7058 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7059 vmovdqu [A1], ymm0
7060
7061 SSE_AVX_ST_MXCSR R0_32, A0_32
7062 IEMIMPL_SSE_EPILOGUE
7063 EPILOGUE_4_ARGS
7064 %assign bImm 0
7065 %rep 256
7066.imm %+ bImm:
7067 IBT_ENDBRxx_WITHOUT_NOTRACK
7068 v %+ %1 ymm0, ymm0, ymm1, bImm
7069 ret
7070 %assign bImm bImm + 1
7071 %endrep
7072.immEnd:
7073ENDPROC iemAImpl_v %+ %1 %+ _u256
7074 %endif
7075%endmacro
7076
7077IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpps, 1, 5
7078IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmppd, 1, 6
7079IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpss, 0, 6
7080IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpsd, 0, 6
7081
7082
7083;;
7084; SSE/AVX instructions with 2 full sized perands and an 8-bit immediate of the form
7085; xxx xmm1, xmm2, imm8.
7086; vxxx xmm1, xmm2, imm8
7087; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7088; register.
7089;
7090; @param 1 The instruction name.
7091;
7092; @return R0_32 The new MXCSR value of the guest.
7093; @param A0_32 The guest's MXCSR register value to use (input).
7094; @param A1 Pointer to the first media register size operand (output).
7095; @param A2 Pointer to the second media register size operand (input).
7096; @param A3 The 8-bit immediate (input).
7097;
7098%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 1
7099BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7100 PROLOGUE_4_ARGS
7101 IEMIMPL_SSE_PROLOGUE
7102 SSE_AVX_LD_MXCSR A0_32
7103
7104 movzx A3, A3_8 ; must clear top bits
7105 movdqu xmm1, [A2]
7106 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7107 movdqu [A1], xmm0
7108
7109 SSE_AVX_ST_MXCSR R0_32, A0_32
7110 IEMIMPL_SSE_EPILOGUE
7111 EPILOGUE_4_ARGS
7112 %assign bImm 0
7113 %rep 256
7114.imm %+ bImm:
7115 IBT_ENDBRxx_WITHOUT_NOTRACK
7116 %1 xmm0, xmm1, bImm
7117 ret
7118 int3
7119 %assign bImm bImm + 1
7120 %endrep
7121.immEnd:
7122ENDPROC iemAImpl_ %+ %1 %+ _u128
7123
7124BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7125 PROLOGUE_4_ARGS
7126 IEMIMPL_SSE_PROLOGUE
7127 SSE_AVX_LD_MXCSR A0_32
7128
7129 movzx A3, A3_8 ; must clear top bits
7130 movdqu xmm1, [A2]
7131 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7132 movdqu [A1], xmm0
7133
7134 SSE_AVX_ST_MXCSR R0_32, A0_32
7135 IEMIMPL_SSE_EPILOGUE
7136 EPILOGUE_4_ARGS
7137 %assign bImm 0
7138 %rep 256
7139.imm %+ bImm:
7140 IBT_ENDBRxx_WITHOUT_NOTRACK
7141 v%1 xmm0, xmm1, bImm
7142 ret
7143 int3
7144 %assign bImm bImm + 1
7145 %endrep
7146.immEnd:
7147ENDPROC iemAImpl_v %+ %1 %+ _u128
7148
7149BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7150 PROLOGUE_4_ARGS
7151 IEMIMPL_SSE_PROLOGUE
7152 SSE_AVX_LD_MXCSR A0_32
7153
7154 movzx A3, A3_8 ; must clear top bits
7155 vmovdqu ymm1, [A2]
7156 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7157 vmovdqu [A1], ymm0
7158
7159 SSE_AVX_ST_MXCSR R0_32, A0_32
7160 IEMIMPL_SSE_EPILOGUE
7161 EPILOGUE_4_ARGS
7162 %assign bImm 0
7163 %rep 256
7164.imm %+ bImm:
7165 IBT_ENDBRxx_WITHOUT_NOTRACK
7166 v%1 ymm0, ymm1, bImm
7167 ret
7168 int3
7169 %assign bImm bImm + 1
7170 %endrep
7171.immEnd:
7172ENDPROC iemAImpl_v %+ %1 %+ _u256
7173%endmacro
7174
7175IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundps
7176IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundpd
7177
7178
7179;;
7180; SSE/AVX instructions with 3 full sized perands and an 8-bit immediate of the form
7181; xxx xmm1, xmm2, imm8.
7182; vxxx xmm1, xmm2, xmm3, imm8
7183; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7184; register.
7185;
7186; @param 1 The instruction name.
7187; @param 2 Flag whether to emit a 256-bit AVX variant (1) or not (0).
7188;
7189; @return R0_32 The new MXCSR value of the guest.
7190; @param A0_32 The guest's MXCSR register value to use (input).
7191; @param A1 Pointer to the first media register size operand (output).
7192; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC/IEMMEDIAF2YMMSRC (input).
7193; @param A3 The 8-bit immediate (input).
7194;
7195%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 2
7196BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7197 PROLOGUE_4_ARGS
7198 IEMIMPL_SSE_PROLOGUE
7199 SSE_AVX_LD_MXCSR A0_32
7200
7201 movzx A3, A3_8 ; must clear top bits
7202 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7203 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7204 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7205 movdqu [A1], xmm0
7206
7207 SSE_AVX_ST_MXCSR R0_32, A0_32
7208 IEMIMPL_SSE_EPILOGUE
7209 EPILOGUE_4_ARGS
7210 %assign bImm 0
7211 %rep 256
7212.imm %+ bImm:
7213 IBT_ENDBRxx_WITHOUT_NOTRACK
7214 %1 xmm0, xmm1, bImm
7215 ret
7216 int3
7217 %assign bImm bImm + 1
7218 %endrep
7219.immEnd:
7220ENDPROC iemAImpl_ %+ %1 %+ _u128
7221
7222
7223BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7224 PROLOGUE_4_ARGS
7225 IEMIMPL_SSE_PROLOGUE
7226 SSE_AVX_LD_MXCSR A0_32
7227
7228 movzx A3, A3_8 ; must clear top bits
7229 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7230 movdqu xmm2, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7231 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7232 movdqu [A1], xmm0
7233
7234 SSE_AVX_ST_MXCSR R0_32, A0_32
7235 IEMIMPL_SSE_EPILOGUE
7236 EPILOGUE_4_ARGS
7237 %assign bImm 0
7238 %rep 256
7239.imm %+ bImm:
7240 IBT_ENDBRxx_WITHOUT_NOTRACK
7241 v %+ %1 xmm0, xmm1, xmm2, bImm
7242 ret
7243 int3
7244 %assign bImm bImm + 1
7245 %endrep
7246.immEnd:
7247ENDPROC iemAImpl_v %+ %1 %+ _u128
7248
7249
7250 %if %2 == 1
7251BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7252 PROLOGUE_4_ARGS
7253 IEMIMPL_SSE_PROLOGUE
7254 SSE_AVX_LD_MXCSR A0_32
7255
7256 movzx A3, A3_8 ; must clear top bits
7257 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7258 vmovdqu ymm2, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7259 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7260 vmovdqu [A1], ymm0
7261
7262 SSE_AVX_ST_MXCSR R0_32, A0_32
7263 IEMIMPL_SSE_EPILOGUE
7264 EPILOGUE_4_ARGS
7265 %assign bImm 0
7266 %rep 256
7267.imm %+ bImm:
7268 IBT_ENDBRxx_WITHOUT_NOTRACK
7269 v %+ %1 ymm0, ymm1, ymm2, bImm
7270 ret
7271 int3
7272 %assign bImm bImm + 1
7273 %endrep
7274.immEnd:
7275ENDPROC iemAImpl_v %+ %1 %+ _u256
7276 %endif
7277%endmacro
7278
7279IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundss, 0
7280IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundsd, 0
7281IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dpps, 1
7282IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dppd, 0
7283
7284
7285;;
7286; SSE instructions of the form
7287; xxx mm, xmm.
7288; and we need to load and save the MXCSR register.
7289;
7290; @param 1 The instruction name.
7291;
7292; @return R0_32 The new MXCSR value of the guest.
7293; @param A0_32 The guest's MXCSR register value to use (input).
7294; @param A1 Pointer to the first MMX register sized operand (output).
7295; @param A2 Pointer to the media register sized operand (input).
7296;
7297%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
7298BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7299 PROLOGUE_3_ARGS
7300 IEMIMPL_SSE_PROLOGUE
7301 SSE_AVX_LD_MXCSR A0_32
7302
7303 movdqu xmm0, [A2]
7304 %1 mm0, xmm0
7305 movq [A1], mm0
7306
7307 SSE_AVX_ST_MXCSR R0_32, A0_32
7308 IEMIMPL_SSE_EPILOGUE
7309 EPILOGUE_3_ARGS
7310ENDPROC iemAImpl_ %+ %1 %+ _u128
7311%endmacro
7312
7313IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
7314IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
7315
7316;;
7317; SSE instructions of the form
7318; xxx xmm, xmm/m64.
7319; and we need to load and save the MXCSR register.
7320;
7321; @param 1 The instruction name.
7322;
7323; @return R0_32 The new MXCSR value of the guest.
7324; @param A0_32 The guest's MXCSR register value to use (input).
7325; @param A1 Pointer to the first media register sized operand (input/output).
7326; @param A2 The 64bit source value from a MMX media register (input)
7327;
7328%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
7329BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7330 PROLOGUE_3_ARGS
7331 IEMIMPL_SSE_PROLOGUE
7332 SSE_AVX_LD_MXCSR A0_32
7333
7334 movdqu xmm0, [A1]
7335 movq mm0, A2
7336 %1 xmm0, mm0
7337 movdqu [A1], xmm0
7338
7339 SSE_AVX_ST_MXCSR R0_32, A0_32
7340 IEMIMPL_SSE_EPILOGUE
7341 EPILOGUE_3_ARGS
7342ENDPROC iemAImpl_ %+ %1 %+ _u128
7343%endmacro
7344
7345IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
7346IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
7347
7348;;
7349; SSE instructions of the form
7350; xxx mm, xmm/m64.
7351; and we need to load and save the MXCSR register.
7352;
7353; @param 1 The instruction name.
7354;
7355; @return R0_32 The new MXCSR value of the guest.
7356; @param A0_32 The guest's MXCSR register value to use (input).
7357; @param A1 Pointer to the first MMX media register sized operand (output).
7358; @param A2 The 64bit source value (input).
7359;
7360%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
7361BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7362 PROLOGUE_3_ARGS
7363 IEMIMPL_SSE_PROLOGUE
7364 SSE_AVX_LD_MXCSR A0_32
7365
7366 movq xmm0, A2
7367 %1 mm0, xmm0
7368 movq [A1], mm0
7369
7370 SSE_AVX_ST_MXCSR R0_32, A0_32
7371 IEMIMPL_SSE_EPILOGUE
7372 EPILOGUE_3_ARGS
7373ENDPROC iemAImpl_ %+ %1 %+ _u128
7374%endmacro
7375
7376IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
7377IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
7378
7379;
7380; All forms of RDRAND and RDSEED
7381;
7382; @param A0 Pointer to the destination operand.
7383; @param A1 Pointer to the EFLAGS value (input/output).
7384;
7385%macro IEMIMPL_RDRAND_RDSEED 3
7386BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
7387 PROLOGUE_2_ARGS
7388
7389 %1 %2
7390 mov [A0], %2
7391 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
7392
7393 EPILOGUE_2_ARGS
7394ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
7395%endmacro
7396
7397IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
7398IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
7399IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
7400IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
7401IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
7402IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
7403
7404
7405;;
7406; sha1rnds4 xmm1, xmm2, imm8.
7407;
7408; @param 1 The instruction name.
7409;
7410; @param A0 Pointer to the first media register size operand (input/output).
7411; @param A1 Pointer to the second source media register size operand (input).
7412; @param A2 The 8-bit immediate
7413;
7414BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
7415 PROLOGUE_3_ARGS
7416 IEMIMPL_SSE_PROLOGUE
7417
7418 movzx A2, A2_8 ; must clear top bits
7419 movdqu xmm0, [A0]
7420 movdqu xmm1, [A1]
7421 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
7422 movdqu [A0], xmm0
7423
7424 IEMIMPL_SSE_EPILOGUE
7425 EPILOGUE_3_ARGS
7426 %assign bImm 0
7427 %rep 256
7428.imm %+ bImm:
7429 IBT_ENDBRxx_WITHOUT_NOTRACK
7430 sha1rnds4 xmm0, xmm1, bImm
7431 ret
7432 %assign bImm bImm + 1
7433 %endrep
7434.immEnd:
7435ENDPROC iemAImpl_sha1rnds4_u128
7436
7437
7438;;
7439; sha256rnds2 xmm1, xmm2, <XMM0>.
7440;
7441; @param 1 The instruction name.
7442;
7443; @param A0 Pointer to the first media register size operand (input/output).
7444; @param A1 Pointer to the second source media register size operand (input).
7445; @param A2 Pointer to the implicit XMM0 constants (input).
7446;
7447BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
7448 PROLOGUE_3_ARGS
7449 IEMIMPL_SSE_PROLOGUE
7450
7451 movdqu xmm0, [A2]
7452 movdqu xmm1, [A0]
7453 movdqu xmm2, [A1]
7454 sha256rnds2 xmm1, xmm2
7455 movdqu [A0], xmm1
7456
7457 IEMIMPL_SSE_EPILOGUE
7458 EPILOGUE_3_ARGS
7459ENDPROC iemAImpl_sha256rnds2_u128
7460
7461
7462;
7463; 32-bit forms of ADCX and ADOX
7464;
7465; @returns Updated EFLAGS.
7466; @param A0 Incoming EFLAGS value (input).
7467; @param A1 Pointer to the destination operand (input/output).
7468; @param A2 32-bit source operand 1 (input).
7469;
7470%macro IEMIMPL_ADX_32 2
7471BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
7472 PROLOGUE_4_ARGS
7473
7474 IEM_LOAD_FLAGS A0_32, %2, 0
7475 %1 A2_32, [A1]
7476 mov [A1], A2_32
7477 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7478
7479 EPILOGUE_4_ARGS
7480ENDPROC iemAImpl_ %+ %1 %+ _u32
7481%endmacro
7482
7483;
7484; 64-bit forms of ADCX and ADOX
7485;
7486; @returns Updated EFLAGS.
7487; @param A0 Incoming EFLAGS value (input).
7488; @param A1 Pointer to the destination operand (input/output).
7489; @param A2 64-bit source operand 1 (input).
7490;
7491%macro IEMIMPL_ADX_64 2
7492BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
7493 PROLOGUE_4_ARGS
7494
7495 IEM_LOAD_FLAGS A0_32, %2, 0
7496 %1 A2, [A1]
7497 mov [A1], A2
7498 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7499
7500 EPILOGUE_4_ARGS
7501ENDPROC iemAImpl_ %+ %1 %+ _u64
7502%endmacro
7503
7504IEMIMPL_ADX_32 adcx, X86_EFL_CF
7505IEMIMPL_ADX_64 adcx, X86_EFL_CF
7506
7507IEMIMPL_ADX_32 adox, X86_EFL_OF
7508IEMIMPL_ADX_64 adox, X86_EFL_OF
7509
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette