VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 107044

Last change on this file since 107044 was 106179, checked in by vboxsync, 8 weeks ago

VMM/IEM: Reworked the div, idiv, mul and imul assembly workers and how we raise division error exceptions. The latter is to simplify eflags management. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 221.5 KB
Line 
1; $Id: IEMAllAImpl.asm 106179 2024-09-29 01:14:19Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; This is handy for generating absolutly correct EFLAGS.
43;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
44
45
46;;
47; RET XX / RET wrapper for fastcall.
48;
49%macro RET_FASTCALL 1
50%ifdef RT_ARCH_X86
51 %ifdef RT_OS_WINDOWS
52 ret %1
53 %else
54 ret
55 %endif
56%else
57 ret
58%endif
59%endmacro
60
61;;
62; NAME for fastcall functions.
63;
64;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
65; escaping (or whatever the dollar is good for here). Thus the ugly
66; prefix argument.
67;
68%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
69%ifdef RT_ARCH_X86
70 %ifdef RT_OS_WINDOWS
71 %undef NAME_FASTCALL
72 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
73 %endif
74%endif
75
76;;
77; BEGINPROC for fastcall functions.
78;
79; @param 1 The function name (C).
80; @param 2 The argument size on x86.
81;
82%macro BEGINPROC_FASTCALL 2
83GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
84 IBT_ENDBRxx
85%endmacro
86
87
88;
89; We employ some macro assembly here to hid the calling convention differences.
90;
91%ifdef RT_ARCH_AMD64
92 %macro PROLOGUE_1_ARGS 0
93 %endmacro
94 %macro EPILOGUE_1_ARGS 0
95 ret
96 %endmacro
97 %macro EPILOGUE_1_ARGS_EX 0
98 ret
99 %endmacro
100
101 %macro PROLOGUE_2_ARGS 0
102 %endmacro
103 %macro EPILOGUE_2_ARGS 0
104 ret
105 %endmacro
106 %macro EPILOGUE_2_ARGS_EX 1
107 ret
108 %endmacro
109
110 %macro PROLOGUE_3_ARGS 0
111 %endmacro
112 %macro EPILOGUE_3_ARGS 0
113 ret
114 %endmacro
115 %macro EPILOGUE_3_ARGS_EX 1
116 ret
117 %endmacro
118
119 %macro PROLOGUE_4_ARGS 0
120 %endmacro
121 %macro EPILOGUE_4_ARGS 0
122 ret
123 %endmacro
124 %macro EPILOGUE_4_ARGS_EX 1
125 ret
126 %endmacro
127
128 %ifdef ASM_CALL64_GCC
129 %define A0 rdi
130 %define A0_32 edi
131 %define A0_16 di
132 %define A0_8 dil
133
134 %define A1 rsi
135 %define A1_32 esi
136 %define A1_16 si
137 %define A1_8 sil
138
139 %define A2 rdx
140 %define A2_32 edx
141 %define A2_16 dx
142 %define A2_8 dl
143
144 %define A3 rcx
145 %define A3_32 ecx
146 %define A3_16 cx
147 %define A3_8 cl
148 %endif
149
150 %ifdef ASM_CALL64_MSC
151 %define A0 rcx
152 %define A0_32 ecx
153 %define A0_16 cx
154 %define A0_8 cl
155
156 %define A1 rdx
157 %define A1_32 edx
158 %define A1_16 dx
159 %define A1_8 dl
160
161 %define A2 r8
162 %define A2_32 r8d
163 %define A2_16 r8w
164 %define A2_8 r8b
165
166 %define A3 r9
167 %define A3_32 r9d
168 %define A3_16 r9w
169 %define A3_8 r9b
170 %endif
171
172 %define T0 rax
173 %define T0_32 eax
174 %define T0_16 ax
175 %define T0_8 al
176
177 %define T1 r11
178 %define T1_32 r11d
179 %define T1_16 r11w
180 %define T1_8 r11b
181
182 %define T2 r10 ; only AMD64
183 %define T2_32 r10d
184 %define T2_16 r10w
185 %define T2_8 r10b
186
187 ;
188 ; Return value, same as T0 but to make it more obvious
189 ; that this is a return value.
190 ;
191 %define R0 rax
192 %define R0_32 eax
193 %define R0_16 ax
194 %define R0_8 al
195
196%else
197 ; x86
198 %macro PROLOGUE_1_ARGS 0
199 push edi
200 %endmacro
201 %macro EPILOGUE_1_ARGS 0
202 pop edi
203 ret 0
204 %endmacro
205 %macro EPILOGUE_1_ARGS_EX 1
206 pop edi
207 ret %1
208 %endmacro
209
210 %macro PROLOGUE_2_ARGS 0
211 push edi
212 %endmacro
213 %macro EPILOGUE_2_ARGS 0
214 pop edi
215 ret 0
216 %endmacro
217 %macro EPILOGUE_2_ARGS_EX 1
218 pop edi
219 ret %1
220 %endmacro
221
222 %macro PROLOGUE_3_ARGS 0
223 push ebx
224 mov ebx, [esp + 4 + 4]
225 push edi
226 %endmacro
227 %macro EPILOGUE_3_ARGS_EX 1
228 %if (%1) < 4
229 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
230 %endif
231 pop edi
232 pop ebx
233 ret %1
234 %endmacro
235 %macro EPILOGUE_3_ARGS 0
236 EPILOGUE_3_ARGS_EX 4
237 %endmacro
238
239 %macro PROLOGUE_4_ARGS 0
240 push ebx
241 push edi
242 push esi
243 mov ebx, [esp + 12 + 4 + 0]
244 mov esi, [esp + 12 + 4 + 4]
245 %endmacro
246 %macro EPILOGUE_4_ARGS_EX 1
247 %if (%1) < 8
248 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
249 %endif
250 pop esi
251 pop edi
252 pop ebx
253 ret %1
254 %endmacro
255 %macro EPILOGUE_4_ARGS 0
256 EPILOGUE_4_ARGS_EX 8
257 %endmacro
258
259 %define A0 ecx
260 %define A0_32 ecx
261 %define A0_16 cx
262 %define A0_8 cl
263
264 %define A1 edx
265 %define A1_32 edx
266 %define A1_16 dx
267 %define A1_8 dl
268
269 %define A2 ebx
270 %define A2_32 ebx
271 %define A2_16 bx
272 %define A2_8 bl
273
274 %define A3 esi
275 %define A3_32 esi
276 %define A3_16 si
277
278 %define T0 eax
279 %define T0_32 eax
280 %define T0_16 ax
281 %define T0_8 al
282
283 %define T1 edi
284 %define T1_32 edi
285 %define T1_16 di
286%endif
287
288
289;;
290; Load the relevant flags from [%1] if there are undefined flags (%3).
291;
292; @remarks Clobbers T0, stack. Changes EFLAGS.
293; @param 1 The parameter (A0..A3) holding the eflags value.
294; @param 2 The set of modified flags.
295; @param 3 The set of undefined flags.
296; @param 4 The flags that must be loaded.
297;
298%macro IEM_MAYBE_LOAD_FLAGS 4
299 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
300 pushf ; store current flags
301 mov T0_32, %1 ; load the guest flags
302 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
303 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
304 or [xSP], T0 ; merge guest flags with host flags.
305 popf ; load the mixed flags.
306
307 %elif (%3 + %4) != 0
308 %if 1 ; This approach seems faster on intel 10980XE
309 %if (%3 | %4) == X86_EFL_CF
310 ; Use bt to load bit into CF
311 bt %1, X86_EFL_CF_BIT
312 %else
313 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
314 mov eax, %1
315 %if (%3 | %4) == X86_EFL_OF
316 ; Use ADD to set OF.
317 shl eax, 31 - X86_EFL_OF_BIT
318 add eax, 80000000h
319 %elif ((%3 | %4) & X86_EFL_OF) != 0
320 ; Use ADD to set OF.
321 xchg al, ah
322 shl al, 15 - X86_EFL_OF_BIT
323 add al, 80h
324 ; Use SAHF to set the other status flags.
325 sahf
326 %else ; OF not needed; so al -> ah and load ah into eflags.
327 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
328 shl eax, 8
329 %else
330 xchg al, ah
331 %endif
332 sahf
333 %endif
334 %endif
335
336 %else
337 pushf ; store current flags
338 mov T0_32, %1 ; load the guest flags
339 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
340 and T0_32, (%2 | %3) ; select the modified and undefined flags.
341 or [xSP], T0 ; merge guest flags with host flags.
342 popf ; load the mixed flags.
343 %endif
344 %endif
345%endmacro
346
347;;
348; Load the relevant flags from [%1].
349;
350; @remarks Clobbers T0, stack. Changes EFLAGS.
351; @param 1 The parameter (A0..A3) holding the eflags value.
352; @param 2 The set of flags to load.
353; @param 3 The set of undefined flags.
354;
355%macro IEM_LOAD_FLAGS 3
356 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
357 pushf ; store current flags
358 mov T0_32, %1 ; load the guest flags
359 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
360 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
361 or [xSP], T0 ; merge guest flags with host flags.
362 popf ; load the mixed flags.
363
364 %elif 1 ; This approach seems faster on intel 10980XE
365 %if (%3 | %2) == X86_EFL_CF
366 ; Use bt to load bit into CF
367 bt %1, X86_EFL_CF_BIT
368 %else
369 mov eax, %1 ; ASSUMES T0_32 is eax!!
370 %if (%3 | %2) == X86_EFL_OF
371 ; Use ADD to set OF.
372 shl eax, 31 - X86_EFL_OF_BIT
373 add eax, 80000000h
374 %elif ((%3 | %2) & X86_EFL_OF) != 0
375 ; Use ADD to set OF.
376 xchg al, ah
377 shl al, 15 - X86_EFL_OF_BIT
378 add al, 80h
379 ; Use SAHF to set the other status flags.
380 sahf
381 %else ; OF not needed; so al -> ah and load ah into eflags.
382 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
383 shl eax, 8
384 %else
385 xchg al, ah
386 %endif
387 sahf
388 %endif
389 %endif ; (%3 | %2) != X86_EFL_CF
390
391 %else
392 pushf ; store current flags
393 mov T0_32, %1 ; load the guest flags
394 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
395 and T0_32, (%2 | %3) ; select the modified and undefined flags.
396 or [xSP], T0 ; merge guest flags with host flags.
397 popf ; load the mixed flags.
398 %endif
399%endmacro
400
401;;
402; Merge incoming guest EFLAGS (%1) with host EFLAGS into EAX (T0).
403;
404; @remarks Clobbers T0, T1, %1, stack.
405; @param 1 The parameter (A0..A3) holding the OLD eflags value. Clobbered.
406; @param 2 The mask of modified flags to save.
407; @param 3 The mask of undefined flags to (maybe) save.
408; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
409;
410%macro IEM_SAVE_FLAGS_RETVAL 4 0
411 %if (%2 | %3 | %4) != 0
412 mov T1_32, %1 ; flags
413 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
414 pushf
415 pop T0
416 and %1, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
417 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
418 %else
419 %if (%2 | %3 | %4) == X86_EFL_CF
420 setc T0_8
421 %elif (%2 | %3) == X86_EFL_OF
422 seto T0_8
423 shl T0_32, X86_EFL_OF_BIT
424 %elif (%2 | %3) == X86_EFL_ZF
425 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
426 shl T0_32, X86_EFL_ZF_BIT
427 %elif (%2 | %3) <= 0xff
428 lahf
429 movzx eax, ah ; ASSUMES T0_32 is eax!
430 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
431 lahf ; while there seems only to be a tiny advantage in most other test.
432 movzx eax, ah ; ASSUMES T0_32 is eax!
433 jno .of_is_clear
434 or eax, X86_EFL_OF
435.of_is_clear:
436 %else
437 pushf ; this is a bit slow
438 pop T0
439 %endif
440 and %1, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
441 and T0_32, (%2 | %3) ; select the modified and undefined flags.
442 %endif
443 or T0_32, %1 ; combine the flags. ASSUMES T0 = eax!
444 ;mov %1, T0_32 ; save the flags.
445 %else
446 mov T0_32, %1
447 %endif
448%endmacro
449
450;;
451; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
452;
453; @remarks Clobbers T0, T1, stack.
454; @param 1 The parameter (A0..A3) holding the eflags value.
455; @param 2 The mask of modified flags to save.
456; @param 3 Mask of additional flags to always clear
457; @param 4 Mask of additional flags to always set.
458;
459;; @todo make it stuff the result into EAX?
460%macro IEM_SAVE_AND_ADJUST_FLAGS 4
461 %if (%2 | %3 | %4) != 0
462 pushf
463 pop T1
464 mov T0_32, %1 ; load flags.
465 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
466 and T1_32, (%2) ; select the modified flags.
467 or T0_32, T1_32 ; combine the flags.
468 %if (%4) != 0
469 or T0_32, %4 ; add the always set flags.
470 %endif
471 mov %1, T0_32 ; save the result.
472 %endif
473%endmacro
474
475;;
476; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
477; signed input (%4[%5]) and parity index (%6), storing the result into EAX (T0).
478;
479; @note %4 & %6 must not be RAX, EAX, or AX! So, don't use with full MUL/IMUL.
480
481; @remarks Clobbers T0, T1, stack, %6, EFLAGS, %1.
482; @param 1 The parameter (A0..A3) holding the eflags value.
483; @param 2 The mask of modified flags to save.
484; @param 3 Mask of additional flags to always clear
485; @param 4 The result register to set SF by.
486; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
487; @param 6 The (full) register containing the parity table index. Will be modified!
488%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL 6
489 pushf
490 pop T0
491 and %1, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
492 and T0_32, (%2) ; select the modified flags.
493 or T0_32, %1 ; combine the flags.
494
495 ; First calculate SF as it is the same register as %6 (only %6 is always full width).
496 bt %4, %5 - 1
497 jnc %%sf_clear
498 or T0_32, X86_EFL_SF
499 %%sf_clear:
500
501 ; Parity last.
502 and %6, 0xff
503 %ifdef RT_ARCH_AMD64
504 lea T1, [NAME(g_afParity) xWrtRIP]
505 or T0_8, [T1 + %6]
506 %else
507 or T0_8, [NAME(g_afParity) + %6]
508 %endif
509
510 ;mov %1, T0_32 ; save the result.
511 ; ASSUMES T0 = eax!
512%endmacro
513
514;;
515; Calculates the new EFLAGS using fixed clear and set bit masks.
516;
517; @remarks Clobbers/returns T0.
518; @param 1 The parameter (A0..A3) holding the eflags value.
519; @param 2 Mask of additional flags to always clear
520; @param 3 Mask of additional flags to always set.
521;
522%macro IEM_ADJUST_FLAGS_RETVAL 3
523 mov T0_32, %1 ; Load flags. ASSUMES T0 is EAX!
524 %if (%2 | %3) != 0
525 %if (%2) != 0
526 and T0_32, ~(%2) ; Remove the always cleared flags.
527 %endif
528 %if (%3) != 0
529 or T0_32, %3 ; Add the always set flags.
530 %endif
531 %endif
532%endmacro
533
534;;
535; Calculates the new EFLAGS using fixed clear and set bit masks.
536;
537; @remarks Clobbers T0, %4, EFLAGS.
538; @param 1 The parameter (A0..A3) holding the eflags value.
539; @param 2 Mask of additional flags to always clear
540; @param 3 Mask of additional flags to always set.
541; @param 4 The (full) register containing the parity table index. Will be modified!
542;
543%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
544 mov T0_32, %1 ; Load flags.
545 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
546 %if (%3) != 0
547 or T0_32, %3 ; Add the always set flags.
548 %endif
549 and %4, 0xff
550 %ifdef RT_ARCH_AMD64
551 lea T2, [NAME(g_afParity) xWrtRIP]
552 or T0_8, [T2 + %4]
553 %else
554 or T0_8, [NAME(g_afParity) + %4]
555 %endif
556 mov %1, T0_32 ; Save the result.
557%endmacro
558
559
560;;;; OLD EFLAGS macros.
561;;;; OLD EFLAGS macros.
562;;;; OLD EFLAGS macros.
563;;;; OLD EFLAGS macros.
564;;;; OLD EFLAGS macros.
565
566;;
567; Load the relevant flags from [%1] if there are undefined flags (%3).
568;
569; @remarks Clobbers T0, stack. Changes EFLAGS.
570; @param 1 The parameter (A0..A3) pointing to the eflags.
571; @param 2 The set of modified flags.
572; @param 3 The set of undefined flags.
573; @param 4 The flags that must be loaded.
574;
575%macro IEM_MAYBE_LOAD_FLAGS_OLD 4
576 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
577 pushf ; store current flags
578 mov T0_32, [%1] ; load the guest flags
579 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
580 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
581 or [xSP], T0 ; merge guest flags with host flags.
582 popf ; load the mixed flags.
583
584 %elif (%3 + %4) != 0
585 %if 1 ; This approach seems faster on intel 10980XE
586 %if (%3 | %4) == X86_EFL_CF
587 ; Use bt to load bit into CF
588 bt dword [%1], X86_EFL_CF_BIT
589 %else
590 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
591 mov eax, [%1]
592 %if (%3 | %4) == X86_EFL_OF
593 ; Use ADD to set OF.
594 shl eax, 31 - X86_EFL_OF_BIT
595 add eax, 80000000h
596 %elif ((%3 | %4) & X86_EFL_OF) != 0
597 ; Use ADD to set OF.
598 xchg al, ah
599 shl al, 15 - X86_EFL_OF_BIT
600 add al, 80h
601 ; Use SAHF to set the other status flags.
602 sahf
603 %else ; OF not needed; so al -> ah and load ah into eflags.
604 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
605 shl eax, 8
606 %else
607 xchg al, ah
608 %endif
609 sahf
610 %endif
611 %endif
612
613 %else
614 pushf ; store current flags
615 mov T0_32, [%1] ; load the guest flags
616 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
617 and T0_32, (%2 | %3) ; select the modified and undefined flags.
618 or [xSP], T0 ; merge guest flags with host flags.
619 popf ; load the mixed flags.
620 %endif
621 %endif
622%endmacro
623
624;;
625; Load the relevant flags from [%1].
626;
627; @remarks Clobbers T0, stack. Changes EFLAGS.
628; @param 1 The parameter (A0..A3) pointing to the eflags.
629; @param 2 The set of flags to load.
630; @param 3 The set of undefined flags.
631;
632%macro IEM_LOAD_FLAGS_OLD 3
633 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
634 pushf ; store current flags
635 mov T0_32, [%1] ; load the guest flags
636 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
637 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
638 or [xSP], T0 ; merge guest flags with host flags.
639 popf ; load the mixed flags.
640
641 %elif 1 ; This approach seems faster on intel 10980XE
642 %if (%3 | %2) == X86_EFL_CF
643 ; Use bt to load bit into CF
644 bt dword [%1], X86_EFL_CF_BIT
645 %else
646 mov eax, [%1] ; ASSUMES T0_32 is eax!!
647 %if (%3 | %2) == X86_EFL_OF
648 ; Use ADD to set OF.
649 shl eax, 31 - X86_EFL_OF_BIT
650 add eax, 80000000h
651 %elif ((%3 | %2) & X86_EFL_OF) != 0
652 ; Use ADD to set OF.
653 xchg al, ah
654 shl al, 15 - X86_EFL_OF_BIT
655 add al, 80h
656 ; Use SAHF to set the other status flags.
657 sahf
658 %else ; OF not needed; so al -> ah and load ah into eflags.
659 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
660 shl eax, 8
661 %else
662 xchg al, ah
663 %endif
664 sahf
665 %endif
666 %endif ; (%3 | %2) != X86_EFL_CF
667
668 %else
669 pushf ; store current flags
670 mov T0_32, [%1] ; load the guest flags
671 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
672 and T0_32, (%2 | %3) ; select the modified and undefined flags.
673 or [xSP], T0 ; merge guest flags with host flags.
674 popf ; load the mixed flags.
675 %endif
676%endmacro
677
678;;
679; Update the flag.
680;
681; @remarks Clobbers T0, T1, stack.
682; @param 1 The register pointing to the EFLAGS.
683; @param 2 The mask of modified flags to save.
684; @param 3 The mask of undefined flags to (maybe) save.
685; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
686;
687%macro IEM_SAVE_FLAGS_OLD 4 0
688 %if (%2 | %3 | %4) != 0
689 mov T1_32, [%1] ; flags
690 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
691 pushf
692 pop T0
693 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
694 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
695 %else
696 %if (%2 | %3 | %4) == X86_EFL_CF
697 setc T0_8
698 %elif (%2 | %3) == X86_EFL_OF
699 seto T0_8
700 shl T0_32, X86_EFL_OF_BIT
701 %elif (%2 | %3) == X86_EFL_ZF
702 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
703 shl T0_32, X86_EFL_ZF_BIT
704 %elif (%2 | %3) <= 0xff
705 lahf
706 movzx eax, ah ; ASSUMES T0_32 is eax!
707 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
708 lahf ; while there seems only to be a tiny advantage in most other test.
709 movzx eax, ah ; ASSUMES T0_32 is eax!
710 jno .of_is_clear
711 or eax, X86_EFL_OF
712.of_is_clear:
713 %else
714 pushf ; this is a bit slow
715 pop T0
716 %endif
717 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
718 and T0_32, (%2 | %3) ; select the modified and undefined flags.
719 %endif
720 or T0_32, T1_32 ; combine the flags.
721 mov [%1], T0_32 ; save the flags.
722 %endif
723%endmacro
724
725;;
726; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
727;
728; @remarks Clobbers T0, T1, stack.
729; @param 1 The register pointing to the EFLAGS.
730; @param 2 The mask of modified flags to save.
731; @param 3 Mask of additional flags to always clear
732; @param 4 Mask of additional flags to always set.
733;
734%macro IEM_SAVE_AND_ADJUST_FLAGS_OLD 4
735 %if (%2 | %3 | %4) != 0
736 pushf
737 pop T1
738 mov T0_32, [%1] ; load flags.
739 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
740 and T1_32, (%2) ; select the modified flags.
741 or T0_32, T1_32 ; combine the flags.
742 %if (%4) != 0
743 or T0_32, %4 ; add the always set flags.
744 %endif
745 mov [%1], T0_32 ; save the result.
746 %endif
747%endmacro
748
749;;
750; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
751; signed input (%4[%5]) and parity index (%6).
752;
753; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
754; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
755; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
756;
757; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
758; @param 1 The register pointing to the EFLAGS.
759; @param 2 The mask of modified flags to save.
760; @param 3 Mask of additional flags to always clear
761; @param 4 The result register to set SF by.
762; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
763; @param 6 The (full) register containing the parity table index. Will be modified!
764
765%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD 6
766 %ifdef RT_ARCH_AMD64
767 pushf
768 pop T2
769 %else
770 push T0
771 pushf
772 pop T0
773 %endif
774 mov T1_32, [%1] ; load flags.
775 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
776 %ifdef RT_ARCH_AMD64
777 and T2_32, (%2) ; select the modified flags.
778 or T1_32, T2_32 ; combine the flags.
779 %else
780 and T0_32, (%2) ; select the modified flags.
781 or T1_32, T0_32 ; combine the flags.
782 pop T0
783 %endif
784
785 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
786 bt %4, %5 - 1
787 jnc %%sf_clear
788 or T1_32, X86_EFL_SF
789 %%sf_clear:
790
791 ; Parity last.
792 and %6, 0xff
793 %ifdef RT_ARCH_AMD64
794 lea T2, [NAME(g_afParity) xWrtRIP]
795 or T1_8, [T2 + %6]
796 %else
797 or T1_8, [NAME(g_afParity) + %6]
798 %endif
799
800 mov [%1], T1_32 ; save the result.
801%endmacro
802
803;;
804; Calculates the new EFLAGS using fixed clear and set bit masks.
805;
806; @remarks Clobbers T0.
807; @param 1 The register pointing to the EFLAGS.
808; @param 2 Mask of additional flags to always clear
809; @param 3 Mask of additional flags to always set.
810;
811%macro IEM_ADJUST_FLAGS_OLD 3
812 %if (%2 | %3) != 0
813 mov T0_32, [%1] ; Load flags.
814 %if (%2) != 0
815 and T0_32, ~(%2) ; Remove the always cleared flags.
816 %endif
817 %if (%3) != 0
818 or T0_32, %3 ; Add the always set flags.
819 %endif
820 mov [%1], T0_32 ; Save the result.
821 %endif
822%endmacro
823
824;;
825; Calculates the new EFLAGS using fixed clear and set bit masks.
826;
827; @remarks Clobbers T0, %4, EFLAGS.
828; @param 1 The register pointing to the EFLAGS.
829; @param 2 Mask of additional flags to always clear
830; @param 3 Mask of additional flags to always set.
831; @param 4 The (full) register containing the parity table index. Will be modified!
832;
833%macro IEM_ADJUST_FLAGS_WITH_PARITY_OLD 4
834 mov T0_32, [%1] ; Load flags.
835 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
836 %if (%3) != 0
837 or T0_32, %3 ; Add the always set flags.
838 %endif
839 and %4, 0xff
840 %ifdef RT_ARCH_AMD64
841 lea T2, [NAME(g_afParity) xWrtRIP]
842 or T0_8, [T2 + %4]
843 %else
844 or T0_8, [NAME(g_afParity) + %4]
845 %endif
846 mov [%1], T0_32 ; Save the result.
847%endmacro
848
849
850
851;;
852; Loads register with offset of imm8 instruction -- used by all of the instruction
853; implementations which lay out jump tables of 256x immediate byte variants.
854; Also checks that the instruction size matches the offsets in the table.
855;
856; @param 1 The register to receive the jump target address (T1).
857; @param 2 The register containing the imm8 index (A1 / A2 / A3).
858; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
859; @note Implicitly uses local symbols .imm0, .imm1, and .immEmd
860; (implementation artifacts of each instruction jump table).
861;
862; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]`.
863;
864%macro IEMIMPL_JUMP_TABLE_TARGET_INT 3
865 lea %1, [.imm0 xWrtRIP]
866 %if %3 == 5
867 lea T0, [%2 + %2*4] ; *5
868 lea %1, [%1 + T0] ; *5 + .imm0
869 %elif %3 == 6
870 lea T0, [%2 + %2*2] ; *3
871 lea %1, [%1 + T0*2] ; *6 + .imm0
872 %elif %3 == 7
873 lea T0, [%2 + %2*2] ; *3
874 lea T0, [T0 + %2*4] ; *7
875 lea %1, [%1 + T0] ; *7 + .imm0
876 %elif %3 == 8
877 lea %1, [%1 + %2*8] ; *8 + .imm0
878 %elif %3 == 9
879 lea T0, [%2 + %2*8] ; *9
880 lea %1, [%1 + T0] ; *9 + .imm0
881 %elif %3 == 10
882 lea T0, [%2 + %2*4] ; *5
883 lea %1, [%1 + T0*2] ; *10 + .imm0
884 %elif %3 == 11
885 lea T0, [%2 + %2*4] ; *5
886 lea T0, [%2 + T0*2] ; *11
887 lea %1, [%1 + T0] ; *11 + .imm0
888 %elif %3 == 12
889 lea T0, [%2 + %2*2] ; *3
890 lea %1, [%1 + T0*4] ; *12 + .imm0
891 %else
892 %error Unexpected instruction byte count in IEMIMPL_JUMP_TABLE_TARGET_INT
893 %endif
894 ; check size: 'warning: value does not fit in 8 bit field' if bad
895 times (.imm1 - .imm0 + %3) %% %3 db 999 * \
896 (.imm1 - .imm0 + %3)
897 ; check alignment: 'warning: value does not fit in 8 bit field' if bad
898 times ((.immEnd - .imm0) - 256 * %3) db 999 * \
899 ((.immEnd - .imm0) - 256 * %3)
900%endmacro
901
902%macro IEMIMPL_JUMP_TABLE_TARGET 3
903 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
904 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, (%3 + 4)
905 %else
906 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, %3
907 %endif
908%endmacro
909
910
911;;
912; Calls the given imm8 instruction -- used by all of the instruction
913; implementations which lay out jump tables of 256x immediate byte variants.
914;
915; @param 1 The register to receive the jump target address (T1).
916; @param 2 The register containing the imm8 index (A1 / A2 / A3).
917; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
918;
919; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]` +
920; `IBT_NOTRACK, call %1`.
921;
922%macro IEMIMPL_CALL_JUMP_TABLE_TARGET 3
923 IEMIMPL_JUMP_TABLE_TARGET %1, %2, %3
924 IBT_NOTRACK
925 call %1
926%endmacro
927
928
929;*********************************************************************************************************************************
930;* External Symbols *
931;*********************************************************************************************************************************
932extern NAME(g_afParity)
933
934
935;;
936; Macro for implementing a binary operator.
937;
938; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
939; variants, except on 32-bit system where the 64-bit accesses requires hand
940; coding.
941;
942; All the functions takes a pointer to the destination memory operand in A0,
943; the source register operand in A1 and a pointer to eflags in A2.
944;
945; @param 1 The instruction mnemonic.
946; @param 2 Non-zero if there should be a locked version.
947; @param 3 The modified flags.
948; @param 4 The undefined flags.
949; @param 5 The flags that must be loaded (ADC, SBC).
950; @param 6 The flags that will be zeroed by the operation.
951;
952%macro IEMIMPL_BIN_OP 6
953BEGINCODE
954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
955 PROLOGUE_3_ARGS
956 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
957 %1 byte [A1], A2_8
958 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
959 EPILOGUE_3_ARGS
960ENDPROC iemAImpl_ %+ %1 %+ _u8
961
962BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
963 PROLOGUE_3_ARGS
964 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
965 %1 word [A1], A2_16
966 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
967 EPILOGUE_3_ARGS
968ENDPROC iemAImpl_ %+ %1 %+ _u16
969
970BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
971 PROLOGUE_3_ARGS
972 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
973 %1 dword [A1], A2_32
974 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
975 EPILOGUE_3_ARGS
976ENDPROC iemAImpl_ %+ %1 %+ _u32
977
978 %ifdef RT_ARCH_AMD64
979BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
980 PROLOGUE_3_ARGS
981 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
982 %1 qword [A1], A2
983 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
984 EPILOGUE_3_ARGS_EX 8
985ENDPROC iemAImpl_ %+ %1 %+ _u64
986 %endif ; RT_ARCH_AMD64
987
988 %if %2 != 0 ; locked versions requested?
989
990BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
991 PROLOGUE_3_ARGS
992 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
993 lock %1 byte [A1], A2_8
994 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
995 EPILOGUE_3_ARGS
996ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
997
998BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
999 PROLOGUE_3_ARGS
1000 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1001 lock %1 word [A1], A2_16
1002 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1003 EPILOGUE_3_ARGS
1004ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1005
1006BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1007 PROLOGUE_3_ARGS
1008 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1009 lock %1 dword [A1], A2_32
1010 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1011 EPILOGUE_3_ARGS
1012ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1013
1014 %ifdef RT_ARCH_AMD64
1015BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1016 PROLOGUE_3_ARGS
1017 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1018 lock %1 qword [A1], A2
1019 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1020 EPILOGUE_3_ARGS_EX 8
1021ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1022 %endif ; RT_ARCH_AMD64
1023 %endif ; locked
1024%endmacro
1025
1026; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
1027IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1028IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1029IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1030IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1031IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1032IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1033IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1034IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1035IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1036
1037
1038;;
1039; Macro for implementing a binary operator, VEX variant with separate input/output.
1040;
1041; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1042; where the 64-bit accesses requires hand coding.
1043;
1044; All the functions takes a pointer to the destination memory operand in A0,
1045; the first source register operand in A1, the second source register operand
1046; in A2 and a pointer to eflags in A3.
1047;
1048; @param 1 The instruction mnemonic.
1049; @param 2 The modified flags.
1050; @param 3 The undefined flags.
1051; @param 4 The zeroed flags.
1052;
1053%macro IEMIMPL_VEX_BIN_OP 4
1054BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1055 PROLOGUE_4_ARGS
1056 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
1057 %1 T0_32, A1_32, A2_32
1058 mov [A0], T0_32
1059 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1060 EPILOGUE_4_ARGS
1061ENDPROC iemAImpl_ %+ %1 %+ _u32
1062
1063 %ifdef RT_ARCH_AMD64
1064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1065 PROLOGUE_4_ARGS
1066 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0
1067 %1 T0, A1, A2
1068 mov [A0], T0
1069 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1070 EPILOGUE_4_ARGS
1071ENDPROC iemAImpl_ %+ %1 %+ _u64
1072 %endif ; RT_ARCH_AMD64
1073%endmacro
1074
1075; instr, modified-flags, undefined-flags, zeroed-flags
1076IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1077IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1078IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
1079
1080;;
1081; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
1082;
1083; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1084; where the 64-bit accesses requires hand coding.
1085;
1086; All the functions takes a pointer to the destination memory operand in A1,
1087; the source register operand in A2 and incoming EFLAGS in A0. Updated EFLAGS
1088; are returned in EAX.
1089;
1090; @param 1 The instruction mnemonic.
1091; @param 2 The modified flags.
1092; @param 3 The undefined flags.
1093; @param 4 The zeroed flags.
1094;
1095%macro IEMIMPL_VEX_BIN_OP_2 4
1096BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1097 PROLOGUE_4_ARGS
1098 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
1099 mov T0_32, [A1]
1100 %1 T0_32, A2_32
1101 mov [A1], T0_32
1102 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1103 EPILOGUE_4_ARGS
1104ENDPROC iemAImpl_ %+ %1 %+ _u32
1105
1106 %ifdef RT_ARCH_AMD64
1107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1108 PROLOGUE_4_ARGS
1109 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1110 mov T0, [A1]
1111 %1 T0, A2
1112 mov [A1], T0
1113 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1114 EPILOGUE_4_ARGS
1115ENDPROC iemAImpl_ %+ %1 %+ _u64
1116 %endif ; RT_ARCH_AMD64
1117%endmacro
1118
1119; instr, modified-flags, undefined-flags zeroed-flags
1120IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1121IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1122IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1123
1124
1125;;
1126; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
1127;
1128; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1129; where the 64-bit accesses requires hand coding.
1130;
1131; All the functions takes a pointer to the destination memory operand in A0,
1132; the first source register operand in A1, the second source register operand
1133; in A2 and a pointer to eflags in A3.
1134;
1135; @param 1 The instruction mnemonic.
1136; @param 2 Fallback instruction if applicable.
1137; @param 3 Whether to emit fallback or not.
1138;
1139%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
1140BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1141 PROLOGUE_3_ARGS
1142 %1 T0_32, A1_32, A2_32
1143 mov [A0], T0_32
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_ %+ %1 %+ _u32
1146
1147 %if %3
1148BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
1149 PROLOGUE_3_ARGS
1150 %ifdef ASM_CALL64_GCC
1151 mov cl, A2_8
1152 %2 A1_32, cl
1153 mov [A0], A1_32
1154 %else
1155 xchg A2, A0
1156 %2 A1_32, cl
1157 mov [A2], A1_32
1158 %endif
1159 EPILOGUE_3_ARGS
1160ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
1161 %endif
1162
1163 %ifdef RT_ARCH_AMD64
1164BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1165 PROLOGUE_3_ARGS
1166 %1 T0, A1, A2
1167 mov [A0], T0
1168 EPILOGUE_3_ARGS
1169ENDPROC iemAImpl_ %+ %1 %+ _u64
1170
1171 %if %3
1172BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
1173 PROLOGUE_3_ARGS
1174 %ifdef ASM_CALL64_GCC
1175 mov cl, A2_8
1176 %2 A1, cl
1177 mov [A0], A1_32
1178 %else
1179 xchg A2, A0
1180 %2 A1, cl
1181 mov [A2], A1_32
1182 %endif
1183 mov [A0], A1
1184 EPILOGUE_3_ARGS
1185ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
1186 %endif
1187 %endif ; RT_ARCH_AMD64
1188%endmacro
1189
1190; instr, fallback instr, emit fallback
1191IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
1192IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
1193IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
1194IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
1195IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
1196
1197
1198;
1199; RORX uses a immediate byte for the shift count, so we only do
1200; fallback implementation of that one.
1201;
1202BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
1203 PROLOGUE_3_ARGS
1204 %ifdef ASM_CALL64_GCC
1205 mov cl, A2_8
1206 ror A1_32, cl
1207 mov [A0], A1_32
1208 %else
1209 xchg A2, A0
1210 ror A1_32, cl
1211 mov [A2], A1_32
1212 %endif
1213 EPILOGUE_3_ARGS
1214ENDPROC iemAImpl_rorx_u32
1215
1216 %ifdef RT_ARCH_AMD64
1217BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
1218 PROLOGUE_3_ARGS
1219 %ifdef ASM_CALL64_GCC
1220 mov cl, A2_8
1221 ror A1, cl
1222 mov [A0], A1
1223 %else
1224 xchg A2, A0
1225 ror A1, cl
1226 mov [A2], A1
1227 %endif
1228 EPILOGUE_3_ARGS
1229ENDPROC iemAImpl_rorx_u64
1230 %endif ; RT_ARCH_AMD64
1231
1232
1233;
1234; MULX
1235;
1236BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
1237 PROLOGUE_4_ARGS
1238%ifdef ASM_CALL64_GCC
1239 ; A2_32 is EDX - prefect
1240 mulx T0_32, T1_32, A3_32
1241 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
1242 mov [A0], T0_32
1243%else
1244 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
1245 xchg A1, A2
1246 mulx T0_32, T1_32, A3_32
1247 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
1248 mov [A0], T0_32
1249%endif
1250 EPILOGUE_4_ARGS
1251ENDPROC iemAImpl_mulx_u32
1252
1253
1254BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
1255 PROLOGUE_4_ARGS
1256%ifdef ASM_CALL64_GCC
1257 ; A2_32 is EDX, T0_32 is EAX
1258 mov eax, A3_32
1259 mul A2_32
1260 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
1261 mov [A0], edx
1262%else
1263 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
1264 xchg A1, A2
1265 mov eax, A3_32
1266 mul A2_32
1267 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
1268 mov [A0], edx
1269%endif
1270 EPILOGUE_4_ARGS
1271ENDPROC iemAImpl_mulx_u32_fallback
1272
1273%ifdef RT_ARCH_AMD64
1274BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
1275 PROLOGUE_4_ARGS
1276%ifdef ASM_CALL64_GCC
1277 ; A2 is RDX - prefect
1278 mulx T0, T1, A3
1279 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
1280 mov [A0], T0
1281%else
1282 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
1283 xchg A1, A2
1284 mulx T0, T1, A3
1285 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
1286 mov [A0], T0
1287%endif
1288 EPILOGUE_4_ARGS
1289ENDPROC iemAImpl_mulx_u64
1290
1291
1292BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
1293 PROLOGUE_4_ARGS
1294%ifdef ASM_CALL64_GCC
1295 ; A2 is RDX, T0 is RAX
1296 mov rax, A3
1297 mul A2
1298 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
1299 mov [A0], rdx
1300%else
1301 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
1302 xchg A1, A2
1303 mov rax, A3
1304 mul A2
1305 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
1306 mov [A0], rdx
1307%endif
1308 EPILOGUE_4_ARGS
1309ENDPROC iemAImpl_mulx_u64_fallback
1310
1311%endif
1312
1313
1314;;
1315; Macro for implementing a bit operator.
1316;
1317; This will generate code for the 16, 32 and 64 bit accesses with locked
1318; variants, except on 32-bit system where the 64-bit accesses requires hand
1319; coding.
1320;
1321; All the functions takes a pointer to the destination memory operand in A1,
1322; the source register operand in A2 and incoming eflags in A0.
1323;
1324; @param 1 The instruction mnemonic.
1325; @param 2 Non-zero if there should be a locked version.
1326; @param 3 The modified flags.
1327; @param 4 The undefined flags.
1328;
1329%macro IEMIMPL_BIT_OP 4
1330BEGINCODE
1331BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1332 PROLOGUE_3_ARGS
1333 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1334 %1 word [A1], A2_16
1335 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1336 EPILOGUE_3_ARGS
1337ENDPROC iemAImpl_ %+ %1 %+ _u16
1338
1339BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1340 PROLOGUE_3_ARGS
1341 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1342 %1 dword [A1], A2_32
1343 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1344 EPILOGUE_3_ARGS
1345ENDPROC iemAImpl_ %+ %1 %+ _u32
1346
1347 %ifdef RT_ARCH_AMD64
1348BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1351 %1 qword [A1], A2
1352 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1353 EPILOGUE_3_ARGS_EX 8
1354ENDPROC iemAImpl_ %+ %1 %+ _u64
1355 %endif ; RT_ARCH_AMD64
1356
1357 %if %2 != 0 ; locked versions requested?
1358
1359BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1360 PROLOGUE_3_ARGS
1361 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1362 lock %1 word [A1], A2_16
1363 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1364 EPILOGUE_3_ARGS
1365ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1366
1367BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1368 PROLOGUE_3_ARGS
1369 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1370 lock %1 dword [A1], A2_32
1371 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1372 EPILOGUE_3_ARGS
1373ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1374
1375 %ifdef RT_ARCH_AMD64
1376BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1377 PROLOGUE_3_ARGS
1378 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1379 lock %1 qword [A1], A2
1380 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1381 EPILOGUE_3_ARGS_EX 8
1382ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1383 %endif ; RT_ARCH_AMD64
1384 %endif ; locked
1385%endmacro
1386
1387; Undefined flags are passed thru here by the intel and amd CPUs we have.
1388; modified efl, undefined eflags
1389IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1390IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1391IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1392IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1393
1394;;
1395; Macro for implementing a bit search operator.
1396;
1397; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1398; system where the 64-bit accesses requires hand coding.
1399;
1400; All the functions takes a pointer to the destination memory operand in A1,
1401; the source register operand in A2 and the incoming eflags in A0.
1402;
1403; In the ZF case the destination register is 'undefined', however it seems that
1404; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1405; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1406; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1407; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1408;
1409; Intel: Clear all and calculate PF in addition to ZF.
1410; AMD: Passthru all flags other than ZF.
1411;
1412; @param 1 The instruction mnemonic.
1413; @param 2 The modified flags.
1414; @param 3 The undefined flags.
1415; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1416;
1417%macro IEMIMPL_BIT_OP2 4
1418BEGINCODE
1419; 16-bit
1420
1421BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1422 PROLOGUE_3_ARGS
1423 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1424 %1 T0_16, A2_16
1425%if %4 != 0
1426 jz .unchanged_dst
1427%endif
1428 mov [A1], T0_16
1429.unchanged_dst:
1430 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1431 EPILOGUE_3_ARGS
1432ENDPROC iemAImpl_ %+ %1 %+ _u16
1433
1434;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1435;bad; PROLOGUE_3_ARGS
1436;bad; %1 T1_16, A1_16
1437;bad; jz .unchanged_dst
1438;bad; mov [A0], T1_16
1439;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1440;bad; EPILOGUE_3_ARGS
1441;bad;.unchanged_dst:
1442;bad;%if %4 != 0
1443;bad; mov [A0], T1_16
1444;bad;%endif
1445;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1446;bad; EPILOGUE_3_ARGS
1447;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1448;bad;
1449;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1450;bad; PROLOGUE_3_ARGS
1451;bad; %1 T0_16, A1_16
1452;bad;%if %4 != 0
1453;bad; jz .unchanged_dst
1454;bad;%endif
1455;bad; mov [A0], T0_16
1456;bad;.unchanged_dst:
1457;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1458;bad; EPILOGUE_3_ARGS
1459;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1460
1461; 32-bit
1462
1463BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1464 PROLOGUE_3_ARGS
1465 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1466 %1 T0_32, A2_32
1467%if %4 != 0
1468 jz .unchanged_dst
1469%endif
1470 mov [A1], T0_32
1471.unchanged_dst:
1472 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1473 EPILOGUE_3_ARGS
1474ENDPROC iemAImpl_ %+ %1 %+ _u32
1475
1476;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1477;bad; PROLOGUE_3_ARGS
1478;bad; %1 T1_32, A1_32
1479;bad;%if %4 != 0
1480;bad; jz .unchanged_dst
1481;bad;%endif
1482;bad; mov [A0], T1_32
1483;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1484;bad; EPILOGUE_3_ARGS
1485;bad;.unchanged_dst:
1486;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1487;bad; EPILOGUE_3_ARGS
1488;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1489;bad;
1490;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1491;bad; PROLOGUE_3_ARGS
1492;bad; %1 T0_32, A1_32
1493;bad;%if %4 != 0
1494;bad; jz .unchanged_dst
1495;bad;%endif
1496;bad; mov [A0], T0_32
1497;bad;.unchanged_dst:
1498;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1499;bad; EPILOGUE_3_ARGS
1500;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1501
1502
1503 %ifdef RT_ARCH_AMD64
1504; 64-bit
1505
1506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1507 PROLOGUE_3_ARGS
1508 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1509 %1 T0, A2
1510%if %4 != 0
1511 jz .unchanged_dst
1512%endif
1513 mov [A1], T0
1514.unchanged_dst:
1515 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1516 EPILOGUE_3_ARGS_EX 8
1517ENDPROC iemAImpl_ %+ %1 %+ _u64
1518
1519;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1520;bad; PROLOGUE_3_ARGS
1521;bad; %1 T1, A1
1522;bad;%if %4 != 0
1523;bad; jz .unchanged_dst
1524;bad;%endif
1525;bad; mov [A0], T1
1526;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1527;bad; EPILOGUE_3_ARGS
1528;bad;.unchanged_dst:
1529;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1530;bad; EPILOGUE_3_ARGS
1531;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1532;bad;
1533;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1534;bad; PROLOGUE_3_ARGS
1535;bad; %1 T0, A1
1536;bad;%if %4 != 0
1537;bad; jz .unchanged_dst
1538;bad;%endif
1539;bad; mov [A0], T0
1540;bad;.unchanged_dst:
1541;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1542;bad; EPILOGUE_3_ARGS_EX 8
1543;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1544
1545 %endif ; RT_ARCH_AMD64
1546%endmacro
1547
1548IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1549IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1550IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1551IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1552
1553
1554;;
1555; Macro for implementing POPCNT.
1556;
1557; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1558; system where the 64-bit accesses requires hand coding.
1559;
1560; All the functions takes a pointer to the destination memory operand in A1,
1561; the source register operand in A2 and eflags in A0.
1562;
1563; ASSUMES Intel and AMD set EFLAGS the same way.
1564;
1565; ASSUMES the instruction does not support memory destination.
1566;
1567; @param 1 The instruction mnemonic.
1568; @param 2 The modified flags.
1569; @param 3 The undefined flags.
1570; @param 4 The zeroed flags.
1571;
1572%macro IEMIMPL_BIT_OP3 4
1573BEGINCODE
1574BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1575 PROLOGUE_3_ARGS
1576 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1577 %1 T0_16, A2_16
1578 mov [A1], T0_16
1579 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1580 EPILOGUE_3_ARGS
1581ENDPROC iemAImpl_ %+ %1 %+ _u16
1582
1583BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1584 PROLOGUE_3_ARGS
1585 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1586 %1 T0_32, A2_32
1587 mov [A1], T0_32
1588 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1589 EPILOGUE_3_ARGS
1590ENDPROC iemAImpl_ %+ %1 %+ _u32
1591
1592 %ifdef RT_ARCH_AMD64
1593BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1594 PROLOGUE_3_ARGS
1595 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1596 %1 T0, A2
1597 mov [A1], T0
1598 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1599 EPILOGUE_3_ARGS_EX 8
1600ENDPROC iemAImpl_ %+ %1 %+ _u64
1601 %endif ; RT_ARCH_AMD64
1602%endmacro
1603IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1604
1605
1606;
1607; IMUL is also a similar but yet different case (no lock, no mem dst).
1608; The rDX:rAX variant of imul is handled together with mul further down.
1609;
1610BEGINCODE
1611; @param 1 EFLAGS that are modified.
1612; @param 2 Undefined EFLAGS.
1613; @param 3 Function suffix.
1614; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1615; 2 for AMD (set AF, clear PF, ZF and SF).
1616%macro IEMIMPL_IMUL_TWO 4
1617BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1618 PROLOGUE_3_ARGS
1619 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1620 imul A2_16, word [A1]
1621 mov [A1], A2_16
1622 %if %4 != 1
1623 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1624 %else
1625 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_16, 16, A2 ; intel
1626 %endif
1627 EPILOGUE_3_ARGS
1628ENDPROC iemAImpl_imul_two_u16 %+ %3
1629
1630BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1631 PROLOGUE_3_ARGS
1632 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1633 imul A2_32, dword [A1]
1634 mov [A1], A2_32
1635 %if %4 != 1
1636 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1637 %else
1638 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_32, 32, A2 ; intel
1639 %endif
1640 EPILOGUE_3_ARGS
1641ENDPROC iemAImpl_imul_two_u32 %+ %3
1642
1643 %ifdef RT_ARCH_AMD64
1644BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1645 PROLOGUE_3_ARGS
1646 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1647 imul A2, qword [A1]
1648 mov [A1], A2
1649 %if %4 != 1
1650 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1651 %else
1652 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2, 64, A2 ; intel
1653 %endif
1654 EPILOGUE_3_ARGS_EX 8
1655ENDPROC iemAImpl_imul_two_u64 %+ %3
1656 %endif ; RT_ARCH_AMD64
1657%endmacro
1658; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1659; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1660; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1661IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1662IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1663IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1664
1665
1666;
1667; XCHG for memory operands. This implies locking. No flag changes.
1668;
1669; Each function takes two arguments, first the pointer to the memory,
1670; then the pointer to the register. They all return void.
1671;
1672BEGINCODE
1673BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1674 PROLOGUE_2_ARGS
1675 mov T0_8, [A1]
1676 xchg [A0], T0_8
1677 mov [A1], T0_8
1678 EPILOGUE_2_ARGS
1679ENDPROC iemAImpl_xchg_u8_locked
1680
1681BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1682 PROLOGUE_2_ARGS
1683 mov T0_16, [A1]
1684 xchg [A0], T0_16
1685 mov [A1], T0_16
1686 EPILOGUE_2_ARGS
1687ENDPROC iemAImpl_xchg_u16_locked
1688
1689BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1690 PROLOGUE_2_ARGS
1691 mov T0_32, [A1]
1692 xchg [A0], T0_32
1693 mov [A1], T0_32
1694 EPILOGUE_2_ARGS
1695ENDPROC iemAImpl_xchg_u32_locked
1696
1697%ifdef RT_ARCH_AMD64
1698BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1699 PROLOGUE_2_ARGS
1700 mov T0, [A1]
1701 xchg [A0], T0
1702 mov [A1], T0
1703 EPILOGUE_2_ARGS
1704ENDPROC iemAImpl_xchg_u64_locked
1705%endif
1706
1707; Unlocked variants for fDisregardLock mode.
1708
1709BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1710 PROLOGUE_2_ARGS
1711 mov T0_8, [A1]
1712 mov T1_8, [A0]
1713 mov [A0], T0_8
1714 mov [A1], T1_8
1715 EPILOGUE_2_ARGS
1716ENDPROC iemAImpl_xchg_u8_unlocked
1717
1718BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1719 PROLOGUE_2_ARGS
1720 mov T0_16, [A1]
1721 mov T1_16, [A0]
1722 mov [A0], T0_16
1723 mov [A1], T1_16
1724 EPILOGUE_2_ARGS
1725ENDPROC iemAImpl_xchg_u16_unlocked
1726
1727BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1728 PROLOGUE_2_ARGS
1729 mov T0_32, [A1]
1730 mov T1_32, [A0]
1731 mov [A0], T0_32
1732 mov [A1], T1_32
1733 EPILOGUE_2_ARGS
1734ENDPROC iemAImpl_xchg_u32_unlocked
1735
1736%ifdef RT_ARCH_AMD64
1737BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1738 PROLOGUE_2_ARGS
1739 mov T0, [A1]
1740 mov T1, [A0]
1741 mov [A0], T0
1742 mov [A1], T1
1743 EPILOGUE_2_ARGS
1744ENDPROC iemAImpl_xchg_u64_unlocked
1745%endif
1746
1747
1748;
1749; XADD for memory operands.
1750;
1751; Each function takes three arguments, first the pointer to the
1752; memory/register, then the pointer to the register, and finally a pointer to
1753; eflags. They all return void.
1754;
1755BEGINCODE
1756BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1757 PROLOGUE_3_ARGS
1758 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1759 mov T0_8, [A1]
1760 xadd [A0], T0_8
1761 mov [A1], T0_8
1762 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1763 EPILOGUE_3_ARGS
1764ENDPROC iemAImpl_xadd_u8
1765
1766BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1767 PROLOGUE_3_ARGS
1768 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1769 mov T0_16, [A1]
1770 xadd [A0], T0_16
1771 mov [A1], T0_16
1772 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1773 EPILOGUE_3_ARGS
1774ENDPROC iemAImpl_xadd_u16
1775
1776BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1777 PROLOGUE_3_ARGS
1778 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1779 mov T0_32, [A1]
1780 xadd [A0], T0_32
1781 mov [A1], T0_32
1782 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1783 EPILOGUE_3_ARGS
1784ENDPROC iemAImpl_xadd_u32
1785
1786%ifdef RT_ARCH_AMD64
1787BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1788 PROLOGUE_3_ARGS
1789 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1790 mov T0, [A1]
1791 xadd [A0], T0
1792 mov [A1], T0
1793 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1794 EPILOGUE_3_ARGS
1795ENDPROC iemAImpl_xadd_u64
1796%endif ; RT_ARCH_AMD64
1797
1798BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1799 PROLOGUE_3_ARGS
1800 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1801 mov T0_8, [A1]
1802 lock xadd [A0], T0_8
1803 mov [A1], T0_8
1804 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1805 EPILOGUE_3_ARGS
1806ENDPROC iemAImpl_xadd_u8_locked
1807
1808BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1809 PROLOGUE_3_ARGS
1810 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1811 mov T0_16, [A1]
1812 lock xadd [A0], T0_16
1813 mov [A1], T0_16
1814 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1815 EPILOGUE_3_ARGS
1816ENDPROC iemAImpl_xadd_u16_locked
1817
1818BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1819 PROLOGUE_3_ARGS
1820 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1821 mov T0_32, [A1]
1822 lock xadd [A0], T0_32
1823 mov [A1], T0_32
1824 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1825 EPILOGUE_3_ARGS
1826ENDPROC iemAImpl_xadd_u32_locked
1827
1828%ifdef RT_ARCH_AMD64
1829BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1830 PROLOGUE_3_ARGS
1831 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1832 mov T0, [A1]
1833 lock xadd [A0], T0
1834 mov [A1], T0
1835 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1836 EPILOGUE_3_ARGS
1837ENDPROC iemAImpl_xadd_u64_locked
1838%endif ; RT_ARCH_AMD64
1839
1840
1841;
1842; CMPXCHG8B.
1843;
1844; These are tricky register wise, so the code is duplicated for each calling
1845; convention.
1846;
1847; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1848;
1849; C-proto:
1850; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1851; uint32_t *pEFlags));
1852;
1853; Note! Identical to iemAImpl_cmpxchg16b.
1854;
1855BEGINCODE
1856BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1857%ifdef RT_ARCH_AMD64
1858 %ifdef ASM_CALL64_MSC
1859 push rbx
1860
1861 mov r11, rdx ; pu64EaxEdx (is also T1)
1862 mov r10, rcx ; pu64Dst
1863
1864 mov ebx, [r8]
1865 mov ecx, [r8 + 4]
1866 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1867 mov eax, [r11]
1868 mov edx, [r11 + 4]
1869
1870 cmpxchg8b [r10]
1871
1872 mov [r11], eax
1873 mov [r11 + 4], edx
1874 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1875
1876 pop rbx
1877 ret
1878 %else
1879 push rbx
1880
1881 mov r10, rcx ; pEFlags
1882 mov r11, rdx ; pu64EbxEcx (is also T1)
1883
1884 mov ebx, [r11]
1885 mov ecx, [r11 + 4]
1886 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1887 mov eax, [rsi]
1888 mov edx, [rsi + 4]
1889
1890 cmpxchg8b [rdi]
1891
1892 mov [rsi], eax
1893 mov [rsi + 4], edx
1894 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1895
1896 pop rbx
1897 ret
1898
1899 %endif
1900%else
1901 push esi
1902 push edi
1903 push ebx
1904 push ebp
1905
1906 mov edi, ecx ; pu64Dst
1907 mov esi, edx ; pu64EaxEdx
1908 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1909 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1910
1911 mov ebx, [ecx]
1912 mov ecx, [ecx + 4]
1913 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1914 mov eax, [esi]
1915 mov edx, [esi + 4]
1916
1917 cmpxchg8b [edi]
1918
1919 mov [esi], eax
1920 mov [esi + 4], edx
1921 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1922
1923 pop ebp
1924 pop ebx
1925 pop edi
1926 pop esi
1927 ret 8
1928%endif
1929ENDPROC iemAImpl_cmpxchg8b
1930
1931BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1932%ifdef RT_ARCH_AMD64
1933 %ifdef ASM_CALL64_MSC
1934 push rbx
1935
1936 mov r11, rdx ; pu64EaxEdx (is also T1)
1937 mov r10, rcx ; pu64Dst
1938
1939 mov ebx, [r8]
1940 mov ecx, [r8 + 4]
1941 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1942 mov eax, [r11]
1943 mov edx, [r11 + 4]
1944
1945 lock cmpxchg8b [r10]
1946
1947 mov [r11], eax
1948 mov [r11 + 4], edx
1949 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1950
1951 pop rbx
1952 ret
1953 %else
1954 push rbx
1955
1956 mov r10, rcx ; pEFlags
1957 mov r11, rdx ; pu64EbxEcx (is also T1)
1958
1959 mov ebx, [r11]
1960 mov ecx, [r11 + 4]
1961 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1962 mov eax, [rsi]
1963 mov edx, [rsi + 4]
1964
1965 lock cmpxchg8b [rdi]
1966
1967 mov [rsi], eax
1968 mov [rsi + 4], edx
1969 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1970
1971 pop rbx
1972 ret
1973
1974 %endif
1975%else
1976 push esi
1977 push edi
1978 push ebx
1979 push ebp
1980
1981 mov edi, ecx ; pu64Dst
1982 mov esi, edx ; pu64EaxEdx
1983 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1984 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1985
1986 mov ebx, [ecx]
1987 mov ecx, [ecx + 4]
1988 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1989 mov eax, [esi]
1990 mov edx, [esi + 4]
1991
1992 lock cmpxchg8b [edi]
1993
1994 mov [esi], eax
1995 mov [esi + 4], edx
1996 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1997
1998 pop ebp
1999 pop ebx
2000 pop edi
2001 pop esi
2002 ret 8
2003%endif
2004ENDPROC iemAImpl_cmpxchg8b_locked
2005
2006%ifdef RT_ARCH_AMD64
2007
2008;
2009; CMPXCHG16B.
2010;
2011; These are tricky register wise, so the code is duplicated for each calling
2012; convention.
2013;
2014; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2015;
2016; C-proto:
2017; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
2018; uint32_t *pEFlags));
2019;
2020; Note! Identical to iemAImpl_cmpxchg8b.
2021;
2022BEGINCODE
2023BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
2024 %ifdef ASM_CALL64_MSC
2025 push rbx
2026
2027 mov r11, rdx ; pu64RaxRdx (is also T1)
2028 mov r10, rcx ; pu64Dst
2029
2030 mov rbx, [r8]
2031 mov rcx, [r8 + 8]
2032 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2033 mov rax, [r11]
2034 mov rdx, [r11 + 8]
2035
2036 cmpxchg16b [r10]
2037
2038 mov [r11], rax
2039 mov [r11 + 8], rdx
2040 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2041
2042 pop rbx
2043 ret
2044 %else
2045 push rbx
2046
2047 mov r10, rcx ; pEFlags
2048 mov r11, rdx ; pu64RbxRcx (is also T1)
2049
2050 mov rbx, [r11]
2051 mov rcx, [r11 + 8]
2052 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2053 mov rax, [rsi]
2054 mov rdx, [rsi + 8]
2055
2056 cmpxchg16b [rdi]
2057
2058 mov [rsi], rax
2059 mov [rsi + 8], rdx
2060 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2061
2062 pop rbx
2063 ret
2064
2065 %endif
2066ENDPROC iemAImpl_cmpxchg16b
2067
2068BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
2069 %ifdef ASM_CALL64_MSC
2070 push rbx
2071
2072 mov r11, rdx ; pu64RaxRdx (is also T1)
2073 mov r10, rcx ; pu64Dst
2074
2075 mov rbx, [r8]
2076 mov rcx, [r8 + 8]
2077 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2078 mov rax, [r11]
2079 mov rdx, [r11 + 8]
2080
2081 lock cmpxchg16b [r10]
2082
2083 mov [r11], rax
2084 mov [r11 + 8], rdx
2085 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2086
2087 pop rbx
2088 ret
2089 %else
2090 push rbx
2091
2092 mov r10, rcx ; pEFlags
2093 mov r11, rdx ; pu64RbxRcx (is also T1)
2094
2095 mov rbx, [r11]
2096 mov rcx, [r11 + 8]
2097 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2098 mov rax, [rsi]
2099 mov rdx, [rsi + 8]
2100
2101 lock cmpxchg16b [rdi]
2102
2103 mov [rsi], rax
2104 mov [rsi + 8], rdx
2105 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2106
2107 pop rbx
2108 ret
2109
2110 %endif
2111ENDPROC iemAImpl_cmpxchg16b_locked
2112
2113%endif ; RT_ARCH_AMD64
2114
2115
2116;
2117; CMPXCHG.
2118;
2119; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2120;
2121; C-proto:
2122; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
2123;
2124BEGINCODE
2125%macro IEMIMPL_CMPXCHG 2
2126BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
2127 PROLOGUE_4_ARGS
2128 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2129 mov al, [A1]
2130 %1 cmpxchg [A0], A2_8
2131 mov [A1], al
2132 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2133 EPILOGUE_4_ARGS
2134ENDPROC iemAImpl_cmpxchg_u8 %+ %2
2135
2136BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
2137 PROLOGUE_4_ARGS
2138 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2139 mov ax, [A1]
2140 %1 cmpxchg [A0], A2_16
2141 mov [A1], ax
2142 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2143 EPILOGUE_4_ARGS
2144ENDPROC iemAImpl_cmpxchg_u16 %+ %2
2145
2146BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
2147 PROLOGUE_4_ARGS
2148 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2149 mov eax, [A1]
2150 %1 cmpxchg [A0], A2_32
2151 mov [A1], eax
2152 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2153 EPILOGUE_4_ARGS
2154ENDPROC iemAImpl_cmpxchg_u32 %+ %2
2155
2156BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
2157%ifdef RT_ARCH_AMD64
2158 PROLOGUE_4_ARGS
2159 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2160 mov rax, [A1]
2161 %1 cmpxchg [A0], A2
2162 mov [A1], rax
2163 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2164 EPILOGUE_4_ARGS
2165%else
2166 ;
2167 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
2168 ;
2169 push esi
2170 push edi
2171 push ebx
2172 push ebp
2173
2174 mov edi, ecx ; pu64Dst
2175 mov esi, edx ; pu64Rax
2176 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
2177 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
2178
2179 mov ebx, [ecx]
2180 mov ecx, [ecx + 4]
2181 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2182 mov eax, [esi]
2183 mov edx, [esi + 4]
2184
2185 lock cmpxchg8b [edi]
2186
2187 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
2188 jz .cmpxchg8b_not_equal
2189;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
2190 cmp eax, eax ; just set the other flags.
2191.store:
2192 mov [esi], eax
2193 mov [esi + 4], edx
2194 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
2195
2196 pop ebp
2197 pop ebx
2198 pop edi
2199 pop esi
2200 ret 8
2201
2202.cmpxchg8b_not_equal:
2203 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
2204 jne .store
2205 cmp [esi], eax
2206 jmp .store
2207
2208%endif
2209ENDPROC iemAImpl_cmpxchg_u64 %+ %2
2210%endmacro ; IEMIMPL_CMPXCHG
2211
2212IEMIMPL_CMPXCHG , ,
2213IEMIMPL_CMPXCHG lock, _locked
2214
2215
2216
2217;;
2218; Macro for implementing a unary operator.
2219;
2220; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
2221; variants, except on 32-bit system where the 64-bit accesses requires hand
2222; coding.
2223;
2224; All the functions takes a pointer to the destination memory operand in A0,
2225; the source register operand in A1 and a pointer to eflags in A2.
2226;
2227; @param 1 The instruction mnemonic.
2228; @param 2 The modified flags.
2229; @param 3 The undefined flags.
2230;
2231%macro IEMIMPL_UNARY_OP 3
2232BEGINCODE
2233BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
2234 PROLOGUE_2_ARGS
2235 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2236 %1 byte [A0]
2237 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2238 EPILOGUE_2_ARGS
2239ENDPROC iemAImpl_ %+ %1 %+ _u8
2240
2241BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
2242 PROLOGUE_2_ARGS
2243 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2244 lock %1 byte [A0]
2245 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2246 EPILOGUE_2_ARGS
2247ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
2248
2249BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
2250 PROLOGUE_2_ARGS
2251 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2252 %1 word [A0]
2253 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2254 EPILOGUE_2_ARGS
2255ENDPROC iemAImpl_ %+ %1 %+ _u16
2256
2257BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
2258 PROLOGUE_2_ARGS
2259 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2260 lock %1 word [A0]
2261 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2262 EPILOGUE_2_ARGS
2263ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
2264
2265BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
2266 PROLOGUE_2_ARGS
2267 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2268 %1 dword [A0]
2269 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2270 EPILOGUE_2_ARGS
2271ENDPROC iemAImpl_ %+ %1 %+ _u32
2272
2273BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
2274 PROLOGUE_2_ARGS
2275 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2276 lock %1 dword [A0]
2277 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2278 EPILOGUE_2_ARGS
2279ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
2280
2281 %ifdef RT_ARCH_AMD64
2282BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
2283 PROLOGUE_2_ARGS
2284 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2285 %1 qword [A0]
2286 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2287 EPILOGUE_2_ARGS
2288ENDPROC iemAImpl_ %+ %1 %+ _u64
2289
2290BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
2291 PROLOGUE_2_ARGS
2292 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2293 lock %1 qword [A0]
2294 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2295 EPILOGUE_2_ARGS
2296ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
2297 %endif ; RT_ARCH_AMD64
2298
2299%endmacro
2300
2301IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2302IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2303IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
2304IEMIMPL_UNARY_OP not, 0, 0
2305
2306
2307;
2308; BSWAP. No flag changes.
2309;
2310; Each function takes one argument, pointer to the value to bswap
2311; (input/output). They all return void.
2312;
2313BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
2314 PROLOGUE_1_ARGS
2315 mov T0_32, [A0] ; just in case any of the upper bits are used.
2316 db 66h
2317 bswap T0_32
2318 mov [A0], T0_32
2319 EPILOGUE_1_ARGS
2320ENDPROC iemAImpl_bswap_u16
2321
2322BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
2323 PROLOGUE_1_ARGS
2324 mov T0_32, [A0]
2325 bswap T0_32
2326 mov [A0], T0_32
2327 EPILOGUE_1_ARGS
2328ENDPROC iemAImpl_bswap_u32
2329
2330BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
2331%ifdef RT_ARCH_AMD64
2332 PROLOGUE_1_ARGS
2333 mov T0, [A0]
2334 bswap T0
2335 mov [A0], T0
2336 EPILOGUE_1_ARGS
2337%else
2338 PROLOGUE_1_ARGS
2339 mov T0, [A0]
2340 mov T1, [A0 + 4]
2341 bswap T0
2342 bswap T1
2343 mov [A0 + 4], T0
2344 mov [A0], T1
2345 EPILOGUE_1_ARGS
2346%endif
2347ENDPROC iemAImpl_bswap_u64
2348
2349
2350;;
2351; Macro for implementing a shift operation.
2352;
2353; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2354; 32-bit system where the 64-bit accesses requires hand coding.
2355;
2356; All the functions takes a pointer to the destination memory operand in A0,
2357; the shift count in A1 and a pointer to eflags in A2.
2358;
2359; @param 1 The instruction mnemonic.
2360; @param 2 The modified flags.
2361; @param 3 The undefined flags.
2362; @param 4 Force load flags.
2363;
2364; Makes ASSUMPTIONS about A0, A1 and A2 assignments. Specifically, that with
2365; GCC/64 we're free to use RCX/CL as it isn't used for any arguments. While
2366; MSC/64 & 32-bit fastcall are using ECX for the first argument (fEFlagsIn),
2367; so we have to switch it around with the shift count parameter registers.
2368;
2369; @note the _intel and _amd variants are implemented in C.
2370;
2371%macro IEMIMPL_SHIFT_OP 4
2372BEGINCODE
2373BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2374 PROLOGUE_3_ARGS
2375 %ifdef ASM_CALL64_GCC
2376 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2377 mov cl, A2_8
2378 %1 byte [A1], cl
2379 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2380 %else
2381 xchg A2, A0
2382 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2383 %1 byte [A1], cl
2384 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2385 %endif
2386.zero_shift:
2387 EPILOGUE_3_ARGS
2388ENDPROC iemAImpl_ %+ %1 %+ _u8
2389
2390BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2391 PROLOGUE_3_ARGS
2392 %ifdef ASM_CALL64_GCC
2393 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2394 mov cl, A2_8
2395 %1 word [A1], cl
2396 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2397 %else
2398 xchg A2, A0
2399 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2400 %1 word [A1], cl
2401 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2402 %endif
2403 EPILOGUE_3_ARGS
2404ENDPROC iemAImpl_ %+ %1 %+ _u16
2405
2406BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2407 PROLOGUE_3_ARGS
2408 %ifdef ASM_CALL64_GCC
2409 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2410 mov cl, A2_8
2411 %1 dword [A1], cl
2412 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2413 %else
2414 xchg A2, A0
2415 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2416 %1 dword [A1], cl
2417 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2418 %endif
2419 EPILOGUE_3_ARGS
2420ENDPROC iemAImpl_ %+ %1 %+ _u32
2421
2422 %ifdef RT_ARCH_AMD64
2423BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2424 PROLOGUE_3_ARGS
2425 %ifdef ASM_CALL64_GCC
2426 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2427 mov cl, A2_8
2428 %1 qword [A1], cl
2429 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2430 %else
2431 xchg A2, A0
2432 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2433 %1 qword [A1], cl
2434 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2435 %endif
2436 EPILOGUE_3_ARGS
2437ENDPROC iemAImpl_ %+ %1 %+ _u64
2438 %endif ; RT_ARCH_AMD64
2439
2440%endmacro
2441
2442; These instructions will NOT modify flags if the masked shift count is zero
2443; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2444; we have to force load all modified and undefined.
2445IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2446IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2447IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2448IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2449IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2450IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2451IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2452
2453
2454;;
2455; Macro for implementing a double precision shift operation.
2456;
2457; This will generate code for the 16, 32 and 64 bit accesses, except on
2458; 32-bit system where the 64-bit accesses requires hand coding.
2459;
2460; The functions takes the destination operand (r/m) in A0, the source (reg) in
2461; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2462;
2463; @param 1 The instruction mnemonic.
2464; @param 2 The modified flags.
2465; @param 3 The undefined flags.
2466; @param 4 The force loaded flags.
2467;
2468; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2469;
2470; @note the _intel and _amd variants are implemented in C.
2471;
2472%macro IEMIMPL_SHIFT_DBL_OP 4
2473BEGINCODE
2474BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2475 PROLOGUE_4_ARGS
2476 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2477 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2478 %ifdef ASM_CALL64_GCC
2479 xchg A3, A2
2480 %1 [A0], A1_16, cl
2481 xchg A3, A2
2482 %else
2483 xchg A0, A2
2484 %1 [A2], A1_16, cl
2485 %endif
2486 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2487 EPILOGUE_4_ARGS
2488ENDPROC iemAImpl_ %+ %1 %+ _u16
2489
2490BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2491 PROLOGUE_4_ARGS
2492 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2493 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2494 %ifdef ASM_CALL64_GCC
2495 xchg A3, A2
2496 %1 [A0], A1_32, cl
2497 xchg A3, A2
2498 %else
2499 xchg A0, A2
2500 %1 [A2], A1_32, cl
2501 %endif
2502 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2503 EPILOGUE_4_ARGS
2504ENDPROC iemAImpl_ %+ %1 %+ _u32
2505
2506 %ifdef RT_ARCH_AMD64
2507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2508 PROLOGUE_4_ARGS
2509 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2510 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2511 %ifdef ASM_CALL64_GCC
2512 xchg A3, A2
2513 %1 [A0], A1, cl
2514 xchg A3, A2
2515 %else
2516 xchg A0, A2
2517 %1 [A2], A1, cl
2518 %endif
2519 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2520 EPILOGUE_4_ARGS_EX 12
2521ENDPROC iemAImpl_ %+ %1 %+ _u64
2522 %endif ; RT_ARCH_AMD64
2523
2524%endmacro
2525
2526; These instructions will NOT modify flags if the masked shift count is zero
2527; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2528; we have to force load all modified and undefined.
2529IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2530IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2531
2532
2533;;
2534; Macro for implementing a multiplication operations.
2535;
2536; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2537; 32-bit system where the 64-bit accesses requires hand coding.
2538;
2539; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2540; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2541; incoming eflags in A3.
2542;
2543; The functions all return eflags. Since valid eflags can't ever be zero, we can
2544; use the same macros/tests framework as div/idiv.
2545;
2546; @param 1 The instruction mnemonic.
2547; @param 2 The modified flags.
2548; @param 3 The undefined flags.
2549; @param 4 Name suffix.
2550; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2551;
2552; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2553;
2554%macro IEMIMPL_MUL_OP 5
2555BEGINCODE
2556BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2557 PROLOGUE_3_ARGS
2558 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2559 mov al, [A0]
2560 %1 A1_8
2561 mov [A0], ax
2562 %if %5 != 1
2563 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2564 %else
2565 movzx edx, ax
2566 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A2_32, %2, X86_EFL_AF | X86_EFL_ZF, dx, 8, xDX ; intel
2567 %endif
2568 EPILOGUE_3_ARGS
2569ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2570
2571BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2572 PROLOGUE_4_ARGS
2573 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2574 mov ax, [A0]
2575 %ifdef ASM_CALL64_GCC
2576 %1 A2_16
2577 mov [A0], ax
2578 mov [A1], dx
2579 %else
2580 mov T1, A1
2581 %1 A2_16
2582 mov [A0], ax
2583 mov [T1], dx
2584 %endif
2585 %if %5 != 1
2586 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2587 %else
2588 movzx edx, ax
2589 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A3_32, %2, X86_EFL_AF | X86_EFL_ZF, dx, 16, xDX ; intel
2590 %endif
2591 EPILOGUE_4_ARGS
2592ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2593
2594BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2595 PROLOGUE_4_ARGS
2596 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2597 mov eax, [A0]
2598 %ifdef ASM_CALL64_GCC
2599 %1 A2_32
2600 mov [A0], eax
2601 mov [A1], edx
2602 %else
2603 mov T1, A1
2604 %1 A2_32
2605 mov [A0], eax
2606 mov [T1], edx
2607 %endif
2608 %if %5 != 1
2609 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2610 %else
2611 mov edx, eax
2612 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A3_32, %2, X86_EFL_AF | X86_EFL_ZF, edx, 32, xDX ; intel
2613 %endif
2614 EPILOGUE_4_ARGS
2615ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2616
2617 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2618BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2619 PROLOGUE_4_ARGS
2620 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2621 mov rax, [A0]
2622 %ifdef ASM_CALL64_GCC
2623 %1 A2
2624 mov [A0], rax
2625 mov [A1], rdx
2626 %else
2627 mov T1, A1
2628 %1 A2
2629 mov [A0], rax
2630 mov [T1], rdx
2631 %endif
2632 %if %5 != 1
2633 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2634 %else
2635 mov T2, rax
2636 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A3_32, %2, X86_EFL_AF | X86_EFL_ZF, T2, 64, T2 ; intel
2637 %endif
2638 EPILOGUE_4_ARGS_EX 12
2639ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2640 %endif ; !RT_ARCH_AMD64
2641%endmacro
2642
2643IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2644IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2645IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2646IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2647IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2648IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2649
2650
2651BEGINCODE
2652;;
2653; Worker function for negating a 32-bit number in T1:T0
2654; @uses None (T0,T1)
2655BEGINPROC iemAImpl_negate_T0_T1_u32
2656 push 0
2657 push 0
2658 xchg T0_32, [xSP]
2659 xchg T1_32, [xSP + xCB]
2660 sub T0_32, [xSP]
2661 sbb T1_32, [xSP + xCB]
2662 add xSP, xCB*2
2663 ret
2664ENDPROC iemAImpl_negate_T0_T1_u32
2665
2666%ifdef RT_ARCH_AMD64
2667;;
2668; Worker function for negating a 64-bit number in T1:T0
2669; @uses None (T0,T1)
2670BEGINPROC iemAImpl_negate_T0_T1_u64
2671 push 0
2672 push 0
2673 xchg T0, [xSP]
2674 xchg T1, [xSP + xCB]
2675 sub T0, [xSP]
2676 sbb T1, [xSP + xCB]
2677 add xSP, xCB*2
2678 ret
2679ENDPROC iemAImpl_negate_T0_T1_u64
2680%endif
2681
2682
2683;;
2684; Macro for implementing a division operations.
2685;
2686; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2687; 32-bit system where the 64-bit accesses requires hand coding.
2688;
2689; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2690; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and
2691; incoming eflags in A3.
2692;
2693; The functions returns the new EFLAGS on success and zero on divide error.
2694; The new EFLAGS value can never be zero, given that bit 1 always set.
2695;
2696; @param 1 The instruction mnemonic.
2697; @param 2 The modified flags.
2698; @param 3 The undefined flags.
2699; @param 4 1 if signed, 0 if unsigned.
2700; @param 5 Function suffix.
2701; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2702; 2 for AMD (set AF, clear PF, ZF and SF).
2703;
2704; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2705;
2706%macro IEMIMPL_DIV_OP 6
2707BEGINCODE
2708BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2709 PROLOGUE_3_ARGS
2710
2711 ; div by chainsaw check.
2712 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2713 jz .div_zero
2714
2715 ; Overflow check - unsigned division is simple to verify, haven't
2716 ; found a simple way to check signed division yet unfortunately.
2717 %if %4 == 0
2718 cmp [A0 + 1], A1_8
2719 jae .div_overflow
2720 %else
2721 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2722 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2723 test A1_8, A1_8
2724 js .divisor_negative
2725 test T0_16, T0_16
2726 jns .both_positive
2727 neg T0_16
2728.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2729 push T0 ; Start off like unsigned below.
2730 shr T0_16, 7
2731 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2732 pop T0
2733 jb .div_no_overflow
2734 ja .div_overflow
2735 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2736 cmp T0_8, A1_8
2737 jae .div_overflow
2738 jmp .div_no_overflow
2739
2740.divisor_negative:
2741 neg A1_8
2742 test T0_16, T0_16
2743 jns .one_of_each
2744 neg T0_16
2745.both_positive: ; Same as unsigned shifted by sign indicator bit.
2746 shr T0_16, 7
2747 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2748 jae .div_overflow
2749.div_no_overflow:
2750 mov A1, T1 ; restore divisor
2751 %endif
2752
2753 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2754 mov ax, [A0]
2755 %1 A1_8
2756 mov [A0], ax
2757 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2758 IEM_ADJUST_FLAGS_RETVAL A2_32, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2759 %else
2760 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2761 %endif
2762.return:
2763 EPILOGUE_3_ARGS
2764
2765.div_zero:
2766.div_overflow:
2767 xor eax, eax
2768 jmp .return
2769ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2770
2771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2772 PROLOGUE_4_ARGS
2773
2774 ; div by chainsaw check.
2775 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2776 jz .div_zero
2777
2778 ; Overflow check - unsigned division is simple to verify, haven't
2779 ; found a simple way to check signed division yet unfortunately.
2780 %if %4 == 0
2781 cmp [A1], A2_16
2782 jae .div_overflow
2783 %else
2784 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2785 shl T0_32, 16
2786 mov T0_16, [A0] ; T0 = dividend
2787 mov T1, A2 ; T1 = divisor
2788 test T1_16, T1_16
2789 js .divisor_negative
2790 test T0_32, T0_32
2791 jns .both_positive
2792 neg T0_32
2793.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2794 push T0 ; Start off like unsigned below.
2795 shr T0_32, 15
2796 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2797 pop T0
2798 jb .div_no_overflow
2799 ja .div_overflow
2800 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2801 cmp T0_16, T1_16
2802 jae .div_overflow
2803 jmp .div_no_overflow
2804
2805.divisor_negative:
2806 neg T1_16
2807 test T0_32, T0_32
2808 jns .one_of_each
2809 neg T0_32
2810.both_positive: ; Same as unsigned shifted by sign indicator bit.
2811 shr T0_32, 15
2812 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2813 jae .div_overflow
2814.div_no_overflow:
2815 %endif
2816
2817 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2818 %ifdef ASM_CALL64_GCC
2819 mov T1, A2
2820 mov ax, [A0]
2821 mov dx, [A1]
2822 %1 T1_16
2823 mov [A0], ax
2824 mov [A1], dx
2825 %else
2826 mov T1, A1
2827 mov ax, [A0]
2828 mov dx, [T1]
2829 %1 A2_16
2830 mov [A0], ax
2831 mov [T1], dx
2832 %endif
2833 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2834 IEM_ADJUST_FLAGS_RETVAL A3_32, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2835 %else
2836 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2837 %endif
2838
2839.return:
2840 EPILOGUE_4_ARGS
2841
2842.div_zero:
2843.div_overflow:
2844 xor eax, eax
2845 jmp .return
2846ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2847
2848BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2849 PROLOGUE_4_ARGS
2850
2851 ; div by chainsaw check.
2852 test A2_32, A2_32
2853 jz .div_zero
2854
2855 ; Overflow check - unsigned division is simple to verify, haven't
2856 ; found a simple way to check signed division yet unfortunately.
2857 %if %4 == 0
2858 cmp [A1], A2_32
2859 jae .div_overflow
2860 %else
2861 push A2 ; save A2 so we modify it (we out of regs on x86).
2862 mov T0_32, [A0] ; T0 = dividend low
2863 mov T1_32, [A1] ; T1 = dividend high
2864 ;test A2_32, A2_32 - we did this 5 instructions ago.
2865 js .divisor_negative
2866 test T1_32, T1_32
2867 jns .both_positive
2868 call NAME(iemAImpl_negate_T0_T1_u32)
2869.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2870 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2871 jnz .div_overflow
2872 push T0 ; Start off like unsigned below.
2873 shl T1_32, 1
2874 shr T0_32, 31
2875 or T1_32, T0_32
2876 cmp T1_32, A2_32
2877 pop T0
2878 jb .div_no_overflow
2879 ja .div_overflow
2880 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2881 cmp T0_32, A2_32
2882 jae .div_overflow
2883 jmp .div_no_overflow
2884
2885.divisor_negative:
2886 neg A2_32
2887 test T1_32, T1_32
2888 jns .one_of_each
2889 call NAME(iemAImpl_negate_T0_T1_u32)
2890.both_positive: ; Same as unsigned shifted by sign indicator bit.
2891 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2892 jnz .div_overflow
2893 shl T1_32, 1
2894 shr T0_32, 31
2895 or T1_32, T0_32
2896 cmp T1_32, A2_32
2897 jae .div_overflow
2898.div_no_overflow:
2899 pop A2
2900 %endif
2901
2902 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2903 mov eax, [A0]
2904 %ifdef ASM_CALL64_GCC
2905 mov T1, A2
2906 mov eax, [A0]
2907 mov edx, [A1]
2908 %1 T1_32
2909 mov [A0], eax
2910 mov [A1], edx
2911 %else
2912 mov T1, A1
2913 mov eax, [A0]
2914 mov edx, [T1]
2915 %1 A2_32
2916 mov [A0], eax
2917 mov [T1], edx
2918 %endif
2919 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2920 IEM_ADJUST_FLAGS_RETVAL A3_32, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2921 %else
2922 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
2923 %endif
2924
2925.return:
2926 EPILOGUE_4_ARGS
2927
2928.div_overflow:
2929 %if %4 != 0
2930 pop A2
2931 %endif
2932.div_zero:
2933 xor eax, eax
2934 jmp .return
2935ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2936
2937 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2938BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2939 PROLOGUE_4_ARGS
2940
2941 test A2, A2
2942 jz .div_zero
2943 %if %4 == 0
2944 cmp [A1], A2
2945 jae .div_overflow
2946 %else
2947 push A2 ; save A2 so we modify it (we out of regs on x86).
2948 mov T0, [A0] ; T0 = dividend low
2949 mov T1, [A1] ; T1 = dividend high
2950 ;test A2, A2 - we did this five instructions above.
2951 js .divisor_negative
2952 test T1, T1
2953 jns .both_positive
2954 call NAME(iemAImpl_negate_T0_T1_u64)
2955.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2956 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2957 jc .div_overflow
2958 push T0 ; Start off like unsigned below.
2959 shl T1, 1
2960 shr T0, 63
2961 or T1, T0
2962 cmp T1, A2
2963 pop T0
2964 jb .div_no_overflow
2965 ja .div_overflow
2966 mov T1, 0x7fffffffffffffff
2967 and T0, T1 ; Special case for covering (divisor - 1).
2968 cmp T0, A2
2969 jae .div_overflow
2970 jmp .div_no_overflow
2971
2972.divisor_negative:
2973 neg A2
2974 test T1, T1
2975 jns .one_of_each
2976 call NAME(iemAImpl_negate_T0_T1_u64)
2977.both_positive: ; Same as unsigned shifted by sign indicator bit.
2978 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2979 jc .div_overflow
2980 shl T1, 1
2981 shr T0, 63
2982 or T1, T0
2983 cmp T1, A2
2984 jae .div_overflow
2985.div_no_overflow:
2986 pop A2
2987 %endif
2988
2989 IEM_MAYBE_LOAD_FLAGS A3_32, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2990 mov rax, [A0]
2991 %ifdef ASM_CALL64_GCC
2992 mov T1, A2
2993 mov rax, [A0]
2994 mov rdx, [A1]
2995 %1 T1
2996 mov [A0], rax
2997 mov [A1], rdx
2998 %else
2999 mov T1, A1
3000 mov rax, [A0]
3001 mov rdx, [T1]
3002 %1 A2
3003 mov [A0], rax
3004 mov [T1], rdx
3005 %endif
3006 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
3007 IEM_ADJUST_FLAGS_RETVAL A3_32, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
3008 %else
3009 IEM_SAVE_FLAGS_RETVAL A3_32, %2, %3, 0
3010 %endif
3011
3012.return:
3013 EPILOGUE_4_ARGS_EX 12
3014
3015.div_overflow:
3016 %if %4 != 0
3017 pop A2
3018 %endif
3019.div_zero:
3020 xor eax, eax
3021 jmp .return
3022ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
3023 %endif ; !RT_ARCH_AMD64
3024
3025%endmacro
3026
3027IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
3028IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
3029IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
3030;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
3031IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
3032IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
3033IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
3034
3035
3036;;
3037; Macro for implementing memory fence operation.
3038;
3039; No return value, no operands or anything.
3040;
3041; @param 1 The instruction.
3042;
3043%macro IEMIMPL_MEM_FENCE 1
3044BEGINCODE
3045BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
3046 %1
3047 ret
3048ENDPROC iemAImpl_ %+ %1
3049%endmacro
3050
3051IEMIMPL_MEM_FENCE lfence
3052IEMIMPL_MEM_FENCE sfence
3053IEMIMPL_MEM_FENCE mfence
3054
3055;;
3056; Alternative for non-SSE2 host.
3057;
3058BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
3059 push xAX
3060 xchg xAX, [xSP]
3061 add xSP, xCB
3062 ret
3063ENDPROC iemAImpl_alt_mem_fence
3064
3065
3066;;
3067; Initialize the FPU for the actual instruction being emulated, this means
3068; loading parts of the guest's control word and status word.
3069;
3070; @uses 24 bytes of stack. T0, T1
3071; @param 1 Expression giving the address of the FXSTATE of the guest.
3072;
3073%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
3074 fnstenv [xSP]
3075
3076 ; FCW - for exception, precision and rounding control.
3077 movzx T0, word [%1 + X86FXSTATE.FCW]
3078 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3079 mov [xSP + X86FSTENV32P.FCW], T0_16
3080
3081 ; FSW - for undefined C0, C1, C2, and C3.
3082 movzx T1, word [%1 + X86FXSTATE.FSW]
3083 and T1, X86_FSW_C_MASK
3084 movzx T0, word [xSP + X86FSTENV32P.FSW]
3085 and T0, X86_FSW_TOP_MASK
3086 or T0, T1
3087 mov [xSP + X86FSTENV32P.FSW], T0_16
3088
3089 fldenv [xSP]
3090%endmacro
3091
3092
3093;;
3094; Initialize the FPU for the actual instruction being emulated, this means
3095; loading parts of the guest's control word, status word, and update the
3096; tag word for the top register if it's empty.
3097;
3098; ASSUMES actual TOP=7
3099;
3100; @uses 24 bytes of stack. T0, T1
3101; @param 1 Expression giving the address of the FXSTATE of the guest.
3102;
3103%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
3104 fnstenv [xSP]
3105
3106 ; FCW - for exception, precision and rounding control.
3107 movzx T0_32, word [%1 + X86FXSTATE.FCW]
3108 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3109 mov [xSP + X86FSTENV32P.FCW], T0_16
3110
3111 ; FSW - for undefined C0, C1, C2, and C3.
3112 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3113 and T1_32, X86_FSW_C_MASK
3114 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
3115 and T0_32, X86_FSW_TOP_MASK
3116 or T0_32, T1_32
3117 mov [xSP + X86FSTENV32P.FSW], T0_16
3118
3119 ; FTW - Only for ST0 (in/out).
3120 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3121 shr T1_32, X86_FSW_TOP_SHIFT
3122 and T1_32, X86_FSW_TOP_SMASK
3123 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
3124 jc %%st0_not_empty
3125 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
3126%%st0_not_empty:
3127
3128 fldenv [xSP]
3129%endmacro
3130
3131
3132;;
3133; Need to move this as well somewhere better?
3134;
3135struc IEMFPURESULT
3136 .r80Result resw 5
3137 .FSW resw 1
3138endstruc
3139
3140
3141;;
3142; Need to move this as well somewhere better?
3143;
3144struc IEMFPURESULTTWO
3145 .r80Result1 resw 5
3146 .FSW resw 1
3147 .r80Result2 resw 5
3148endstruc
3149
3150
3151;
3152;---------------------- 16-bit signed integer operations ----------------------
3153;
3154
3155
3156;;
3157; Converts a 16-bit floating point value to a 80-bit one (fpu register).
3158;
3159; @param A0 FPU context (fxsave).
3160; @param A1 Pointer to a IEMFPURESULT for the output.
3161; @param A2 Pointer to the 16-bit floating point value to convert.
3162;
3163BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
3164 PROLOGUE_3_ARGS
3165 sub xSP, 20h
3166
3167 fninit
3168 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3169 fild word [A2]
3170
3171 fnstsw word [A1 + IEMFPURESULT.FSW]
3172 fnclex
3173 fstp tword [A1 + IEMFPURESULT.r80Result]
3174
3175 fninit
3176 add xSP, 20h
3177 EPILOGUE_3_ARGS
3178ENDPROC iemAImpl_fild_r80_from_i16
3179
3180
3181;;
3182; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
3183;
3184; @param A0 FPU context (fxsave).
3185; @param A1 Where to return the output FSW.
3186; @param A2 Where to store the 16-bit signed integer value.
3187; @param A3 Pointer to the 80-bit value.
3188;
3189BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
3190 PROLOGUE_4_ARGS
3191 sub xSP, 20h
3192
3193 fninit
3194 fld tword [A3]
3195 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3196 fistp word [A2]
3197
3198 fnstsw word [A1]
3199
3200 fninit
3201 add xSP, 20h
3202 EPILOGUE_4_ARGS
3203ENDPROC iemAImpl_fist_r80_to_i16
3204
3205
3206;;
3207; Store a 80-bit floating point value (register) as a 16-bit signed integer
3208; (memory) with truncation.
3209;
3210; @param A0 FPU context (fxsave).
3211; @param A1 Where to return the output FSW.
3212; @param A2 Where to store the 16-bit signed integer value.
3213; @param A3 Pointer to the 80-bit value.
3214;
3215BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
3216 PROLOGUE_4_ARGS
3217 sub xSP, 20h
3218
3219 fninit
3220 fld tword [A3]
3221 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3222 fisttp word [A2]
3223
3224 fnstsw word [A1]
3225
3226 fninit
3227 add xSP, 20h
3228 EPILOGUE_4_ARGS
3229ENDPROC iemAImpl_fistt_r80_to_i16
3230
3231
3232;;
3233; FPU instruction working on one 80-bit and one 16-bit signed integer value.
3234;
3235; @param 1 The instruction
3236;
3237; @param A0 FPU context (fxsave).
3238; @param A1 Pointer to a IEMFPURESULT for the output.
3239; @param A2 Pointer to the 80-bit value.
3240; @param A3 Pointer to the 16-bit value.
3241;
3242%macro IEMIMPL_FPU_R80_BY_I16 1
3243BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3244 PROLOGUE_4_ARGS
3245 sub xSP, 20h
3246
3247 fninit
3248 fld tword [A2]
3249 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3250 %1 word [A3]
3251
3252 fnstsw word [A1 + IEMFPURESULT.FSW]
3253 fnclex
3254 fstp tword [A1 + IEMFPURESULT.r80Result]
3255
3256 fninit
3257 add xSP, 20h
3258 EPILOGUE_4_ARGS
3259ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3260%endmacro
3261
3262IEMIMPL_FPU_R80_BY_I16 fiadd
3263IEMIMPL_FPU_R80_BY_I16 fimul
3264IEMIMPL_FPU_R80_BY_I16 fisub
3265IEMIMPL_FPU_R80_BY_I16 fisubr
3266IEMIMPL_FPU_R80_BY_I16 fidiv
3267IEMIMPL_FPU_R80_BY_I16 fidivr
3268
3269
3270;;
3271; FPU instruction working on one 80-bit and one 16-bit signed integer value,
3272; only returning FSW.
3273;
3274; @param 1 The instruction
3275;
3276; @param A0 FPU context (fxsave).
3277; @param A1 Where to store the output FSW.
3278; @param A2 Pointer to the 80-bit value.
3279; @param A3 Pointer to the 64-bit value.
3280;
3281%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
3282BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3283 PROLOGUE_4_ARGS
3284 sub xSP, 20h
3285
3286 fninit
3287 fld tword [A2]
3288 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3289 %1 word [A3]
3290
3291 fnstsw word [A1]
3292
3293 fninit
3294 add xSP, 20h
3295 EPILOGUE_4_ARGS
3296ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3297%endmacro
3298
3299IEMIMPL_FPU_R80_BY_I16_FSW ficom
3300
3301
3302
3303;
3304;---------------------- 32-bit signed integer operations ----------------------
3305;
3306
3307
3308;;
3309; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3310;
3311; @param A0 FPU context (fxsave).
3312; @param A1 Pointer to a IEMFPURESULT for the output.
3313; @param A2 Pointer to the 32-bit floating point value to convert.
3314;
3315BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
3316 PROLOGUE_3_ARGS
3317 sub xSP, 20h
3318
3319 fninit
3320 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3321 fild dword [A2]
3322
3323 fnstsw word [A1 + IEMFPURESULT.FSW]
3324 fnclex
3325 fstp tword [A1 + IEMFPURESULT.r80Result]
3326
3327 fninit
3328 add xSP, 20h
3329 EPILOGUE_3_ARGS
3330ENDPROC iemAImpl_fild_r80_from_i32
3331
3332
3333;;
3334; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
3335;
3336; @param A0 FPU context (fxsave).
3337; @param A1 Where to return the output FSW.
3338; @param A2 Where to store the 32-bit signed integer value.
3339; @param A3 Pointer to the 80-bit value.
3340;
3341BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
3342 PROLOGUE_4_ARGS
3343 sub xSP, 20h
3344
3345 fninit
3346 fld tword [A3]
3347 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3348 fistp dword [A2]
3349
3350 fnstsw word [A1]
3351
3352 fninit
3353 add xSP, 20h
3354 EPILOGUE_4_ARGS
3355ENDPROC iemAImpl_fist_r80_to_i32
3356
3357
3358;;
3359; Store a 80-bit floating point value (register) as a 32-bit signed integer
3360; (memory) with truncation.
3361;
3362; @param A0 FPU context (fxsave).
3363; @param A1 Where to return the output FSW.
3364; @param A2 Where to store the 32-bit signed integer value.
3365; @param A3 Pointer to the 80-bit value.
3366;
3367BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3368 PROLOGUE_4_ARGS
3369 sub xSP, 20h
3370
3371 fninit
3372 fld tword [A3]
3373 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3374 fisttp dword [A2]
3375
3376 fnstsw word [A1]
3377
3378 fninit
3379 add xSP, 20h
3380 EPILOGUE_4_ARGS
3381ENDPROC iemAImpl_fistt_r80_to_i32
3382
3383
3384;;
3385; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3386;
3387; @param 1 The instruction
3388;
3389; @param A0 FPU context (fxsave).
3390; @param A1 Pointer to a IEMFPURESULT for the output.
3391; @param A2 Pointer to the 80-bit value.
3392; @param A3 Pointer to the 32-bit value.
3393;
3394%macro IEMIMPL_FPU_R80_BY_I32 1
3395BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3396 PROLOGUE_4_ARGS
3397 sub xSP, 20h
3398
3399 fninit
3400 fld tword [A2]
3401 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3402 %1 dword [A3]
3403
3404 fnstsw word [A1 + IEMFPURESULT.FSW]
3405 fnclex
3406 fstp tword [A1 + IEMFPURESULT.r80Result]
3407
3408 fninit
3409 add xSP, 20h
3410 EPILOGUE_4_ARGS
3411ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3412%endmacro
3413
3414IEMIMPL_FPU_R80_BY_I32 fiadd
3415IEMIMPL_FPU_R80_BY_I32 fimul
3416IEMIMPL_FPU_R80_BY_I32 fisub
3417IEMIMPL_FPU_R80_BY_I32 fisubr
3418IEMIMPL_FPU_R80_BY_I32 fidiv
3419IEMIMPL_FPU_R80_BY_I32 fidivr
3420
3421
3422;;
3423; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3424; only returning FSW.
3425;
3426; @param 1 The instruction
3427;
3428; @param A0 FPU context (fxsave).
3429; @param A1 Where to store the output FSW.
3430; @param A2 Pointer to the 80-bit value.
3431; @param A3 Pointer to the 64-bit value.
3432;
3433%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3434BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3435 PROLOGUE_4_ARGS
3436 sub xSP, 20h
3437
3438 fninit
3439 fld tword [A2]
3440 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3441 %1 dword [A3]
3442
3443 fnstsw word [A1]
3444
3445 fninit
3446 add xSP, 20h
3447 EPILOGUE_4_ARGS
3448ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3449%endmacro
3450
3451IEMIMPL_FPU_R80_BY_I32_FSW ficom
3452
3453
3454
3455;
3456;---------------------- 64-bit signed integer operations ----------------------
3457;
3458
3459
3460;;
3461; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3462;
3463; @param A0 FPU context (fxsave).
3464; @param A1 Pointer to a IEMFPURESULT for the output.
3465; @param A2 Pointer to the 64-bit floating point value to convert.
3466;
3467BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3468 PROLOGUE_3_ARGS
3469 sub xSP, 20h
3470
3471 fninit
3472 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3473 fild qword [A2]
3474
3475 fnstsw word [A1 + IEMFPURESULT.FSW]
3476 fnclex
3477 fstp tword [A1 + IEMFPURESULT.r80Result]
3478
3479 fninit
3480 add xSP, 20h
3481 EPILOGUE_3_ARGS
3482ENDPROC iemAImpl_fild_r80_from_i64
3483
3484
3485;;
3486; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3487;
3488; @param A0 FPU context (fxsave).
3489; @param A1 Where to return the output FSW.
3490; @param A2 Where to store the 64-bit signed integer value.
3491; @param A3 Pointer to the 80-bit value.
3492;
3493BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3494 PROLOGUE_4_ARGS
3495 sub xSP, 20h
3496
3497 fninit
3498 fld tword [A3]
3499 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3500 fistp qword [A2]
3501
3502 fnstsw word [A1]
3503
3504 fninit
3505 add xSP, 20h
3506 EPILOGUE_4_ARGS
3507ENDPROC iemAImpl_fist_r80_to_i64
3508
3509
3510;;
3511; Store a 80-bit floating point value (register) as a 64-bit signed integer
3512; (memory) with truncation.
3513;
3514; @param A0 FPU context (fxsave).
3515; @param A1 Where to return the output FSW.
3516; @param A2 Where to store the 64-bit signed integer value.
3517; @param A3 Pointer to the 80-bit value.
3518;
3519BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3520 PROLOGUE_4_ARGS
3521 sub xSP, 20h
3522
3523 fninit
3524 fld tword [A3]
3525 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3526 fisttp qword [A2]
3527
3528 fnstsw word [A1]
3529
3530 fninit
3531 add xSP, 20h
3532 EPILOGUE_4_ARGS
3533ENDPROC iemAImpl_fistt_r80_to_i64
3534
3535
3536
3537;
3538;---------------------- 32-bit floating point operations ----------------------
3539;
3540
3541;;
3542; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3543;
3544; @param A0 FPU context (fxsave).
3545; @param A1 Pointer to a IEMFPURESULT for the output.
3546; @param A2 Pointer to the 32-bit floating point value to convert.
3547;
3548BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3549 PROLOGUE_3_ARGS
3550 sub xSP, 20h
3551
3552 fninit
3553 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3554 fld dword [A2]
3555
3556 fnstsw word [A1 + IEMFPURESULT.FSW]
3557 fnclex
3558 fstp tword [A1 + IEMFPURESULT.r80Result]
3559
3560 fninit
3561 add xSP, 20h
3562 EPILOGUE_3_ARGS
3563ENDPROC iemAImpl_fld_r80_from_r32
3564
3565
3566;;
3567; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3568;
3569; @param A0 FPU context (fxsave).
3570; @param A1 Where to return the output FSW.
3571; @param A2 Where to store the 32-bit value.
3572; @param A3 Pointer to the 80-bit value.
3573;
3574BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3575 PROLOGUE_4_ARGS
3576 sub xSP, 20h
3577
3578 fninit
3579 fld tword [A3]
3580 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3581 fst dword [A2]
3582
3583 fnstsw word [A1]
3584
3585 fninit
3586 add xSP, 20h
3587 EPILOGUE_4_ARGS
3588ENDPROC iemAImpl_fst_r80_to_r32
3589
3590
3591;;
3592; FPU instruction working on one 80-bit and one 32-bit floating point value.
3593;
3594; @param 1 The instruction
3595;
3596; @param A0 FPU context (fxsave).
3597; @param A1 Pointer to a IEMFPURESULT for the output.
3598; @param A2 Pointer to the 80-bit value.
3599; @param A3 Pointer to the 32-bit value.
3600;
3601%macro IEMIMPL_FPU_R80_BY_R32 1
3602BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3603 PROLOGUE_4_ARGS
3604 sub xSP, 20h
3605
3606 fninit
3607 fld tword [A2]
3608 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3609 %1 dword [A3]
3610
3611 fnstsw word [A1 + IEMFPURESULT.FSW]
3612 fnclex
3613 fstp tword [A1 + IEMFPURESULT.r80Result]
3614
3615 fninit
3616 add xSP, 20h
3617 EPILOGUE_4_ARGS
3618ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3619%endmacro
3620
3621IEMIMPL_FPU_R80_BY_R32 fadd
3622IEMIMPL_FPU_R80_BY_R32 fmul
3623IEMIMPL_FPU_R80_BY_R32 fsub
3624IEMIMPL_FPU_R80_BY_R32 fsubr
3625IEMIMPL_FPU_R80_BY_R32 fdiv
3626IEMIMPL_FPU_R80_BY_R32 fdivr
3627
3628
3629;;
3630; FPU instruction working on one 80-bit and one 32-bit floating point value,
3631; only returning FSW.
3632;
3633; @param 1 The instruction
3634;
3635; @param A0 FPU context (fxsave).
3636; @param A1 Where to store the output FSW.
3637; @param A2 Pointer to the 80-bit value.
3638; @param A3 Pointer to the 64-bit value.
3639;
3640%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3641BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3642 PROLOGUE_4_ARGS
3643 sub xSP, 20h
3644
3645 fninit
3646 fld tword [A2]
3647 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3648 %1 dword [A3]
3649
3650 fnstsw word [A1]
3651
3652 fninit
3653 add xSP, 20h
3654 EPILOGUE_4_ARGS
3655ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3656%endmacro
3657
3658IEMIMPL_FPU_R80_BY_R32_FSW fcom
3659
3660
3661
3662;
3663;---------------------- 64-bit floating point operations ----------------------
3664;
3665
3666;;
3667; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3668;
3669; @param A0 FPU context (fxsave).
3670; @param A1 Pointer to a IEMFPURESULT for the output.
3671; @param A2 Pointer to the 64-bit floating point value to convert.
3672;
3673BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3674 PROLOGUE_3_ARGS
3675 sub xSP, 20h
3676
3677 fninit
3678 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3679 fld qword [A2]
3680
3681 fnstsw word [A1 + IEMFPURESULT.FSW]
3682 fnclex
3683 fstp tword [A1 + IEMFPURESULT.r80Result]
3684
3685 fninit
3686 add xSP, 20h
3687 EPILOGUE_3_ARGS
3688ENDPROC iemAImpl_fld_r80_from_r64
3689
3690
3691;;
3692; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3693;
3694; @param A0 FPU context (fxsave).
3695; @param A1 Where to return the output FSW.
3696; @param A2 Where to store the 64-bit value.
3697; @param A3 Pointer to the 80-bit value.
3698;
3699BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3700 PROLOGUE_4_ARGS
3701 sub xSP, 20h
3702
3703 fninit
3704 fld tword [A3]
3705 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3706 fst qword [A2]
3707
3708 fnstsw word [A1]
3709
3710 fninit
3711 add xSP, 20h
3712 EPILOGUE_4_ARGS
3713ENDPROC iemAImpl_fst_r80_to_r64
3714
3715
3716;;
3717; FPU instruction working on one 80-bit and one 64-bit floating point value.
3718;
3719; @param 1 The instruction
3720;
3721; @param A0 FPU context (fxsave).
3722; @param A1 Pointer to a IEMFPURESULT for the output.
3723; @param A2 Pointer to the 80-bit value.
3724; @param A3 Pointer to the 64-bit value.
3725;
3726%macro IEMIMPL_FPU_R80_BY_R64 1
3727BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3728 PROLOGUE_4_ARGS
3729 sub xSP, 20h
3730
3731 fninit
3732 fld tword [A2]
3733 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3734 %1 qword [A3]
3735
3736 fnstsw word [A1 + IEMFPURESULT.FSW]
3737 fnclex
3738 fstp tword [A1 + IEMFPURESULT.r80Result]
3739
3740 fninit
3741 add xSP, 20h
3742 EPILOGUE_4_ARGS
3743ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3744%endmacro
3745
3746IEMIMPL_FPU_R80_BY_R64 fadd
3747IEMIMPL_FPU_R80_BY_R64 fmul
3748IEMIMPL_FPU_R80_BY_R64 fsub
3749IEMIMPL_FPU_R80_BY_R64 fsubr
3750IEMIMPL_FPU_R80_BY_R64 fdiv
3751IEMIMPL_FPU_R80_BY_R64 fdivr
3752
3753;;
3754; FPU instruction working on one 80-bit and one 64-bit floating point value,
3755; only returning FSW.
3756;
3757; @param 1 The instruction
3758;
3759; @param A0 FPU context (fxsave).
3760; @param A1 Where to store the output FSW.
3761; @param A2 Pointer to the 80-bit value.
3762; @param A3 Pointer to the 64-bit value.
3763;
3764%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3765BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3766 PROLOGUE_4_ARGS
3767 sub xSP, 20h
3768
3769 fninit
3770 fld tword [A2]
3771 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3772 %1 qword [A3]
3773
3774 fnstsw word [A1]
3775
3776 fninit
3777 add xSP, 20h
3778 EPILOGUE_4_ARGS
3779ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3780%endmacro
3781
3782IEMIMPL_FPU_R80_BY_R64_FSW fcom
3783
3784
3785
3786;
3787;---------------------- 80-bit floating point operations ----------------------
3788;
3789
3790;;
3791; Loads a 80-bit floating point register value from memory.
3792;
3793; @param A0 FPU context (fxsave).
3794; @param A1 Pointer to a IEMFPURESULT for the output.
3795; @param A2 Pointer to the 80-bit floating point value to load.
3796;
3797BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3798 PROLOGUE_3_ARGS
3799 sub xSP, 20h
3800
3801 fninit
3802 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3803 fld tword [A2]
3804
3805 fnstsw word [A1 + IEMFPURESULT.FSW]
3806 fnclex
3807 fstp tword [A1 + IEMFPURESULT.r80Result]
3808
3809 fninit
3810 add xSP, 20h
3811 EPILOGUE_3_ARGS
3812ENDPROC iemAImpl_fld_r80_from_r80
3813
3814
3815;;
3816; Store a 80-bit floating point register to memory
3817;
3818; @param A0 FPU context (fxsave).
3819; @param A1 Where to return the output FSW.
3820; @param A2 Where to store the 80-bit value.
3821; @param A3 Pointer to the 80-bit register value.
3822;
3823BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3824 PROLOGUE_4_ARGS
3825 sub xSP, 20h
3826
3827 fninit
3828 fld tword [A3]
3829 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3830 fstp tword [A2]
3831
3832 fnstsw word [A1]
3833
3834 fninit
3835 add xSP, 20h
3836 EPILOGUE_4_ARGS
3837ENDPROC iemAImpl_fst_r80_to_r80
3838
3839
3840;;
3841; Loads an 80-bit floating point register value in BCD format from memory.
3842;
3843; @param A0 FPU context (fxsave).
3844; @param A1 Pointer to a IEMFPURESULT for the output.
3845; @param A2 Pointer to the 80-bit BCD value to load.
3846;
3847BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3848 PROLOGUE_3_ARGS
3849 sub xSP, 20h
3850
3851 fninit
3852 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3853 fbld tword [A2]
3854
3855 fnstsw word [A1 + IEMFPURESULT.FSW]
3856 fnclex
3857 fstp tword [A1 + IEMFPURESULT.r80Result]
3858
3859 fninit
3860 add xSP, 20h
3861 EPILOGUE_3_ARGS
3862ENDPROC iemAImpl_fld_r80_from_d80
3863
3864
3865;;
3866; Store a 80-bit floating point register to memory as BCD
3867;
3868; @param A0 FPU context (fxsave).
3869; @param A1 Where to return the output FSW.
3870; @param A2 Where to store the 80-bit BCD value.
3871; @param A3 Pointer to the 80-bit register value.
3872;
3873BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3874 PROLOGUE_4_ARGS
3875 sub xSP, 20h
3876
3877 fninit
3878 fld tword [A3]
3879 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3880 fbstp tword [A2]
3881
3882 fnstsw word [A1]
3883
3884 fninit
3885 add xSP, 20h
3886 EPILOGUE_4_ARGS
3887ENDPROC iemAImpl_fst_r80_to_d80
3888
3889
3890;;
3891; FPU instruction working on two 80-bit floating point values.
3892;
3893; @param 1 The instruction
3894;
3895; @param A0 FPU context (fxsave).
3896; @param A1 Pointer to a IEMFPURESULT for the output.
3897; @param A2 Pointer to the first 80-bit value (ST0)
3898; @param A3 Pointer to the second 80-bit value (STn).
3899;
3900%macro IEMIMPL_FPU_R80_BY_R80 2
3901BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3902 PROLOGUE_4_ARGS
3903 sub xSP, 20h
3904
3905 fninit
3906 fld tword [A3]
3907 fld tword [A2]
3908 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3909 %1 %2
3910
3911 fnstsw word [A1 + IEMFPURESULT.FSW]
3912 fnclex
3913 fstp tword [A1 + IEMFPURESULT.r80Result]
3914
3915 fninit
3916 add xSP, 20h
3917 EPILOGUE_4_ARGS
3918ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3919%endmacro
3920
3921IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3922IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3923IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3924IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3925IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3926IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3927IEMIMPL_FPU_R80_BY_R80 fprem, {}
3928IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3929IEMIMPL_FPU_R80_BY_R80 fscale, {}
3930
3931
3932;;
3933; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3934; storing the result in ST1 and popping the stack.
3935;
3936; @param 1 The instruction
3937;
3938; @param A0 FPU context (fxsave).
3939; @param A1 Pointer to a IEMFPURESULT for the output.
3940; @param A2 Pointer to the first 80-bit value (ST1).
3941; @param A3 Pointer to the second 80-bit value (ST0).
3942;
3943%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3944BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3945 PROLOGUE_4_ARGS
3946 sub xSP, 20h
3947
3948 fninit
3949 fld tword [A2]
3950 fld tword [A3]
3951 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3952 %1
3953
3954 fnstsw word [A1 + IEMFPURESULT.FSW]
3955 fnclex
3956 fstp tword [A1 + IEMFPURESULT.r80Result]
3957
3958 fninit
3959 add xSP, 20h
3960 EPILOGUE_4_ARGS
3961ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3962%endmacro
3963
3964IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3965IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3966IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3967
3968
3969;;
3970; FPU instruction working on two 80-bit floating point values, only
3971; returning FSW.
3972;
3973; @param 1 The instruction
3974;
3975; @param A0 FPU context (fxsave).
3976; @param A1 Pointer to a uint16_t for the resulting FSW.
3977; @param A2 Pointer to the first 80-bit value.
3978; @param A3 Pointer to the second 80-bit value.
3979;
3980%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3982 PROLOGUE_4_ARGS
3983 sub xSP, 20h
3984
3985 fninit
3986 fld tword [A3]
3987 fld tword [A2]
3988 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3989 %1 st0, st1
3990
3991 fnstsw word [A1]
3992
3993 fninit
3994 add xSP, 20h
3995 EPILOGUE_4_ARGS
3996ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3997%endmacro
3998
3999IEMIMPL_FPU_R80_BY_R80_FSW fcom
4000IEMIMPL_FPU_R80_BY_R80_FSW fucom
4001
4002
4003;;
4004; FPU instruction working on two 80-bit floating point values,
4005; returning FSW and EFLAGS (eax).
4006;
4007; @param 1 The instruction
4008;
4009; @returns EFLAGS in EAX.
4010; @param A0 FPU context (fxsave).
4011; @param A1 Pointer to a uint16_t for the resulting FSW.
4012; @param A2 Pointer to the first 80-bit value.
4013; @param A3 Pointer to the second 80-bit value.
4014;
4015%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
4016BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
4017 PROLOGUE_4_ARGS
4018 sub xSP, 20h
4019
4020 fninit
4021 fld tword [A3]
4022 fld tword [A2]
4023 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4024 %1 st1
4025
4026 fnstsw word [A1]
4027 pushf
4028 pop xAX
4029
4030 fninit
4031 add xSP, 20h
4032 EPILOGUE_4_ARGS
4033ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4034%endmacro
4035
4036IEMIMPL_FPU_R80_BY_R80_EFL fcomi
4037IEMIMPL_FPU_R80_BY_R80_EFL fucomi
4038
4039
4040;;
4041; FPU instruction working on one 80-bit floating point value.
4042;
4043; @param 1 The instruction
4044;
4045; @param A0 FPU context (fxsave).
4046; @param A1 Pointer to a IEMFPURESULT for the output.
4047; @param A2 Pointer to the 80-bit value.
4048;
4049%macro IEMIMPL_FPU_R80 1
4050BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4051 PROLOGUE_3_ARGS
4052 sub xSP, 20h
4053
4054 fninit
4055 fld tword [A2]
4056 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4057 %1
4058
4059 fnstsw word [A1 + IEMFPURESULT.FSW]
4060 fnclex
4061 fstp tword [A1 + IEMFPURESULT.r80Result]
4062
4063 fninit
4064 add xSP, 20h
4065 EPILOGUE_3_ARGS
4066ENDPROC iemAImpl_ %+ %1 %+ _r80
4067%endmacro
4068
4069IEMIMPL_FPU_R80 fchs
4070IEMIMPL_FPU_R80 fabs
4071IEMIMPL_FPU_R80 f2xm1
4072IEMIMPL_FPU_R80 fsqrt
4073IEMIMPL_FPU_R80 frndint
4074IEMIMPL_FPU_R80 fsin
4075IEMIMPL_FPU_R80 fcos
4076
4077
4078;;
4079; FPU instruction working on one 80-bit floating point value, only
4080; returning FSW.
4081;
4082; @param 1 The instruction
4083; @param 2 Non-zero to also restore FTW.
4084;
4085; @param A0 FPU context (fxsave).
4086; @param A1 Pointer to a uint16_t for the resulting FSW.
4087; @param A2 Pointer to the 80-bit value.
4088;
4089%macro IEMIMPL_FPU_R80_FSW 2
4090BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4091 PROLOGUE_3_ARGS
4092 sub xSP, 20h
4093
4094 fninit
4095 fld tword [A2]
4096%if %2 != 0
4097 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
4098%else
4099 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4100%endif
4101 %1
4102
4103 fnstsw word [A1]
4104
4105 fninit
4106 add xSP, 20h
4107 EPILOGUE_3_ARGS
4108ENDPROC iemAImpl_ %+ %1 %+ _r80
4109%endmacro
4110
4111IEMIMPL_FPU_R80_FSW ftst, 0
4112IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
4113
4114
4115
4116;;
4117; FPU instruction loading a 80-bit floating point constant.
4118;
4119; @param 1 The instruction
4120;
4121; @param A0 FPU context (fxsave).
4122; @param A1 Pointer to a IEMFPURESULT for the output.
4123;
4124%macro IEMIMPL_FPU_R80_CONST 1
4125BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
4126 PROLOGUE_2_ARGS
4127 sub xSP, 20h
4128
4129 fninit
4130 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4131 %1
4132
4133 fnstsw word [A1 + IEMFPURESULT.FSW]
4134 fnclex
4135 fstp tword [A1 + IEMFPURESULT.r80Result]
4136
4137 fninit
4138 add xSP, 20h
4139 EPILOGUE_2_ARGS
4140ENDPROC iemAImpl_ %+ %1 %+
4141%endmacro
4142
4143IEMIMPL_FPU_R80_CONST fld1
4144IEMIMPL_FPU_R80_CONST fldl2t
4145IEMIMPL_FPU_R80_CONST fldl2e
4146IEMIMPL_FPU_R80_CONST fldpi
4147IEMIMPL_FPU_R80_CONST fldlg2
4148IEMIMPL_FPU_R80_CONST fldln2
4149IEMIMPL_FPU_R80_CONST fldz
4150
4151
4152;;
4153; FPU instruction working on one 80-bit floating point value, outputing two.
4154;
4155; @param 1 The instruction
4156;
4157; @param A0 FPU context (fxsave).
4158; @param A1 Pointer to a IEMFPURESULTTWO for the output.
4159; @param A2 Pointer to the 80-bit value.
4160;
4161%macro IEMIMPL_FPU_R80_R80 1
4162BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
4163 PROLOGUE_3_ARGS
4164 sub xSP, 20h
4165
4166 fninit
4167 fld tword [A2]
4168 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4169 %1
4170
4171 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
4172 fnclex
4173 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
4174 fnclex
4175 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
4176
4177 fninit
4178 add xSP, 20h
4179 EPILOGUE_3_ARGS
4180ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
4181%endmacro
4182
4183IEMIMPL_FPU_R80_R80 fptan
4184IEMIMPL_FPU_R80_R80 fxtract
4185IEMIMPL_FPU_R80_R80 fsincos
4186
4187
4188
4189
4190;---------------------- SSE and MMX Operations ----------------------
4191
4192;; @todo what do we need to do for MMX?
4193%macro IEMIMPL_MMX_PROLOGUE 0
4194%endmacro
4195%macro IEMIMPL_MMX_EPILOGUE 0
4196%endmacro
4197
4198;; @todo what do we need to do for SSE?
4199%macro IEMIMPL_SSE_PROLOGUE 0
4200%endmacro
4201%macro IEMIMPL_SSE_EPILOGUE 0
4202%endmacro
4203
4204;; @todo what do we need to do for AVX?
4205%macro IEMIMPL_AVX_PROLOGUE 0
4206%endmacro
4207%macro IEMIMPL_AVX_EPILOGUE 0
4208%endmacro
4209
4210
4211;;
4212; Media instruction working on two full sized registers.
4213;
4214; @param 1 The instruction
4215; @param 2 Whether there is an MMX variant (1) or not (0).
4216;
4217; @param A0 FPU context (fxsave).
4218; @param A1 Pointer to the first media register size operand (input/output).
4219; @param A2 Pointer to the second media register size operand (input).
4220;
4221; @todo r=aeichner Currently unused, can probably be removed.
4222;
4223%macro IEMIMPL_MEDIA_F2 2
4224%if %2 != 0
4225BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
4226 PROLOGUE_3_ARGS
4227 IEMIMPL_MMX_PROLOGUE
4228
4229 movq mm0, [A1]
4230 movq mm1, [A2]
4231 %1 mm0, mm1
4232 movq [A1], mm0
4233
4234 IEMIMPL_MMX_EPILOGUE
4235 EPILOGUE_3_ARGS
4236ENDPROC iemAImpl_ %+ %1 %+ _u64
4237%endif
4238
4239BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4240 PROLOGUE_3_ARGS
4241 IEMIMPL_SSE_PROLOGUE
4242
4243 movdqu xmm0, [A1]
4244 movdqu xmm1, [A2]
4245 %1 xmm0, xmm1
4246 movdqu [A1], xmm0
4247
4248 IEMIMPL_SSE_EPILOGUE
4249 EPILOGUE_3_ARGS
4250ENDPROC iemAImpl_ %+ %1 %+ _u128
4251%endmacro
4252
4253;;
4254; Media instruction working on two full sized registers, but no FXSAVE state argument.
4255;
4256; @param 1 The instruction
4257; @param 2 Whether there is an MMX variant (1) or not (0).
4258;
4259; @param A0 Pointer to the first media register size operand (input/output).
4260; @param A1 Pointer to the second media register size operand (input).
4261;
4262%macro IEMIMPL_MEDIA_OPT_F2 2
4263%if %2 != 0
4264BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4265 PROLOGUE_2_ARGS
4266 IEMIMPL_MMX_PROLOGUE
4267
4268 movq mm0, [A0]
4269 movq mm1, [A1]
4270 %1 mm0, mm1
4271 movq [A0], mm0
4272
4273 IEMIMPL_MMX_EPILOGUE
4274 EPILOGUE_2_ARGS
4275ENDPROC iemAImpl_ %+ %1 %+ _u64
4276%endif
4277
4278BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4279 PROLOGUE_2_ARGS
4280 IEMIMPL_SSE_PROLOGUE
4281
4282 movdqu xmm0, [A0]
4283 movdqu xmm1, [A1]
4284 %1 xmm0, xmm1
4285 movdqu [A0], xmm0
4286
4287 IEMIMPL_SSE_EPILOGUE
4288 EPILOGUE_2_ARGS
4289ENDPROC iemAImpl_ %+ %1 %+ _u128
4290%endmacro
4291
4292IEMIMPL_MEDIA_OPT_F2 pshufb, 1
4293IEMIMPL_MEDIA_OPT_F2 pand, 1
4294IEMIMPL_MEDIA_OPT_F2 pandn, 1
4295IEMIMPL_MEDIA_OPT_F2 por, 1
4296IEMIMPL_MEDIA_OPT_F2 pxor, 1
4297IEMIMPL_MEDIA_OPT_F2 pcmpeqb, 1
4298IEMIMPL_MEDIA_OPT_F2 pcmpeqw, 1
4299IEMIMPL_MEDIA_OPT_F2 pcmpeqd, 1
4300IEMIMPL_MEDIA_OPT_F2 pcmpeqq, 0
4301IEMIMPL_MEDIA_OPT_F2 pcmpgtb, 1
4302IEMIMPL_MEDIA_OPT_F2 pcmpgtw, 1
4303IEMIMPL_MEDIA_OPT_F2 pcmpgtd, 1
4304IEMIMPL_MEDIA_OPT_F2 pcmpgtq, 0
4305IEMIMPL_MEDIA_OPT_F2 paddb, 1
4306IEMIMPL_MEDIA_OPT_F2 paddw, 1
4307IEMIMPL_MEDIA_OPT_F2 paddd, 1
4308IEMIMPL_MEDIA_OPT_F2 paddq, 1
4309IEMIMPL_MEDIA_OPT_F2 paddsb, 1
4310IEMIMPL_MEDIA_OPT_F2 paddsw, 1
4311IEMIMPL_MEDIA_OPT_F2 paddusb, 1
4312IEMIMPL_MEDIA_OPT_F2 paddusw, 1
4313IEMIMPL_MEDIA_OPT_F2 psubb, 1
4314IEMIMPL_MEDIA_OPT_F2 psubw, 1
4315IEMIMPL_MEDIA_OPT_F2 psubd, 1
4316IEMIMPL_MEDIA_OPT_F2 psubq, 1
4317IEMIMPL_MEDIA_OPT_F2 psubsb, 1
4318IEMIMPL_MEDIA_OPT_F2 psubsw, 1
4319IEMIMPL_MEDIA_OPT_F2 psubusb, 1
4320IEMIMPL_MEDIA_OPT_F2 psubusw, 1
4321IEMIMPL_MEDIA_OPT_F2 pmullw, 1
4322IEMIMPL_MEDIA_OPT_F2 pmulld, 0
4323IEMIMPL_MEDIA_OPT_F2 pmulhw, 1
4324IEMIMPL_MEDIA_OPT_F2 pmaddwd, 1
4325IEMIMPL_MEDIA_OPT_F2 pminub, 1
4326IEMIMPL_MEDIA_OPT_F2 pminuw, 0
4327IEMIMPL_MEDIA_OPT_F2 pminud, 0
4328IEMIMPL_MEDIA_OPT_F2 pminsb, 0
4329IEMIMPL_MEDIA_OPT_F2 pminsw, 1
4330IEMIMPL_MEDIA_OPT_F2 pminsd, 0
4331IEMIMPL_MEDIA_OPT_F2 pmaxub, 1
4332IEMIMPL_MEDIA_OPT_F2 pmaxuw, 0
4333IEMIMPL_MEDIA_OPT_F2 pmaxud, 0
4334IEMIMPL_MEDIA_OPT_F2 pmaxsb, 0
4335IEMIMPL_MEDIA_OPT_F2 pmaxsw, 1
4336IEMIMPL_MEDIA_OPT_F2 pmaxsd, 0
4337IEMIMPL_MEDIA_OPT_F2 pabsb, 1
4338IEMIMPL_MEDIA_OPT_F2 pabsw, 1
4339IEMIMPL_MEDIA_OPT_F2 pabsd, 1
4340IEMIMPL_MEDIA_OPT_F2 psignb, 1
4341IEMIMPL_MEDIA_OPT_F2 psignw, 1
4342IEMIMPL_MEDIA_OPT_F2 psignd, 1
4343IEMIMPL_MEDIA_OPT_F2 phaddw, 1
4344IEMIMPL_MEDIA_OPT_F2 phaddd, 1
4345IEMIMPL_MEDIA_OPT_F2 phsubw, 1
4346IEMIMPL_MEDIA_OPT_F2 phsubd, 1
4347IEMIMPL_MEDIA_OPT_F2 phaddsw, 1
4348IEMIMPL_MEDIA_OPT_F2 phsubsw, 1
4349IEMIMPL_MEDIA_OPT_F2 pmaddubsw, 1
4350IEMIMPL_MEDIA_OPT_F2 pmulhrsw, 1
4351IEMIMPL_MEDIA_OPT_F2 pmuludq, 1
4352IEMIMPL_MEDIA_OPT_F2 packsswb, 1
4353IEMIMPL_MEDIA_OPT_F2 packssdw, 1
4354IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4355IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4356IEMIMPL_MEDIA_OPT_F2 psllw, 1
4357IEMIMPL_MEDIA_OPT_F2 pslld, 1
4358IEMIMPL_MEDIA_OPT_F2 psllq, 1
4359IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4360IEMIMPL_MEDIA_OPT_F2 psrld, 1
4361IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4362IEMIMPL_MEDIA_OPT_F2 psraw, 1
4363IEMIMPL_MEDIA_OPT_F2 psrad, 1
4364IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4365IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4366IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4367IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4368IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4369IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4370IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4371IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4372IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4373IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4374IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4375IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4376IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4377IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4378IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4379IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4380IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4381IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4382IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4383IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4384
4385
4386;;
4387; Media instruction working on one full sized and one half sized register (lower half).
4388;
4389; @param 1 The instruction
4390; @param 2 1 if MMX is included, 0 if not.
4391;
4392; @param A0 Pointer to the first full sized media register operand (input/output).
4393; @param A1 Pointer to the second half sized media register operand (input).
4394;
4395%macro IEMIMPL_MEDIA_F1L1 2
4396 %if %2 != 0
4397BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4398 PROLOGUE_2_ARGS
4399 IEMIMPL_MMX_PROLOGUE
4400
4401 movq mm0, [A0]
4402 movq mm1, [A1]
4403 %1 mm0, mm1
4404 movq [A0], mm0
4405
4406 IEMIMPL_MMX_EPILOGUE
4407 EPILOGUE_2_ARGS
4408ENDPROC iemAImpl_ %+ %1 %+ _u64
4409 %endif
4410
4411BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4412 PROLOGUE_2_ARGS
4413 IEMIMPL_SSE_PROLOGUE
4414
4415 movdqu xmm0, [A0]
4416 movdqu xmm1, [A1]
4417 %1 xmm0, xmm1
4418 movdqu [A0], xmm0
4419
4420 IEMIMPL_SSE_EPILOGUE
4421 EPILOGUE_2_ARGS
4422ENDPROC iemAImpl_ %+ %1 %+ _u128
4423%endmacro
4424
4425IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4426IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4427IEMIMPL_MEDIA_F1L1 punpckldq, 1
4428IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4429
4430
4431;;
4432; Media instruction working two half sized input registers (lower half) and a full sized
4433; destination register (vpunpckh*).
4434;
4435; @param 1 The instruction
4436;
4437; @param A0 Pointer to the destination register (full sized, output only).
4438; @param A1 Pointer to the first full sized media source register operand, where we
4439; will only use the lower half as input - but we'll be loading it in full.
4440; @param A2 Pointer to the second full sized media source register operand, where we
4441; will only use the lower half as input - but we'll be loading it in full.
4442;
4443%macro IEMIMPL_MEDIA_F1L1L1 1
4444BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4445 PROLOGUE_3_ARGS
4446 IEMIMPL_AVX_PROLOGUE
4447
4448 vmovdqu xmm0, [A1]
4449 vmovdqu xmm1, [A2]
4450 %1 xmm0, xmm0, xmm1
4451 vmovdqu [A0], xmm0
4452
4453 IEMIMPL_AVX_PROLOGUE
4454 EPILOGUE_3_ARGS
4455ENDPROC iemAImpl_ %+ %1 %+ _u128
4456
4457BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4458 PROLOGUE_3_ARGS
4459 IEMIMPL_AVX_PROLOGUE
4460
4461 vmovdqu ymm0, [A1]
4462 vmovdqu ymm1, [A2]
4463 %1 ymm0, ymm0, ymm1
4464 vmovdqu [A0], ymm0
4465
4466 IEMIMPL_AVX_PROLOGUE
4467 EPILOGUE_3_ARGS
4468ENDPROC iemAImpl_ %+ %1 %+ _u256
4469%endmacro
4470
4471IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4472IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4473IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4474IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4475
4476
4477;;
4478; Media instruction working on one full sized and one half sized register (high half).
4479;
4480; @param 1 The instruction
4481; @param 2 1 if MMX is included, 0 if not.
4482;
4483; @param A0 Pointer to the first full sized media register operand (input/output).
4484; @param A1 Pointer to the second full sized media register operand, where we
4485; will only use the upper half as input - but we'll load it in full.
4486;
4487%macro IEMIMPL_MEDIA_F1H1 2
4488IEMIMPL_MEDIA_F1L1 %1, %2
4489%endmacro
4490
4491IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4492IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4493IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4494IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4495
4496
4497;;
4498; Media instruction working two half sized input registers (high half) and a full sized
4499; destination register (vpunpckh*).
4500;
4501; @param 1 The instruction
4502;
4503; @param A0 Pointer to the destination register (full sized, output only).
4504; @param A1 Pointer to the first full sized media source register operand, where we
4505; will only use the upper half as input - but we'll be loading it in full.
4506; @param A2 Pointer to the second full sized media source register operand, where we
4507; will only use the upper half as input - but we'll be loading it in full.
4508;
4509%macro IEMIMPL_MEDIA_F1H1H1 1
4510IEMIMPL_MEDIA_F1L1L1 %1
4511%endmacro
4512
4513IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4514IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4515IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4516IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4517
4518
4519;
4520; Shufflers with evil 8-bit immediates.
4521;
4522
4523BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4524 PROLOGUE_3_ARGS
4525 IEMIMPL_MMX_PROLOGUE
4526
4527 movzx A2, A2_8 ; must clear top bits
4528 movq mm1, [A1]
4529 movq mm0, mm0 ; paranoia!
4530 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 5
4531 movq [A0], mm0
4532
4533 IEMIMPL_MMX_EPILOGUE
4534 EPILOGUE_3_ARGS
4535%assign bImm 0
4536%rep 256
4537.imm %+ bImm:
4538 IBT_ENDBRxx_WITHOUT_NOTRACK
4539 pshufw mm0, mm1, bImm
4540 ret
4541 %assign bImm bImm + 1
4542%endrep
4543.immEnd:
4544ENDPROC iemAImpl_pshufw_u64
4545
4546
4547%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4548BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4549 PROLOGUE_3_ARGS
4550 IEMIMPL_SSE_PROLOGUE
4551
4552 movzx A2, A2_8 ; must clear top bits
4553 movdqu xmm1, [A1]
4554 movdqu xmm0, xmm1 ; paranoia!
4555 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4556 movdqu [A0], xmm0
4557
4558 IEMIMPL_SSE_EPILOGUE
4559 EPILOGUE_3_ARGS
4560
4561 %assign bImm 0
4562 %rep 256
4563.imm %+ bImm:
4564 IBT_ENDBRxx_WITHOUT_NOTRACK
4565 %1 xmm0, xmm1, bImm
4566 ret
4567 %assign bImm bImm + 1
4568 %endrep
4569.immEnd:
4570ENDPROC iemAImpl_ %+ %1 %+ _u128
4571%endmacro
4572
4573IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4574IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4575IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4576
4577
4578%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4579BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4580 PROLOGUE_3_ARGS
4581 IEMIMPL_SSE_PROLOGUE
4582
4583 movzx A2, A2_8 ; must clear top bits
4584 vmovdqu ymm1, [A1]
4585 vmovdqu ymm0, ymm1 ; paranoia!
4586 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4587 vmovdqu [A0], ymm0
4588
4589 IEMIMPL_SSE_EPILOGUE
4590 EPILOGUE_3_ARGS
4591 %assign bImm 0
4592 %rep 256
4593.imm %+ bImm:
4594 IBT_ENDBRxx_WITHOUT_NOTRACK
4595 %1 ymm0, ymm1, bImm
4596 ret
4597 %assign bImm bImm + 1
4598 %endrep
4599.immEnd:
4600ENDPROC iemAImpl_ %+ %1 %+ _u256
4601%endmacro
4602
4603IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4604IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4605IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4606
4607
4608;
4609; Shifts with evil 8-bit immediates.
4610;
4611
4612%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4613BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4614 PROLOGUE_2_ARGS
4615 IEMIMPL_MMX_PROLOGUE
4616
4617 movzx A1, A1_8 ; must clear top bits
4618 movq mm0, [A0]
4619 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 5
4620 movq [A0], mm0
4621
4622 IEMIMPL_MMX_EPILOGUE
4623 EPILOGUE_2_ARGS
4624%assign bImm 0
4625%rep 256
4626.imm %+ bImm:
4627 IBT_ENDBRxx_WITHOUT_NOTRACK
4628 %1 mm0, bImm
4629 ret
4630 %assign bImm bImm + 1
4631%endrep
4632.immEnd:
4633ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4634%endmacro
4635
4636IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4637IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4638IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4639IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4640IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4641IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4642IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4643IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4644
4645
4646%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4647BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4648 PROLOGUE_2_ARGS
4649 IEMIMPL_SSE_PROLOGUE
4650
4651 movzx A1, A1_8 ; must clear top bits
4652 movdqu xmm0, [A0]
4653 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 6
4654 movdqu [A0], xmm0
4655
4656 IEMIMPL_SSE_EPILOGUE
4657 EPILOGUE_2_ARGS
4658 %assign bImm 0
4659 %rep 256
4660.imm %+ bImm:
4661 IBT_ENDBRxx_WITHOUT_NOTRACK
4662 %1 xmm0, bImm
4663 ret
4664 %assign bImm bImm + 1
4665 %endrep
4666.immEnd:
4667ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4668%endmacro
4669
4670IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4671IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4672IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4673IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4674IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4675IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4676IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4677IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4678IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4679IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4680
4681
4682;
4683; Move byte mask.
4684;
4685
4686BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4687 PROLOGUE_2_ARGS
4688 IEMIMPL_MMX_PROLOGUE
4689
4690 movq mm1, [A1]
4691 pmovmskb T0, mm1
4692 mov [A0], T0
4693%ifdef RT_ARCH_X86
4694 mov dword [A0 + 4], 0
4695%endif
4696 IEMIMPL_MMX_EPILOGUE
4697 EPILOGUE_2_ARGS
4698ENDPROC iemAImpl_pmovmskb_u64
4699
4700BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4701 PROLOGUE_2_ARGS
4702 IEMIMPL_SSE_PROLOGUE
4703
4704 movdqu xmm1, [A1]
4705 pmovmskb T0, xmm1
4706 mov [A0], T0
4707%ifdef RT_ARCH_X86
4708 mov dword [A0 + 4], 0
4709%endif
4710 IEMIMPL_SSE_EPILOGUE
4711 EPILOGUE_2_ARGS
4712ENDPROC iemAImpl_pmovmskb_u128
4713
4714BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4715 PROLOGUE_2_ARGS
4716 IEMIMPL_AVX_PROLOGUE
4717
4718 vmovdqu ymm1, [A1]
4719 vpmovmskb T0, ymm1
4720 mov [A0], T0
4721%ifdef RT_ARCH_X86
4722 mov dword [A0 + 4], 0
4723%endif
4724 IEMIMPL_AVX_EPILOGUE
4725 EPILOGUE_2_ARGS
4726ENDPROC iemAImpl_vpmovmskb_u256
4727
4728
4729;;
4730; Media instruction working on two full sized source registers and one destination (AVX).
4731;
4732; @param 1 The instruction
4733;
4734; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4735; @param A1 Pointer to the destination media register size operand (output).
4736; @param A2 Pointer to the first source media register size operand (input).
4737; @param A3 Pointer to the second source media register size operand (input).
4738;
4739; @todo r=aeichner Not used right now
4740;
4741%macro IEMIMPL_MEDIA_F3 1
4742BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4743 PROLOGUE_4_ARGS
4744 IEMIMPL_AVX_PROLOGUE
4745
4746 vmovdqu xmm0, [A2]
4747 vmovdqu xmm1, [A3]
4748 %1 xmm0, xmm0, xmm1
4749 vmovdqu [A1], xmm0
4750
4751 IEMIMPL_AVX_PROLOGUE
4752 EPILOGUE_4_ARGS
4753ENDPROC iemAImpl_ %+ %1 %+ _u128
4754
4755BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4756 PROLOGUE_4_ARGS
4757 IEMIMPL_AVX_PROLOGUE
4758
4759 vmovdqu ymm0, [A2]
4760 vmovdqu ymm1, [A3]
4761 %1 ymm0, ymm0, ymm1
4762 vmovdqu [A1], ymm0
4763
4764 IEMIMPL_AVX_PROLOGUE
4765 EPILOGUE_4_ARGS
4766ENDPROC iemAImpl_ %+ %1 %+ _u256
4767%endmacro
4768
4769;;
4770; Media instruction working on two full sized source registers and one destination (AVX),
4771; but no XSAVE state pointer argument.
4772;
4773; @param 1 The instruction
4774; @param 2 Flag whether to add a 256-bit variant (1) or not (0).
4775;
4776; @param A0 Pointer to the destination media register size operand (output).
4777; @param A1 Pointer to the first source media register size operand (input).
4778; @param A2 Pointer to the second source media register size operand (input).
4779;
4780%macro IEMIMPL_MEDIA_OPT_F3 2
4781BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4782 PROLOGUE_3_ARGS
4783 IEMIMPL_AVX_PROLOGUE
4784
4785 vmovdqu xmm0, [A1]
4786 vmovdqu xmm1, [A2]
4787 %1 xmm0, xmm0, xmm1
4788 vmovdqu [A0], xmm0
4789
4790 IEMIMPL_AVX_PROLOGUE
4791 EPILOGUE_3_ARGS
4792ENDPROC iemAImpl_ %+ %1 %+ _u128
4793
4794 %if %2 == 1
4795BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4796 PROLOGUE_3_ARGS
4797 IEMIMPL_AVX_PROLOGUE
4798
4799 vmovdqu ymm0, [A1]
4800 vmovdqu ymm1, [A2]
4801 %1 ymm0, ymm0, ymm1
4802 vmovdqu [A0], ymm0
4803
4804 IEMIMPL_AVX_PROLOGUE
4805 EPILOGUE_3_ARGS
4806ENDPROC iemAImpl_ %+ %1 %+ _u256
4807 %endif
4808%endmacro
4809
4810IEMIMPL_MEDIA_OPT_F3 vpshufb, 1
4811IEMIMPL_MEDIA_OPT_F3 vpand, 1
4812IEMIMPL_MEDIA_OPT_F3 vpminub, 1
4813IEMIMPL_MEDIA_OPT_F3 vpminuw, 1
4814IEMIMPL_MEDIA_OPT_F3 vpminud, 1
4815IEMIMPL_MEDIA_OPT_F3 vpminsb, 1
4816IEMIMPL_MEDIA_OPT_F3 vpminsw, 1
4817IEMIMPL_MEDIA_OPT_F3 vpminsd, 1
4818IEMIMPL_MEDIA_OPT_F3 vpmaxub, 1
4819IEMIMPL_MEDIA_OPT_F3 vpmaxuw, 1
4820IEMIMPL_MEDIA_OPT_F3 vpmaxud, 1
4821IEMIMPL_MEDIA_OPT_F3 vpmaxsb, 1
4822IEMIMPL_MEDIA_OPT_F3 vpmaxsw, 1
4823IEMIMPL_MEDIA_OPT_F3 vpmaxsd, 1
4824IEMIMPL_MEDIA_OPT_F3 vpandn, 1
4825IEMIMPL_MEDIA_OPT_F3 vpor, 1
4826IEMIMPL_MEDIA_OPT_F3 vpxor, 1
4827IEMIMPL_MEDIA_OPT_F3 vpcmpeqb, 1
4828IEMIMPL_MEDIA_OPT_F3 vpcmpeqw, 1
4829IEMIMPL_MEDIA_OPT_F3 vpcmpeqd, 1
4830IEMIMPL_MEDIA_OPT_F3 vpcmpeqq, 1
4831IEMIMPL_MEDIA_OPT_F3 vpcmpgtb, 1
4832IEMIMPL_MEDIA_OPT_F3 vpcmpgtw, 1
4833IEMIMPL_MEDIA_OPT_F3 vpcmpgtd, 1
4834IEMIMPL_MEDIA_OPT_F3 vpcmpgtq, 1
4835IEMIMPL_MEDIA_OPT_F3 vpaddb, 1
4836IEMIMPL_MEDIA_OPT_F3 vpaddw, 1
4837IEMIMPL_MEDIA_OPT_F3 vpaddd, 1
4838IEMIMPL_MEDIA_OPT_F3 vpaddq, 1
4839IEMIMPL_MEDIA_OPT_F3 vpsubb, 1
4840IEMIMPL_MEDIA_OPT_F3 vpsubw, 1
4841IEMIMPL_MEDIA_OPT_F3 vpsubd, 1
4842IEMIMPL_MEDIA_OPT_F3 vpsubq, 1
4843IEMIMPL_MEDIA_OPT_F3 vpacksswb, 1
4844IEMIMPL_MEDIA_OPT_F3 vpackssdw, 1
4845IEMIMPL_MEDIA_OPT_F3 vpackuswb, 1
4846IEMIMPL_MEDIA_OPT_F3 vpackusdw, 1
4847IEMIMPL_MEDIA_OPT_F3 vpmullw, 1
4848IEMIMPL_MEDIA_OPT_F3 vpmulld, 1
4849IEMIMPL_MEDIA_OPT_F3 vpmulhw, 1
4850IEMIMPL_MEDIA_OPT_F3 vpmulhuw, 1
4851IEMIMPL_MEDIA_OPT_F3 vpavgb, 1
4852IEMIMPL_MEDIA_OPT_F3 vpavgw, 1
4853IEMIMPL_MEDIA_OPT_F3 vpsignb, 1
4854IEMIMPL_MEDIA_OPT_F3 vpsignw, 1
4855IEMIMPL_MEDIA_OPT_F3 vpsignd, 1
4856IEMIMPL_MEDIA_OPT_F3 vphaddw, 1
4857IEMIMPL_MEDIA_OPT_F3 vphaddd, 1
4858IEMIMPL_MEDIA_OPT_F3 vphsubw, 1
4859IEMIMPL_MEDIA_OPT_F3 vphsubd, 1
4860IEMIMPL_MEDIA_OPT_F3 vphaddsw, 1
4861IEMIMPL_MEDIA_OPT_F3 vphsubsw, 1
4862IEMIMPL_MEDIA_OPT_F3 vpmaddubsw, 1
4863IEMIMPL_MEDIA_OPT_F3 vpmulhrsw, 1
4864IEMIMPL_MEDIA_OPT_F3 vpsadbw, 1
4865IEMIMPL_MEDIA_OPT_F3 vpmuldq, 1
4866IEMIMPL_MEDIA_OPT_F3 vpmuludq, 1
4867IEMIMPL_MEDIA_OPT_F3 vunpcklps, 1
4868IEMIMPL_MEDIA_OPT_F3 vunpcklpd, 1
4869IEMIMPL_MEDIA_OPT_F3 vunpckhps, 1
4870IEMIMPL_MEDIA_OPT_F3 vunpckhpd, 1
4871IEMIMPL_MEDIA_OPT_F3 vpsubsb, 1
4872IEMIMPL_MEDIA_OPT_F3 vpsubsw, 1
4873IEMIMPL_MEDIA_OPT_F3 vpsubusb, 1
4874IEMIMPL_MEDIA_OPT_F3 vpsubusw, 1
4875IEMIMPL_MEDIA_OPT_F3 vpaddusb, 1
4876IEMIMPL_MEDIA_OPT_F3 vpaddusw, 1
4877IEMIMPL_MEDIA_OPT_F3 vpaddsb, 1
4878IEMIMPL_MEDIA_OPT_F3 vpaddsw, 1
4879IEMIMPL_MEDIA_OPT_F3 vpermilps, 1
4880IEMIMPL_MEDIA_OPT_F3 vpermilpd, 1
4881IEMIMPL_MEDIA_OPT_F3 vpmaddwd, 1
4882IEMIMPL_MEDIA_OPT_F3 vpsrlvd, 1
4883IEMIMPL_MEDIA_OPT_F3 vpsrlvq, 1
4884IEMIMPL_MEDIA_OPT_F3 vpsravd, 1
4885IEMIMPL_MEDIA_OPT_F3 vpsllvd, 1
4886IEMIMPL_MEDIA_OPT_F3 vpsllvq, 1
4887
4888IEMIMPL_MEDIA_OPT_F3 vaesenc, 0
4889IEMIMPL_MEDIA_OPT_F3 vaesenclast, 0
4890IEMIMPL_MEDIA_OPT_F3 vaesdec, 0
4891IEMIMPL_MEDIA_OPT_F3 vaesdeclast, 0
4892
4893
4894;;
4895; VAESIMC instruction.
4896;
4897; @param A0 Pointer to the first media register size operand (output).
4898; @param A1 Pointer to the second media register size operand (input).
4899;
4900BEGINPROC_FASTCALL iemAImpl_vaesimc_u128, 8
4901 PROLOGUE_2_ARGS
4902 IEMIMPL_SSE_PROLOGUE
4903
4904 movdqu xmm0, [A0]
4905 movdqu xmm1, [A1]
4906 vaesimc xmm0, xmm1
4907 movdqu [A0], xmm0
4908
4909 IEMIMPL_SSE_EPILOGUE
4910 EPILOGUE_2_ARGS
4911ENDPROC iemAImpl_vaesimc_u128
4912
4913
4914;;
4915; VAESKEYGENASSIST instruction.
4916;
4917; @param A0 Pointer to the first media register size operand (output).
4918; @param A1 Pointer to the second media register size operand (input).
4919; @param A2 8-bit immediate for the round constant.
4920;
4921BEGINPROC_FASTCALL iemAImpl_vaeskeygenassist_u128, 16
4922 PROLOGUE_3_ARGS
4923 IEMIMPL_AVX_PROLOGUE
4924
4925 movzx A2, A2_8 ; must clear top bits
4926 movdqu xmm0, [A0]
4927 movdqu xmm1, [A1]
4928 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4929 movdqu [A0], xmm0
4930
4931 IEMIMPL_AVX_EPILOGUE
4932 EPILOGUE_3_ARGS
4933 %assign bImm 0
4934 %rep 256
4935.imm %+ bImm:
4936 IBT_ENDBRxx_WITHOUT_NOTRACK
4937 vaeskeygenassist xmm0, xmm1, bImm
4938 ret
4939 int3
4940 %assign bImm bImm + 1
4941 %endrep
4942.immEnd:
4943ENDPROC iemAImpl_vaeskeygenassist_u128
4944
4945
4946;;
4947; VPERMQ instruction.
4948;
4949; @param A0 Pointer to the first media register size operand (output).
4950; @param A1 Pointer to the second media register size operand (input).
4951; @param A2 8-bit immediate for the round constant.
4952;
4953BEGINPROC_FASTCALL iemAImpl_vpermq_u256, 16
4954 PROLOGUE_3_ARGS
4955 IEMIMPL_AVX_PROLOGUE
4956
4957 movzx A2, A2_8 ; must clear top bits
4958 vmovdqu ymm1, [A1]
4959 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4960 vmovdqu [A0], ymm0
4961
4962 IEMIMPL_AVX_EPILOGUE
4963 EPILOGUE_3_ARGS
4964 %assign bImm 0
4965 %rep 256
4966.imm %+ bImm:
4967 IBT_ENDBRxx_WITHOUT_NOTRACK
4968 vpermq ymm0, ymm1, bImm
4969 ret
4970 int3
4971 %assign bImm bImm + 1
4972 %endrep
4973.immEnd:
4974ENDPROC iemAImpl_vpermq_u256
4975
4976
4977;;
4978; VPERMPD instruction.
4979;
4980; @param A0 Pointer to the first media register size operand (output).
4981; @param A1 Pointer to the second media register size operand (input).
4982; @param A2 8-bit immediate for the round constant.
4983;
4984BEGINPROC_FASTCALL iemAImpl_vpermpd_u256, 16
4985 PROLOGUE_3_ARGS
4986 IEMIMPL_AVX_PROLOGUE
4987
4988 movzx A2, A2_8 ; must clear top bits
4989 vmovdqu ymm1, [A1]
4990 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
4991 vmovdqu [A0], ymm0
4992
4993 IEMIMPL_AVX_EPILOGUE
4994 EPILOGUE_3_ARGS
4995 %assign bImm 0
4996 %rep 256
4997.imm %+ bImm:
4998 IBT_ENDBRxx_WITHOUT_NOTRACK
4999 vpermpd ymm0, ymm1, bImm
5000 ret
5001 int3
5002 %assign bImm bImm + 1
5003 %endrep
5004.immEnd:
5005ENDPROC iemAImpl_vpermpd_u256
5006
5007
5008;;
5009; VPERMPS instruction.
5010;
5011; @param A0 Pointer to the first media register size operand (output).
5012; @param A1 Pointer to the second media register size operand (input).
5013; @param A2 Pointer to the third media register size operand (input).
5014;
5015BEGINPROC_FASTCALL iemAImpl_vpermps_u256, 16
5016 PROLOGUE_3_ARGS
5017 IEMIMPL_AVX_PROLOGUE
5018
5019 vmovdqu ymm0, [A1]
5020 vmovdqu ymm1, [A2]
5021 vpermps ymm0, ymm0, ymm1
5022 vmovdqu [A0], ymm0
5023
5024 IEMIMPL_AVX_EPILOGUE
5025 EPILOGUE_3_ARGS
5026ENDPROC iemAImpl_vpermps_u256
5027
5028
5029;;
5030; VPERMD instruction.
5031;
5032; @param A0 Pointer to the first media register size operand (output).
5033; @param A1 Pointer to the second media register size operand (input).
5034; @param A2 Pointer to the third media register size operand (input).
5035;
5036BEGINPROC_FASTCALL iemAImpl_vpermd_u256, 16
5037 PROLOGUE_3_ARGS
5038 IEMIMPL_AVX_PROLOGUE
5039
5040 vmovdqu ymm0, [A1]
5041 vmovdqu ymm1, [A2]
5042 vpermd ymm0, ymm0, ymm1
5043 vmovdqu [A0], ymm0
5044
5045 IEMIMPL_AVX_EPILOGUE
5046 EPILOGUE_3_ARGS
5047ENDPROC iemAImpl_vpermd_u256
5048
5049
5050;;
5051; Media instruction working on one full sized source register, one full sized destination
5052; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
5053; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
5054; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
5055; of either 16, 32, or 64, it acts like the max shift size)
5056;
5057; @param 1 The instruction
5058;
5059; @param A0 Pointer to the destination media register size operand (output).
5060; @param A1 Pointer to the first source media register size operand (input).
5061; @param A2 Pointer to the second source media register size operand (input).
5062;
5063%macro IEMIMPL_SHIFT_OPT_F3 1
5064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5065 PROLOGUE_3_ARGS
5066 IEMIMPL_AVX_PROLOGUE
5067
5068 vmovdqu xmm0, [A1]
5069 vmovdqu xmm1, [A2]
5070 %1 xmm0, xmm0, xmm1
5071 vmovdqu [A0], xmm0
5072
5073 IEMIMPL_AVX_PROLOGUE
5074 EPILOGUE_3_ARGS
5075ENDPROC iemAImpl_ %+ %1 %+ _u128
5076
5077BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5078 PROLOGUE_3_ARGS
5079 IEMIMPL_AVX_PROLOGUE
5080
5081 vmovdqu ymm0, [A1]
5082 vmovdqu xmm1, [A2]
5083 %1 ymm0, ymm0, xmm1
5084 vmovdqu [A0], ymm0
5085
5086 IEMIMPL_AVX_PROLOGUE
5087 EPILOGUE_3_ARGS
5088ENDPROC iemAImpl_ %+ %1 %+ _u256
5089%endmacro
5090
5091IEMIMPL_SHIFT_OPT_F3 vpsllw
5092IEMIMPL_SHIFT_OPT_F3 vpslld
5093IEMIMPL_SHIFT_OPT_F3 vpsllq
5094IEMIMPL_SHIFT_OPT_F3 vpsraw
5095IEMIMPL_SHIFT_OPT_F3 vpsrad
5096IEMIMPL_SHIFT_OPT_F3 vpsrlw
5097IEMIMPL_SHIFT_OPT_F3 vpsrld
5098IEMIMPL_SHIFT_OPT_F3 vpsrlq
5099
5100
5101;;
5102; Media instruction working on one full sized source registers and one destination (AVX),
5103; but no XSAVE state pointer argument.
5104;
5105; @param 1 The instruction
5106; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
5107;
5108; @param A0 Pointer to the destination media register size operand (output).
5109; @param A1 Pointer to the source media register size operand (input).
5110;
5111%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
5112BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5113 PROLOGUE_2_ARGS
5114 IEMIMPL_AVX_PROLOGUE
5115
5116 vmovdqu xmm0, [A1]
5117 %1 xmm0, xmm0
5118 vmovdqu [A0], xmm0
5119
5120 IEMIMPL_AVX_PROLOGUE
5121 EPILOGUE_2_ARGS
5122ENDPROC iemAImpl_ %+ %1 %+ _u128
5123
5124 %if %2 == 1
5125BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5126 PROLOGUE_2_ARGS
5127 IEMIMPL_AVX_PROLOGUE
5128
5129 vmovdqu ymm0, [A1]
5130 %1 ymm0, ymm0
5131 vmovdqu [A0], ymm0
5132
5133 IEMIMPL_AVX_PROLOGUE
5134 EPILOGUE_2_ARGS
5135ENDPROC iemAImpl_ %+ %1 %+ _u256
5136 %endif
5137%endmacro
5138
5139IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
5140IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
5141IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
5142IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
5143
5144
5145;
5146; The SSE 4.2 crc32
5147;
5148; @param A1 Pointer to the 32-bit destination.
5149; @param A2 The source operand, sized according to the suffix.
5150;
5151BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
5152 PROLOGUE_2_ARGS
5153
5154 mov T0_32, [A0]
5155 crc32 T0_32, A1_8
5156 mov [A0], T0_32
5157
5158 EPILOGUE_2_ARGS
5159ENDPROC iemAImpl_crc32_u8
5160
5161BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
5162 PROLOGUE_2_ARGS
5163
5164 mov T0_32, [A0]
5165 crc32 T0_32, A1_16
5166 mov [A0], T0_32
5167
5168 EPILOGUE_2_ARGS
5169ENDPROC iemAImpl_crc32_u16
5170
5171BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
5172 PROLOGUE_2_ARGS
5173
5174 mov T0_32, [A0]
5175 crc32 T0_32, A1_32
5176 mov [A0], T0_32
5177
5178 EPILOGUE_2_ARGS
5179ENDPROC iemAImpl_crc32_u32
5180
5181%ifdef RT_ARCH_AMD64
5182BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
5183 PROLOGUE_2_ARGS
5184
5185 mov T0_32, [A0]
5186 crc32 T0, A1
5187 mov [A0], T0_32
5188
5189 EPILOGUE_2_ARGS
5190ENDPROC iemAImpl_crc32_u64
5191%endif
5192
5193
5194;
5195; PTEST (SSE 4.1)
5196;
5197; @param A0 Pointer to the first source operand (aka readonly destination).
5198; @param A1 Pointer to the second source operand.
5199; @param A2 Pointer to the EFLAGS register.
5200;
5201BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
5202 PROLOGUE_3_ARGS
5203 IEMIMPL_SSE_PROLOGUE
5204
5205 movdqu xmm0, [A0]
5206 movdqu xmm1, [A1]
5207 ptest xmm0, xmm1
5208 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5209
5210 IEMIMPL_SSE_EPILOGUE
5211 EPILOGUE_3_ARGS
5212ENDPROC iemAImpl_ptest_u128
5213
5214BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
5215 PROLOGUE_3_ARGS
5216 IEMIMPL_SSE_PROLOGUE
5217
5218 vmovdqu ymm0, [A0]
5219 vmovdqu ymm1, [A1]
5220 vptest ymm0, ymm1
5221 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5222
5223 IEMIMPL_SSE_EPILOGUE
5224 EPILOGUE_3_ARGS
5225ENDPROC iemAImpl_vptest_u256
5226
5227
5228;; Template for the vtestp{s,d} instructions
5229;
5230; @param 1 The instruction
5231;
5232; @param A0 Pointer to the first source operand (aka readonly destination).
5233; @param A1 Pointer to the second source operand.
5234; @param A2 Pointer to the EFLAGS register.
5235;
5236%macro IEMIMPL_VTESTP_S_D 1
5237BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5238 PROLOGUE_3_ARGS
5239 IEMIMPL_AVX_PROLOGUE
5240
5241 vmovdqu xmm0, [A0]
5242 vmovdqu xmm1, [A1]
5243 %1 xmm0, xmm1
5244 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5245
5246 IEMIMPL_AVX_EPILOGUE
5247 EPILOGUE_3_ARGS
5248ENDPROC iemAImpl_ %+ %1 %+ _u128
5249
5250BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5251 PROLOGUE_3_ARGS
5252 IEMIMPL_AVX_PROLOGUE
5253
5254 vmovdqu ymm0, [A0]
5255 vmovdqu ymm1, [A1]
5256 %1 ymm0, ymm1
5257 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5258
5259 IEMIMPL_AVX_EPILOGUE
5260 EPILOGUE_3_ARGS
5261ENDPROC iemAImpl_ %+ %1 %+ _u256
5262%endmacro
5263
5264IEMIMPL_VTESTP_S_D vtestps
5265IEMIMPL_VTESTP_S_D vtestpd
5266
5267
5268;;
5269; Template for the [v]pmov{s,z}x* instructions
5270;
5271; @param 1 The instruction
5272;
5273; @param A0 Pointer to the destination media register size operand (output).
5274; @param A1 The source operand value (input).
5275;
5276%macro IEMIMPL_V_PMOV_SZ_X 1
5277BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5278 PROLOGUE_2_ARGS
5279 IEMIMPL_SSE_PROLOGUE
5280
5281 movd xmm0, A1
5282 %1 xmm0, xmm0
5283 vmovdqu [A0], xmm0
5284
5285 IEMIMPL_SSE_PROLOGUE
5286 EPILOGUE_2_ARGS
5287ENDPROC iemAImpl_ %+ %1 %+ _u128
5288
5289BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5290 PROLOGUE_2_ARGS
5291 IEMIMPL_AVX_PROLOGUE
5292
5293 movd xmm0, A1
5294 v %+ %1 xmm0, xmm0
5295 vmovdqu [A0], xmm0
5296
5297 IEMIMPL_AVX_PROLOGUE
5298 EPILOGUE_2_ARGS
5299ENDPROC iemAImpl_v %+ %1 %+ _u128
5300
5301BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5302 PROLOGUE_2_ARGS
5303 IEMIMPL_AVX_PROLOGUE
5304
5305 movdqu xmm0, [A1]
5306 v %+ %1 ymm0, xmm0
5307 vmovdqu [A0], ymm0
5308
5309 IEMIMPL_AVX_PROLOGUE
5310 EPILOGUE_2_ARGS
5311ENDPROC iemAImpl_v %+ %1 %+ _u256
5312%endmacro
5313
5314IEMIMPL_V_PMOV_SZ_X pmovsxbw
5315IEMIMPL_V_PMOV_SZ_X pmovsxbd
5316IEMIMPL_V_PMOV_SZ_X pmovsxbq
5317IEMIMPL_V_PMOV_SZ_X pmovsxwd
5318IEMIMPL_V_PMOV_SZ_X pmovsxwq
5319IEMIMPL_V_PMOV_SZ_X pmovsxdq
5320
5321IEMIMPL_V_PMOV_SZ_X pmovzxbw
5322IEMIMPL_V_PMOV_SZ_X pmovzxbd
5323IEMIMPL_V_PMOV_SZ_X pmovzxbq
5324IEMIMPL_V_PMOV_SZ_X pmovzxwd
5325IEMIMPL_V_PMOV_SZ_X pmovzxwq
5326IEMIMPL_V_PMOV_SZ_X pmovzxdq
5327
5328
5329;;
5330; Initialize the SSE MXCSR register using the guest value partially to
5331; account for rounding mode, load the value from the given register.
5332;
5333; @uses 4 bytes of stack to save the original value, T0.
5334; @param 1 Expression giving the register holding the guest's MXCSR.
5335;
5336%macro SSE_AVX_LD_MXCSR 1
5337 sub xSP, 4
5338
5339 stmxcsr [xSP]
5340 mov T0_32, %1
5341 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5342 or T0_32, X86_MXCSR_XCPT_MASK
5343 sub xSP, 4
5344 mov [xSP], T0_32
5345 ldmxcsr [xSP]
5346 add xSP, 4
5347%endmacro
5348
5349
5350;;
5351; Restores the SSE MXCSR register with the original value.
5352;
5353; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5354; @param 1 Expression giving the register to return the new guest's MXCSR value.
5355; @param 2 Expression giving the register holding original guest's MXCSR value.
5356;
5357; @note Restores the stack pointer.
5358;
5359%macro SSE_AVX_ST_MXCSR 2
5360 sub xSP, 4
5361 stmxcsr [xSP]
5362 mov %1, [xSP]
5363 add xSP, 4
5364 ; Merge the status bits into the original MXCSR value.
5365 and %1, X86_MXCSR_XCPT_FLAGS
5366 ;
5367 ; If PE is set together with OE/UE and neither are masked
5368 ; PE needs to be cleared because on real hardware
5369 ; an exception is generated with only OE/UE being set,
5370 ; but because we mask all exceptions PE will get set as well.
5371 ;
5372 mov T2_32, %1
5373 and T2_32, X86_MXCSR_OE | X86_MXCSR_UE
5374 mov T1_32, %2
5375 and T1_32, X86_MXCSR_OM | X86_MXCSR_UM
5376 shr T1_32, X86_MXCSR_XCPT_MASK_SHIFT
5377 not T1_32
5378 and T2_32, T1_32
5379 test T2_32, X86_MXCSR_OE | X86_MXCSR_UE
5380 jz .excp_masked
5381 btr %1, X86_MXCSR_PE_BIT
5382.excp_masked:
5383 or %1, %2
5384
5385 ldmxcsr [xSP]
5386 add xSP, 4
5387%endmacro
5388
5389
5390;;
5391; Floating point instruction working on two full sized registers.
5392;
5393; @param 1 The instruction
5394; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
5395;
5396; @returns R0_32 The new MXCSR value of the guest.
5397; @param A0 The guest's MXCSR register value to use.
5398; @param A1 Where to return the result.
5399; @param A2 Pointer to the first media register size operand (input/output).
5400; @param A3 Pointer to the second media register size operand (input).
5401;
5402%macro IEMIMPL_FP_F2 2
5403BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5404 PROLOGUE_4_ARGS
5405 IEMIMPL_SSE_PROLOGUE
5406 SSE_AVX_LD_MXCSR A0_32
5407
5408 movdqu xmm0, [A2]
5409 movdqu xmm1, [A3]
5410 %1 xmm0, xmm1
5411 movdqu [A1], xmm0
5412
5413 SSE_AVX_ST_MXCSR R0_32, A0_32
5414 IEMIMPL_SSE_PROLOGUE
5415 EPILOGUE_4_ARGS
5416ENDPROC iemAImpl_ %+ %1 %+ _u128
5417
5418 %if %2 == 3
5419BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5420 PROLOGUE_4_ARGS
5421 IEMIMPL_AVX_PROLOGUE
5422 SSE_AVX_LD_MXCSR A0_32
5423
5424 vmovdqu xmm0, [A2]
5425 vmovdqu xmm1, [A3]
5426 v %+ %1 xmm0, xmm0, xmm1
5427 vmovdqu [A1], xmm0
5428
5429 SSE_AVX_ST_MXCSR R0_32, A0_32
5430 IEMIMPL_AVX_PROLOGUE
5431 EPILOGUE_4_ARGS
5432ENDPROC iemAImpl_v %+ %1 %+ _u128
5433
5434BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5435 PROLOGUE_4_ARGS
5436 IEMIMPL_AVX_PROLOGUE
5437 SSE_AVX_LD_MXCSR A0_32
5438
5439 vmovdqu ymm0, [A2]
5440 vmovdqu ymm1, [A3]
5441 v %+ %1 ymm0, ymm0, ymm1
5442 vmovdqu [A1], ymm0
5443
5444 SSE_AVX_ST_MXCSR R0_32, A0_32
5445 IEMIMPL_AVX_PROLOGUE
5446 EPILOGUE_4_ARGS
5447ENDPROC iemAImpl_v %+ %1 %+ _u256
5448 %elif %2 == 2
5449BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5450 PROLOGUE_4_ARGS
5451 IEMIMPL_AVX_PROLOGUE
5452 SSE_AVX_LD_MXCSR A0_32
5453
5454 vmovdqu xmm0, [A2]
5455 vmovdqu xmm1, [A3]
5456 v %+ %1 xmm0, xmm1
5457 vmovdqu [A1], xmm0
5458
5459 SSE_AVX_ST_MXCSR R0_32, A0_32
5460 IEMIMPL_AVX_PROLOGUE
5461 EPILOGUE_4_ARGS
5462ENDPROC iemAImpl_v %+ %1 %+ _u128
5463
5464BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5465 PROLOGUE_4_ARGS
5466 IEMIMPL_AVX_PROLOGUE
5467 SSE_AVX_LD_MXCSR A0_32
5468
5469 vmovdqu ymm0, [A2]
5470 vmovdqu ymm1, [A3]
5471 v %+ %1 ymm0, ymm1
5472 vmovdqu [A1], ymm0
5473
5474 SSE_AVX_ST_MXCSR R0_32, A0_32
5475 IEMIMPL_AVX_PROLOGUE
5476 EPILOGUE_4_ARGS
5477ENDPROC iemAImpl_v %+ %1 %+ _u256
5478 %endif
5479%endmacro
5480
5481IEMIMPL_FP_F2 addps, 3
5482IEMIMPL_FP_F2 addpd, 3
5483IEMIMPL_FP_F2 mulps, 3
5484IEMIMPL_FP_F2 mulpd, 3
5485IEMIMPL_FP_F2 subps, 3
5486IEMIMPL_FP_F2 subpd, 3
5487IEMIMPL_FP_F2 minps, 3
5488IEMIMPL_FP_F2 minpd, 3
5489IEMIMPL_FP_F2 divps, 3
5490IEMIMPL_FP_F2 divpd, 3
5491IEMIMPL_FP_F2 maxps, 3
5492IEMIMPL_FP_F2 maxpd, 3
5493IEMIMPL_FP_F2 haddps, 3
5494IEMIMPL_FP_F2 haddpd, 3
5495IEMIMPL_FP_F2 hsubps, 3
5496IEMIMPL_FP_F2 hsubpd, 3
5497IEMIMPL_FP_F2 addsubps, 3
5498IEMIMPL_FP_F2 addsubpd, 3
5499
5500
5501;;
5502; These are actually unary operations but to keep it simple
5503; we treat them as binary for now, so the output result is
5504; always in sync with the register where the result might get written
5505; to.
5506IEMIMPL_FP_F2 sqrtps, 2
5507IEMIMPL_FP_F2 rsqrtps, 2
5508IEMIMPL_FP_F2 sqrtpd, 2
5509IEMIMPL_FP_F2 rcpps, 2
5510IEMIMPL_FP_F2 cvtdq2ps, 2
5511IEMIMPL_FP_F2 cvtps2dq, 2
5512IEMIMPL_FP_F2 cvttps2dq, 2
5513IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5514
5515
5516;;
5517; Floating point instruction working on a full sized register and a single precision operand.
5518;
5519; @param 1 The instruction
5520;
5521; @return R0_32 The new MXCSR value of the guest.
5522; @param A0 The guest's MXCSR register value to use.
5523; @param A1 Where to return the result.
5524; @param A2 Pointer to the first media register size operand (input/output).
5525; @param A3 Pointer to the second single precision floating point value (input).
5526;
5527%macro IEMIMPL_FP_F2_R32 1
5528BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5529 PROLOGUE_4_ARGS
5530 IEMIMPL_SSE_PROLOGUE
5531 SSE_AVX_LD_MXCSR A0_32
5532
5533 movdqu xmm0, [A2]
5534 movd xmm1, [A3]
5535 %1 xmm0, xmm1
5536 movdqu [A1], xmm0
5537
5538 SSE_AVX_ST_MXCSR R0_32, A0_32
5539 IEMIMPL_SSE_EPILOGUE
5540 EPILOGUE_4_ARGS
5541ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5542
5543BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5544 PROLOGUE_4_ARGS
5545 IEMIMPL_AVX_PROLOGUE
5546 SSE_AVX_LD_MXCSR A0_32
5547
5548 vmovdqu xmm0, [A2]
5549 vmovd xmm1, [A3]
5550 v %+ %1 xmm0, xmm0, xmm1
5551 vmovdqu [A1], xmm0
5552
5553 SSE_AVX_ST_MXCSR R0_32, A0_32
5554 IEMIMPL_AVX_PROLOGUE
5555 EPILOGUE_4_ARGS
5556ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5557%endmacro
5558
5559IEMIMPL_FP_F2_R32 addss
5560IEMIMPL_FP_F2_R32 mulss
5561IEMIMPL_FP_F2_R32 subss
5562IEMIMPL_FP_F2_R32 minss
5563IEMIMPL_FP_F2_R32 divss
5564IEMIMPL_FP_F2_R32 maxss
5565IEMIMPL_FP_F2_R32 cvtss2sd
5566IEMIMPL_FP_F2_R32 sqrtss
5567IEMIMPL_FP_F2_R32 rsqrtss
5568IEMIMPL_FP_F2_R32 rcpss
5569
5570
5571;;
5572; Floating point instruction working on a full sized register and a double precision operand.
5573;
5574; @param 1 The instruction
5575;
5576; @return R0_32 The new MXCSR value of the guest.
5577; @param A0 The guest's MXCSR register value to use.
5578; @param A1 Where to return the result.
5579; @param A2 Pointer to the first media register size operand (input/output).
5580; @param A3 Pointer to the second double precision floating point value (input).
5581;
5582%macro IEMIMPL_FP_F2_R64 1
5583BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5584 PROLOGUE_4_ARGS
5585 IEMIMPL_SSE_PROLOGUE
5586 SSE_AVX_LD_MXCSR A0_32
5587
5588 movdqu xmm0, [A2]
5589 movq xmm1, [A3]
5590 %1 xmm0, xmm1
5591 movdqu [A1], xmm0
5592
5593 SSE_AVX_ST_MXCSR R0_32, A0_32
5594 IEMIMPL_SSE_EPILOGUE
5595 EPILOGUE_4_ARGS
5596ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5597
5598BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5599 PROLOGUE_4_ARGS
5600 IEMIMPL_AVX_PROLOGUE
5601 SSE_AVX_LD_MXCSR A0_32
5602
5603 vmovdqu xmm0, [A2]
5604 vmovq xmm1, [A3]
5605 v %+ %1 xmm0, xmm0, xmm1
5606 vmovdqu [A1], xmm0
5607
5608 SSE_AVX_ST_MXCSR R0_32, A0_32
5609 IEMIMPL_AVX_EPILOGUE
5610 EPILOGUE_4_ARGS
5611ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5612%endmacro
5613
5614IEMIMPL_FP_F2_R64 addsd
5615IEMIMPL_FP_F2_R64 mulsd
5616IEMIMPL_FP_F2_R64 subsd
5617IEMIMPL_FP_F2_R64 minsd
5618IEMIMPL_FP_F2_R64 divsd
5619IEMIMPL_FP_F2_R64 maxsd
5620IEMIMPL_FP_F2_R64 cvtsd2ss
5621IEMIMPL_FP_F2_R64 sqrtsd
5622
5623
5624;;
5625; Macro for the cvtpd2ps/cvtps2pd instructions.
5626;
5627; 1 The instruction name.
5628; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5629;
5630; @return R0_32 The new MXCSR value of the guest.
5631; @param A0_32 The guest's MXCSR register value to use.
5632; @param A1 Where to return the result.
5633; @param A2 Pointer to the first media register size operand (input/output).
5634; @param A3 Pointer to the second media register size operand (input).
5635;
5636%macro IEMIMPL_CVT_F2 2
5637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5638 PROLOGUE_4_ARGS
5639 IEMIMPL_SSE_PROLOGUE
5640 SSE_AVX_LD_MXCSR A0_32
5641
5642 movdqu xmm0, [A2]
5643 movdqu xmm1, [A3]
5644 %1 xmm0, xmm1
5645 movdqu [A1], xmm0
5646
5647 SSE_AVX_ST_MXCSR R0_32, A0_32
5648 IEMIMPL_SSE_EPILOGUE
5649 EPILOGUE_4_ARGS
5650ENDPROC iemAImpl_ %+ %1 %+ _u128
5651
5652BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u128, 16
5653 PROLOGUE_4_ARGS
5654 IEMIMPL_AVX_PROLOGUE
5655 SSE_AVX_LD_MXCSR A0_32
5656
5657 vmovdqu xmm1, [A2]
5658 v %+ %1 xmm0, xmm1
5659 vmovdqu [A1], xmm0
5660
5661 SSE_AVX_ST_MXCSR R0_32, A0_32
5662 IEMIMPL_AVX_EPILOGUE
5663 EPILOGUE_4_ARGS
5664ENDPROC iemAImpl_v %+ %1 %+ _u128_u128
5665
5666BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_u256, 16
5667 PROLOGUE_4_ARGS
5668 IEMIMPL_AVX_PROLOGUE
5669 SSE_AVX_LD_MXCSR A0_32
5670
5671 vmovdqu xmm1, [A2]
5672 %if %2 == 0
5673 v %+ %1 xmm0, xmm1
5674 %else
5675 v %+ %1 ymm0, xmm1
5676 %endif
5677 vmovdqu [A1], ymm0
5678
5679 SSE_AVX_ST_MXCSR R0_32, A0_32
5680 IEMIMPL_AVX_EPILOGUE
5681 EPILOGUE_4_ARGS
5682ENDPROC iemAImpl_v %+ %1 %+ _u128_u256
5683%endmacro
5684
5685IEMIMPL_CVT_F2 cvtpd2ps, 0
5686IEMIMPL_CVT_F2 cvttpd2dq, 0
5687IEMIMPL_CVT_F2 cvtpd2dq, 0
5688
5689;IEMIMPL_CVT_F2 cvtps2pd, 1 - inefficient.
5690
5691BEGINPROC_FASTCALL iemAImpl_cvtps2pd_u128, 12
5692 PROLOGUE_3_ARGS
5693 IEMIMPL_SSE_PROLOGUE
5694 SSE_AVX_LD_MXCSR A0_32
5695
5696 cvtps2pd xmm0, [A2]
5697 movdqu [A1], xmm0
5698
5699 SSE_AVX_ST_MXCSR R0_32, A0_32
5700 IEMIMPL_SSE_EPILOGUE
5701 EPILOGUE_3_ARGS
5702ENDPROC iemAImpl_cvtps2pd_u128
5703
5704
5705;;
5706; vcvtps2pd instruction - 128-bit variant.
5707;
5708; @return R0_32 The new MXCSR value of the guest.
5709; @param A0_32 The guest's MXCSR register value to use.
5710; @param A1 Pointer to the result operand (output).
5711; @param A2 Pointer to the second operand (input).
5712;
5713BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u128_u64, 16
5714 PROLOGUE_3_ARGS
5715 IEMIMPL_AVX_PROLOGUE
5716 SSE_AVX_LD_MXCSR A0_32
5717
5718 vcvtps2pd xmm0, qword [A2]
5719 movdqu [A1], xmm0
5720
5721 SSE_AVX_ST_MXCSR R0_32, A0_32
5722 IEMIMPL_AVX_EPILOGUE
5723 EPILOGUE_3_ARGS
5724ENDPROC iemAImpl_vcvtps2pd_u128_u64
5725
5726
5727;;
5728; vcvtps2pd instruction - 256-bit variant.
5729;
5730; @return R0_32 The new MXCSR value of the guest.
5731; @param A0_32 The guest's MXCSR register value to use.
5732; @param A1 Pointer to the result operand (output).
5733; @param A2 Pointer to the second operand (input).
5734;
5735BEGINPROC_FASTCALL iemAImpl_vcvtps2pd_u256_u128, 16
5736 PROLOGUE_3_ARGS
5737 IEMIMPL_AVX_PROLOGUE
5738 SSE_AVX_LD_MXCSR A0_32
5739
5740 movdqu xmm0, [A2]
5741 vcvtps2pd ymm0, xmm1
5742 vmovdqu [A1], ymm0
5743
5744 SSE_AVX_ST_MXCSR R0_32, A0_32
5745 IEMIMPL_AVX_EPILOGUE
5746 EPILOGUE_3_ARGS
5747ENDPROC iemAImpl_vcvtps2pd_u256_u128
5748
5749
5750;;
5751; vcvtdq2pd instruction - 128-bit variant.
5752;
5753; @return R0_32 The new MXCSR value of the guest.
5754; @param A0_32 The guest's MXCSR register value to use.
5755; @param A1 Pointer to the result operand (output).
5756; @param A2 Pointer to the second operand (input).
5757;
5758BEGINPROC_FASTCALL iemAImpl_vcvtdq2pd_u128_u64, 16
5759 PROLOGUE_3_ARGS
5760 IEMIMPL_AVX_PROLOGUE
5761 SSE_AVX_LD_MXCSR A0_32
5762
5763 vcvtdq2pd xmm0, qword [A2]
5764 movdqu [A1], xmm0
5765
5766 SSE_AVX_ST_MXCSR R0_32, A0_32
5767 IEMIMPL_AVX_EPILOGUE
5768 EPILOGUE_3_ARGS
5769ENDPROC iemAImpl_vcvtdq2pd_u128_u64
5770
5771
5772;;
5773; vcvtdq2pd instruction - 256-bit variant.
5774;
5775; @return R0_32 The new MXCSR value of the guest.
5776; @param A0_32 The guest's MXCSR register value to use.
5777; @param A1 Pointer to the result operand (output).
5778; @param A2 Pointer to the second operand (input).
5779;
5780BEGINPROC_FASTCALL iemAImpl_vcvtdq2pd_u256_u128, 16
5781 PROLOGUE_3_ARGS
5782 IEMIMPL_AVX_PROLOGUE
5783 SSE_AVX_LD_MXCSR A0_32
5784
5785 movdqu xmm0, [A2]
5786 vcvtdq2pd ymm0, xmm1
5787 vmovdqu [A1], ymm0
5788
5789 SSE_AVX_ST_MXCSR R0_32, A0_32
5790 IEMIMPL_AVX_EPILOGUE
5791 EPILOGUE_3_ARGS
5792ENDPROC iemAImpl_vcvtdq2pd_u256_u128
5793
5794
5795;;
5796; shufps instructions with 8-bit immediates.
5797;
5798; @param A0 Pointer to the destination media register size operand (input/output).
5799; @param A1 Pointer to the first source media register size operand (input).
5800; @param A2 The 8-bit immediate
5801;
5802BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5803 PROLOGUE_3_ARGS
5804 IEMIMPL_SSE_PROLOGUE
5805
5806 movzx A2, A2_8 ; must clear top bits
5807 movdqu xmm0, [A0]
5808 movdqu xmm1, [A1]
5809 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5810 movdqu [A0], xmm0
5811
5812 IEMIMPL_SSE_EPILOGUE
5813 EPILOGUE_3_ARGS
5814 %assign bImm 0
5815 %rep 256
5816.imm %+ bImm:
5817 IBT_ENDBRxx_WITHOUT_NOTRACK
5818 shufps xmm0, xmm1, bImm
5819 ret
5820 int3
5821 %assign bImm bImm + 1
5822 %endrep
5823.immEnd:
5824ENDPROC iemAImpl_shufps_u128
5825
5826
5827;;
5828; shufpd instruction with 8-bit immediates.
5829;
5830; @param A0 Pointer to the destination media register size operand (input/output).
5831; @param A1 Pointer to the first source media register size operand (input).
5832; @param A2 The 8-bit immediate
5833;
5834BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5835 PROLOGUE_3_ARGS
5836 IEMIMPL_SSE_PROLOGUE
5837
5838 movzx A2, A2_8 ; must clear top bits
5839 movdqu xmm0, [A0]
5840 movdqu xmm1, [A1]
5841 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5842 movdqu [A0], xmm0
5843
5844 IEMIMPL_SSE_EPILOGUE
5845 EPILOGUE_3_ARGS
5846 %assign bImm 0
5847 %rep 256
5848.imm %+ bImm:
5849 IBT_ENDBRxx_WITHOUT_NOTRACK
5850 shufpd xmm0, xmm1, bImm
5851 ret
5852 %assign bImm bImm + 1
5853 %endrep
5854.immEnd:
5855ENDPROC iemAImpl_shufpd_u128
5856
5857
5858;;
5859; vshufp{s,d} instructions with 8-bit immediates.
5860;
5861; @param 1 The instruction name.
5862;
5863; @param A0 Pointer to the destination media register size operand (output).
5864; @param A1 Pointer to the first source media register size operand (input).
5865; @param A2 Pointer to the second source media register size operand (input).
5866; @param A3 The 8-bit immediate
5867;
5868%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5869BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5870 PROLOGUE_4_ARGS
5871 IEMIMPL_AVX_PROLOGUE
5872
5873 movzx A3, A3_8 ; must clear top bits
5874 movdqu xmm0, [A1]
5875 movdqu xmm1, [A2]
5876 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5877 movdqu [A0], xmm0
5878
5879 IEMIMPL_AVX_EPILOGUE
5880 EPILOGUE_4_ARGS
5881 %assign bImm 0
5882 %rep 256
5883.imm %+ bImm:
5884 IBT_ENDBRxx_WITHOUT_NOTRACK
5885 %1 xmm0, xmm0, xmm1, bImm
5886 ret
5887 %assign bImm bImm + 1
5888 %endrep
5889.immEnd:
5890ENDPROC iemAImpl_ %+ %1 %+ _u128
5891
5892BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5893 PROLOGUE_4_ARGS
5894 IEMIMPL_AVX_PROLOGUE
5895
5896 movzx A3, A3_8 ; must clear top bits
5897 vmovdqu ymm0, [A1]
5898 vmovdqu ymm1, [A2]
5899 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5900 vmovdqu [A0], ymm0
5901
5902 IEMIMPL_AVX_EPILOGUE
5903 EPILOGUE_4_ARGS
5904 %assign bImm 0
5905 %rep 256
5906.imm %+ bImm:
5907 IBT_ENDBRxx_WITHOUT_NOTRACK
5908 %1 ymm0, ymm0, ymm1, bImm
5909 ret
5910 %assign bImm bImm + 1
5911 %endrep
5912.immEnd:
5913ENDPROC iemAImpl_ %+ %1 %+ _u256
5914%endmacro
5915
5916IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5917IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5918
5919
5920;;
5921; One of the [p]blendv{b,ps,pd} variants
5922;
5923; @param 1 The instruction
5924;
5925; @param A0 Pointer to the first media register sized operand (input/output).
5926; @param A1 Pointer to the second media sized value (input).
5927; @param A2 Pointer to the media register sized mask value (input).
5928;
5929%macro IEMIMPL_P_BLEND 1
5930BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5931 PROLOGUE_3_ARGS
5932 IEMIMPL_SSE_PROLOGUE
5933
5934 movdqu xmm0, [A2] ; This is implicit
5935 movdqu xmm1, [A0]
5936 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5937 %1 xmm1, xmm2
5938 movdqu [A0], xmm1
5939
5940 IEMIMPL_SSE_PROLOGUE
5941 EPILOGUE_3_ARGS
5942ENDPROC iemAImpl_ %+ %1 %+ _u128
5943%endmacro
5944
5945IEMIMPL_P_BLEND pblendvb
5946IEMIMPL_P_BLEND blendvps
5947IEMIMPL_P_BLEND blendvpd
5948
5949
5950;;
5951; One of the v[p]blendv{b,ps,pd} variants
5952;
5953; @param 1 The instruction
5954;
5955; @param A0 Pointer to the first media register sized operand (output).
5956; @param A1 Pointer to the first media register sized operand (input).
5957; @param A2 Pointer to the second media register sized operand (input).
5958; @param A3 Pointer to the media register sized mask value (input).
5959%macro IEMIMPL_AVX_P_BLEND 1
5960BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5961 PROLOGUE_4_ARGS
5962 IEMIMPL_AVX_PROLOGUE
5963
5964 vmovdqu xmm0, [A1]
5965 vmovdqu xmm1, [A2]
5966 vmovdqu xmm2, [A3]
5967 %1 xmm0, xmm0, xmm1, xmm2
5968 vmovdqu [A0], xmm0
5969
5970 IEMIMPL_AVX_PROLOGUE
5971 EPILOGUE_4_ARGS
5972ENDPROC iemAImpl_ %+ %1 %+ _u128
5973
5974BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5975 PROLOGUE_4_ARGS
5976 IEMIMPL_AVX_PROLOGUE
5977
5978 vmovdqu ymm0, [A1]
5979 vmovdqu ymm1, [A2]
5980 vmovdqu ymm2, [A3]
5981 %1 ymm0, ymm0, ymm1, ymm2
5982 vmovdqu [A0], ymm0
5983
5984 IEMIMPL_AVX_PROLOGUE
5985 EPILOGUE_4_ARGS
5986ENDPROC iemAImpl_ %+ %1 %+ _u256
5987%endmacro
5988
5989IEMIMPL_AVX_P_BLEND vpblendvb
5990IEMIMPL_AVX_P_BLEND vblendvps
5991IEMIMPL_AVX_P_BLEND vblendvpd
5992
5993
5994;;
5995; palignr mm1, mm2/m64 instruction.
5996;
5997; @param A0 Pointer to the first media register sized operand (output).
5998; @param A1 The second register sized operand (input).
5999; @param A2 The 8-bit immediate.
6000BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
6001 PROLOGUE_3_ARGS
6002 IEMIMPL_MMX_PROLOGUE
6003
6004 movzx A2, A2_8 ; must clear top bits
6005 movq mm0, [A0]
6006 movq mm1, A1
6007 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
6008 movq [A0], mm0
6009
6010 IEMIMPL_MMX_EPILOGUE
6011 EPILOGUE_3_ARGS
6012 %assign bImm 0
6013 %rep 256
6014.imm %+ bImm:
6015 IBT_ENDBRxx_WITHOUT_NOTRACK
6016 palignr mm0, mm1, bImm
6017 ret
6018 %assign bImm bImm + 1
6019 %endrep
6020.immEnd:
6021ENDPROC iemAImpl_palignr_u64
6022
6023
6024;;
6025; SSE instructions with 8-bit immediates of the form
6026; xxx xmm1, xmm2, imm8.
6027; where the instruction encoding takes up 6 bytes.
6028;
6029; @param 1 The instruction name.
6030;
6031; @param A0 Pointer to the first media register size operand (input/output).
6032; @param A1 Pointer to the second source media register size operand (input).
6033; @param A2 The 8-bit immediate
6034;
6035%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
6036BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6037 PROLOGUE_3_ARGS
6038 IEMIMPL_SSE_PROLOGUE
6039
6040 movzx A2, A2_8 ; must clear top bits
6041 movdqu xmm0, [A0]
6042 movdqu xmm1, [A1]
6043 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
6044 movdqu [A0], xmm0
6045
6046 IEMIMPL_SSE_EPILOGUE
6047 EPILOGUE_3_ARGS
6048 %assign bImm 0
6049 %rep 256
6050.imm %+ bImm:
6051 IBT_ENDBRxx_WITHOUT_NOTRACK
6052 %1 xmm0, xmm1, bImm
6053 ret
6054 int3
6055 %assign bImm bImm + 1
6056 %endrep
6057.immEnd:
6058ENDPROC iemAImpl_ %+ %1 %+ _u128
6059%endmacro
6060
6061IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
6062IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
6063IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
6064IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
6065IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
6066IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
6067IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
6068
6069
6070;;
6071; AVX instructions with 8-bit immediates of the form
6072; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
6073; where the instruction encoding takes up 6 bytes.
6074;
6075; @param 1 The instruction name.
6076; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6077; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6078;
6079; @param A0 Pointer to the destination media register size operand (output).
6080; @param A1 Pointer to the first source media register size operand (input).
6081; @param A2 Pointer to the second source media register size operand (input).
6082; @param A3 The 8-bit immediate
6083;
6084%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
6085 %if %2 == 1
6086BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6087 PROLOGUE_4_ARGS
6088 IEMIMPL_AVX_PROLOGUE
6089
6090 movzx A3, A3_8 ; must clear top bits
6091 movdqu xmm0, [A1]
6092 movdqu xmm1, [A2]
6093 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6094 movdqu [A0], xmm0
6095
6096 IEMIMPL_AVX_EPILOGUE
6097 EPILOGUE_4_ARGS
6098 %assign bImm 0
6099 %rep 256
6100.imm %+ bImm:
6101 IBT_ENDBRxx_WITHOUT_NOTRACK
6102 %1 xmm0, xmm0, xmm1, bImm
6103 ret
6104 int3
6105 %assign bImm bImm + 1
6106 %endrep
6107.immEnd:
6108ENDPROC iemAImpl_ %+ %1 %+ _u128
6109 %endif
6110
6111 %if %3 == 1
6112BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
6113 PROLOGUE_4_ARGS
6114 IEMIMPL_AVX_PROLOGUE
6115
6116 movzx A3, A3_8 ; must clear top bits
6117 vmovdqu ymm0, [A1]
6118 vmovdqu ymm1, [A2]
6119 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6120 vmovdqu [A0], ymm0
6121
6122 IEMIMPL_AVX_EPILOGUE
6123 EPILOGUE_4_ARGS
6124 %assign bImm 0
6125 %rep 256
6126.imm %+ bImm:
6127 IBT_ENDBRxx_WITHOUT_NOTRACK
6128 %1 ymm0, ymm0, ymm1, bImm
6129 ret
6130 int3
6131 %assign bImm bImm + 1
6132 %endrep
6133.immEnd:
6134ENDPROC iemAImpl_ %+ %1 %+ _u256
6135 %endif
6136%endmacro
6137
6138IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
6139IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
6140IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
6141IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
6142IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
6143IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
6144IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
6145IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
6146IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
6147
6148
6149;;
6150; AVX instructions with 8-bit immediates of the form
6151; xxx {x,y}mm1, {x,y}mm2, imm8.
6152; where the instruction encoding takes up 6 bytes.
6153;
6154; @param 1 The instruction name.
6155; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
6156; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
6157; @param 4 The number of bytes taken up by a single instance of the instruction.
6158;
6159; @param A0 Pointer to the destination media register size operand (output).
6160; @param A1 Pointer to the first source media register size operand (input).
6161; @param A2 The 8-bit immediate
6162;
6163%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP 4
6164 %if %2 == 1
6165BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
6166 PROLOGUE_4_ARGS
6167 IEMIMPL_AVX_PROLOGUE
6168
6169 movzx A2, A2_8 ; must clear top bits
6170 movdqu xmm1, [A1]
6171 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6172 movdqu [A0], xmm0
6173
6174 IEMIMPL_AVX_EPILOGUE
6175 EPILOGUE_4_ARGS
6176 %assign bImm 0
6177 %rep 256
6178.imm %+ bImm:
6179 IBT_ENDBRxx_WITHOUT_NOTRACK
6180 %1 xmm0, xmm1, bImm
6181 ret
6182 int3
6183 %assign bImm bImm + 1
6184 %endrep
6185.immEnd:
6186ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
6187 %endif
6188
6189 %if %3 == 1
6190BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
6191 PROLOGUE_4_ARGS
6192 IEMIMPL_AVX_PROLOGUE
6193
6194 movzx A2, A2_8 ; must clear top bits
6195 vmovdqu ymm1, [A1]
6196 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
6197 vmovdqu [A0], ymm0
6198
6199 IEMIMPL_AVX_EPILOGUE
6200 EPILOGUE_4_ARGS
6201 %assign bImm 0
6202 %rep 256
6203.imm %+ bImm:
6204 IBT_ENDBRxx_WITHOUT_NOTRACK
6205 %1 ymm0, ymm1, bImm
6206 ret
6207 int3
6208 %assign bImm bImm + 1
6209 %endrep
6210.immEnd:
6211ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
6212 %endif
6213%endmacro
6214
6215IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilps, 1, 1, 8
6216IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilpd, 1, 1, 8
6217IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpslldq, 1, 1, 7
6218IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpsrldq, 1, 1, 7
6219
6220
6221;;
6222; Need to move this as well somewhere better?
6223;
6224struc IEMPCMPISTRXSRC
6225 .uSrc1 resd 4
6226 .uSrc2 resd 4
6227endstruc
6228
6229struc IEMPCMPESTRXSRC
6230 .uSrc1 resd 4
6231 .uSrc2 resd 4
6232 .u64Rax resd 2
6233 .u64Rdx resd 2
6234endstruc
6235
6236;;
6237; The pcmpistri/vcmpistri instruction.
6238;
6239; @param 1 The instruction name
6240;
6241; @return R0_32 The new ECX value.
6242; @param A0 Pointer to the EFLAGS register.
6243; @param A1 Pointer to the first operand (input).
6244; @param A2 Pointer to the second operand (input).
6245; @param A3 The 8-bit immediate
6246;
6247%macro IEMIMPL_MEDIA_V_CMPISTRI 1
6248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6249 PROLOGUE_4_ARGS
6250 IEMIMPL_SSE_PROLOGUE
6251
6252 movzx A3, A3_8 ; must clear top bits
6253 movdqu xmm0, [A1]
6254 movdqu xmm1, [A2]
6255 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6256 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6257
6258 IEM_SAVE_FLAGS_OLD T2, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6259 mov R0_32, ecx
6260
6261 IEMIMPL_SSE_EPILOGUE
6262 EPILOGUE_4_ARGS
6263 %assign bImm 0
6264 %rep 256
6265.imm %+ bImm:
6266 IBT_ENDBRxx_WITHOUT_NOTRACK
6267 %1 xmm0, xmm1, bImm
6268 ret
6269 int3
6270 %assign bImm bImm + 1
6271 %endrep
6272.immEnd:
6273ENDPROC iemAImpl_ %+ %1 %+ _u128
6274%endmacro
6275
6276IEMIMPL_MEDIA_V_CMPISTRI pcmpistri
6277IEMIMPL_MEDIA_V_CMPISTRI vpcmpistri
6278
6279
6280;;
6281; The pcmpestri instruction.
6282;
6283; @param 1 The instruction name
6284;
6285; @param A0 Pointer to the ECX register to store the result to (output).
6286; @param A1 Pointer to the EFLAGS register.
6287; @param A2 Pointer to the structure containing the source operands (input).
6288; @param A3 The 8-bit immediate
6289;
6290BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
6291 PROLOGUE_4_ARGS
6292 IEMIMPL_SSE_PROLOGUE
6293
6294 movzx A3, A3_8 ; must clear top bits
6295 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6296 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6297 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6298 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6299 push xDX ; xDX can be A1 or A2 depending on the calling convention
6300 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6301 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6302 IBT_NOTRACK
6303 call T1
6304
6305 pop xDX
6306 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6307 mov [T2], ecx
6308
6309 IEMIMPL_SSE_EPILOGUE
6310 EPILOGUE_4_ARGS
6311 %assign bImm 0
6312 %rep 256
6313.imm %+ bImm:
6314 IBT_ENDBRxx_WITHOUT_NOTRACK
6315 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6316 pcmpestri xmm0, xmm1, bImm
6317 ret
6318 %assign bImm bImm + 1
6319 %endrep
6320.immEnd:
6321ENDPROC iemAImpl_pcmpestri_u128
6322
6323
6324;;
6325; The vpcmpestri instruction.
6326;
6327; @param 1 The instruction name
6328;
6329; @param A0 Pointer to the ECX register to store the result to (output).
6330; @param A1 Pointer to the EFLAGS register.
6331; @param A2 Pointer to the structure containing the source operands (input).
6332; @param A3 The 8-bit immediate
6333;
6334BEGINPROC_FASTCALL iemAImpl_vpcmpestri_u128, 16
6335 PROLOGUE_4_ARGS
6336 IEMIMPL_SSE_PROLOGUE
6337
6338 movzx A3, A3_8 ; must clear top bits
6339 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6340 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6341 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6342 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6343 push xDX ; xDX can be A1 or A2 depending on the calling convention
6344 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6345 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6346 IBT_NOTRACK
6347 call T1
6348
6349 pop xDX
6350 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6351 mov [T2], ecx
6352
6353 IEMIMPL_SSE_EPILOGUE
6354 EPILOGUE_4_ARGS
6355 %assign bImm 0
6356 %rep 256
6357.imm %+ bImm:
6358 IBT_ENDBRxx_WITHOUT_NOTRACK
6359 db 0xc4, 0xe3, 0xf9, 0x61, 0xc1, bImm ; vpcmpestri xmm0,xmm1,0x1 with VEX.W set
6360 ret
6361 int3
6362 %assign bImm bImm + 1
6363 %endrep
6364.immEnd:
6365ENDPROC iemAImpl_vpcmpestri_u128
6366
6367
6368;;
6369; The pcmpistrm/vpcmpistrm instruction template.
6370;
6371; @param 1 The instruction name
6372;
6373; @param A0 Pointer to the XMM0 register to store the result to (output).
6374; @param A1 Pointer to the EFLAGS register.
6375; @param A2 Pointer to the structure containing the source operands (input).
6376; @param A3 The 8-bit immediate
6377;
6378%macro IEMIMPL_MEDIA_V_CMPISTRM 1
6379BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6380 PROLOGUE_4_ARGS
6381 IEMIMPL_SSE_PROLOGUE
6382
6383 movzx A3, A3_8 ; must clear top bits
6384 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
6385 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
6386 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6387
6388 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6389 movdqu [A0], xmm0
6390
6391 IEMIMPL_SSE_EPILOGUE
6392 EPILOGUE_4_ARGS
6393 %assign bImm 0
6394 %rep 256
6395.imm %+ bImm:
6396 IBT_ENDBRxx_WITHOUT_NOTRACK
6397 %1 xmm1, xmm2, bImm
6398 ret
6399 int3
6400 %assign bImm bImm + 1
6401 %endrep
6402.immEnd:
6403ENDPROC iemAImpl_ %+ %1 %+ _u128
6404%endmacro
6405
6406IEMIMPL_MEDIA_V_CMPISTRM pcmpistrm
6407IEMIMPL_MEDIA_V_CMPISTRM vpcmpistrm
6408
6409
6410;;
6411; The pcmpestrm instruction.
6412;
6413; @param A0 Pointer to the XMM0 register to store the result to (output).
6414; @param A1 Pointer to the EFLAGS register.
6415; @param A2 Pointer to the structure containing the source operands (input).
6416; @param A3 The 8-bit immediate
6417;
6418BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
6419 PROLOGUE_4_ARGS
6420 IEMIMPL_SSE_PROLOGUE
6421
6422 movzx A3, A3_8 ; must clear top bits
6423 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6424 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6425 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6426 push xDX ; xDX can be A1 or A2 depending on the calling convention
6427 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6428 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6429 IBT_NOTRACK
6430 call T1
6431
6432 pop xDX
6433 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6434 movdqu [A0], xmm0
6435
6436 IEMIMPL_SSE_EPILOGUE
6437 EPILOGUE_4_ARGS
6438 %assign bImm 0
6439 %rep 256
6440.imm %+ bImm:
6441 IBT_ENDBRxx_WITHOUT_NOTRACK
6442 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6443 pcmpestrm xmm1, xmm2, bImm
6444 ret
6445 %assign bImm bImm + 1
6446 %endrep
6447.immEnd:
6448ENDPROC iemAImpl_pcmpestrm_u128
6449
6450
6451;;
6452; The vpcmpestrm instruction.
6453;
6454; @param A0 Pointer to the XMM0 register to store the result to (output).
6455; @param A1 Pointer to the EFLAGS register.
6456; @param A2 Pointer to the structure containing the source operands (input).
6457; @param A3 The 8-bit immediate
6458;
6459BEGINPROC_FASTCALL iemAImpl_vpcmpestrm_u128, 16
6460 PROLOGUE_4_ARGS
6461 IEMIMPL_SSE_PROLOGUE
6462
6463 movzx A3, A3_8 ; must clear top bits
6464 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6465 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6466 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6467 push xDX ; xDX can be A1 or A2 depending on the calling convention
6468 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6469 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6470 IBT_NOTRACK
6471 call T1
6472
6473 pop xDX
6474 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6475 movdqu [A0], xmm0
6476
6477 IEMIMPL_SSE_EPILOGUE
6478 EPILOGUE_4_ARGS
6479 %assign bImm 0
6480 %rep 256
6481.imm %+ bImm:
6482 IBT_ENDBRxx_WITHOUT_NOTRACK
6483 db 0xc4, 0xe3, 0xf9, 0x60, 0xca, bImm ; vpcmpestrm xmm1, xmm2, bImm with VEX.W set
6484 ret
6485 int3
6486 %assign bImm bImm + 1
6487 %endrep
6488.immEnd:
6489ENDPROC iemAImpl_vpcmpestrm_u128
6490
6491
6492;;
6493; movmskp{s,d} SSE instruction template
6494;
6495; @param 1 The SSE instruction name.
6496; @param 2 The AVX instruction name.
6497;
6498; @param A0 Pointer to the output register (output/byte sized).
6499; @param A1 Pointer to the source media register size operand (input).
6500;
6501%macro IEMIMPL_MEDIA_MOVMSK_P 2
6502BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6503 PROLOGUE_2_ARGS
6504 IEMIMPL_SSE_PROLOGUE
6505
6506 movdqu xmm0, [A1]
6507 %1 T0, xmm0
6508 mov byte [A0], T0_8
6509
6510 IEMIMPL_SSE_EPILOGUE
6511 EPILOGUE_2_ARGS
6512ENDPROC iemAImpl_ %+ %1 %+ _u128
6513
6514BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6515 PROLOGUE_2_ARGS
6516 IEMIMPL_AVX_PROLOGUE
6517
6518 movdqu xmm0, [A1]
6519 %2 T0, xmm0
6520 mov byte [A0], T0_8
6521
6522 IEMIMPL_AVX_EPILOGUE
6523 EPILOGUE_2_ARGS
6524ENDPROC iemAImpl_ %+ %2 %+ _u128
6525
6526BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6527 PROLOGUE_2_ARGS
6528 IEMIMPL_AVX_PROLOGUE
6529
6530 vmovdqu ymm0, [A1]
6531 %2 T0, ymm0
6532 mov byte [A0], T0_8
6533
6534 IEMIMPL_AVX_EPILOGUE
6535 EPILOGUE_2_ARGS
6536ENDPROC iemAImpl_ %+ %2 %+ _u256
6537%endmacro
6538
6539IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6540IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6541
6542
6543;;
6544; Template for [v]cvttss2si/[v]cvtss2si instructions.
6545;
6546; @param 1 Instruction name.
6547; @param 2 AVX or SSE
6548;
6549; @return R0_32 The new MXCSR value of the guest.
6550; @param A0_32 The guest's MXCSR register value to use.
6551; @param A1 Pointer to the result operand (output).
6552; @param A2 Pointer to the second operand (input).
6553;
6554%macro IEMIMPL_MEDIA_V_CVTXSS2SI 2
6555BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r32, 16
6556 PROLOGUE_3_ARGS
6557 IEMIMPL_ %+ %2 %+ _PROLOGUE
6558 SSE_AVX_LD_MXCSR A0_32
6559
6560 %1 T0_32, [A2]
6561 mov dword [A1], T0_32
6562
6563 SSE_AVX_ST_MXCSR R0_32, A0_32
6564 IEMIMPL_ %+ %2 %+ _EPILOGUE
6565 EPILOGUE_3_ARGS
6566ENDPROC iemAImpl_ %+ %1 %+ _i32_r32
6567
6568
6569BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r32, 16
6570 PROLOGUE_3_ARGS
6571 IEMIMPL_ %+ %2 %+ _PROLOGUE
6572 SSE_AVX_LD_MXCSR A0_32
6573
6574 %1 T0, [A2]
6575 mov qword [A1], T0
6576
6577 SSE_AVX_ST_MXCSR R0_32, A0_32
6578 IEMIMPL_ %+ %2 %+ _EPILOGUE
6579 EPILOGUE_3_ARGS
6580ENDPROC iemAImpl_ %+ %1 %+ _i64_r32
6581%endmacro
6582
6583IEMIMPL_MEDIA_V_CVTXSS2SI cvttss2si, SSE
6584IEMIMPL_MEDIA_V_CVTXSS2SI vcvttss2si, AVX
6585IEMIMPL_MEDIA_V_CVTXSS2SI cvtss2si, SSE
6586IEMIMPL_MEDIA_V_CVTXSS2SI vcvtss2si, AVX
6587
6588
6589;;
6590; Template for [v]cvttsd2si/[v]cvtsd2si instructions.
6591;
6592; @param 1 Instruction name.
6593; @param 2 AVX or SSE
6594;
6595; @return R0_32 The new MXCSR value of the guest.
6596; @param A0_32 The guest's MXCSR register value to use.
6597; @param A1 Pointer to the result operand (output).
6598; @param A2 Pointer to the second operand (input).
6599;
6600%macro IEMIMPL_MEDIA_V_CVTXSD2SI 2
6601BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i32_r64, 16
6602 PROLOGUE_3_ARGS
6603 IEMIMPL_ %+ %2 %+ _PROLOGUE
6604 SSE_AVX_LD_MXCSR A0_32
6605
6606 %1 T0_32, [A2]
6607 mov dword [A1], T0_32
6608
6609 SSE_AVX_ST_MXCSR R0_32, A0_32
6610 IEMIMPL_ %+ %2 %+ _EPILOGUE
6611 EPILOGUE_3_ARGS
6612ENDPROC iemAImpl_ %+ %1 %+ _i32_r64
6613
6614
6615BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _i64_r64, 16
6616 PROLOGUE_3_ARGS
6617 IEMIMPL_ %+ %2 %+ _PROLOGUE
6618 SSE_AVX_LD_MXCSR A0_32
6619
6620 %1 T0, [A2]
6621 mov qword [A1], T0
6622
6623 SSE_AVX_ST_MXCSR R0_32, A0_32
6624 IEMIMPL_ %+ %2 %+ _EPILOGUE
6625 EPILOGUE_3_ARGS
6626ENDPROC iemAImpl_ %+ %1 %+ _i64_r64
6627%endmacro
6628
6629IEMIMPL_MEDIA_V_CVTXSD2SI cvttsd2si, SSE
6630IEMIMPL_MEDIA_V_CVTXSD2SI vcvttsd2si, AVX
6631IEMIMPL_MEDIA_V_CVTXSD2SI cvtsd2si, SSE
6632IEMIMPL_MEDIA_V_CVTXSD2SI vcvtsd2si, AVX
6633
6634
6635;;
6636; cvtsi2ss instruction - 32-bit variant.
6637;
6638; @return R0_32 The new MXCSR value of the guest.
6639; @param A0_32 The guest's MXCSR register value to use.
6640; @param A1 Pointer to the result operand (output).
6641; @param A2 Pointer to the second operand (input).
6642;
6643BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6644 PROLOGUE_3_ARGS
6645 IEMIMPL_SSE_PROLOGUE
6646 SSE_AVX_LD_MXCSR A0_32
6647
6648 cvtsi2ss xmm0, dword [A2]
6649 movd dword [A1], xmm0
6650
6651 SSE_AVX_ST_MXCSR R0_32, A0_32
6652 IEMIMPL_SSE_EPILOGUE
6653 EPILOGUE_3_ARGS
6654ENDPROC iemAImpl_cvtsi2ss_r32_i32
6655
6656
6657;;
6658; vcvtsi2ss instruction - 32-bit variant.
6659;
6660; @return R0_32 The new MXCSR value of the guest.
6661; @param A0_32 The guest's MXCSR register value to use.
6662; @param A1 Pointer to the result operand (output).
6663; @param A2 Pointer to the second operand (input).
6664; @param A3 Pointer to the third operand (input).
6665;
6666BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i32, 16
6667 PROLOGUE_3_ARGS
6668 IEMIMPL_AVX_PROLOGUE
6669 SSE_AVX_LD_MXCSR A0_32
6670
6671 movdqu xmm0, [A2]
6672 vcvtsi2ss xmm0, xmm0, dword [A3]
6673 movdqu [A1], xmm0
6674
6675 SSE_AVX_ST_MXCSR R0_32, A0_32
6676 IEMIMPL_AVX_EPILOGUE
6677 EPILOGUE_3_ARGS
6678ENDPROC iemAImpl_vcvtsi2ss_u128_i32
6679
6680
6681;;
6682; cvtsi2ss instruction - 64-bit variant.
6683;
6684; @return R0_32 The new MXCSR value of the guest.
6685; @param A0_32 The guest's MXCSR register value to use.
6686; @param A1 Pointer to the result operand (output).
6687; @param A2 Pointer to the second operand (input).
6688;
6689BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6690 PROLOGUE_3_ARGS
6691 IEMIMPL_SSE_PROLOGUE
6692 SSE_AVX_LD_MXCSR A0_32
6693
6694 cvtsi2ss xmm0, qword [A2]
6695 movd dword [A1], xmm0
6696
6697 SSE_AVX_ST_MXCSR R0_32, A0_32
6698 IEMIMPL_SSE_EPILOGUE
6699 EPILOGUE_3_ARGS
6700ENDPROC iemAImpl_cvtsi2ss_r32_i64
6701
6702
6703;;
6704; vcvtsi2ss instruction - 64-bit variant.
6705;
6706; @return R0_32 The new MXCSR value of the guest.
6707; @param A0_32 The guest's MXCSR register value to use.
6708; @param A1 Pointer to the result operand (output).
6709; @param A2 Pointer to the second operand (input).
6710; @param A3 Pointer to the third operand (input).
6711;
6712BEGINPROC_FASTCALL iemAImpl_vcvtsi2ss_u128_i64, 16
6713 PROLOGUE_3_ARGS
6714 IEMIMPL_AVX_PROLOGUE
6715 SSE_AVX_LD_MXCSR A0_32
6716
6717 movdqu xmm0, [A2]
6718 vcvtsi2ss xmm0, xmm0, qword [A3]
6719 movdqu [A1], xmm0
6720
6721 SSE_AVX_ST_MXCSR R0_32, A0_32
6722 IEMIMPL_AVX_EPILOGUE
6723 EPILOGUE_3_ARGS
6724ENDPROC iemAImpl_vcvtsi2ss_u128_i64
6725
6726
6727;;
6728; cvtsi2sd instruction - 32-bit variant.
6729;
6730; @return R0_32 The new MXCSR value of the guest.
6731; @param A0_32 The guest's MXCSR register value to use.
6732; @param A1 Pointer to the result operand (output).
6733; @param A2 Pointer to the second operand (input).
6734;
6735BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6736 PROLOGUE_3_ARGS
6737 IEMIMPL_SSE_PROLOGUE
6738 SSE_AVX_LD_MXCSR A0_32
6739
6740 cvtsi2sd xmm0, dword [A2]
6741 movq [A1], xmm0
6742
6743 SSE_AVX_ST_MXCSR R0_32, A0_32
6744 IEMIMPL_SSE_EPILOGUE
6745 EPILOGUE_3_ARGS
6746ENDPROC iemAImpl_cvtsi2sd_r64_i32
6747
6748
6749;;
6750; vcvtsi2sd instruction - 32-bit variant.
6751;
6752; @return R0_32 The new MXCSR value of the guest.
6753; @param A0_32 The guest's MXCSR register value to use.
6754; @param A1 Pointer to the result operand (output).
6755; @param A2 Pointer to the second operand (input).
6756; @param A3 Pointer to the third operand (input).
6757;
6758BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i32, 16
6759 PROLOGUE_3_ARGS
6760 IEMIMPL_AVX_PROLOGUE
6761 SSE_AVX_LD_MXCSR A0_32
6762
6763 movdqu xmm0, [A2]
6764 vcvtsi2sd xmm0, xmm0, dword [A3]
6765 movdqu [A1], xmm0
6766
6767 SSE_AVX_ST_MXCSR R0_32, A0_32
6768 IEMIMPL_AVX_EPILOGUE
6769 EPILOGUE_3_ARGS
6770ENDPROC iemAImpl_vcvtsi2sd_u128_i32
6771
6772
6773;;
6774; cvtsi2sd instruction - 64-bit variant.
6775;
6776; @return R0_32 The new MXCSR value of the guest.
6777; @param A0_32 The guest's MXCSR register value to use.
6778; @param A1 Pointer to the result operand (output).
6779; @param A2 Pointer to the second operand (input).
6780;
6781BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6782 PROLOGUE_3_ARGS
6783 IEMIMPL_SSE_PROLOGUE
6784 SSE_AVX_LD_MXCSR A0_32
6785
6786 cvtsi2sd xmm0, qword [A2]
6787 movq [A1], xmm0
6788
6789 SSE_AVX_ST_MXCSR R0_32, A0_32
6790 IEMIMPL_SSE_EPILOGUE
6791 EPILOGUE_3_ARGS
6792ENDPROC iemAImpl_cvtsi2sd_r64_i64
6793
6794
6795;;
6796; vcvtsi2sd instruction - 64-bit variant.
6797;
6798; @return R0_32 The new MXCSR value of the guest.
6799; @param A0_32 The guest's MXCSR register value to use.
6800; @param A1 Pointer to the result operand (output).
6801; @param A2 Pointer to the second operand (input).
6802; @param A3 Pointer to the third operand (input).
6803;
6804BEGINPROC_FASTCALL iemAImpl_vcvtsi2sd_u128_i64, 16
6805 PROLOGUE_3_ARGS
6806 IEMIMPL_AVX_PROLOGUE
6807 SSE_AVX_LD_MXCSR A0_32
6808
6809 movdqu xmm0, [A2]
6810 vcvtsi2sd xmm0, xmm0, qword [A3]
6811 movdqu [A1], xmm0
6812
6813 SSE_AVX_ST_MXCSR R0_32, A0_32
6814 IEMIMPL_AVX_EPILOGUE
6815 EPILOGUE_3_ARGS
6816ENDPROC iemAImpl_vcvtsi2sd_u128_i64
6817
6818
6819;
6820; UCOMISS (SSE)
6821;
6822; @return R0_32 The new MXCSR value of the guest.
6823; @param A0_32 The guest's MXCSR register value to use (input).
6824; @param A1 Pointer to the EFLAGS value (input/output).
6825; @param A2_32 The first source operand.
6826; @param A3_32 The second source operand.
6827;
6828BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6829 PROLOGUE_4_ARGS
6830 IEMIMPL_SSE_PROLOGUE
6831 SSE_AVX_LD_MXCSR A0_32
6832
6833 movd xmm0, A2_32
6834 movd xmm1, A3_32
6835 ucomiss xmm0, xmm1
6836 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6837
6838 SSE_AVX_ST_MXCSR R0_32, A0_32
6839 IEMIMPL_SSE_EPILOGUE
6840 EPILOGUE_4_ARGS
6841ENDPROC iemAImpl_ucomiss_u128
6842
6843BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6844 PROLOGUE_4_ARGS
6845 IEMIMPL_SSE_PROLOGUE
6846 SSE_AVX_LD_MXCSR A0_32
6847
6848 movd xmm0, A2_32
6849 movd xmm1, A3_32
6850 vucomiss xmm0, xmm1
6851 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6852
6853 SSE_AVX_ST_MXCSR R0_32, A0_32
6854 IEMIMPL_SSE_EPILOGUE
6855 EPILOGUE_3_ARGS
6856ENDPROC iemAImpl_vucomiss_u128
6857
6858
6859;
6860; UCOMISD (SSE)
6861;
6862; @return R0_32 The new MXCSR value of the guest.
6863; @param A0_32 The guest's MXCSR register value to use (input).
6864; @param A1 Pointer to the EFLAGS value (input/output).
6865; @param A2 The first source operand.
6866; @param A3 The second source operand.
6867;
6868BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6869 PROLOGUE_4_ARGS
6870 IEMIMPL_SSE_PROLOGUE
6871 SSE_AVX_LD_MXCSR A0_32
6872
6873 movq xmm0, A2
6874 movq xmm1, A3
6875 ucomisd xmm0, xmm1
6876 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6877
6878 SSE_AVX_ST_MXCSR R0_32, A0_32
6879 IEMIMPL_SSE_EPILOGUE
6880 EPILOGUE_4_ARGS
6881ENDPROC iemAImpl_ucomisd_u128
6882
6883BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6884 PROLOGUE_4_ARGS
6885 IEMIMPL_SSE_PROLOGUE
6886 SSE_AVX_LD_MXCSR A0_32
6887
6888 movq xmm0, A2
6889 movq xmm1, A3
6890 vucomisd xmm0, xmm1
6891 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6892
6893 SSE_AVX_ST_MXCSR R0_32, A0_32
6894 IEMIMPL_SSE_EPILOGUE
6895 EPILOGUE_4_ARGS
6896ENDPROC iemAImpl_vucomisd_u128
6897
6898;
6899; COMISS (SSE)
6900;
6901; @return R0_32 The new MXCSR value of the guest.
6902; @param A0_32 The guest's MXCSR register value to use (input).
6903; @param A1 Pointer to the EFLAGS value (input/output).
6904; @param A2_32 The first source operand.
6905; @param A3_32 The second source operand.
6906;
6907BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6908 PROLOGUE_4_ARGS
6909 IEMIMPL_SSE_PROLOGUE
6910 SSE_AVX_LD_MXCSR A0_32
6911
6912 movd xmm0, A2_32
6913 movd xmm1, A3_32
6914 comiss xmm0, xmm1
6915 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6916
6917 SSE_AVX_ST_MXCSR R0_32, A0_32
6918 IEMIMPL_SSE_EPILOGUE
6919 EPILOGUE_4_ARGS
6920ENDPROC iemAImpl_comiss_u128
6921
6922BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6923 PROLOGUE_4_ARGS
6924 IEMIMPL_SSE_PROLOGUE
6925 SSE_AVX_LD_MXCSR A0_32
6926
6927 movd xmm0, A2_32
6928 movd xmm1, A3_32
6929 vcomiss xmm0, xmm1
6930 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6931
6932 SSE_AVX_ST_MXCSR R0_32, A0_32
6933 IEMIMPL_SSE_EPILOGUE
6934 EPILOGUE_4_ARGS
6935ENDPROC iemAImpl_vcomiss_u128
6936
6937
6938;
6939; COMISD (SSE)
6940;
6941; @return R0_32 The new MXCSR value of the guest.
6942; @param A0_32 The guest's MXCSR register value to use (input).
6943; @param A1 Pointer to the EFLAGS value (input/output).
6944; @param A2 The first source operand.
6945; @param A3 The second source operand.
6946;
6947BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6948 PROLOGUE_4_ARGS
6949 IEMIMPL_SSE_PROLOGUE
6950 SSE_AVX_LD_MXCSR A0_32
6951
6952 movq xmm0, A2
6953 movq xmm1, A3
6954 comisd xmm0, xmm1
6955 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6956
6957 SSE_AVX_ST_MXCSR R0_32, A0_32
6958 IEMIMPL_SSE_EPILOGUE
6959 EPILOGUE_4_ARGS
6960ENDPROC iemAImpl_comisd_u128
6961
6962BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6963 PROLOGUE_4_ARGS
6964 IEMIMPL_SSE_PROLOGUE
6965 SSE_AVX_LD_MXCSR A0_32
6966
6967 movq xmm0, A2
6968 movq xmm1, A3
6969 vcomisd xmm0, xmm1
6970 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6971
6972 SSE_AVX_ST_MXCSR R0_32, A0_32
6973 IEMIMPL_SSE_EPILOGUE
6974 EPILOGUE_4_ARGS
6975ENDPROC iemAImpl_vcomisd_u128
6976
6977
6978;;
6979; Need to move this as well somewhere better?
6980;
6981struc IEMMEDIAF2XMMSRC
6982 .uSrc1 resd 4
6983 .uSrc2 resd 4
6984endstruc
6985
6986
6987struc IEMMEDIAF2YMMSRC
6988 .uSrc1 resd 8
6989 .uSrc2 resd 8
6990endstruc
6991
6992
6993;;
6994; SSE/AVX instructions with 8-bit immediates of the form
6995; xxx xmm1, xmm2, imm8.
6996; vxxx xmm1, xmm2, xmm3, imm8.
6997; and we need to load and save the MXCSR register.
6998;
6999; @param 1 The instruction name.
7000; @param 2 Flag whether this instruction has a 256-bit AVX variant (1) or not (0).
7001; @param 3 Number of bytes for the encoding of the SSE variant + ret instruction (AVX is fixed to 6).
7002;
7003; @return R0_32 The new MXCSR value of the guest.
7004; @param A0_32 The guest's MXCSR register value to use (input).
7005; @param A1 Pointer to the first media register size operand (output).
7006; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
7007; @param A3 The 8-bit immediate (input).
7008;
7009%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR 3
7010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7011 PROLOGUE_4_ARGS
7012 IEMIMPL_SSE_PROLOGUE
7013 SSE_AVX_LD_MXCSR A0_32
7014
7015 movzx A3, A3_8 ; must clear top bits
7016 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7017 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7018 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, %3
7019 movdqu [A1], xmm0
7020
7021 SSE_AVX_ST_MXCSR R0_32, A0_32
7022 IEMIMPL_SSE_EPILOGUE
7023 EPILOGUE_4_ARGS
7024 %assign bImm 0
7025 %rep 256
7026.imm %+ bImm:
7027 IBT_ENDBRxx_WITHOUT_NOTRACK
7028 %1 xmm0, xmm1, bImm
7029 ret
7030 %assign bImm bImm + 1
7031 %endrep
7032.immEnd:
7033ENDPROC iemAImpl_ %+ %1 %+ _u128
7034
7035
7036BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7037 PROLOGUE_4_ARGS
7038 IEMIMPL_SSE_PROLOGUE
7039 SSE_AVX_LD_MXCSR A0_32
7040
7041 movzx A3, A3_8 ; must clear top bits
7042 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7043 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7044 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7045 movdqu [A1], xmm0
7046
7047 SSE_AVX_ST_MXCSR R0_32, A0_32
7048 IEMIMPL_SSE_EPILOGUE
7049 EPILOGUE_4_ARGS
7050 %assign bImm 0
7051 %rep 256
7052.imm %+ bImm:
7053 IBT_ENDBRxx_WITHOUT_NOTRACK
7054 v %+ %1 xmm0, xmm0, xmm1, bImm
7055 ret
7056 %assign bImm bImm + 1
7057 %endrep
7058.immEnd:
7059ENDPROC iemAImpl_v %+ %1 %+ _u128
7060
7061 %if %2 == 1
7062BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7063 PROLOGUE_4_ARGS
7064 IEMIMPL_SSE_PROLOGUE
7065 SSE_AVX_LD_MXCSR A0_32
7066
7067 movzx A3, A3_8 ; must clear top bits
7068 vmovdqu ymm0, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7069 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7070 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
7071 vmovdqu [A1], ymm0
7072
7073 SSE_AVX_ST_MXCSR R0_32, A0_32
7074 IEMIMPL_SSE_EPILOGUE
7075 EPILOGUE_4_ARGS
7076 %assign bImm 0
7077 %rep 256
7078.imm %+ bImm:
7079 IBT_ENDBRxx_WITHOUT_NOTRACK
7080 v %+ %1 ymm0, ymm0, ymm1, bImm
7081 ret
7082 %assign bImm bImm + 1
7083 %endrep
7084.immEnd:
7085ENDPROC iemAImpl_v %+ %1 %+ _u256
7086 %endif
7087%endmacro
7088
7089IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpps, 1, 5
7090IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmppd, 1, 6
7091IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpss, 0, 6
7092IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR cmpsd, 0, 6
7093
7094
7095;;
7096; SSE/AVX instructions with 2 full sized perands and an 8-bit immediate of the form
7097; xxx xmm1, xmm2, imm8.
7098; vxxx xmm1, xmm2, imm8
7099; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7100; register.
7101;
7102; @param 1 The instruction name.
7103;
7104; @return R0_32 The new MXCSR value of the guest.
7105; @param A0_32 The guest's MXCSR register value to use (input).
7106; @param A1 Pointer to the first media register size operand (output).
7107; @param A2 Pointer to the second media register size operand (input).
7108; @param A3 The 8-bit immediate (input).
7109;
7110%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 1
7111BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7112 PROLOGUE_4_ARGS
7113 IEMIMPL_SSE_PROLOGUE
7114 SSE_AVX_LD_MXCSR A0_32
7115
7116 movzx A3, A3_8 ; must clear top bits
7117 movdqu xmm1, [A2]
7118 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7119 movdqu [A1], xmm0
7120
7121 SSE_AVX_ST_MXCSR R0_32, A0_32
7122 IEMIMPL_SSE_EPILOGUE
7123 EPILOGUE_4_ARGS
7124 %assign bImm 0
7125 %rep 256
7126.imm %+ bImm:
7127 IBT_ENDBRxx_WITHOUT_NOTRACK
7128 %1 xmm0, xmm1, bImm
7129 ret
7130 int3
7131 %assign bImm bImm + 1
7132 %endrep
7133.immEnd:
7134ENDPROC iemAImpl_ %+ %1 %+ _u128
7135
7136BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7137 PROLOGUE_4_ARGS
7138 IEMIMPL_SSE_PROLOGUE
7139 SSE_AVX_LD_MXCSR A0_32
7140
7141 movzx A3, A3_8 ; must clear top bits
7142 movdqu xmm1, [A2]
7143 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7144 movdqu [A1], xmm0
7145
7146 SSE_AVX_ST_MXCSR R0_32, A0_32
7147 IEMIMPL_SSE_EPILOGUE
7148 EPILOGUE_4_ARGS
7149 %assign bImm 0
7150 %rep 256
7151.imm %+ bImm:
7152 IBT_ENDBRxx_WITHOUT_NOTRACK
7153 v%1 xmm0, xmm1, bImm
7154 ret
7155 int3
7156 %assign bImm bImm + 1
7157 %endrep
7158.immEnd:
7159ENDPROC iemAImpl_v %+ %1 %+ _u128
7160
7161BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7162 PROLOGUE_4_ARGS
7163 IEMIMPL_SSE_PROLOGUE
7164 SSE_AVX_LD_MXCSR A0_32
7165
7166 movzx A3, A3_8 ; must clear top bits
7167 vmovdqu ymm1, [A2]
7168 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7169 vmovdqu [A1], ymm0
7170
7171 SSE_AVX_ST_MXCSR R0_32, A0_32
7172 IEMIMPL_SSE_EPILOGUE
7173 EPILOGUE_4_ARGS
7174 %assign bImm 0
7175 %rep 256
7176.imm %+ bImm:
7177 IBT_ENDBRxx_WITHOUT_NOTRACK
7178 v%1 ymm0, ymm1, bImm
7179 ret
7180 int3
7181 %assign bImm bImm + 1
7182 %endrep
7183.immEnd:
7184ENDPROC iemAImpl_v %+ %1 %+ _u256
7185%endmacro
7186
7187IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundps
7188IEMIMPL_MEDIA_SSE_AVX_INSN_F2_IMM8_MXCSR_6 roundpd
7189
7190
7191;;
7192; SSE/AVX instructions with 3 full sized perands and an 8-bit immediate of the form
7193; xxx xmm1, xmm2, imm8.
7194; vxxx xmm1, xmm2, xmm3, imm8
7195; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
7196; register.
7197;
7198; @param 1 The instruction name.
7199; @param 2 Flag whether to emit a 256-bit AVX variant (1) or not (0).
7200;
7201; @return R0_32 The new MXCSR value of the guest.
7202; @param A0_32 The guest's MXCSR register value to use (input).
7203; @param A1 Pointer to the first media register size operand (output).
7204; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC/IEMMEDIAF2YMMSRC (input).
7205; @param A3 The 8-bit immediate (input).
7206;
7207%macro IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 2
7208BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7209 PROLOGUE_4_ARGS
7210 IEMIMPL_SSE_PROLOGUE
7211 SSE_AVX_LD_MXCSR A0_32
7212
7213 movzx A3, A3_8 ; must clear top bits
7214 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7215 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7216 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7217 movdqu [A1], xmm0
7218
7219 SSE_AVX_ST_MXCSR R0_32, A0_32
7220 IEMIMPL_SSE_EPILOGUE
7221 EPILOGUE_4_ARGS
7222 %assign bImm 0
7223 %rep 256
7224.imm %+ bImm:
7225 IBT_ENDBRxx_WITHOUT_NOTRACK
7226 %1 xmm0, xmm1, bImm
7227 ret
7228 int3
7229 %assign bImm bImm + 1
7230 %endrep
7231.immEnd:
7232ENDPROC iemAImpl_ %+ %1 %+ _u128
7233
7234
7235BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
7236 PROLOGUE_4_ARGS
7237 IEMIMPL_SSE_PROLOGUE
7238 SSE_AVX_LD_MXCSR A0_32
7239
7240 movzx A3, A3_8 ; must clear top bits
7241 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
7242 movdqu xmm2, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
7243 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7244 movdqu [A1], xmm0
7245
7246 SSE_AVX_ST_MXCSR R0_32, A0_32
7247 IEMIMPL_SSE_EPILOGUE
7248 EPILOGUE_4_ARGS
7249 %assign bImm 0
7250 %rep 256
7251.imm %+ bImm:
7252 IBT_ENDBRxx_WITHOUT_NOTRACK
7253 v %+ %1 xmm0, xmm1, xmm2, bImm
7254 ret
7255 int3
7256 %assign bImm bImm + 1
7257 %endrep
7258.immEnd:
7259ENDPROC iemAImpl_v %+ %1 %+ _u128
7260
7261
7262 %if %2 == 1
7263BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
7264 PROLOGUE_4_ARGS
7265 IEMIMPL_SSE_PROLOGUE
7266 SSE_AVX_LD_MXCSR A0_32
7267
7268 movzx A3, A3_8 ; must clear top bits
7269 vmovdqu ymm1, [A2 + IEMMEDIAF2YMMSRC.uSrc1]
7270 vmovdqu ymm2, [A2 + IEMMEDIAF2YMMSRC.uSrc2]
7271 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
7272 vmovdqu [A1], ymm0
7273
7274 SSE_AVX_ST_MXCSR R0_32, A0_32
7275 IEMIMPL_SSE_EPILOGUE
7276 EPILOGUE_4_ARGS
7277 %assign bImm 0
7278 %rep 256
7279.imm %+ bImm:
7280 IBT_ENDBRxx_WITHOUT_NOTRACK
7281 v %+ %1 ymm0, ymm1, ymm2, bImm
7282 ret
7283 int3
7284 %assign bImm bImm + 1
7285 %endrep
7286.immEnd:
7287ENDPROC iemAImpl_v %+ %1 %+ _u256
7288 %endif
7289%endmacro
7290
7291IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundss, 0
7292IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 roundsd, 0
7293IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dpps, 1
7294IEMIMPL_MEDIA_SSE_AVX_INSN_F3_IMM8_MXCSR_6 dppd, 0
7295
7296
7297;;
7298; SSE instructions of the form
7299; xxx mm, xmm.
7300; and we need to load and save the MXCSR register.
7301;
7302; @param 1 The instruction name.
7303;
7304; @return R0_32 The new MXCSR value of the guest.
7305; @param A0_32 The guest's MXCSR register value to use (input).
7306; @param A1 Pointer to the first MMX register sized operand (output).
7307; @param A2 Pointer to the media register sized operand (input).
7308;
7309%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
7310BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7311 PROLOGUE_3_ARGS
7312 IEMIMPL_SSE_PROLOGUE
7313 SSE_AVX_LD_MXCSR A0_32
7314
7315 movdqu xmm0, [A2]
7316 %1 mm0, xmm0
7317 movq [A1], mm0
7318
7319 SSE_AVX_ST_MXCSR R0_32, A0_32
7320 IEMIMPL_SSE_EPILOGUE
7321 EPILOGUE_3_ARGS
7322ENDPROC iemAImpl_ %+ %1 %+ _u128
7323%endmacro
7324
7325IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
7326IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
7327
7328;;
7329; SSE instructions of the form
7330; xxx xmm, xmm/m64.
7331; and we need to load and save the MXCSR register.
7332;
7333; @param 1 The instruction name.
7334;
7335; @return R0_32 The new MXCSR value of the guest.
7336; @param A0_32 The guest's MXCSR register value to use (input).
7337; @param A1 Pointer to the first media register sized operand (input/output).
7338; @param A2 The 64bit source value from a MMX media register (input)
7339;
7340%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
7341BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7342 PROLOGUE_3_ARGS
7343 IEMIMPL_SSE_PROLOGUE
7344 SSE_AVX_LD_MXCSR A0_32
7345
7346 movdqu xmm0, [A1]
7347 movq mm0, A2
7348 %1 xmm0, mm0
7349 movdqu [A1], xmm0
7350
7351 SSE_AVX_ST_MXCSR R0_32, A0_32
7352 IEMIMPL_SSE_EPILOGUE
7353 EPILOGUE_3_ARGS
7354ENDPROC iemAImpl_ %+ %1 %+ _u128
7355%endmacro
7356
7357IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
7358IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
7359
7360;;
7361; SSE instructions of the form
7362; xxx mm, xmm/m64.
7363; and we need to load and save the MXCSR register.
7364;
7365; @param 1 The instruction name.
7366;
7367; @return R0_32 The new MXCSR value of the guest.
7368; @param A0_32 The guest's MXCSR register value to use (input).
7369; @param A1 Pointer to the first MMX media register sized operand (output).
7370; @param A2 The 64bit source value (input).
7371;
7372%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
7373BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
7374 PROLOGUE_3_ARGS
7375 IEMIMPL_SSE_PROLOGUE
7376 SSE_AVX_LD_MXCSR A0_32
7377
7378 movq xmm0, A2
7379 %1 mm0, xmm0
7380 movq [A1], mm0
7381
7382 SSE_AVX_ST_MXCSR R0_32, A0_32
7383 IEMIMPL_SSE_EPILOGUE
7384 EPILOGUE_3_ARGS
7385ENDPROC iemAImpl_ %+ %1 %+ _u128
7386%endmacro
7387
7388IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
7389IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
7390
7391;
7392; All forms of RDRAND and RDSEED
7393;
7394; @param A0 Pointer to the destination operand.
7395; @param A1 Pointer to the EFLAGS value (input/output).
7396;
7397%macro IEMIMPL_RDRAND_RDSEED 3
7398BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
7399 PROLOGUE_2_ARGS
7400
7401 %1 %2
7402 mov [A0], %2
7403 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
7404
7405 EPILOGUE_2_ARGS
7406ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
7407%endmacro
7408
7409IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
7410IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
7411IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
7412IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
7413IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
7414IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
7415
7416
7417;;
7418; sha1rnds4 xmm1, xmm2, imm8.
7419;
7420; @param 1 The instruction name.
7421;
7422; @param A0 Pointer to the first media register size operand (input/output).
7423; @param A1 Pointer to the second source media register size operand (input).
7424; @param A2 The 8-bit immediate
7425;
7426BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
7427 PROLOGUE_3_ARGS
7428 IEMIMPL_SSE_PROLOGUE
7429
7430 movzx A2, A2_8 ; must clear top bits
7431 movdqu xmm0, [A0]
7432 movdqu xmm1, [A1]
7433 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
7434 movdqu [A0], xmm0
7435
7436 IEMIMPL_SSE_EPILOGUE
7437 EPILOGUE_3_ARGS
7438 %assign bImm 0
7439 %rep 256
7440.imm %+ bImm:
7441 IBT_ENDBRxx_WITHOUT_NOTRACK
7442 sha1rnds4 xmm0, xmm1, bImm
7443 ret
7444 %assign bImm bImm + 1
7445 %endrep
7446.immEnd:
7447ENDPROC iemAImpl_sha1rnds4_u128
7448
7449
7450;;
7451; sha256rnds2 xmm1, xmm2, <XMM0>.
7452;
7453; @param 1 The instruction name.
7454;
7455; @param A0 Pointer to the first media register size operand (input/output).
7456; @param A1 Pointer to the second source media register size operand (input).
7457; @param A2 Pointer to the implicit XMM0 constants (input).
7458;
7459BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
7460 PROLOGUE_3_ARGS
7461 IEMIMPL_SSE_PROLOGUE
7462
7463 movdqu xmm0, [A2]
7464 movdqu xmm1, [A0]
7465 movdqu xmm2, [A1]
7466 sha256rnds2 xmm1, xmm2
7467 movdqu [A0], xmm1
7468
7469 IEMIMPL_SSE_EPILOGUE
7470 EPILOGUE_3_ARGS
7471ENDPROC iemAImpl_sha256rnds2_u128
7472
7473
7474;
7475; 32-bit forms of ADCX and ADOX
7476;
7477; @returns Updated EFLAGS.
7478; @param A0 Incoming EFLAGS value (input).
7479; @param A1 Pointer to the destination operand (input/output).
7480; @param A2 32-bit source operand 1 (input).
7481;
7482%macro IEMIMPL_ADX_32 2
7483BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
7484 PROLOGUE_4_ARGS
7485
7486 IEM_LOAD_FLAGS A0_32, %2, 0
7487 %1 A2_32, [A1]
7488 mov [A1], A2_32
7489 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7490
7491 EPILOGUE_4_ARGS
7492ENDPROC iemAImpl_ %+ %1 %+ _u32
7493%endmacro
7494
7495;
7496; 64-bit forms of ADCX and ADOX
7497;
7498; @returns Updated EFLAGS.
7499; @param A0 Incoming EFLAGS value (input).
7500; @param A1 Pointer to the destination operand (input/output).
7501; @param A2 64-bit source operand 1 (input).
7502;
7503%macro IEMIMPL_ADX_64 2
7504BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
7505 PROLOGUE_4_ARGS
7506
7507 IEM_LOAD_FLAGS A0_32, %2, 0
7508 %1 A2, [A1]
7509 mov [A1], A2
7510 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
7511
7512 EPILOGUE_4_ARGS
7513ENDPROC iemAImpl_ %+ %1 %+ _u64
7514%endmacro
7515
7516IEMIMPL_ADX_32 adcx, X86_EFL_CF
7517IEMIMPL_ADX_64 adcx, X86_EFL_CF
7518
7519IEMIMPL_ADX_32 adox, X86_EFL_OF
7520IEMIMPL_ADX_64 adox, X86_EFL_OF
7521
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette