VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 104627

Last change on this file since 104627 was 104521, checked in by vboxsync, 9 months ago

VMM/IEM: Corrected cvtps2pd memory access from 128 to 64 bits. bugref:9898 bugref:10683

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 205.6 KB
Line 
1; $Id: IEMAllAImpl.asm 104521 2024-05-06 14:15:45Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; This is handy for generating absolutly correct EFLAGS.
43;%define IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
44
45
46;;
47; RET XX / RET wrapper for fastcall.
48;
49%macro RET_FASTCALL 1
50%ifdef RT_ARCH_X86
51 %ifdef RT_OS_WINDOWS
52 ret %1
53 %else
54 ret
55 %endif
56%else
57 ret
58%endif
59%endmacro
60
61;;
62; NAME for fastcall functions.
63;
64;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
65; escaping (or whatever the dollar is good for here). Thus the ugly
66; prefix argument.
67;
68%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
69%ifdef RT_ARCH_X86
70 %ifdef RT_OS_WINDOWS
71 %undef NAME_FASTCALL
72 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
73 %endif
74%endif
75
76;;
77; BEGINPROC for fastcall functions.
78;
79; @param 1 The function name (C).
80; @param 2 The argument size on x86.
81;
82%macro BEGINPROC_FASTCALL 2
83GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
84 IBT_ENDBRxx
85%endmacro
86
87
88;
89; We employ some macro assembly here to hid the calling convention differences.
90;
91%ifdef RT_ARCH_AMD64
92 %macro PROLOGUE_1_ARGS 0
93 %endmacro
94 %macro EPILOGUE_1_ARGS 0
95 ret
96 %endmacro
97 %macro EPILOGUE_1_ARGS_EX 0
98 ret
99 %endmacro
100
101 %macro PROLOGUE_2_ARGS 0
102 %endmacro
103 %macro EPILOGUE_2_ARGS 0
104 ret
105 %endmacro
106 %macro EPILOGUE_2_ARGS_EX 1
107 ret
108 %endmacro
109
110 %macro PROLOGUE_3_ARGS 0
111 %endmacro
112 %macro EPILOGUE_3_ARGS 0
113 ret
114 %endmacro
115 %macro EPILOGUE_3_ARGS_EX 1
116 ret
117 %endmacro
118
119 %macro PROLOGUE_4_ARGS 0
120 %endmacro
121 %macro EPILOGUE_4_ARGS 0
122 ret
123 %endmacro
124 %macro EPILOGUE_4_ARGS_EX 1
125 ret
126 %endmacro
127
128 %ifdef ASM_CALL64_GCC
129 %define A0 rdi
130 %define A0_32 edi
131 %define A0_16 di
132 %define A0_8 dil
133
134 %define A1 rsi
135 %define A1_32 esi
136 %define A1_16 si
137 %define A1_8 sil
138
139 %define A2 rdx
140 %define A2_32 edx
141 %define A2_16 dx
142 %define A2_8 dl
143
144 %define A3 rcx
145 %define A3_32 ecx
146 %define A3_16 cx
147 %define A3_8 cl
148 %endif
149
150 %ifdef ASM_CALL64_MSC
151 %define A0 rcx
152 %define A0_32 ecx
153 %define A0_16 cx
154 %define A0_8 cl
155
156 %define A1 rdx
157 %define A1_32 edx
158 %define A1_16 dx
159 %define A1_8 dl
160
161 %define A2 r8
162 %define A2_32 r8d
163 %define A2_16 r8w
164 %define A2_8 r8b
165
166 %define A3 r9
167 %define A3_32 r9d
168 %define A3_16 r9w
169 %define A3_8 r9b
170 %endif
171
172 %define T0 rax
173 %define T0_32 eax
174 %define T0_16 ax
175 %define T0_8 al
176
177 %define T1 r11
178 %define T1_32 r11d
179 %define T1_16 r11w
180 %define T1_8 r11b
181
182 %define T2 r10 ; only AMD64
183 %define T2_32 r10d
184 %define T2_16 r10w
185 %define T2_8 r10b
186
187 ;
188 ; Return value, same as T0 but to make it more obvious
189 ; that this is a return value.
190 ;
191 %define R0 rax
192 %define R0_32 eax
193 %define R0_16 ax
194 %define R0_8 al
195
196%else
197 ; x86
198 %macro PROLOGUE_1_ARGS 0
199 push edi
200 %endmacro
201 %macro EPILOGUE_1_ARGS 0
202 pop edi
203 ret 0
204 %endmacro
205 %macro EPILOGUE_1_ARGS_EX 1
206 pop edi
207 ret %1
208 %endmacro
209
210 %macro PROLOGUE_2_ARGS 0
211 push edi
212 %endmacro
213 %macro EPILOGUE_2_ARGS 0
214 pop edi
215 ret 0
216 %endmacro
217 %macro EPILOGUE_2_ARGS_EX 1
218 pop edi
219 ret %1
220 %endmacro
221
222 %macro PROLOGUE_3_ARGS 0
223 push ebx
224 mov ebx, [esp + 4 + 4]
225 push edi
226 %endmacro
227 %macro EPILOGUE_3_ARGS_EX 1
228 %if (%1) < 4
229 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
230 %endif
231 pop edi
232 pop ebx
233 ret %1
234 %endmacro
235 %macro EPILOGUE_3_ARGS 0
236 EPILOGUE_3_ARGS_EX 4
237 %endmacro
238
239 %macro PROLOGUE_4_ARGS 0
240 push ebx
241 push edi
242 push esi
243 mov ebx, [esp + 12 + 4 + 0]
244 mov esi, [esp + 12 + 4 + 4]
245 %endmacro
246 %macro EPILOGUE_4_ARGS_EX 1
247 %if (%1) < 8
248 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
249 %endif
250 pop esi
251 pop edi
252 pop ebx
253 ret %1
254 %endmacro
255 %macro EPILOGUE_4_ARGS 0
256 EPILOGUE_4_ARGS_EX 8
257 %endmacro
258
259 %define A0 ecx
260 %define A0_32 ecx
261 %define A0_16 cx
262 %define A0_8 cl
263
264 %define A1 edx
265 %define A1_32 edx
266 %define A1_16 dx
267 %define A1_8 dl
268
269 %define A2 ebx
270 %define A2_32 ebx
271 %define A2_16 bx
272 %define A2_8 bl
273
274 %define A3 esi
275 %define A3_32 esi
276 %define A3_16 si
277
278 %define T0 eax
279 %define T0_32 eax
280 %define T0_16 ax
281 %define T0_8 al
282
283 %define T1 edi
284 %define T1_32 edi
285 %define T1_16 di
286%endif
287
288
289;;
290; Load the relevant flags from [%1] if there are undefined flags (%3).
291;
292; @remarks Clobbers T0, stack. Changes EFLAGS.
293; @param 1 The parameter (A0..A3) holding the eflags value.
294; @param 2 The set of modified flags.
295; @param 3 The set of undefined flags.
296; @param 4 The flags that must be loaded.
297;
298%macro IEM_MAYBE_LOAD_FLAGS 4
299 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
300 pushf ; store current flags
301 mov T0_32, %1 ; load the guest flags
302 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
303 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
304 or [xSP], T0 ; merge guest flags with host flags.
305 popf ; load the mixed flags.
306
307 %elif (%3 + %4) != 0
308 %if 1 ; This approach seems faster on intel 10980XE
309 %if (%3 | %4) == X86_EFL_CF
310 ; Use bt to load bit into CF
311 bt %1, X86_EFL_CF_BIT
312 %else
313 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
314 mov eax, %1
315 %if (%3 | %4) == X86_EFL_OF
316 ; Use ADD to set OF.
317 shl eax, 31 - X86_EFL_OF_BIT
318 add eax, 80000000h
319 %elif ((%3 | %4) & X86_EFL_OF) != 0
320 ; Use ADD to set OF.
321 xchg al, ah
322 shl al, 15 - X86_EFL_OF_BIT
323 add al, 80h
324 ; Use SAHF to set the other status flags.
325 sahf
326 %else ; OF not needed; so al -> ah and load ah into eflags.
327 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
328 shl eax, 8
329 %else
330 xchg al, ah
331 %endif
332 sahf
333 %endif
334 %endif
335
336 %else
337 pushf ; store current flags
338 mov T0_32, %1 ; load the guest flags
339 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
340 and T0_32, (%2 | %3) ; select the modified and undefined flags.
341 or [xSP], T0 ; merge guest flags with host flags.
342 popf ; load the mixed flags.
343 %endif
344 %endif
345%endmacro
346
347;;
348; Load the relevant flags from [%1].
349;
350; @remarks Clobbers T0, stack. Changes EFLAGS.
351; @param 1 The parameter (A0..A3) holding the eflags value.
352; @param 2 The set of flags to load.
353; @param 3 The set of undefined flags.
354;
355%macro IEM_LOAD_FLAGS 3
356 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
357 pushf ; store current flags
358 mov T0_32, %1 ; load the guest flags
359 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
360 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
361 or [xSP], T0 ; merge guest flags with host flags.
362 popf ; load the mixed flags.
363
364 %elif 1 ; This approach seems faster on intel 10980XE
365 %if (%3 | %2) == X86_EFL_CF
366 ; Use bt to load bit into CF
367 bt %1, X86_EFL_CF_BIT
368 %else
369 mov eax, %1 ; ASSUMES T0_32 is eax!!
370 %if (%3 | %2) == X86_EFL_OF
371 ; Use ADD to set OF.
372 shl eax, 31 - X86_EFL_OF_BIT
373 add eax, 80000000h
374 %elif ((%3 | %2) & X86_EFL_OF) != 0
375 ; Use ADD to set OF.
376 xchg al, ah
377 shl al, 15 - X86_EFL_OF_BIT
378 add al, 80h
379 ; Use SAHF to set the other status flags.
380 sahf
381 %else ; OF not needed; so al -> ah and load ah into eflags.
382 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
383 shl eax, 8
384 %else
385 xchg al, ah
386 %endif
387 sahf
388 %endif
389 %endif ; (%3 | %2) != X86_EFL_CF
390
391 %else
392 pushf ; store current flags
393 mov T0_32, %1 ; load the guest flags
394 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
395 and T0_32, (%2 | %3) ; select the modified and undefined flags.
396 or [xSP], T0 ; merge guest flags with host flags.
397 popf ; load the mixed flags.
398 %endif
399%endmacro
400
401;;
402; Merge incoming guest EFLAGS (%1) with host EFLAGS into EAX (T0).
403;
404; @remarks Clobbers T0, T1, %1, stack.
405; @param 1 The parameter (A0..A3) holding the OLD eflags value. Clobbered.
406; @param 2 The mask of modified flags to save.
407; @param 3 The mask of undefined flags to (maybe) save.
408; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
409;
410%macro IEM_SAVE_FLAGS_RETVAL 4 0
411 %if (%2 | %3 | %4) != 0
412 mov T1_32, %1 ; flags
413 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
414 pushf
415 pop T0
416 and %1, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
417 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
418 %else
419 %if (%2 | %3 | %4) == X86_EFL_CF
420 setc T0_8
421 %elif (%2 | %3) == X86_EFL_OF
422 seto T0_8
423 shl T0_32, X86_EFL_OF_BIT
424 %elif (%2 | %3) == X86_EFL_ZF
425 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
426 shl T0_32, X86_EFL_ZF_BIT
427 %elif (%2 | %3) <= 0xff
428 lahf
429 movzx eax, ah ; ASSUMES T0_32 is eax!
430 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
431 lahf ; while there seems only to be a tiny advantage in most other test.
432 movzx eax, ah ; ASSUMES T0_32 is eax!
433 jno .of_is_clear
434 or eax, X86_EFL_OF
435.of_is_clear:
436 %else
437 pushf ; this is a bit slow
438 pop T0
439 %endif
440 and %1, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
441 and T0_32, (%2 | %3) ; select the modified and undefined flags.
442 %endif
443 or T0_32, %1 ; combine the flags. ASSUMES T0 = eax!
444 ;mov %1, T0_32 ; save the flags.
445 %endif
446%endmacro
447
448;;
449; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
450;
451; @remarks Clobbers T0, T1, stack.
452; @param 1 The parameter (A0..A3) holding the eflags value.
453; @param 2 The mask of modified flags to save.
454; @param 3 Mask of additional flags to always clear
455; @param 4 Mask of additional flags to always set.
456;
457;; @todo make it stuff the result into EAX?
458%macro IEM_SAVE_AND_ADJUST_FLAGS 4
459 %if (%2 | %3 | %4) != 0
460 pushf
461 pop T1
462 mov T0_32, %1 ; load flags.
463 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
464 and T1_32, (%2) ; select the modified flags.
465 or T0_32, T1_32 ; combine the flags.
466 %if (%4) != 0
467 or T0_32, %4 ; add the always set flags.
468 %endif
469 mov %1, T0_32 ; save the result.
470 %endif
471%endmacro
472
473;;
474; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
475; signed input (%4[%5]) and parity index (%6), storing the result into EAX (T0).
476;
477; @note %4 & %6 must not be RAX, EAX, or AX! So, don't use with full MUL/IMUL.
478
479; @remarks Clobbers T0, T1, stack, %6, EFLAGS, %1.
480; @param 1 The parameter (A0..A3) holding the eflags value.
481; @param 2 The mask of modified flags to save.
482; @param 3 Mask of additional flags to always clear
483; @param 4 The result register to set SF by.
484; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
485; @param 6 The (full) register containing the parity table index. Will be modified!
486%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL 6
487 pushf
488 pop T0
489 and %1, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
490 and T0_32, (%2) ; select the modified flags.
491 or T0_32, %1 ; combine the flags.
492
493 ; First calculate SF as it is the same register as %6 (only %6 is always full width).
494 bt %4, %5 - 1
495 jnc %%sf_clear
496 or T0_32, X86_EFL_SF
497 %%sf_clear:
498
499 ; Parity last.
500 and %6, 0xff
501 %ifdef RT_ARCH_AMD64
502 lea T1, [NAME(g_afParity) xWrtRIP]
503 or T0_8, [T1 + %6]
504 %else
505 or T0_8, [NAME(g_afParity) + %6]
506 %endif
507
508 ;mov %1, T0_32 ; save the result.
509 ; ASSUMES T0 = eax!
510%endmacro
511
512;;
513; Calculates the new EFLAGS using fixed clear and set bit masks.
514;
515; @remarks Clobbers T0.
516; @param 1 The parameter (A0..A3) holding the eflags value.
517; @param 2 Mask of additional flags to always clear
518; @param 3 Mask of additional flags to always set.
519;
520%macro IEM_ADJUST_FLAGS 3
521 %if (%2 | %3) != 0
522 mov T0_32, %1 ; Load flags.
523 %if (%2) != 0
524 and T0_32, ~(%2) ; Remove the always cleared flags.
525 %endif
526 %if (%3) != 0
527 or T0_32, %3 ; Add the always set flags.
528 %endif
529 mov %1, T0_32 ; Save the result.
530 %endif
531%endmacro
532
533;;
534; Calculates the new EFLAGS using fixed clear and set bit masks.
535;
536; @remarks Clobbers T0, %4, EFLAGS.
537; @param 1 The parameter (A0..A3) holding the eflags value.
538; @param 2 Mask of additional flags to always clear
539; @param 3 Mask of additional flags to always set.
540; @param 4 The (full) register containing the parity table index. Will be modified!
541;
542%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
543 mov T0_32, %1 ; Load flags.
544 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
545 %if (%3) != 0
546 or T0_32, %3 ; Add the always set flags.
547 %endif
548 and %4, 0xff
549 %ifdef RT_ARCH_AMD64
550 lea T2, [NAME(g_afParity) xWrtRIP]
551 or T0_8, [T2 + %4]
552 %else
553 or T0_8, [NAME(g_afParity) + %4]
554 %endif
555 mov %1, T0_32 ; Save the result.
556%endmacro
557
558
559;;;; OLD EFLAGS macros.
560;;;; OLD EFLAGS macros.
561;;;; OLD EFLAGS macros.
562;;;; OLD EFLAGS macros.
563;;;; OLD EFLAGS macros.
564
565;;
566; Load the relevant flags from [%1] if there are undefined flags (%3).
567;
568; @remarks Clobbers T0, stack. Changes EFLAGS.
569; @param 1 The parameter (A0..A3) pointing to the eflags.
570; @param 2 The set of modified flags.
571; @param 3 The set of undefined flags.
572; @param 4 The flags that must be loaded.
573;
574%macro IEM_MAYBE_LOAD_FLAGS_OLD 4
575 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
576 pushf ; store current flags
577 mov T0_32, [%1] ; load the guest flags
578 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified and undefined flags
579 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified and undefined flags.
580 or [xSP], T0 ; merge guest flags with host flags.
581 popf ; load the mixed flags.
582
583 %elif (%3 + %4) != 0
584 %if 1 ; This approach seems faster on intel 10980XE
585 %if (%3 | %4) == X86_EFL_CF
586 ; Use bt to load bit into CF
587 bt dword [%1], X86_EFL_CF_BIT
588 %else
589 ; Use ADD to set OF and SHAF for the rest. ASSUMES T0_32 is eax!
590 mov eax, [%1]
591 %if (%3 | %4) == X86_EFL_OF
592 ; Use ADD to set OF.
593 shl eax, 31 - X86_EFL_OF_BIT
594 add eax, 80000000h
595 %elif ((%3 | %4) & X86_EFL_OF) != 0
596 ; Use ADD to set OF.
597 xchg al, ah
598 shl al, 15 - X86_EFL_OF_BIT
599 add al, 80h
600 ; Use SAHF to set the other status flags.
601 sahf
602 %else ; OF not needed; so al -> ah and load ah into eflags.
603 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
604 shl eax, 8
605 %else
606 xchg al, ah
607 %endif
608 sahf
609 %endif
610 %endif
611
612 %else
613 pushf ; store current flags
614 mov T0_32, [%1] ; load the guest flags
615 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
616 and T0_32, (%2 | %3) ; select the modified and undefined flags.
617 or [xSP], T0 ; merge guest flags with host flags.
618 popf ; load the mixed flags.
619 %endif
620 %endif
621%endmacro
622
623;;
624; Load the relevant flags from [%1].
625;
626; @remarks Clobbers T0, stack. Changes EFLAGS.
627; @param 1 The parameter (A0..A3) pointing to the eflags.
628; @param 2 The set of flags to load.
629; @param 3 The set of undefined flags.
630;
631%macro IEM_LOAD_FLAGS_OLD 3
632 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
633 pushf ; store current flags
634 mov T0_32, [%1] ; load the guest flags
635 and dword [xSP], ~(%2 | %3 | X86_EFL_STATUS_BITS) ; mask out the modified, undefined and status flags
636 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
637 or [xSP], T0 ; merge guest flags with host flags.
638 popf ; load the mixed flags.
639
640 %elif 1 ; This approach seems faster on intel 10980XE
641 %if (%3 | %2) == X86_EFL_CF
642 ; Use bt to load bit into CF
643 bt dword [%1], X86_EFL_CF_BIT
644 %else
645 mov eax, [%1] ; ASSUMES T0_32 is eax!!
646 %if (%3 | %2) == X86_EFL_OF
647 ; Use ADD to set OF.
648 shl eax, 31 - X86_EFL_OF_BIT
649 add eax, 80000000h
650 %elif ((%3 | %2) & X86_EFL_OF) != 0
651 ; Use ADD to set OF.
652 xchg al, ah
653 shl al, 15 - X86_EFL_OF_BIT
654 add al, 80h
655 ; Use SAHF to set the other status flags.
656 sahf
657 %else ; OF not needed; so al -> ah and load ah into eflags.
658 %if 1 ; Pretty similar on 10980XE, but shl seems faster on average.
659 shl eax, 8
660 %else
661 xchg al, ah
662 %endif
663 sahf
664 %endif
665 %endif ; (%3 | %2) != X86_EFL_CF
666
667 %else
668 pushf ; store current flags
669 mov T0_32, [%1] ; load the guest flags
670 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
671 and T0_32, (%2 | %3) ; select the modified and undefined flags.
672 or [xSP], T0 ; merge guest flags with host flags.
673 popf ; load the mixed flags.
674 %endif
675%endmacro
676
677;;
678; Update the flag.
679;
680; @remarks Clobbers T0, T1, stack.
681; @param 1 The register pointing to the EFLAGS.
682; @param 2 The mask of modified flags to save.
683; @param 3 The mask of undefined flags to (maybe) save.
684; @param 4 The mask of flags that are zeroed (and thus doesn't require loading, just clearing)
685;
686%macro IEM_SAVE_FLAGS_OLD 4 0
687 %if (%2 | %3 | %4) != 0
688 mov T1_32, [%1] ; flags
689 %ifdef IEM_AIMPL_WITH_LOAD_AND_SAVE_ALL_STATUS_FLAGS
690 pushf
691 pop T0
692 and T1_32, ~(%2 | %3 | %4 | X86_EFL_STATUS_BITS) ; clear the modified & undefined & zeroed & status flags.
693 and T0_32, (%2 | %3 | X86_EFL_STATUS_BITS) ; select the modified, undefined and status flags.
694 %else
695 %if (%2 | %3 | %4) == X86_EFL_CF
696 setc T0_8
697 %elif (%2 | %3) == X86_EFL_OF
698 seto T0_8
699 shl T0_32, X86_EFL_OF_BIT
700 %elif (%2 | %3) == X86_EFL_ZF
701 setz T0_8 ; On 10980XE this is faster than the next option 5596 vs 5936 ps/call (cmpxchg8b-positive).
702 shl T0_32, X86_EFL_ZF_BIT
703 %elif (%2 | %3) <= 0xff
704 lahf
705 movzx eax, ah ; ASSUMES T0_32 is eax!
706 %elif 1 ; The locked functions are generally faster on 10980XE with this approach
707 lahf ; while there seems only to be a tiny advantage in most other test.
708 movzx eax, ah ; ASSUMES T0_32 is eax!
709 jno .of_is_clear
710 or eax, X86_EFL_OF
711.of_is_clear:
712 %else
713 pushf ; this is a bit slow
714 pop T0
715 %endif
716 and T1_32, ~(%2 | %3 | %4) ; clear the modified & undefined & zeroed flags.
717 and T0_32, (%2 | %3) ; select the modified and undefined flags.
718 %endif
719 or T0_32, T1_32 ; combine the flags.
720 mov [%1], T0_32 ; save the flags.
721 %endif
722%endmacro
723
724;;
725; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
726;
727; @remarks Clobbers T0, T1, stack.
728; @param 1 The register pointing to the EFLAGS.
729; @param 2 The mask of modified flags to save.
730; @param 3 Mask of additional flags to always clear
731; @param 4 Mask of additional flags to always set.
732;
733%macro IEM_SAVE_AND_ADJUST_FLAGS_OLD 4
734 %if (%2 | %3 | %4) != 0
735 pushf
736 pop T1
737 mov T0_32, [%1] ; load flags.
738 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
739 and T1_32, (%2) ; select the modified flags.
740 or T0_32, T1_32 ; combine the flags.
741 %if (%4) != 0
742 or T0_32, %4 ; add the always set flags.
743 %endif
744 mov [%1], T0_32 ; save the result.
745 %endif
746%endmacro
747
748;;
749; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
750; signed input (%4[%5]) and parity index (%6).
751;
752; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
753; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
754; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
755;
756; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
757; @param 1 The register pointing to the EFLAGS.
758; @param 2 The mask of modified flags to save.
759; @param 3 Mask of additional flags to always clear
760; @param 4 The result register to set SF by.
761; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
762; @param 6 The (full) register containing the parity table index. Will be modified!
763
764%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD 6
765 %ifdef RT_ARCH_AMD64
766 pushf
767 pop T2
768 %else
769 push T0
770 pushf
771 pop T0
772 %endif
773 mov T1_32, [%1] ; load flags.
774 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
775 %ifdef RT_ARCH_AMD64
776 and T2_32, (%2) ; select the modified flags.
777 or T1_32, T2_32 ; combine the flags.
778 %else
779 and T0_32, (%2) ; select the modified flags.
780 or T1_32, T0_32 ; combine the flags.
781 pop T0
782 %endif
783
784 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
785 bt %4, %5 - 1
786 jnc %%sf_clear
787 or T1_32, X86_EFL_SF
788 %%sf_clear:
789
790 ; Parity last.
791 and %6, 0xff
792 %ifdef RT_ARCH_AMD64
793 lea T2, [NAME(g_afParity) xWrtRIP]
794 or T1_8, [T2 + %6]
795 %else
796 or T1_8, [NAME(g_afParity) + %6]
797 %endif
798
799 mov [%1], T1_32 ; save the result.
800%endmacro
801
802;;
803; Calculates the new EFLAGS using fixed clear and set bit masks.
804;
805; @remarks Clobbers T0.
806; @param 1 The register pointing to the EFLAGS.
807; @param 2 Mask of additional flags to always clear
808; @param 3 Mask of additional flags to always set.
809;
810%macro IEM_ADJUST_FLAGS_OLD 3
811 %if (%2 | %3) != 0
812 mov T0_32, [%1] ; Load flags.
813 %if (%2) != 0
814 and T0_32, ~(%2) ; Remove the always cleared flags.
815 %endif
816 %if (%3) != 0
817 or T0_32, %3 ; Add the always set flags.
818 %endif
819 mov [%1], T0_32 ; Save the result.
820 %endif
821%endmacro
822
823;;
824; Calculates the new EFLAGS using fixed clear and set bit masks.
825;
826; @remarks Clobbers T0, %4, EFLAGS.
827; @param 1 The register pointing to the EFLAGS.
828; @param 2 Mask of additional flags to always clear
829; @param 3 Mask of additional flags to always set.
830; @param 4 The (full) register containing the parity table index. Will be modified!
831;
832%macro IEM_ADJUST_FLAGS_WITH_PARITY_OLD 4
833 mov T0_32, [%1] ; Load flags.
834 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
835 %if (%3) != 0
836 or T0_32, %3 ; Add the always set flags.
837 %endif
838 and %4, 0xff
839 %ifdef RT_ARCH_AMD64
840 lea T2, [NAME(g_afParity) xWrtRIP]
841 or T0_8, [T2 + %4]
842 %else
843 or T0_8, [NAME(g_afParity) + %4]
844 %endif
845 mov [%1], T0_32 ; Save the result.
846%endmacro
847
848
849
850;;
851; Loads register with offset of imm8 instruction -- used by all of the instruction
852; implementations which lay out jump tables of 256x immediate byte variants.
853; Also checks that the instruction size matches the offsets in the table.
854;
855; @param 1 The register to receive the jump target address (T1).
856; @param 2 The register containing the imm8 index (A1 / A2 / A3).
857; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
858; @note Implicitly uses local symbols .imm0, .imm1, and .immEmd
859; (implementation artifacts of each instruction jump table).
860;
861; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]`.
862;
863%macro IEMIMPL_JUMP_TABLE_TARGET_INT 3
864 lea %1, [.imm0 xWrtRIP]
865 %if %3 == 5
866 lea T0, [%2 + %2*4] ; *5
867 lea %1, [%1 + T0] ; *5 + .imm0
868 %elif %3 == 6
869 lea T0, [%2 + %2*2] ; *3
870 lea %1, [%1 + T0*2] ; *6 + .imm0
871 %elif %3 == 7
872 lea T0, [%2 + %2*2] ; *3
873 lea T0, [T0 + %2*4] ; *7
874 lea %1, [%1 + T0] ; *7 + .imm0
875 %elif %3 == 8
876 lea %1, [%1 + %2*8] ; *8 + .imm0
877 %elif %3 == 9
878 lea T0, [%2 + %2*8] ; *9
879 lea %1, [%1 + T0] ; *9 + .imm0
880 %elif %3 == 10
881 lea T0, [%2 + %2*4] ; *5
882 lea %1, [%1 + T0*2] ; *10 + .imm0
883 %elif %3 == 11
884 lea T0, [%2 + %2*4] ; *5
885 lea T0, [%2 + T0*2] ; *11
886 lea %1, [%1 + T0] ; *11 + .imm0
887 %elif %3 == 12
888 lea T0, [%2 + %2*2] ; *3
889 lea %1, [%1 + T0*4] ; *12 + .imm0
890 %else
891 %error Unexpected instruction byte count in IEMIMPL_JUMP_TABLE_TARGET_INT
892 %endif
893 ; check size: 'warning: value does not fit in 8 bit field' if bad
894 times (.imm1 - .imm0 + %3) %% %3 db 999 * \
895 (.imm1 - .imm0 + %3)
896 ; check alignment: 'warning: value does not fit in 8 bit field' if bad
897 times ((.immEnd - .imm0) - 256 * %3) db 999 * \
898 ((.immEnd - .imm0) - 256 * %3)
899%endmacro
900
901%macro IEMIMPL_JUMP_TABLE_TARGET 3
902 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
903 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, (%3 + 4)
904 %else
905 IEMIMPL_JUMP_TABLE_TARGET_INT %1, %2, %3
906 %endif
907%endmacro
908
909
910;;
911; Calls the given imm8 instruction -- used by all of the instruction
912; implementations which lay out jump tables of 256x immediate byte variants.
913;
914; @param 1 The register to receive the jump target address (T1).
915; @param 2 The register containing the imm8 index (A1 / A2 / A3).
916; @param 3 Byte size of one instruction + ret (+ ?int3) in the table
917;
918; Emits the equivalent (in actual code) of `lea %1, [.imm0 + %2 * %3]` +
919; `IBT_NOTRACK, call %1`.
920;
921%macro IEMIMPL_CALL_JUMP_TABLE_TARGET 3
922 IEMIMPL_JUMP_TABLE_TARGET %1, %2, %3
923 IBT_NOTRACK
924 call %1
925%endmacro
926
927
928;*********************************************************************************************************************************
929;* External Symbols *
930;*********************************************************************************************************************************
931extern NAME(g_afParity)
932
933
934;;
935; Macro for implementing a binary operator.
936;
937; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
938; variants, except on 32-bit system where the 64-bit accesses requires hand
939; coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; @param 1 The instruction mnemonic.
945; @param 2 Non-zero if there should be a locked version.
946; @param 3 The modified flags.
947; @param 4 The undefined flags.
948; @param 5 The flags that must be loaded (ADC, SBC).
949; @param 6 The flags that will be zeroed by the operation.
950;
951%macro IEMIMPL_BIN_OP 6
952BEGINCODE
953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
954 PROLOGUE_3_ARGS
955 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
956 %1 byte [A1], A2_8
957 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
958 EPILOGUE_3_ARGS
959ENDPROC iemAImpl_ %+ %1 %+ _u8
960
961BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
962 PROLOGUE_3_ARGS
963 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
964 %1 word [A1], A2_16
965 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
966 EPILOGUE_3_ARGS
967ENDPROC iemAImpl_ %+ %1 %+ _u16
968
969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
970 PROLOGUE_3_ARGS
971 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
972 %1 dword [A1], A2_32
973 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
974 EPILOGUE_3_ARGS
975ENDPROC iemAImpl_ %+ %1 %+ _u32
976
977 %ifdef RT_ARCH_AMD64
978BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
979 PROLOGUE_3_ARGS
980 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
981 %1 qword [A1], A2
982 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
983 EPILOGUE_3_ARGS_EX 8
984ENDPROC iemAImpl_ %+ %1 %+ _u64
985 %endif ; RT_ARCH_AMD64
986
987 %if %2 != 0 ; locked versions requested?
988
989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
990 PROLOGUE_3_ARGS
991 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
992 lock %1 byte [A1], A2_8
993 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
994 EPILOGUE_3_ARGS
995ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1000 lock %1 word [A1], A2_16
1001 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1002 EPILOGUE_3_ARGS
1003ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1004
1005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1006 PROLOGUE_3_ARGS
1007 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1008 lock %1 dword [A1], A2_32
1009 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1010 EPILOGUE_3_ARGS
1011ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1012
1013 %ifdef RT_ARCH_AMD64
1014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1015 PROLOGUE_3_ARGS
1016 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, %5
1017 lock %1 qword [A1], A2
1018 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, %6
1019 EPILOGUE_3_ARGS_EX 8
1020ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1021 %endif ; RT_ARCH_AMD64
1022 %endif ; locked
1023%endmacro
1024
1025; instr,lock, modified-flags, undefined flags, must be loaded, zeroed flags
1026IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1027IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1028IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1029IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, X86_EFL_CF, 0
1030IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0, 0
1031IEMIMPL_BIN_OP or, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1032IEMIMPL_BIN_OP xor, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1033IEMIMPL_BIN_OP and, 1, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1034IEMIMPL_BIN_OP test, 0, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF), X86_EFL_AF, 0, X86_EFL_OF | X86_EFL_CF
1035
1036
1037;;
1038; Macro for implementing a binary operator, VEX variant with separate input/output.
1039;
1040; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1041; where the 64-bit accesses requires hand coding.
1042;
1043; All the functions takes a pointer to the destination memory operand in A0,
1044; the first source register operand in A1, the second source register operand
1045; in A2 and a pointer to eflags in A3.
1046;
1047; @param 1 The instruction mnemonic.
1048; @param 2 The modified flags.
1049; @param 3 The undefined flags.
1050; @param 4 The zeroed flags.
1051;
1052%macro IEMIMPL_VEX_BIN_OP 4
1053BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1054 PROLOGUE_4_ARGS
1055 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0 ;; @todo do we need to load undefined flags for any platform?
1056 %1 T0_32, A1_32, A2_32
1057 mov [A0], T0_32
1058 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1059 EPILOGUE_4_ARGS
1060ENDPROC iemAImpl_ %+ %1 %+ _u32
1061
1062 %ifdef RT_ARCH_AMD64
1063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1064 PROLOGUE_4_ARGS
1065 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, 0
1066 %1 T0, A1, A2
1067 mov [A0], T0
1068 IEM_SAVE_FLAGS_OLD A3, %2, %3, %4
1069 EPILOGUE_4_ARGS
1070ENDPROC iemAImpl_ %+ %1 %+ _u64
1071 %endif ; RT_ARCH_AMD64
1072%endmacro
1073
1074; instr, modified-flags, undefined-flags, zeroed-flags
1075IEMIMPL_VEX_BIN_OP andn, X86_EFL_SF | X86_EFL_ZF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1076IEMIMPL_VEX_BIN_OP bextr, X86_EFL_ZF, X86_EFL_SF | X86_EFL_AF | X86_EFL_PF, X86_EFL_OF | X86_EFL_CF
1077IEMIMPL_VEX_BIN_OP bzhi, X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF, X86_EFL_AF | X86_EFL_PF, X86_EFL_OF
1078
1079;;
1080; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
1081;
1082; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1083; where the 64-bit accesses requires hand coding.
1084;
1085; All the functions takes a pointer to the destination memory operand in A1,
1086; the source register operand in A2 and incoming EFLAGS in A0. Updated EFLAGS
1087; are returned in EAX.
1088;
1089; @param 1 The instruction mnemonic.
1090; @param 2 The modified flags.
1091; @param 3 The undefined flags.
1092; @param 4 The zeroed flags.
1093;
1094%macro IEMIMPL_VEX_BIN_OP_2 4
1095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1096 PROLOGUE_4_ARGS
1097 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0 ;; @todo check if any undefined flags are passed thru
1098 mov T0_32, [A1]
1099 %1 T0_32, A2_32
1100 mov [A1], T0_32
1101 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1102 EPILOGUE_4_ARGS
1103ENDPROC iemAImpl_ %+ %1 %+ _u32
1104
1105 %ifdef RT_ARCH_AMD64
1106BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1107 PROLOGUE_4_ARGS
1108 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1109 mov T0, [A1]
1110 %1 T0, A2
1111 mov [A1], T0
1112 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1113 EPILOGUE_4_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u64
1115 %endif ; RT_ARCH_AMD64
1116%endmacro
1117
1118; instr, modified-flags, undefined-flags zeroed-flags
1119IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1120IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1121IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF), X86_EFL_OF
1122
1123
1124;;
1125; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
1126;
1127; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
1128; where the 64-bit accesses requires hand coding.
1129;
1130; All the functions takes a pointer to the destination memory operand in A0,
1131; the first source register operand in A1, the second source register operand
1132; in A2 and a pointer to eflags in A3.
1133;
1134; @param 1 The instruction mnemonic.
1135; @param 2 Fallback instruction if applicable.
1136; @param 3 Whether to emit fallback or not.
1137;
1138%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
1139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1140 PROLOGUE_3_ARGS
1141 %1 T0_32, A1_32, A2_32
1142 mov [A0], T0_32
1143 EPILOGUE_3_ARGS
1144ENDPROC iemAImpl_ %+ %1 %+ _u32
1145
1146 %if %3
1147BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
1148 PROLOGUE_3_ARGS
1149 %ifdef ASM_CALL64_GCC
1150 mov cl, A2_8
1151 %2 A1_32, cl
1152 mov [A0], A1_32
1153 %else
1154 xchg A2, A0
1155 %2 A1_32, cl
1156 mov [A2], A1_32
1157 %endif
1158 EPILOGUE_3_ARGS
1159ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
1160 %endif
1161
1162 %ifdef RT_ARCH_AMD64
1163BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1164 PROLOGUE_3_ARGS
1165 %1 T0, A1, A2
1166 mov [A0], T0
1167 EPILOGUE_3_ARGS
1168ENDPROC iemAImpl_ %+ %1 %+ _u64
1169
1170 %if %3
1171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
1172 PROLOGUE_3_ARGS
1173 %ifdef ASM_CALL64_GCC
1174 mov cl, A2_8
1175 %2 A1, cl
1176 mov [A0], A1_32
1177 %else
1178 xchg A2, A0
1179 %2 A1, cl
1180 mov [A2], A1_32
1181 %endif
1182 mov [A0], A1
1183 EPILOGUE_3_ARGS
1184ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
1185 %endif
1186 %endif ; RT_ARCH_AMD64
1187%endmacro
1188
1189; instr, fallback instr, emit fallback
1190IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
1191IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
1192IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
1193IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
1194IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
1195
1196
1197;
1198; RORX uses a immediate byte for the shift count, so we only do
1199; fallback implementation of that one.
1200;
1201BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
1202 PROLOGUE_3_ARGS
1203 %ifdef ASM_CALL64_GCC
1204 mov cl, A2_8
1205 ror A1_32, cl
1206 mov [A0], A1_32
1207 %else
1208 xchg A2, A0
1209 ror A1_32, cl
1210 mov [A2], A1_32
1211 %endif
1212 EPILOGUE_3_ARGS
1213ENDPROC iemAImpl_rorx_u32
1214
1215 %ifdef RT_ARCH_AMD64
1216BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
1217 PROLOGUE_3_ARGS
1218 %ifdef ASM_CALL64_GCC
1219 mov cl, A2_8
1220 ror A1, cl
1221 mov [A0], A1
1222 %else
1223 xchg A2, A0
1224 ror A1, cl
1225 mov [A2], A1
1226 %endif
1227 EPILOGUE_3_ARGS
1228ENDPROC iemAImpl_rorx_u64
1229 %endif ; RT_ARCH_AMD64
1230
1231
1232;
1233; MULX
1234;
1235BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
1236 PROLOGUE_4_ARGS
1237%ifdef ASM_CALL64_GCC
1238 ; A2_32 is EDX - prefect
1239 mulx T0_32, T1_32, A3_32
1240 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
1241 mov [A0], T0_32
1242%else
1243 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
1244 xchg A1, A2
1245 mulx T0_32, T1_32, A3_32
1246 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
1247 mov [A0], T0_32
1248%endif
1249 EPILOGUE_4_ARGS
1250ENDPROC iemAImpl_mulx_u32
1251
1252
1253BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
1254 PROLOGUE_4_ARGS
1255%ifdef ASM_CALL64_GCC
1256 ; A2_32 is EDX, T0_32 is EAX
1257 mov eax, A3_32
1258 mul A2_32
1259 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
1260 mov [A0], edx
1261%else
1262 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
1263 xchg A1, A2
1264 mov eax, A3_32
1265 mul A2_32
1266 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
1267 mov [A0], edx
1268%endif
1269 EPILOGUE_4_ARGS
1270ENDPROC iemAImpl_mulx_u32_fallback
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
1274 PROLOGUE_4_ARGS
1275%ifdef ASM_CALL64_GCC
1276 ; A2 is RDX - prefect
1277 mulx T0, T1, A3
1278 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
1279 mov [A0], T0
1280%else
1281 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
1282 xchg A1, A2
1283 mulx T0, T1, A3
1284 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
1285 mov [A0], T0
1286%endif
1287 EPILOGUE_4_ARGS
1288ENDPROC iemAImpl_mulx_u64
1289
1290
1291BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
1292 PROLOGUE_4_ARGS
1293%ifdef ASM_CALL64_GCC
1294 ; A2 is RDX, T0 is RAX
1295 mov rax, A3
1296 mul A2
1297 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
1298 mov [A0], rdx
1299%else
1300 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
1301 xchg A1, A2
1302 mov rax, A3
1303 mul A2
1304 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
1305 mov [A0], rdx
1306%endif
1307 EPILOGUE_4_ARGS
1308ENDPROC iemAImpl_mulx_u64_fallback
1309
1310%endif
1311
1312
1313;;
1314; Macro for implementing a bit operator.
1315;
1316; This will generate code for the 16, 32 and 64 bit accesses with locked
1317; variants, except on 32-bit system where the 64-bit accesses requires hand
1318; coding.
1319;
1320; All the functions takes a pointer to the destination memory operand in A1,
1321; the source register operand in A2 and incoming eflags in A0.
1322;
1323; @param 1 The instruction mnemonic.
1324; @param 2 Non-zero if there should be a locked version.
1325; @param 3 The modified flags.
1326; @param 4 The undefined flags.
1327;
1328%macro IEMIMPL_BIT_OP 4
1329BEGINCODE
1330BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1333 %1 word [A1], A2_16
1334 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_ %+ %1 %+ _u16
1337
1338BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1341 %1 dword [A1], A2_32
1342 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1343 EPILOGUE_3_ARGS
1344ENDPROC iemAImpl_ %+ %1 %+ _u32
1345
1346 %ifdef RT_ARCH_AMD64
1347BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1348 PROLOGUE_3_ARGS
1349 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1350 %1 qword [A1], A2
1351 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1352 EPILOGUE_3_ARGS_EX 8
1353ENDPROC iemAImpl_ %+ %1 %+ _u64
1354 %endif ; RT_ARCH_AMD64
1355
1356 %if %2 != 0 ; locked versions requested?
1357
1358BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
1359 PROLOGUE_3_ARGS
1360 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1361 lock %1 word [A1], A2_16
1362 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1363 EPILOGUE_3_ARGS
1364ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1365
1366BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
1367 PROLOGUE_3_ARGS
1368 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1369 lock %1 dword [A1], A2_32
1370 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1371 EPILOGUE_3_ARGS
1372ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1373
1374 %ifdef RT_ARCH_AMD64
1375BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
1376 PROLOGUE_3_ARGS
1377 IEM_MAYBE_LOAD_FLAGS A0_32, %3, %4, 0
1378 lock %1 qword [A1], A2
1379 IEM_SAVE_FLAGS_RETVAL A0_32, %3, %4, 0
1380 EPILOGUE_3_ARGS_EX 8
1381ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1382 %endif ; RT_ARCH_AMD64
1383 %endif ; locked
1384%endmacro
1385
1386; Undefined flags are passed thru here by the intel and amd CPUs we have.
1387; modified efl, undefined eflags
1388IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1389IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1390IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1391IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), 0 ;passed-thru (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
1392
1393;;
1394; Macro for implementing a bit search operator.
1395;
1396; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1397; system where the 64-bit accesses requires hand coding.
1398;
1399; All the functions takes a pointer to the destination memory operand in A1,
1400; the source register operand in A2 and the incoming eflags in A0.
1401;
1402; In the ZF case the destination register is 'undefined', however it seems that
1403; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
1404; AMD and Intel and according to https://www.sandpile.org/x86/flags.htm between
1405; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
1406; the behaviour of more recent CPUs (Intel 10980XE and AMD 3990X).
1407;
1408; Intel: Clear all and calculate PF in addition to ZF.
1409; AMD: Passthru all flags other than ZF.
1410;
1411; @param 1 The instruction mnemonic.
1412; @param 2 The modified flags.
1413; @param 3 The undefined flags.
1414; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
1415;
1416%macro IEMIMPL_BIT_OP2 4
1417BEGINCODE
1418; 16-bit
1419
1420BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1421 PROLOGUE_3_ARGS
1422 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1423 %1 T0_16, A2_16
1424%if %4 != 0
1425 jz .unchanged_dst
1426%endif
1427 mov [A1], T0_16
1428.unchanged_dst:
1429 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1430 EPILOGUE_3_ARGS
1431ENDPROC iemAImpl_ %+ %1 %+ _u16
1432
1433;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
1434;bad; PROLOGUE_3_ARGS
1435;bad; %1 T1_16, A1_16
1436;bad; jz .unchanged_dst
1437;bad; mov [A0], T1_16
1438;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1439;bad; EPILOGUE_3_ARGS
1440;bad;.unchanged_dst:
1441;bad;%if %4 != 0
1442;bad; mov [A0], T1_16
1443;bad;%endif
1444;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1445;bad; EPILOGUE_3_ARGS
1446;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
1447;bad;
1448;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
1449;bad; PROLOGUE_3_ARGS
1450;bad; %1 T0_16, A1_16
1451;bad;%if %4 != 0
1452;bad; jz .unchanged_dst
1453;bad;%endif
1454;bad; mov [A0], T0_16
1455;bad;.unchanged_dst:
1456;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1457;bad; EPILOGUE_3_ARGS
1458;bad;ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1459
1460; 32-bit
1461
1462BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1463 PROLOGUE_3_ARGS
1464 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1465 %1 T0_32, A2_32
1466%if %4 != 0
1467 jz .unchanged_dst
1468%endif
1469 mov [A1], T0_32
1470.unchanged_dst:
1471 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1472 EPILOGUE_3_ARGS
1473ENDPROC iemAImpl_ %+ %1 %+ _u32
1474
1475;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1476;bad; PROLOGUE_3_ARGS
1477;bad; %1 T1_32, A1_32
1478;bad;%if %4 != 0
1479;bad; jz .unchanged_dst
1480;bad;%endif
1481;bad; mov [A0], T1_32
1482;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1483;bad; EPILOGUE_3_ARGS
1484;bad;.unchanged_dst:
1485;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1486;bad; EPILOGUE_3_ARGS
1487;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1488;bad;
1489;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1490;bad; PROLOGUE_3_ARGS
1491;bad; %1 T0_32, A1_32
1492;bad;%if %4 != 0
1493;bad; jz .unchanged_dst
1494;bad;%endif
1495;bad; mov [A0], T0_32
1496;bad;.unchanged_dst:
1497;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1498;bad; EPILOGUE_3_ARGS
1499;bad;ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1500
1501
1502 %ifdef RT_ARCH_AMD64
1503; 64-bit
1504
1505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1506 PROLOGUE_3_ARGS
1507 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %3 ; Must load undefined flags since AMD passes them thru
1508 %1 T0, A2
1509%if %4 != 0
1510 jz .unchanged_dst
1511%endif
1512 mov [A1], T0
1513.unchanged_dst:
1514 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
1515 EPILOGUE_3_ARGS_EX 8
1516ENDPROC iemAImpl_ %+ %1 %+ _u64
1517
1518;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1519;bad; PROLOGUE_3_ARGS
1520;bad; %1 T1, A1
1521;bad;%if %4 != 0
1522;bad; jz .unchanged_dst
1523;bad;%endif
1524;bad; mov [A0], T1
1525;bad; IEM_ADJUST_FLAGS_WITH_PARITY_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1526;bad; EPILOGUE_3_ARGS
1527;bad;.unchanged_dst:
1528;bad; IEM_ADJUST_FLAGS_OLD A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1529;bad; EPILOGUE_3_ARGS
1530;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1531;bad;
1532;bad;BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1533;bad; PROLOGUE_3_ARGS
1534;bad; %1 T0, A1
1535;bad;%if %4 != 0
1536;bad; jz .unchanged_dst
1537;bad;%endif
1538;bad; mov [A0], T0
1539;bad;.unchanged_dst:
1540;bad; IEM_SAVE_AND_ADJUST_FLAGS_OLD A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1541;bad; EPILOGUE_3_ARGS_EX 8
1542;bad;ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1543
1544 %endif ; RT_ARCH_AMD64
1545%endmacro
1546
1547IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1548IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1549IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1550IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1551
1552
1553;;
1554; Macro for implementing POPCNT.
1555;
1556; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1557; system where the 64-bit accesses requires hand coding.
1558;
1559; All the functions takes a pointer to the destination memory operand in A1,
1560; the source register operand in A2 and eflags in A0.
1561;
1562; ASSUMES Intel and AMD set EFLAGS the same way.
1563;
1564; ASSUMES the instruction does not support memory destination.
1565;
1566; @param 1 The instruction mnemonic.
1567; @param 2 The modified flags.
1568; @param 3 The undefined flags.
1569; @param 4 The zeroed flags.
1570;
1571%macro IEMIMPL_BIT_OP3 4
1572BEGINCODE
1573BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1574 PROLOGUE_3_ARGS
1575 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1576 %1 T0_16, A2_16
1577 mov [A1], T0_16
1578 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1579 EPILOGUE_3_ARGS
1580ENDPROC iemAImpl_ %+ %1 %+ _u16
1581
1582BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1583 PROLOGUE_3_ARGS
1584 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1585 %1 T0_32, A2_32
1586 mov [A1], T0_32
1587 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1588 EPILOGUE_3_ARGS
1589ENDPROC iemAImpl_ %+ %1 %+ _u32
1590
1591 %ifdef RT_ARCH_AMD64
1592BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1593 PROLOGUE_3_ARGS
1594 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, 0
1595 %1 T0, A2
1596 mov [A1], T0
1597 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, %4
1598 EPILOGUE_3_ARGS_EX 8
1599ENDPROC iemAImpl_ %+ %1 %+ _u64
1600 %endif ; RT_ARCH_AMD64
1601%endmacro
1602IEMIMPL_BIT_OP3 popcnt, X86_EFL_ZF, 0, X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF
1603
1604
1605;
1606; IMUL is also a similar but yet different case (no lock, no mem dst).
1607; The rDX:rAX variant of imul is handled together with mul further down.
1608;
1609BEGINCODE
1610; @param 1 EFLAGS that are modified.
1611; @param 2 Undefined EFLAGS.
1612; @param 3 Function suffix.
1613; @param 4 EFLAGS variation: 0 for native, 1 for intel,
1614; 2 for AMD (set AF, clear PF, ZF and SF).
1615%macro IEMIMPL_IMUL_TWO 4
1616BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1617 PROLOGUE_3_ARGS
1618 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1619 imul A2_16, word [A1]
1620 mov [A1], A2_16
1621 %if %4 != 1
1622 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1623 %else
1624 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_16, 16, A2 ; intel
1625 %endif
1626 EPILOGUE_3_ARGS
1627ENDPROC iemAImpl_imul_two_u16 %+ %3
1628
1629BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1630 PROLOGUE_3_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1632 imul A2_32, dword [A1]
1633 mov [A1], A2_32
1634 %if %4 != 1
1635 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1636 %else
1637 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2_32, 32, A2 ; intel
1638 %endif
1639 EPILOGUE_3_ARGS
1640ENDPROC iemAImpl_imul_two_u32 %+ %3
1641
1642 %ifdef RT_ARCH_AMD64
1643BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1644 PROLOGUE_3_ARGS
1645 IEM_MAYBE_LOAD_FLAGS A0_32, %1, %2, %2 ; Undefined flags may be passed thru (AMD)
1646 imul A2, qword [A1]
1647 mov [A1], A2
1648 %if %4 != 1
1649 IEM_SAVE_FLAGS_RETVAL A0_32, %1, %2, 0
1650 %else
1651 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_RETVAL A0_32, %1, X86_EFL_AF | X86_EFL_ZF, A2, 64, A2 ; intel
1652 %endif
1653 EPILOGUE_3_ARGS_EX 8
1654ENDPROC iemAImpl_imul_two_u64 %+ %3
1655 %endif ; RT_ARCH_AMD64
1656%endmacro
1657; The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
1658; flags as is. Whereas Intel skylake (6700K and 10980XE (Cascade Lake)) always
1659; clear AF and ZF and calculates SF and PF as per the lower half of the result.
1660IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1661IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1662IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1663
1664
1665;
1666; XCHG for memory operands. This implies locking. No flag changes.
1667;
1668; Each function takes two arguments, first the pointer to the memory,
1669; then the pointer to the register. They all return void.
1670;
1671BEGINCODE
1672BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1673 PROLOGUE_2_ARGS
1674 mov T0_8, [A1]
1675 xchg [A0], T0_8
1676 mov [A1], T0_8
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_xchg_u8_locked
1679
1680BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 mov T0_16, [A1]
1683 xchg [A0], T0_16
1684 mov [A1], T0_16
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_xchg_u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1689 PROLOGUE_2_ARGS
1690 mov T0_32, [A1]
1691 xchg [A0], T0_32
1692 mov [A1], T0_32
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_xchg_u32_locked
1695
1696%ifdef RT_ARCH_AMD64
1697BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1698 PROLOGUE_2_ARGS
1699 mov T0, [A1]
1700 xchg [A0], T0
1701 mov [A1], T0
1702 EPILOGUE_2_ARGS
1703ENDPROC iemAImpl_xchg_u64_locked
1704%endif
1705
1706; Unlocked variants for fDisregardLock mode.
1707
1708BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1709 PROLOGUE_2_ARGS
1710 mov T0_8, [A1]
1711 mov T1_8, [A0]
1712 mov [A0], T0_8
1713 mov [A1], T1_8
1714 EPILOGUE_2_ARGS
1715ENDPROC iemAImpl_xchg_u8_unlocked
1716
1717BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1718 PROLOGUE_2_ARGS
1719 mov T0_16, [A1]
1720 mov T1_16, [A0]
1721 mov [A0], T0_16
1722 mov [A1], T1_16
1723 EPILOGUE_2_ARGS
1724ENDPROC iemAImpl_xchg_u16_unlocked
1725
1726BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1727 PROLOGUE_2_ARGS
1728 mov T0_32, [A1]
1729 mov T1_32, [A0]
1730 mov [A0], T0_32
1731 mov [A1], T1_32
1732 EPILOGUE_2_ARGS
1733ENDPROC iemAImpl_xchg_u32_unlocked
1734
1735%ifdef RT_ARCH_AMD64
1736BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1737 PROLOGUE_2_ARGS
1738 mov T0, [A1]
1739 mov T1, [A0]
1740 mov [A0], T0
1741 mov [A1], T1
1742 EPILOGUE_2_ARGS
1743ENDPROC iemAImpl_xchg_u64_unlocked
1744%endif
1745
1746
1747;
1748; XADD for memory operands.
1749;
1750; Each function takes three arguments, first the pointer to the
1751; memory/register, then the pointer to the register, and finally a pointer to
1752; eflags. They all return void.
1753;
1754BEGINCODE
1755BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1756 PROLOGUE_3_ARGS
1757 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1758 mov T0_8, [A1]
1759 xadd [A0], T0_8
1760 mov [A1], T0_8
1761 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1762 EPILOGUE_3_ARGS
1763ENDPROC iemAImpl_xadd_u8
1764
1765BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1766 PROLOGUE_3_ARGS
1767 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1768 mov T0_16, [A1]
1769 xadd [A0], T0_16
1770 mov [A1], T0_16
1771 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1772 EPILOGUE_3_ARGS
1773ENDPROC iemAImpl_xadd_u16
1774
1775BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1778 mov T0_32, [A1]
1779 xadd [A0], T0_32
1780 mov [A1], T0_32
1781 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_xadd_u32
1784
1785%ifdef RT_ARCH_AMD64
1786BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1787 PROLOGUE_3_ARGS
1788 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1789 mov T0, [A1]
1790 xadd [A0], T0
1791 mov [A1], T0
1792 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1793 EPILOGUE_3_ARGS
1794ENDPROC iemAImpl_xadd_u64
1795%endif ; RT_ARCH_AMD64
1796
1797BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1798 PROLOGUE_3_ARGS
1799 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1800 mov T0_8, [A1]
1801 lock xadd [A0], T0_8
1802 mov [A1], T0_8
1803 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1804 EPILOGUE_3_ARGS
1805ENDPROC iemAImpl_xadd_u8_locked
1806
1807BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1808 PROLOGUE_3_ARGS
1809 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1810 mov T0_16, [A1]
1811 lock xadd [A0], T0_16
1812 mov [A1], T0_16
1813 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1814 EPILOGUE_3_ARGS
1815ENDPROC iemAImpl_xadd_u16_locked
1816
1817BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1818 PROLOGUE_3_ARGS
1819 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1820 mov T0_32, [A1]
1821 lock xadd [A0], T0_32
1822 mov [A1], T0_32
1823 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1824 EPILOGUE_3_ARGS
1825ENDPROC iemAImpl_xadd_u32_locked
1826
1827%ifdef RT_ARCH_AMD64
1828BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1829 PROLOGUE_3_ARGS
1830 IEM_MAYBE_LOAD_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1831 mov T0, [A1]
1832 lock xadd [A0], T0
1833 mov [A1], T0
1834 IEM_SAVE_FLAGS_OLD A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1835 EPILOGUE_3_ARGS
1836ENDPROC iemAImpl_xadd_u64_locked
1837%endif ; RT_ARCH_AMD64
1838
1839
1840;
1841; CMPXCHG8B.
1842;
1843; These are tricky register wise, so the code is duplicated for each calling
1844; convention.
1845;
1846; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1847;
1848; C-proto:
1849; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1850; uint32_t *pEFlags));
1851;
1852; Note! Identical to iemAImpl_cmpxchg16b.
1853;
1854BEGINCODE
1855BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1856%ifdef RT_ARCH_AMD64
1857 %ifdef ASM_CALL64_MSC
1858 push rbx
1859
1860 mov r11, rdx ; pu64EaxEdx (is also T1)
1861 mov r10, rcx ; pu64Dst
1862
1863 mov ebx, [r8]
1864 mov ecx, [r8 + 4]
1865 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1866 mov eax, [r11]
1867 mov edx, [r11 + 4]
1868
1869 cmpxchg8b [r10]
1870
1871 mov [r11], eax
1872 mov [r11 + 4], edx
1873 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1874
1875 pop rbx
1876 ret
1877 %else
1878 push rbx
1879
1880 mov r10, rcx ; pEFlags
1881 mov r11, rdx ; pu64EbxEcx (is also T1)
1882
1883 mov ebx, [r11]
1884 mov ecx, [r11 + 4]
1885 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1886 mov eax, [rsi]
1887 mov edx, [rsi + 4]
1888
1889 cmpxchg8b [rdi]
1890
1891 mov [rsi], eax
1892 mov [rsi + 4], edx
1893 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1894
1895 pop rbx
1896 ret
1897
1898 %endif
1899%else
1900 push esi
1901 push edi
1902 push ebx
1903 push ebp
1904
1905 mov edi, ecx ; pu64Dst
1906 mov esi, edx ; pu64EaxEdx
1907 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1908 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1909
1910 mov ebx, [ecx]
1911 mov ecx, [ecx + 4]
1912 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1913 mov eax, [esi]
1914 mov edx, [esi + 4]
1915
1916 cmpxchg8b [edi]
1917
1918 mov [esi], eax
1919 mov [esi + 4], edx
1920 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1921
1922 pop ebp
1923 pop ebx
1924 pop edi
1925 pop esi
1926 ret 8
1927%endif
1928ENDPROC iemAImpl_cmpxchg8b
1929
1930BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1931%ifdef RT_ARCH_AMD64
1932 %ifdef ASM_CALL64_MSC
1933 push rbx
1934
1935 mov r11, rdx ; pu64EaxEdx (is also T1)
1936 mov r10, rcx ; pu64Dst
1937
1938 mov ebx, [r8]
1939 mov ecx, [r8 + 4]
1940 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1941 mov eax, [r11]
1942 mov edx, [r11 + 4]
1943
1944 lock cmpxchg8b [r10]
1945
1946 mov [r11], eax
1947 mov [r11 + 4], edx
1948 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1949
1950 pop rbx
1951 ret
1952 %else
1953 push rbx
1954
1955 mov r10, rcx ; pEFlags
1956 mov r11, rdx ; pu64EbxEcx (is also T1)
1957
1958 mov ebx, [r11]
1959 mov ecx, [r11 + 4]
1960 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1961 mov eax, [rsi]
1962 mov edx, [rsi + 4]
1963
1964 lock cmpxchg8b [rdi]
1965
1966 mov [rsi], eax
1967 mov [rsi + 4], edx
1968 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
1969
1970 pop rbx
1971 ret
1972
1973 %endif
1974%else
1975 push esi
1976 push edi
1977 push ebx
1978 push ebp
1979
1980 mov edi, ecx ; pu64Dst
1981 mov esi, edx ; pu64EaxEdx
1982 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1983 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1984
1985 mov ebx, [ecx]
1986 mov ecx, [ecx + 4]
1987 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1988 mov eax, [esi]
1989 mov edx, [esi + 4]
1990
1991 lock cmpxchg8b [edi]
1992
1993 mov [esi], eax
1994 mov [esi + 4], edx
1995 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, edi)
1996
1997 pop ebp
1998 pop ebx
1999 pop edi
2000 pop esi
2001 ret 8
2002%endif
2003ENDPROC iemAImpl_cmpxchg8b_locked
2004
2005%ifdef RT_ARCH_AMD64
2006
2007;
2008; CMPXCHG16B.
2009;
2010; These are tricky register wise, so the code is duplicated for each calling
2011; convention.
2012;
2013; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2014;
2015; C-proto:
2016; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
2017; uint32_t *pEFlags));
2018;
2019; Note! Identical to iemAImpl_cmpxchg8b.
2020;
2021BEGINCODE
2022BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
2023 %ifdef ASM_CALL64_MSC
2024 push rbx
2025
2026 mov r11, rdx ; pu64RaxRdx (is also T1)
2027 mov r10, rcx ; pu64Dst
2028
2029 mov rbx, [r8]
2030 mov rcx, [r8 + 8]
2031 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2032 mov rax, [r11]
2033 mov rdx, [r11 + 8]
2034
2035 cmpxchg16b [r10]
2036
2037 mov [r11], rax
2038 mov [r11 + 8], rdx
2039 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2040
2041 pop rbx
2042 ret
2043 %else
2044 push rbx
2045
2046 mov r10, rcx ; pEFlags
2047 mov r11, rdx ; pu64RbxRcx (is also T1)
2048
2049 mov rbx, [r11]
2050 mov rcx, [r11 + 8]
2051 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2052 mov rax, [rsi]
2053 mov rdx, [rsi + 8]
2054
2055 cmpxchg16b [rdi]
2056
2057 mov [rsi], rax
2058 mov [rsi + 8], rdx
2059 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2060
2061 pop rbx
2062 ret
2063
2064 %endif
2065ENDPROC iemAImpl_cmpxchg16b
2066
2067BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
2068 %ifdef ASM_CALL64_MSC
2069 push rbx
2070
2071 mov r11, rdx ; pu64RaxRdx (is also T1)
2072 mov r10, rcx ; pu64Dst
2073
2074 mov rbx, [r8]
2075 mov rcx, [r8 + 8]
2076 IEM_MAYBE_LOAD_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2077 mov rax, [r11]
2078 mov rdx, [r11 + 8]
2079
2080 lock cmpxchg16b [r10]
2081
2082 mov [r11], rax
2083 mov [r11 + 8], rdx
2084 IEM_SAVE_FLAGS_OLD r9, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2085
2086 pop rbx
2087 ret
2088 %else
2089 push rbx
2090
2091 mov r10, rcx ; pEFlags
2092 mov r11, rdx ; pu64RbxRcx (is also T1)
2093
2094 mov rbx, [r11]
2095 mov rcx, [r11 + 8]
2096 IEM_MAYBE_LOAD_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
2097 mov rax, [rsi]
2098 mov rdx, [rsi + 8]
2099
2100 lock cmpxchg16b [rdi]
2101
2102 mov [rsi], rax
2103 mov [rsi + 8], rdx
2104 IEM_SAVE_FLAGS_OLD r10, (X86_EFL_ZF), 0, 0 ; clobbers T0+T1 (eax, r11)
2105
2106 pop rbx
2107 ret
2108
2109 %endif
2110ENDPROC iemAImpl_cmpxchg16b_locked
2111
2112%endif ; RT_ARCH_AMD64
2113
2114
2115;
2116; CMPXCHG.
2117;
2118; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
2119;
2120; C-proto:
2121; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
2122;
2123BEGINCODE
2124%macro IEMIMPL_CMPXCHG 2
2125BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2128 mov al, [A1]
2129 %1 cmpxchg [A0], A2_8
2130 mov [A1], al
2131 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2132 EPILOGUE_4_ARGS
2133ENDPROC iemAImpl_cmpxchg_u8 %+ %2
2134
2135BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
2136 PROLOGUE_4_ARGS
2137 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2138 mov ax, [A1]
2139 %1 cmpxchg [A0], A2_16
2140 mov [A1], ax
2141 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2142 EPILOGUE_4_ARGS
2143ENDPROC iemAImpl_cmpxchg_u16 %+ %2
2144
2145BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
2146 PROLOGUE_4_ARGS
2147 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2148 mov eax, [A1]
2149 %1 cmpxchg [A0], A2_32
2150 mov [A1], eax
2151 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2152 EPILOGUE_4_ARGS
2153ENDPROC iemAImpl_cmpxchg_u32 %+ %2
2154
2155BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
2156%ifdef RT_ARCH_AMD64
2157 PROLOGUE_4_ARGS
2158 IEM_MAYBE_LOAD_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2159 mov rax, [A1]
2160 %1 cmpxchg [A0], A2
2161 mov [A1], rax
2162 IEM_SAVE_FLAGS_OLD A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, r11/edi)
2163 EPILOGUE_4_ARGS
2164%else
2165 ;
2166 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
2167 ;
2168 push esi
2169 push edi
2170 push ebx
2171 push ebp
2172
2173 mov edi, ecx ; pu64Dst
2174 mov esi, edx ; pu64Rax
2175 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
2176 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
2177
2178 mov ebx, [ecx]
2179 mov ecx, [ecx + 4]
2180 IEM_MAYBE_LOAD_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
2181 mov eax, [esi]
2182 mov edx, [esi + 4]
2183
2184 lock cmpxchg8b [edi]
2185
2186 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
2187 jz .cmpxchg8b_not_equal
2188;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
2189 cmp eax, eax ; just set the other flags.
2190.store:
2191 mov [esi], eax
2192 mov [esi + 4], edx
2193 IEM_SAVE_FLAGS_OLD ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0+T1 (eax, edi)
2194
2195 pop ebp
2196 pop ebx
2197 pop edi
2198 pop esi
2199 ret 8
2200
2201.cmpxchg8b_not_equal:
2202 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
2203 jne .store
2204 cmp [esi], eax
2205 jmp .store
2206
2207%endif
2208ENDPROC iemAImpl_cmpxchg_u64 %+ %2
2209%endmacro ; IEMIMPL_CMPXCHG
2210
2211IEMIMPL_CMPXCHG , ,
2212IEMIMPL_CMPXCHG lock, _locked
2213
2214
2215
2216;;
2217; Macro for implementing a unary operator.
2218;
2219; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
2220; variants, except on 32-bit system where the 64-bit accesses requires hand
2221; coding.
2222;
2223; All the functions takes a pointer to the destination memory operand in A0,
2224; the source register operand in A1 and a pointer to eflags in A2.
2225;
2226; @param 1 The instruction mnemonic.
2227; @param 2 The modified flags.
2228; @param 3 The undefined flags.
2229;
2230%macro IEMIMPL_UNARY_OP 3
2231BEGINCODE
2232BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
2233 PROLOGUE_2_ARGS
2234 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2235 %1 byte [A0]
2236 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2237 EPILOGUE_2_ARGS
2238ENDPROC iemAImpl_ %+ %1 %+ _u8
2239
2240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
2241 PROLOGUE_2_ARGS
2242 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2243 lock %1 byte [A0]
2244 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2245 EPILOGUE_2_ARGS
2246ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
2247
2248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
2249 PROLOGUE_2_ARGS
2250 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2251 %1 word [A0]
2252 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2253 EPILOGUE_2_ARGS
2254ENDPROC iemAImpl_ %+ %1 %+ _u16
2255
2256BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
2257 PROLOGUE_2_ARGS
2258 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2259 lock %1 word [A0]
2260 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2261 EPILOGUE_2_ARGS
2262ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
2263
2264BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
2265 PROLOGUE_2_ARGS
2266 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2267 %1 dword [A0]
2268 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2269 EPILOGUE_2_ARGS
2270ENDPROC iemAImpl_ %+ %1 %+ _u32
2271
2272BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
2273 PROLOGUE_2_ARGS
2274 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2275 lock %1 dword [A0]
2276 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2277 EPILOGUE_2_ARGS
2278ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
2279
2280 %ifdef RT_ARCH_AMD64
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
2282 PROLOGUE_2_ARGS
2283 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2284 %1 qword [A0]
2285 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2286 EPILOGUE_2_ARGS
2287ENDPROC iemAImpl_ %+ %1 %+ _u64
2288
2289BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
2290 PROLOGUE_2_ARGS
2291 IEM_MAYBE_LOAD_FLAGS_OLD A1, %2, %3, 0
2292 lock %1 qword [A0]
2293 IEM_SAVE_FLAGS_OLD A1, %2, %3, 0
2294 EPILOGUE_2_ARGS
2295ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
2296 %endif ; RT_ARCH_AMD64
2297
2298%endmacro
2299
2300IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2301IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
2302IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
2303IEMIMPL_UNARY_OP not, 0, 0
2304
2305
2306;
2307; BSWAP. No flag changes.
2308;
2309; Each function takes one argument, pointer to the value to bswap
2310; (input/output). They all return void.
2311;
2312BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
2313 PROLOGUE_1_ARGS
2314 mov T0_32, [A0] ; just in case any of the upper bits are used.
2315 db 66h
2316 bswap T0_32
2317 mov [A0], T0_32
2318 EPILOGUE_1_ARGS
2319ENDPROC iemAImpl_bswap_u16
2320
2321BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
2322 PROLOGUE_1_ARGS
2323 mov T0_32, [A0]
2324 bswap T0_32
2325 mov [A0], T0_32
2326 EPILOGUE_1_ARGS
2327ENDPROC iemAImpl_bswap_u32
2328
2329BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
2330%ifdef RT_ARCH_AMD64
2331 PROLOGUE_1_ARGS
2332 mov T0, [A0]
2333 bswap T0
2334 mov [A0], T0
2335 EPILOGUE_1_ARGS
2336%else
2337 PROLOGUE_1_ARGS
2338 mov T0, [A0]
2339 mov T1, [A0 + 4]
2340 bswap T0
2341 bswap T1
2342 mov [A0 + 4], T0
2343 mov [A0], T1
2344 EPILOGUE_1_ARGS
2345%endif
2346ENDPROC iemAImpl_bswap_u64
2347
2348
2349;;
2350; Macro for implementing a shift operation.
2351;
2352; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2353; 32-bit system where the 64-bit accesses requires hand coding.
2354;
2355; All the functions takes a pointer to the destination memory operand in A0,
2356; the shift count in A1 and a pointer to eflags in A2.
2357;
2358; @param 1 The instruction mnemonic.
2359; @param 2 The modified flags.
2360; @param 3 The undefined flags.
2361; @param 4 Force load flags.
2362;
2363; Makes ASSUMPTIONS about A0, A1 and A2 assignments. Specifically, that with
2364; GCC/64 we're free to use RCX/CL as it isn't used for any arguments. While
2365; MSC/64 & 32-bit fastcall are using ECX for the first argument (fEFlagsIn),
2366; so we have to switch it around with the shift count parameter registers.
2367;
2368; @note the _intel and _amd variants are implemented in C.
2369;
2370%macro IEMIMPL_SHIFT_OP 4
2371BEGINCODE
2372BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
2373 PROLOGUE_3_ARGS
2374 %ifdef ASM_CALL64_GCC
2375 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2376 mov cl, A2_8
2377 %1 byte [A1], cl
2378 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2379 %else
2380 xchg A2, A0
2381 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2382 %1 byte [A1], cl
2383 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2384 %endif
2385.zero_shift:
2386 EPILOGUE_3_ARGS
2387ENDPROC iemAImpl_ %+ %1 %+ _u8
2388
2389BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
2390 PROLOGUE_3_ARGS
2391 %ifdef ASM_CALL64_GCC
2392 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2393 mov cl, A2_8
2394 %1 word [A1], cl
2395 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2396 %else
2397 xchg A2, A0
2398 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2399 %1 word [A1], cl
2400 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2401 %endif
2402 EPILOGUE_3_ARGS
2403ENDPROC iemAImpl_ %+ %1 %+ _u16
2404
2405BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
2406 PROLOGUE_3_ARGS
2407 %ifdef ASM_CALL64_GCC
2408 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2409 mov cl, A2_8
2410 %1 dword [A1], cl
2411 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2412 %else
2413 xchg A2, A0
2414 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2415 %1 dword [A1], cl
2416 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2417 %endif
2418 EPILOGUE_3_ARGS
2419ENDPROC iemAImpl_ %+ %1 %+ _u32
2420
2421 %ifdef RT_ARCH_AMD64
2422BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2423 PROLOGUE_3_ARGS
2424 %ifdef ASM_CALL64_GCC
2425 IEM_MAYBE_LOAD_FLAGS A0_32, %2, %3, %4
2426 mov cl, A2_8
2427 %1 qword [A1], cl
2428 IEM_SAVE_FLAGS_RETVAL A0_32, %2, %3, 0
2429 %else
2430 xchg A2, A0
2431 IEM_MAYBE_LOAD_FLAGS A2_32, %2, %3, %4
2432 %1 qword [A1], cl
2433 IEM_SAVE_FLAGS_RETVAL A2_32, %2, %3, 0
2434 %endif
2435 EPILOGUE_3_ARGS
2436ENDPROC iemAImpl_ %+ %1 %+ _u64
2437 %endif ; RT_ARCH_AMD64
2438
2439%endmacro
2440
2441; These instructions will NOT modify flags if the masked shift count is zero
2442; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2443; we have to force load all modified and undefined.
2444IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2445IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2446IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2447IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF | X86_EFL_OF
2448IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2449IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2450IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2451
2452
2453;;
2454; Macro for implementing a double precision shift operation.
2455;
2456; This will generate code for the 16, 32 and 64 bit accesses, except on
2457; 32-bit system where the 64-bit accesses requires hand coding.
2458;
2459; The functions takes the destination operand (r/m) in A0, the source (reg) in
2460; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
2461;
2462; @param 1 The instruction mnemonic.
2463; @param 2 The modified flags.
2464; @param 3 The undefined flags.
2465; @param 4 The force loaded flags.
2466;
2467; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
2468;
2469; @note the _intel and _amd variants are implemented in C.
2470;
2471%macro IEMIMPL_SHIFT_DBL_OP 4
2472BEGINCODE
2473BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
2474 PROLOGUE_4_ARGS
2475 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2476 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2477 %ifdef ASM_CALL64_GCC
2478 xchg A3, A2
2479 %1 [A0], A1_16, cl
2480 xchg A3, A2
2481 %else
2482 xchg A0, A2
2483 %1 [A2], A1_16, cl
2484 %endif
2485 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2486 EPILOGUE_4_ARGS
2487ENDPROC iemAImpl_ %+ %1 %+ _u16
2488
2489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2490 PROLOGUE_4_ARGS
2491 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2492 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2493 %ifdef ASM_CALL64_GCC
2494 xchg A3, A2
2495 %1 [A0], A1_32, cl
2496 xchg A3, A2
2497 %else
2498 xchg A0, A2
2499 %1 [A2], A1_32, cl
2500 %endif
2501 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2502 EPILOGUE_4_ARGS
2503ENDPROC iemAImpl_ %+ %1 %+ _u32
2504
2505 %ifdef RT_ARCH_AMD64
2506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2507 PROLOGUE_4_ARGS
2508 ;IEM_LOAD_FLAGS_OLD A3, %4, %3
2509 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %4
2510 %ifdef ASM_CALL64_GCC
2511 xchg A3, A2
2512 %1 [A0], A1, cl
2513 xchg A3, A2
2514 %else
2515 xchg A0, A2
2516 %1 [A2], A1, cl
2517 %endif
2518 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2519 EPILOGUE_4_ARGS_EX 12
2520ENDPROC iemAImpl_ %+ %1 %+ _u64
2521 %endif ; RT_ARCH_AMD64
2522
2523%endmacro
2524
2525; These instructions will NOT modify flags if the masked shift count is zero
2526; (the mask is 0x3f for 64-bit instructions and 0x1f for the others). Thus,
2527; we have to force load all modified and undefined.
2528IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2529IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), X86_EFL_STATUS_BITS
2530
2531
2532;;
2533; Macro for implementing a multiplication operations.
2534;
2535; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2536; 32-bit system where the 64-bit accesses requires hand coding.
2537;
2538; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2539; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2540; pointer to eflags in A3.
2541;
2542; The functions all return 0 so the caller can be used for div/idiv as well as
2543; for the mul/imul implementation.
2544;
2545; @param 1 The instruction mnemonic.
2546; @param 2 The modified flags.
2547; @param 3 The undefined flags.
2548; @param 4 Name suffix.
2549; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2550;
2551; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2552;
2553%macro IEMIMPL_MUL_OP 5
2554BEGINCODE
2555BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2556 PROLOGUE_3_ARGS
2557 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2558 mov al, [A0]
2559 %1 A1_8
2560 mov [A0], ax
2561 %if %5 != 1
2562 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2563 %else
2564 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX ; intel
2565 %endif
2566 xor eax, eax
2567 EPILOGUE_3_ARGS
2568ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2569
2570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2571 PROLOGUE_4_ARGS
2572 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2573 mov ax, [A0]
2574 %ifdef ASM_CALL64_GCC
2575 %1 A2_16
2576 mov [A0], ax
2577 mov [A1], dx
2578 %else
2579 mov T1, A1
2580 %1 A2_16
2581 mov [A0], ax
2582 mov [T1], dx
2583 %endif
2584 %if %5 != 1
2585 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2586 %else
2587 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX ; intel
2588 %endif
2589 xor eax, eax
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2592
2593BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2594 PROLOGUE_4_ARGS
2595 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2596 mov eax, [A0]
2597 %ifdef ASM_CALL64_GCC
2598 %1 A2_32
2599 mov [A0], eax
2600 mov [A1], edx
2601 %else
2602 mov T1, A1
2603 %1 A2_32
2604 mov [A0], eax
2605 mov [T1], edx
2606 %endif
2607 %if %5 != 1
2608 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2609 %else
2610 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX ; intel
2611 %endif
2612 xor eax, eax
2613 EPILOGUE_4_ARGS
2614ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2615
2616 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2618 PROLOGUE_4_ARGS
2619 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2620 mov rax, [A0]
2621 %ifdef ASM_CALL64_GCC
2622 %1 A2
2623 mov [A0], rax
2624 mov [A1], rdx
2625 %else
2626 mov T1, A1
2627 %1 A2
2628 mov [A0], rax
2629 mov [T1], rdx
2630 %endif
2631 %if %5 != 1
2632 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2633 %else
2634 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF_OLD A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX ; intel
2635 %endif
2636 xor eax, eax
2637 EPILOGUE_4_ARGS_EX 12
2638ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2639 %endif ; !RT_ARCH_AMD64
2640
2641%endmacro
2642
2643IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2644IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2645IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2646IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2647IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2648IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2649
2650
2651BEGINCODE
2652;;
2653; Worker function for negating a 32-bit number in T1:T0
2654; @uses None (T0,T1)
2655BEGINPROC iemAImpl_negate_T0_T1_u32
2656 push 0
2657 push 0
2658 xchg T0_32, [xSP]
2659 xchg T1_32, [xSP + xCB]
2660 sub T0_32, [xSP]
2661 sbb T1_32, [xSP + xCB]
2662 add xSP, xCB*2
2663 ret
2664ENDPROC iemAImpl_negate_T0_T1_u32
2665
2666%ifdef RT_ARCH_AMD64
2667;;
2668; Worker function for negating a 64-bit number in T1:T0
2669; @uses None (T0,T1)
2670BEGINPROC iemAImpl_negate_T0_T1_u64
2671 push 0
2672 push 0
2673 xchg T0, [xSP]
2674 xchg T1, [xSP + xCB]
2675 sub T0, [xSP]
2676 sbb T1, [xSP + xCB]
2677 add xSP, xCB*2
2678 ret
2679ENDPROC iemAImpl_negate_T0_T1_u64
2680%endif
2681
2682
2683;;
2684; Macro for implementing a division operations.
2685;
2686; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2687; 32-bit system where the 64-bit accesses requires hand coding.
2688;
2689; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2690; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2691; pointer to eflags in A3.
2692;
2693; The functions all return 0 on success and -1 if a divide error should be
2694; raised by the caller.
2695;
2696; @param 1 The instruction mnemonic.
2697; @param 2 The modified flags.
2698; @param 3 The undefined flags.
2699; @param 4 1 if signed, 0 if unsigned.
2700; @param 5 Function suffix.
2701; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2702; 2 for AMD (set AF, clear PF, ZF and SF).
2703;
2704; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2705;
2706%macro IEMIMPL_DIV_OP 6
2707BEGINCODE
2708BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2709 PROLOGUE_3_ARGS
2710
2711 ; div by chainsaw check.
2712 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2713 jz .div_zero
2714
2715 ; Overflow check - unsigned division is simple to verify, haven't
2716 ; found a simple way to check signed division yet unfortunately.
2717 %if %4 == 0
2718 cmp [A0 + 1], A1_8
2719 jae .div_overflow
2720 %else
2721 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2722 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2723 test A1_8, A1_8
2724 js .divisor_negative
2725 test T0_16, T0_16
2726 jns .both_positive
2727 neg T0_16
2728.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2729 push T0 ; Start off like unsigned below.
2730 shr T0_16, 7
2731 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2732 pop T0
2733 jb .div_no_overflow
2734 ja .div_overflow
2735 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2736 cmp T0_8, A1_8
2737 jae .div_overflow
2738 jmp .div_no_overflow
2739
2740.divisor_negative:
2741 neg A1_8
2742 test T0_16, T0_16
2743 jns .one_of_each
2744 neg T0_16
2745.both_positive: ; Same as unsigned shifted by sign indicator bit.
2746 shr T0_16, 7
2747 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2748 jae .div_overflow
2749.div_no_overflow:
2750 mov A1, T1 ; restore divisor
2751 %endif
2752
2753 IEM_MAYBE_LOAD_FLAGS_OLD A2, %2, %3, %3 ; Undefined flags may be passed thru (Intel)
2754 mov ax, [A0]
2755 %1 A1_8
2756 mov [A0], ax
2757 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2758 IEM_ADJUST_FLAGS_OLD A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2759 %else
2760 IEM_SAVE_FLAGS_OLD A2, %2, %3, 0
2761 %endif
2762 xor eax, eax
2763
2764.return:
2765 EPILOGUE_3_ARGS
2766
2767.div_zero:
2768.div_overflow:
2769 mov eax, -1
2770 jmp .return
2771ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2772
2773BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2774 PROLOGUE_4_ARGS
2775
2776 ; div by chainsaw check.
2777 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2778 jz .div_zero
2779
2780 ; Overflow check - unsigned division is simple to verify, haven't
2781 ; found a simple way to check signed division yet unfortunately.
2782 %if %4 == 0
2783 cmp [A1], A2_16
2784 jae .div_overflow
2785 %else
2786 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2787 shl T0_32, 16
2788 mov T0_16, [A0] ; T0 = dividend
2789 mov T1, A2 ; T1 = divisor
2790 test T1_16, T1_16
2791 js .divisor_negative
2792 test T0_32, T0_32
2793 jns .both_positive
2794 neg T0_32
2795.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2796 push T0 ; Start off like unsigned below.
2797 shr T0_32, 15
2798 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2799 pop T0
2800 jb .div_no_overflow
2801 ja .div_overflow
2802 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2803 cmp T0_16, T1_16
2804 jae .div_overflow
2805 jmp .div_no_overflow
2806
2807.divisor_negative:
2808 neg T1_16
2809 test T0_32, T0_32
2810 jns .one_of_each
2811 neg T0_32
2812.both_positive: ; Same as unsigned shifted by sign indicator bit.
2813 shr T0_32, 15
2814 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2815 jae .div_overflow
2816.div_no_overflow:
2817 %endif
2818
2819 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2820 %ifdef ASM_CALL64_GCC
2821 mov T1, A2
2822 mov ax, [A0]
2823 mov dx, [A1]
2824 %1 T1_16
2825 mov [A0], ax
2826 mov [A1], dx
2827 %else
2828 mov T1, A1
2829 mov ax, [A0]
2830 mov dx, [T1]
2831 %1 A2_16
2832 mov [A0], ax
2833 mov [T1], dx
2834 %endif
2835 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2836 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2837 %else
2838 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2839 %endif
2840 xor eax, eax
2841
2842.return:
2843 EPILOGUE_4_ARGS
2844
2845.div_zero:
2846.div_overflow:
2847 mov eax, -1
2848 jmp .return
2849ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2850
2851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2852 PROLOGUE_4_ARGS
2853
2854 ; div by chainsaw check.
2855 test A2_32, A2_32
2856 jz .div_zero
2857
2858 ; Overflow check - unsigned division is simple to verify, haven't
2859 ; found a simple way to check signed division yet unfortunately.
2860 %if %4 == 0
2861 cmp [A1], A2_32
2862 jae .div_overflow
2863 %else
2864 push A2 ; save A2 so we modify it (we out of regs on x86).
2865 mov T0_32, [A0] ; T0 = dividend low
2866 mov T1_32, [A1] ; T1 = dividend high
2867 ;test A2_32, A2_32 - we did this 5 instructions ago.
2868 js .divisor_negative
2869 test T1_32, T1_32
2870 jns .both_positive
2871 call NAME(iemAImpl_negate_T0_T1_u32)
2872.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2873 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2874 jnz .div_overflow
2875 push T0 ; Start off like unsigned below.
2876 shl T1_32, 1
2877 shr T0_32, 31
2878 or T1_32, T0_32
2879 cmp T1_32, A2_32
2880 pop T0
2881 jb .div_no_overflow
2882 ja .div_overflow
2883 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2884 cmp T0_32, A2_32
2885 jae .div_overflow
2886 jmp .div_no_overflow
2887
2888.divisor_negative:
2889 neg A2_32
2890 test T1_32, T1_32
2891 jns .one_of_each
2892 call NAME(iemAImpl_negate_T0_T1_u32)
2893.both_positive: ; Same as unsigned shifted by sign indicator bit.
2894 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2895 jnz .div_overflow
2896 shl T1_32, 1
2897 shr T0_32, 31
2898 or T1_32, T0_32
2899 cmp T1_32, A2_32
2900 jae .div_overflow
2901.div_no_overflow:
2902 pop A2
2903 %endif
2904
2905 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2906 mov eax, [A0]
2907 %ifdef ASM_CALL64_GCC
2908 mov T1, A2
2909 mov eax, [A0]
2910 mov edx, [A1]
2911 %1 T1_32
2912 mov [A0], eax
2913 mov [A1], edx
2914 %else
2915 mov T1, A1
2916 mov eax, [A0]
2917 mov edx, [T1]
2918 %1 A2_32
2919 mov [A0], eax
2920 mov [T1], edx
2921 %endif
2922 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2923 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2924 %else
2925 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
2926 %endif
2927 xor eax, eax
2928
2929.return:
2930 EPILOGUE_4_ARGS
2931
2932.div_overflow:
2933 %if %4 != 0
2934 pop A2
2935 %endif
2936.div_zero:
2937 mov eax, -1
2938 jmp .return
2939ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2940
2941 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2942BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2943 PROLOGUE_4_ARGS
2944
2945 test A2, A2
2946 jz .div_zero
2947 %if %4 == 0
2948 cmp [A1], A2
2949 jae .div_overflow
2950 %else
2951 push A2 ; save A2 so we modify it (we out of regs on x86).
2952 mov T0, [A0] ; T0 = dividend low
2953 mov T1, [A1] ; T1 = dividend high
2954 ;test A2, A2 - we did this five instructions above.
2955 js .divisor_negative
2956 test T1, T1
2957 jns .both_positive
2958 call NAME(iemAImpl_negate_T0_T1_u64)
2959.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2960 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2961 jc .div_overflow
2962 push T0 ; Start off like unsigned below.
2963 shl T1, 1
2964 shr T0, 63
2965 or T1, T0
2966 cmp T1, A2
2967 pop T0
2968 jb .div_no_overflow
2969 ja .div_overflow
2970 mov T1, 0x7fffffffffffffff
2971 and T0, T1 ; Special case for covering (divisor - 1).
2972 cmp T0, A2
2973 jae .div_overflow
2974 jmp .div_no_overflow
2975
2976.divisor_negative:
2977 neg A2
2978 test T1, T1
2979 jns .one_of_each
2980 call NAME(iemAImpl_negate_T0_T1_u64)
2981.both_positive: ; Same as unsigned shifted by sign indicator bit.
2982 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2983 jc .div_overflow
2984 shl T1, 1
2985 shr T0, 63
2986 or T1, T0
2987 cmp T1, A2
2988 jae .div_overflow
2989.div_no_overflow:
2990 pop A2
2991 %endif
2992
2993 IEM_MAYBE_LOAD_FLAGS_OLD A3, %2, %3, %3 ; Undefined flags may be passed thru (AMD)
2994 mov rax, [A0]
2995 %ifdef ASM_CALL64_GCC
2996 mov T1, A2
2997 mov rax, [A0]
2998 mov rdx, [A1]
2999 %1 T1
3000 mov [A0], rax
3001 mov [A1], rdx
3002 %else
3003 mov T1, A1
3004 mov rax, [A0]
3005 mov rdx, [T1]
3006 %1 A2
3007 mov [A0], rax
3008 mov [T1], rdx
3009 %endif
3010 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
3011 IEM_ADJUST_FLAGS_OLD A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
3012 %else
3013 IEM_SAVE_FLAGS_OLD A3, %2, %3, 0
3014 %endif
3015 xor eax, eax
3016
3017.return:
3018 EPILOGUE_4_ARGS_EX 12
3019
3020.div_overflow:
3021 %if %4 != 0
3022 pop A2
3023 %endif
3024.div_zero:
3025 mov eax, -1
3026 jmp .return
3027ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
3028 %endif ; !RT_ARCH_AMD64
3029
3030%endmacro
3031
3032IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
3033IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
3034IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
3035;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
3036IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
3037IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
3038IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
3039
3040
3041;;
3042; Macro for implementing memory fence operation.
3043;
3044; No return value, no operands or anything.
3045;
3046; @param 1 The instruction.
3047;
3048%macro IEMIMPL_MEM_FENCE 1
3049BEGINCODE
3050BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
3051 %1
3052 ret
3053ENDPROC iemAImpl_ %+ %1
3054%endmacro
3055
3056IEMIMPL_MEM_FENCE lfence
3057IEMIMPL_MEM_FENCE sfence
3058IEMIMPL_MEM_FENCE mfence
3059
3060;;
3061; Alternative for non-SSE2 host.
3062;
3063BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
3064 push xAX
3065 xchg xAX, [xSP]
3066 add xSP, xCB
3067 ret
3068ENDPROC iemAImpl_alt_mem_fence
3069
3070
3071;;
3072; Initialize the FPU for the actual instruction being emulated, this means
3073; loading parts of the guest's control word and status word.
3074;
3075; @uses 24 bytes of stack. T0, T1
3076; @param 1 Expression giving the address of the FXSTATE of the guest.
3077;
3078%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
3079 fnstenv [xSP]
3080
3081 ; FCW - for exception, precision and rounding control.
3082 movzx T0, word [%1 + X86FXSTATE.FCW]
3083 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3084 mov [xSP + X86FSTENV32P.FCW], T0_16
3085
3086 ; FSW - for undefined C0, C1, C2, and C3.
3087 movzx T1, word [%1 + X86FXSTATE.FSW]
3088 and T1, X86_FSW_C_MASK
3089 movzx T0, word [xSP + X86FSTENV32P.FSW]
3090 and T0, X86_FSW_TOP_MASK
3091 or T0, T1
3092 mov [xSP + X86FSTENV32P.FSW], T0_16
3093
3094 fldenv [xSP]
3095%endmacro
3096
3097
3098;;
3099; Initialize the FPU for the actual instruction being emulated, this means
3100; loading parts of the guest's control word, status word, and update the
3101; tag word for the top register if it's empty.
3102;
3103; ASSUMES actual TOP=7
3104;
3105; @uses 24 bytes of stack. T0, T1
3106; @param 1 Expression giving the address of the FXSTATE of the guest.
3107;
3108%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
3109 fnstenv [xSP]
3110
3111 ; FCW - for exception, precision and rounding control.
3112 movzx T0_32, word [%1 + X86FXSTATE.FCW]
3113 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
3114 mov [xSP + X86FSTENV32P.FCW], T0_16
3115
3116 ; FSW - for undefined C0, C1, C2, and C3.
3117 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3118 and T1_32, X86_FSW_C_MASK
3119 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
3120 and T0_32, X86_FSW_TOP_MASK
3121 or T0_32, T1_32
3122 mov [xSP + X86FSTENV32P.FSW], T0_16
3123
3124 ; FTW - Only for ST0 (in/out).
3125 movzx T1_32, word [%1 + X86FXSTATE.FSW]
3126 shr T1_32, X86_FSW_TOP_SHIFT
3127 and T1_32, X86_FSW_TOP_SMASK
3128 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
3129 jc %%st0_not_empty
3130 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
3131%%st0_not_empty:
3132
3133 fldenv [xSP]
3134%endmacro
3135
3136
3137;;
3138; Need to move this as well somewhere better?
3139;
3140struc IEMFPURESULT
3141 .r80Result resw 5
3142 .FSW resw 1
3143endstruc
3144
3145
3146;;
3147; Need to move this as well somewhere better?
3148;
3149struc IEMFPURESULTTWO
3150 .r80Result1 resw 5
3151 .FSW resw 1
3152 .r80Result2 resw 5
3153endstruc
3154
3155
3156;
3157;---------------------- 16-bit signed integer operations ----------------------
3158;
3159
3160
3161;;
3162; Converts a 16-bit floating point value to a 80-bit one (fpu register).
3163;
3164; @param A0 FPU context (fxsave).
3165; @param A1 Pointer to a IEMFPURESULT for the output.
3166; @param A2 Pointer to the 16-bit floating point value to convert.
3167;
3168BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
3169 PROLOGUE_3_ARGS
3170 sub xSP, 20h
3171
3172 fninit
3173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3174 fild word [A2]
3175
3176 fnstsw word [A1 + IEMFPURESULT.FSW]
3177 fnclex
3178 fstp tword [A1 + IEMFPURESULT.r80Result]
3179
3180 fninit
3181 add xSP, 20h
3182 EPILOGUE_3_ARGS
3183ENDPROC iemAImpl_fild_r80_from_i16
3184
3185
3186;;
3187; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
3188;
3189; @param A0 FPU context (fxsave).
3190; @param A1 Where to return the output FSW.
3191; @param A2 Where to store the 16-bit signed integer value.
3192; @param A3 Pointer to the 80-bit value.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
3195 PROLOGUE_4_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 fld tword [A3]
3200 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3201 fistp word [A2]
3202
3203 fnstsw word [A1]
3204
3205 fninit
3206 add xSP, 20h
3207 EPILOGUE_4_ARGS
3208ENDPROC iemAImpl_fist_r80_to_i16
3209
3210
3211;;
3212; Store a 80-bit floating point value (register) as a 16-bit signed integer
3213; (memory) with truncation.
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 16-bit signed integer value.
3218; @param A3 Pointer to the 80-bit value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fisttp word [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fistt_r80_to_i16
3235
3236
3237;;
3238; FPU instruction working on one 80-bit and one 16-bit signed integer value.
3239;
3240; @param 1 The instruction
3241;
3242; @param A0 FPU context (fxsave).
3243; @param A1 Pointer to a IEMFPURESULT for the output.
3244; @param A2 Pointer to the 80-bit value.
3245; @param A3 Pointer to the 16-bit value.
3246;
3247%macro IEMIMPL_FPU_R80_BY_I16 1
3248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3249 PROLOGUE_4_ARGS
3250 sub xSP, 20h
3251
3252 fninit
3253 fld tword [A2]
3254 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3255 %1 word [A3]
3256
3257 fnstsw word [A1 + IEMFPURESULT.FSW]
3258 fnclex
3259 fstp tword [A1 + IEMFPURESULT.r80Result]
3260
3261 fninit
3262 add xSP, 20h
3263 EPILOGUE_4_ARGS
3264ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3265%endmacro
3266
3267IEMIMPL_FPU_R80_BY_I16 fiadd
3268IEMIMPL_FPU_R80_BY_I16 fimul
3269IEMIMPL_FPU_R80_BY_I16 fisub
3270IEMIMPL_FPU_R80_BY_I16 fisubr
3271IEMIMPL_FPU_R80_BY_I16 fidiv
3272IEMIMPL_FPU_R80_BY_I16 fidivr
3273
3274
3275;;
3276; FPU instruction working on one 80-bit and one 16-bit signed integer value,
3277; only returning FSW.
3278;
3279; @param 1 The instruction
3280;
3281; @param A0 FPU context (fxsave).
3282; @param A1 Where to store the output FSW.
3283; @param A2 Pointer to the 80-bit value.
3284; @param A3 Pointer to the 64-bit value.
3285;
3286%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
3287BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
3288 PROLOGUE_4_ARGS
3289 sub xSP, 20h
3290
3291 fninit
3292 fld tword [A2]
3293 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3294 %1 word [A3]
3295
3296 fnstsw word [A1]
3297
3298 fninit
3299 add xSP, 20h
3300 EPILOGUE_4_ARGS
3301ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
3302%endmacro
3303
3304IEMIMPL_FPU_R80_BY_I16_FSW ficom
3305
3306
3307
3308;
3309;---------------------- 32-bit signed integer operations ----------------------
3310;
3311
3312
3313;;
3314; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3315;
3316; @param A0 FPU context (fxsave).
3317; @param A1 Pointer to a IEMFPURESULT for the output.
3318; @param A2 Pointer to the 32-bit floating point value to convert.
3319;
3320BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
3321 PROLOGUE_3_ARGS
3322 sub xSP, 20h
3323
3324 fninit
3325 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3326 fild dword [A2]
3327
3328 fnstsw word [A1 + IEMFPURESULT.FSW]
3329 fnclex
3330 fstp tword [A1 + IEMFPURESULT.r80Result]
3331
3332 fninit
3333 add xSP, 20h
3334 EPILOGUE_3_ARGS
3335ENDPROC iemAImpl_fild_r80_from_i32
3336
3337
3338;;
3339; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
3340;
3341; @param A0 FPU context (fxsave).
3342; @param A1 Where to return the output FSW.
3343; @param A2 Where to store the 32-bit signed integer value.
3344; @param A3 Pointer to the 80-bit value.
3345;
3346BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
3347 PROLOGUE_4_ARGS
3348 sub xSP, 20h
3349
3350 fninit
3351 fld tword [A3]
3352 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3353 fistp dword [A2]
3354
3355 fnstsw word [A1]
3356
3357 fninit
3358 add xSP, 20h
3359 EPILOGUE_4_ARGS
3360ENDPROC iemAImpl_fist_r80_to_i32
3361
3362
3363;;
3364; Store a 80-bit floating point value (register) as a 32-bit signed integer
3365; (memory) with truncation.
3366;
3367; @param A0 FPU context (fxsave).
3368; @param A1 Where to return the output FSW.
3369; @param A2 Where to store the 32-bit signed integer value.
3370; @param A3 Pointer to the 80-bit value.
3371;
3372BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
3373 PROLOGUE_4_ARGS
3374 sub xSP, 20h
3375
3376 fninit
3377 fld tword [A3]
3378 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3379 fisttp dword [A2]
3380
3381 fnstsw word [A1]
3382
3383 fninit
3384 add xSP, 20h
3385 EPILOGUE_4_ARGS
3386ENDPROC iemAImpl_fistt_r80_to_i32
3387
3388
3389;;
3390; FPU instruction working on one 80-bit and one 32-bit signed integer value.
3391;
3392; @param 1 The instruction
3393;
3394; @param A0 FPU context (fxsave).
3395; @param A1 Pointer to a IEMFPURESULT for the output.
3396; @param A2 Pointer to the 80-bit value.
3397; @param A3 Pointer to the 32-bit value.
3398;
3399%macro IEMIMPL_FPU_R80_BY_I32 1
3400BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3401 PROLOGUE_4_ARGS
3402 sub xSP, 20h
3403
3404 fninit
3405 fld tword [A2]
3406 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3407 %1 dword [A3]
3408
3409 fnstsw word [A1 + IEMFPURESULT.FSW]
3410 fnclex
3411 fstp tword [A1 + IEMFPURESULT.r80Result]
3412
3413 fninit
3414 add xSP, 20h
3415 EPILOGUE_4_ARGS
3416ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3417%endmacro
3418
3419IEMIMPL_FPU_R80_BY_I32 fiadd
3420IEMIMPL_FPU_R80_BY_I32 fimul
3421IEMIMPL_FPU_R80_BY_I32 fisub
3422IEMIMPL_FPU_R80_BY_I32 fisubr
3423IEMIMPL_FPU_R80_BY_I32 fidiv
3424IEMIMPL_FPU_R80_BY_I32 fidivr
3425
3426
3427;;
3428; FPU instruction working on one 80-bit and one 32-bit signed integer value,
3429; only returning FSW.
3430;
3431; @param 1 The instruction
3432;
3433; @param A0 FPU context (fxsave).
3434; @param A1 Where to store the output FSW.
3435; @param A2 Pointer to the 80-bit value.
3436; @param A3 Pointer to the 64-bit value.
3437;
3438%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
3439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
3440 PROLOGUE_4_ARGS
3441 sub xSP, 20h
3442
3443 fninit
3444 fld tword [A2]
3445 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3446 %1 dword [A3]
3447
3448 fnstsw word [A1]
3449
3450 fninit
3451 add xSP, 20h
3452 EPILOGUE_4_ARGS
3453ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
3454%endmacro
3455
3456IEMIMPL_FPU_R80_BY_I32_FSW ficom
3457
3458
3459
3460;
3461;---------------------- 64-bit signed integer operations ----------------------
3462;
3463
3464
3465;;
3466; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3467;
3468; @param A0 FPU context (fxsave).
3469; @param A1 Pointer to a IEMFPURESULT for the output.
3470; @param A2 Pointer to the 64-bit floating point value to convert.
3471;
3472BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
3473 PROLOGUE_3_ARGS
3474 sub xSP, 20h
3475
3476 fninit
3477 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3478 fild qword [A2]
3479
3480 fnstsw word [A1 + IEMFPURESULT.FSW]
3481 fnclex
3482 fstp tword [A1 + IEMFPURESULT.r80Result]
3483
3484 fninit
3485 add xSP, 20h
3486 EPILOGUE_3_ARGS
3487ENDPROC iemAImpl_fild_r80_from_i64
3488
3489
3490;;
3491; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3492;
3493; @param A0 FPU context (fxsave).
3494; @param A1 Where to return the output FSW.
3495; @param A2 Where to store the 64-bit signed integer value.
3496; @param A3 Pointer to the 80-bit value.
3497;
3498BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3499 PROLOGUE_4_ARGS
3500 sub xSP, 20h
3501
3502 fninit
3503 fld tword [A3]
3504 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3505 fistp qword [A2]
3506
3507 fnstsw word [A1]
3508
3509 fninit
3510 add xSP, 20h
3511 EPILOGUE_4_ARGS
3512ENDPROC iemAImpl_fist_r80_to_i64
3513
3514
3515;;
3516; Store a 80-bit floating point value (register) as a 64-bit signed integer
3517; (memory) with truncation.
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Where to return the output FSW.
3521; @param A2 Where to store the 64-bit signed integer value.
3522; @param A3 Pointer to the 80-bit value.
3523;
3524BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3525 PROLOGUE_4_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A3]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 fisttp qword [A2]
3532
3533 fnstsw word [A1]
3534
3535 fninit
3536 add xSP, 20h
3537 EPILOGUE_4_ARGS
3538ENDPROC iemAImpl_fistt_r80_to_i64
3539
3540
3541
3542;
3543;---------------------- 32-bit floating point operations ----------------------
3544;
3545
3546;;
3547; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3548;
3549; @param A0 FPU context (fxsave).
3550; @param A1 Pointer to a IEMFPURESULT for the output.
3551; @param A2 Pointer to the 32-bit floating point value to convert.
3552;
3553BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3554 PROLOGUE_3_ARGS
3555 sub xSP, 20h
3556
3557 fninit
3558 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3559 fld dword [A2]
3560
3561 fnstsw word [A1 + IEMFPURESULT.FSW]
3562 fnclex
3563 fstp tword [A1 + IEMFPURESULT.r80Result]
3564
3565 fninit
3566 add xSP, 20h
3567 EPILOGUE_3_ARGS
3568ENDPROC iemAImpl_fld_r80_from_r32
3569
3570
3571;;
3572; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3573;
3574; @param A0 FPU context (fxsave).
3575; @param A1 Where to return the output FSW.
3576; @param A2 Where to store the 32-bit value.
3577; @param A3 Pointer to the 80-bit value.
3578;
3579BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3580 PROLOGUE_4_ARGS
3581 sub xSP, 20h
3582
3583 fninit
3584 fld tword [A3]
3585 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3586 fst dword [A2]
3587
3588 fnstsw word [A1]
3589
3590 fninit
3591 add xSP, 20h
3592 EPILOGUE_4_ARGS
3593ENDPROC iemAImpl_fst_r80_to_r32
3594
3595
3596;;
3597; FPU instruction working on one 80-bit and one 32-bit floating point value.
3598;
3599; @param 1 The instruction
3600;
3601; @param A0 FPU context (fxsave).
3602; @param A1 Pointer to a IEMFPURESULT for the output.
3603; @param A2 Pointer to the 80-bit value.
3604; @param A3 Pointer to the 32-bit value.
3605;
3606%macro IEMIMPL_FPU_R80_BY_R32 1
3607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3608 PROLOGUE_4_ARGS
3609 sub xSP, 20h
3610
3611 fninit
3612 fld tword [A2]
3613 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3614 %1 dword [A3]
3615
3616 fnstsw word [A1 + IEMFPURESULT.FSW]
3617 fnclex
3618 fstp tword [A1 + IEMFPURESULT.r80Result]
3619
3620 fninit
3621 add xSP, 20h
3622 EPILOGUE_4_ARGS
3623ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3624%endmacro
3625
3626IEMIMPL_FPU_R80_BY_R32 fadd
3627IEMIMPL_FPU_R80_BY_R32 fmul
3628IEMIMPL_FPU_R80_BY_R32 fsub
3629IEMIMPL_FPU_R80_BY_R32 fsubr
3630IEMIMPL_FPU_R80_BY_R32 fdiv
3631IEMIMPL_FPU_R80_BY_R32 fdivr
3632
3633
3634;;
3635; FPU instruction working on one 80-bit and one 32-bit floating point value,
3636; only returning FSW.
3637;
3638; @param 1 The instruction
3639;
3640; @param A0 FPU context (fxsave).
3641; @param A1 Where to store the output FSW.
3642; @param A2 Pointer to the 80-bit value.
3643; @param A3 Pointer to the 64-bit value.
3644;
3645%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3647 PROLOGUE_4_ARGS
3648 sub xSP, 20h
3649
3650 fninit
3651 fld tword [A2]
3652 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3653 %1 dword [A3]
3654
3655 fnstsw word [A1]
3656
3657 fninit
3658 add xSP, 20h
3659 EPILOGUE_4_ARGS
3660ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3661%endmacro
3662
3663IEMIMPL_FPU_R80_BY_R32_FSW fcom
3664
3665
3666
3667;
3668;---------------------- 64-bit floating point operations ----------------------
3669;
3670
3671;;
3672; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3673;
3674; @param A0 FPU context (fxsave).
3675; @param A1 Pointer to a IEMFPURESULT for the output.
3676; @param A2 Pointer to the 64-bit floating point value to convert.
3677;
3678BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3679 PROLOGUE_3_ARGS
3680 sub xSP, 20h
3681
3682 fninit
3683 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3684 fld qword [A2]
3685
3686 fnstsw word [A1 + IEMFPURESULT.FSW]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULT.r80Result]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_fld_r80_from_r64
3694
3695
3696;;
3697; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3698;
3699; @param A0 FPU context (fxsave).
3700; @param A1 Where to return the output FSW.
3701; @param A2 Where to store the 64-bit value.
3702; @param A3 Pointer to the 80-bit value.
3703;
3704BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3705 PROLOGUE_4_ARGS
3706 sub xSP, 20h
3707
3708 fninit
3709 fld tword [A3]
3710 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3711 fst qword [A2]
3712
3713 fnstsw word [A1]
3714
3715 fninit
3716 add xSP, 20h
3717 EPILOGUE_4_ARGS
3718ENDPROC iemAImpl_fst_r80_to_r64
3719
3720
3721;;
3722; FPU instruction working on one 80-bit and one 64-bit floating point value.
3723;
3724; @param 1 The instruction
3725;
3726; @param A0 FPU context (fxsave).
3727; @param A1 Pointer to a IEMFPURESULT for the output.
3728; @param A2 Pointer to the 80-bit value.
3729; @param A3 Pointer to the 64-bit value.
3730;
3731%macro IEMIMPL_FPU_R80_BY_R64 1
3732BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3733 PROLOGUE_4_ARGS
3734 sub xSP, 20h
3735
3736 fninit
3737 fld tword [A2]
3738 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3739 %1 qword [A3]
3740
3741 fnstsw word [A1 + IEMFPURESULT.FSW]
3742 fnclex
3743 fstp tword [A1 + IEMFPURESULT.r80Result]
3744
3745 fninit
3746 add xSP, 20h
3747 EPILOGUE_4_ARGS
3748ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3749%endmacro
3750
3751IEMIMPL_FPU_R80_BY_R64 fadd
3752IEMIMPL_FPU_R80_BY_R64 fmul
3753IEMIMPL_FPU_R80_BY_R64 fsub
3754IEMIMPL_FPU_R80_BY_R64 fsubr
3755IEMIMPL_FPU_R80_BY_R64 fdiv
3756IEMIMPL_FPU_R80_BY_R64 fdivr
3757
3758;;
3759; FPU instruction working on one 80-bit and one 64-bit floating point value,
3760; only returning FSW.
3761;
3762; @param 1 The instruction
3763;
3764; @param A0 FPU context (fxsave).
3765; @param A1 Where to store the output FSW.
3766; @param A2 Pointer to the 80-bit value.
3767; @param A3 Pointer to the 64-bit value.
3768;
3769%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3770BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3771 PROLOGUE_4_ARGS
3772 sub xSP, 20h
3773
3774 fninit
3775 fld tword [A2]
3776 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3777 %1 qword [A3]
3778
3779 fnstsw word [A1]
3780
3781 fninit
3782 add xSP, 20h
3783 EPILOGUE_4_ARGS
3784ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3785%endmacro
3786
3787IEMIMPL_FPU_R80_BY_R64_FSW fcom
3788
3789
3790
3791;
3792;---------------------- 80-bit floating point operations ----------------------
3793;
3794
3795;;
3796; Loads a 80-bit floating point register value from memory.
3797;
3798; @param A0 FPU context (fxsave).
3799; @param A1 Pointer to a IEMFPURESULT for the output.
3800; @param A2 Pointer to the 80-bit floating point value to load.
3801;
3802BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3803 PROLOGUE_3_ARGS
3804 sub xSP, 20h
3805
3806 fninit
3807 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3808 fld tword [A2]
3809
3810 fnstsw word [A1 + IEMFPURESULT.FSW]
3811 fnclex
3812 fstp tword [A1 + IEMFPURESULT.r80Result]
3813
3814 fninit
3815 add xSP, 20h
3816 EPILOGUE_3_ARGS
3817ENDPROC iemAImpl_fld_r80_from_r80
3818
3819
3820;;
3821; Store a 80-bit floating point register to memory
3822;
3823; @param A0 FPU context (fxsave).
3824; @param A1 Where to return the output FSW.
3825; @param A2 Where to store the 80-bit value.
3826; @param A3 Pointer to the 80-bit register value.
3827;
3828BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3829 PROLOGUE_4_ARGS
3830 sub xSP, 20h
3831
3832 fninit
3833 fld tword [A3]
3834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3835 fstp tword [A2]
3836
3837 fnstsw word [A1]
3838
3839 fninit
3840 add xSP, 20h
3841 EPILOGUE_4_ARGS
3842ENDPROC iemAImpl_fst_r80_to_r80
3843
3844
3845;;
3846; Loads an 80-bit floating point register value in BCD format from memory.
3847;
3848; @param A0 FPU context (fxsave).
3849; @param A1 Pointer to a IEMFPURESULT for the output.
3850; @param A2 Pointer to the 80-bit BCD value to load.
3851;
3852BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3853 PROLOGUE_3_ARGS
3854 sub xSP, 20h
3855
3856 fninit
3857 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3858 fbld tword [A2]
3859
3860 fnstsw word [A1 + IEMFPURESULT.FSW]
3861 fnclex
3862 fstp tword [A1 + IEMFPURESULT.r80Result]
3863
3864 fninit
3865 add xSP, 20h
3866 EPILOGUE_3_ARGS
3867ENDPROC iemAImpl_fld_r80_from_d80
3868
3869
3870;;
3871; Store a 80-bit floating point register to memory as BCD
3872;
3873; @param A0 FPU context (fxsave).
3874; @param A1 Where to return the output FSW.
3875; @param A2 Where to store the 80-bit BCD value.
3876; @param A3 Pointer to the 80-bit register value.
3877;
3878BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3879 PROLOGUE_4_ARGS
3880 sub xSP, 20h
3881
3882 fninit
3883 fld tword [A3]
3884 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3885 fbstp tword [A2]
3886
3887 fnstsw word [A1]
3888
3889 fninit
3890 add xSP, 20h
3891 EPILOGUE_4_ARGS
3892ENDPROC iemAImpl_fst_r80_to_d80
3893
3894
3895;;
3896; FPU instruction working on two 80-bit floating point values.
3897;
3898; @param 1 The instruction
3899;
3900; @param A0 FPU context (fxsave).
3901; @param A1 Pointer to a IEMFPURESULT for the output.
3902; @param A2 Pointer to the first 80-bit value (ST0)
3903; @param A3 Pointer to the second 80-bit value (STn).
3904;
3905%macro IEMIMPL_FPU_R80_BY_R80 2
3906BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3907 PROLOGUE_4_ARGS
3908 sub xSP, 20h
3909
3910 fninit
3911 fld tword [A3]
3912 fld tword [A2]
3913 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3914 %1 %2
3915
3916 fnstsw word [A1 + IEMFPURESULT.FSW]
3917 fnclex
3918 fstp tword [A1 + IEMFPURESULT.r80Result]
3919
3920 fninit
3921 add xSP, 20h
3922 EPILOGUE_4_ARGS
3923ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3924%endmacro
3925
3926IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3927IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3928IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3929IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3930IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3931IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3932IEMIMPL_FPU_R80_BY_R80 fprem, {}
3933IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3934IEMIMPL_FPU_R80_BY_R80 fscale, {}
3935
3936
3937;;
3938; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3939; storing the result in ST1 and popping the stack.
3940;
3941; @param 1 The instruction
3942;
3943; @param A0 FPU context (fxsave).
3944; @param A1 Pointer to a IEMFPURESULT for the output.
3945; @param A2 Pointer to the first 80-bit value (ST1).
3946; @param A3 Pointer to the second 80-bit value (ST0).
3947;
3948%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3949BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3950 PROLOGUE_4_ARGS
3951 sub xSP, 20h
3952
3953 fninit
3954 fld tword [A2]
3955 fld tword [A3]
3956 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3957 %1
3958
3959 fnstsw word [A1 + IEMFPURESULT.FSW]
3960 fnclex
3961 fstp tword [A1 + IEMFPURESULT.r80Result]
3962
3963 fninit
3964 add xSP, 20h
3965 EPILOGUE_4_ARGS
3966ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3967%endmacro
3968
3969IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3970IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3971IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3972
3973
3974;;
3975; FPU instruction working on two 80-bit floating point values, only
3976; returning FSW.
3977;
3978; @param 1 The instruction
3979;
3980; @param A0 FPU context (fxsave).
3981; @param A1 Pointer to a uint16_t for the resulting FSW.
3982; @param A2 Pointer to the first 80-bit value.
3983; @param A3 Pointer to the second 80-bit value.
3984;
3985%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3986BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3987 PROLOGUE_4_ARGS
3988 sub xSP, 20h
3989
3990 fninit
3991 fld tword [A3]
3992 fld tword [A2]
3993 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3994 %1 st0, st1
3995
3996 fnstsw word [A1]
3997
3998 fninit
3999 add xSP, 20h
4000 EPILOGUE_4_ARGS
4001ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4002%endmacro
4003
4004IEMIMPL_FPU_R80_BY_R80_FSW fcom
4005IEMIMPL_FPU_R80_BY_R80_FSW fucom
4006
4007
4008;;
4009; FPU instruction working on two 80-bit floating point values,
4010; returning FSW and EFLAGS (eax).
4011;
4012; @param 1 The instruction
4013;
4014; @returns EFLAGS in EAX.
4015; @param A0 FPU context (fxsave).
4016; @param A1 Pointer to a uint16_t for the resulting FSW.
4017; @param A2 Pointer to the first 80-bit value.
4018; @param A3 Pointer to the second 80-bit value.
4019;
4020%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
4021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
4022 PROLOGUE_4_ARGS
4023 sub xSP, 20h
4024
4025 fninit
4026 fld tword [A3]
4027 fld tword [A2]
4028 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4029 %1 st1
4030
4031 fnstsw word [A1]
4032 pushf
4033 pop xAX
4034
4035 fninit
4036 add xSP, 20h
4037 EPILOGUE_4_ARGS
4038ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
4039%endmacro
4040
4041IEMIMPL_FPU_R80_BY_R80_EFL fcomi
4042IEMIMPL_FPU_R80_BY_R80_EFL fucomi
4043
4044
4045;;
4046; FPU instruction working on one 80-bit floating point value.
4047;
4048; @param 1 The instruction
4049;
4050; @param A0 FPU context (fxsave).
4051; @param A1 Pointer to a IEMFPURESULT for the output.
4052; @param A2 Pointer to the 80-bit value.
4053;
4054%macro IEMIMPL_FPU_R80 1
4055BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4056 PROLOGUE_3_ARGS
4057 sub xSP, 20h
4058
4059 fninit
4060 fld tword [A2]
4061 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4062 %1
4063
4064 fnstsw word [A1 + IEMFPURESULT.FSW]
4065 fnclex
4066 fstp tword [A1 + IEMFPURESULT.r80Result]
4067
4068 fninit
4069 add xSP, 20h
4070 EPILOGUE_3_ARGS
4071ENDPROC iemAImpl_ %+ %1 %+ _r80
4072%endmacro
4073
4074IEMIMPL_FPU_R80 fchs
4075IEMIMPL_FPU_R80 fabs
4076IEMIMPL_FPU_R80 f2xm1
4077IEMIMPL_FPU_R80 fsqrt
4078IEMIMPL_FPU_R80 frndint
4079IEMIMPL_FPU_R80 fsin
4080IEMIMPL_FPU_R80 fcos
4081
4082
4083;;
4084; FPU instruction working on one 80-bit floating point value, only
4085; returning FSW.
4086;
4087; @param 1 The instruction
4088; @param 2 Non-zero to also restore FTW.
4089;
4090; @param A0 FPU context (fxsave).
4091; @param A1 Pointer to a uint16_t for the resulting FSW.
4092; @param A2 Pointer to the 80-bit value.
4093;
4094%macro IEMIMPL_FPU_R80_FSW 2
4095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
4096 PROLOGUE_3_ARGS
4097 sub xSP, 20h
4098
4099 fninit
4100 fld tword [A2]
4101%if %2 != 0
4102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
4103%else
4104 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4105%endif
4106 %1
4107
4108 fnstsw word [A1]
4109
4110 fninit
4111 add xSP, 20h
4112 EPILOGUE_3_ARGS
4113ENDPROC iemAImpl_ %+ %1 %+ _r80
4114%endmacro
4115
4116IEMIMPL_FPU_R80_FSW ftst, 0
4117IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
4118
4119
4120
4121;;
4122; FPU instruction loading a 80-bit floating point constant.
4123;
4124; @param 1 The instruction
4125;
4126; @param A0 FPU context (fxsave).
4127; @param A1 Pointer to a IEMFPURESULT for the output.
4128;
4129%macro IEMIMPL_FPU_R80_CONST 1
4130BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
4131 PROLOGUE_2_ARGS
4132 sub xSP, 20h
4133
4134 fninit
4135 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4136 %1
4137
4138 fnstsw word [A1 + IEMFPURESULT.FSW]
4139 fnclex
4140 fstp tword [A1 + IEMFPURESULT.r80Result]
4141
4142 fninit
4143 add xSP, 20h
4144 EPILOGUE_2_ARGS
4145ENDPROC iemAImpl_ %+ %1 %+
4146%endmacro
4147
4148IEMIMPL_FPU_R80_CONST fld1
4149IEMIMPL_FPU_R80_CONST fldl2t
4150IEMIMPL_FPU_R80_CONST fldl2e
4151IEMIMPL_FPU_R80_CONST fldpi
4152IEMIMPL_FPU_R80_CONST fldlg2
4153IEMIMPL_FPU_R80_CONST fldln2
4154IEMIMPL_FPU_R80_CONST fldz
4155
4156
4157;;
4158; FPU instruction working on one 80-bit floating point value, outputing two.
4159;
4160; @param 1 The instruction
4161;
4162; @param A0 FPU context (fxsave).
4163; @param A1 Pointer to a IEMFPURESULTTWO for the output.
4164; @param A2 Pointer to the 80-bit value.
4165;
4166%macro IEMIMPL_FPU_R80_R80 1
4167BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
4168 PROLOGUE_3_ARGS
4169 sub xSP, 20h
4170
4171 fninit
4172 fld tword [A2]
4173 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
4174 %1
4175
4176 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
4177 fnclex
4178 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
4179 fnclex
4180 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
4181
4182 fninit
4183 add xSP, 20h
4184 EPILOGUE_3_ARGS
4185ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
4186%endmacro
4187
4188IEMIMPL_FPU_R80_R80 fptan
4189IEMIMPL_FPU_R80_R80 fxtract
4190IEMIMPL_FPU_R80_R80 fsincos
4191
4192
4193
4194
4195;---------------------- SSE and MMX Operations ----------------------
4196
4197;; @todo what do we need to do for MMX?
4198%macro IEMIMPL_MMX_PROLOGUE 0
4199%endmacro
4200%macro IEMIMPL_MMX_EPILOGUE 0
4201%endmacro
4202
4203;; @todo what do we need to do for SSE?
4204%macro IEMIMPL_SSE_PROLOGUE 0
4205%endmacro
4206%macro IEMIMPL_SSE_EPILOGUE 0
4207%endmacro
4208
4209;; @todo what do we need to do for AVX?
4210%macro IEMIMPL_AVX_PROLOGUE 0
4211%endmacro
4212%macro IEMIMPL_AVX_EPILOGUE 0
4213%endmacro
4214
4215
4216;;
4217; Media instruction working on two full sized registers.
4218;
4219; @param 1 The instruction
4220; @param 2 Whether there is an MMX variant (1) or not (0).
4221;
4222; @param A0 FPU context (fxsave).
4223; @param A1 Pointer to the first media register size operand (input/output).
4224; @param A2 Pointer to the second media register size operand (input).
4225;
4226; @todo r=aeichner Currently unused, can probably be removed.
4227;
4228%macro IEMIMPL_MEDIA_F2 2
4229%if %2 != 0
4230BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
4231 PROLOGUE_3_ARGS
4232 IEMIMPL_MMX_PROLOGUE
4233
4234 movq mm0, [A1]
4235 movq mm1, [A2]
4236 %1 mm0, mm1
4237 movq [A1], mm0
4238
4239 IEMIMPL_MMX_EPILOGUE
4240 EPILOGUE_3_ARGS
4241ENDPROC iemAImpl_ %+ %1 %+ _u64
4242%endif
4243
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_SSE_PROLOGUE
4247
4248 movdqu xmm0, [A1]
4249 movdqu xmm1, [A2]
4250 %1 xmm0, xmm1
4251 movdqu [A1], xmm0
4252
4253 IEMIMPL_SSE_EPILOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256%endmacro
4257
4258;;
4259; Media instruction working on two full sized registers, but no FXSAVE state argument.
4260;
4261; @param 1 The instruction
4262; @param 2 Whether there is an MMX variant (1) or not (0).
4263;
4264; @param A0 Pointer to the first media register size operand (input/output).
4265; @param A1 Pointer to the second media register size operand (input).
4266;
4267%macro IEMIMPL_MEDIA_OPT_F2 2
4268%if %2 != 0
4269BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4270 PROLOGUE_2_ARGS
4271 IEMIMPL_MMX_PROLOGUE
4272
4273 movq mm0, [A0]
4274 movq mm1, [A1]
4275 %1 mm0, mm1
4276 movq [A0], mm0
4277
4278 IEMIMPL_MMX_EPILOGUE
4279 EPILOGUE_2_ARGS
4280ENDPROC iemAImpl_ %+ %1 %+ _u64
4281%endif
4282
4283BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4284 PROLOGUE_2_ARGS
4285 IEMIMPL_SSE_PROLOGUE
4286
4287 movdqu xmm0, [A0]
4288 movdqu xmm1, [A1]
4289 %1 xmm0, xmm1
4290 movdqu [A0], xmm0
4291
4292 IEMIMPL_SSE_EPILOGUE
4293 EPILOGUE_2_ARGS
4294ENDPROC iemAImpl_ %+ %1 %+ _u128
4295%endmacro
4296
4297IEMIMPL_MEDIA_OPT_F2 pshufb, 1
4298IEMIMPL_MEDIA_OPT_F2 pand, 1
4299IEMIMPL_MEDIA_OPT_F2 pandn, 1
4300IEMIMPL_MEDIA_OPT_F2 por, 1
4301IEMIMPL_MEDIA_OPT_F2 pxor, 1
4302IEMIMPL_MEDIA_OPT_F2 pcmpeqb, 1
4303IEMIMPL_MEDIA_OPT_F2 pcmpeqw, 1
4304IEMIMPL_MEDIA_OPT_F2 pcmpeqd, 1
4305IEMIMPL_MEDIA_OPT_F2 pcmpeqq, 0
4306IEMIMPL_MEDIA_OPT_F2 pcmpgtb, 1
4307IEMIMPL_MEDIA_OPT_F2 pcmpgtw, 1
4308IEMIMPL_MEDIA_OPT_F2 pcmpgtd, 1
4309IEMIMPL_MEDIA_OPT_F2 pcmpgtq, 0
4310IEMIMPL_MEDIA_OPT_F2 paddb, 1
4311IEMIMPL_MEDIA_OPT_F2 paddw, 1
4312IEMIMPL_MEDIA_OPT_F2 paddd, 1
4313IEMIMPL_MEDIA_OPT_F2 paddq, 1
4314IEMIMPL_MEDIA_OPT_F2 paddsb, 1
4315IEMIMPL_MEDIA_OPT_F2 paddsw, 1
4316IEMIMPL_MEDIA_OPT_F2 paddusb, 1
4317IEMIMPL_MEDIA_OPT_F2 paddusw, 1
4318IEMIMPL_MEDIA_OPT_F2 psubb, 1
4319IEMIMPL_MEDIA_OPT_F2 psubw, 1
4320IEMIMPL_MEDIA_OPT_F2 psubd, 1
4321IEMIMPL_MEDIA_OPT_F2 psubq, 1
4322IEMIMPL_MEDIA_OPT_F2 psubsb, 1
4323IEMIMPL_MEDIA_OPT_F2 psubsw, 1
4324IEMIMPL_MEDIA_OPT_F2 psubusb, 1
4325IEMIMPL_MEDIA_OPT_F2 psubusw, 1
4326IEMIMPL_MEDIA_OPT_F2 pmullw, 1
4327IEMIMPL_MEDIA_OPT_F2 pmulld, 0
4328IEMIMPL_MEDIA_OPT_F2 pmulhw, 1
4329IEMIMPL_MEDIA_OPT_F2 pmaddwd, 1
4330IEMIMPL_MEDIA_OPT_F2 pminub, 1
4331IEMIMPL_MEDIA_OPT_F2 pminuw, 0
4332IEMIMPL_MEDIA_OPT_F2 pminud, 0
4333IEMIMPL_MEDIA_OPT_F2 pminsb, 0
4334IEMIMPL_MEDIA_OPT_F2 pminsw, 1
4335IEMIMPL_MEDIA_OPT_F2 pminsd, 0
4336IEMIMPL_MEDIA_OPT_F2 pmaxub, 1
4337IEMIMPL_MEDIA_OPT_F2 pmaxuw, 0
4338IEMIMPL_MEDIA_OPT_F2 pmaxud, 0
4339IEMIMPL_MEDIA_OPT_F2 pmaxsb, 0
4340IEMIMPL_MEDIA_OPT_F2 pmaxsw, 1
4341IEMIMPL_MEDIA_OPT_F2 pmaxsd, 0
4342IEMIMPL_MEDIA_OPT_F2 pabsb, 1
4343IEMIMPL_MEDIA_OPT_F2 pabsw, 1
4344IEMIMPL_MEDIA_OPT_F2 pabsd, 1
4345IEMIMPL_MEDIA_OPT_F2 psignb, 1
4346IEMIMPL_MEDIA_OPT_F2 psignw, 1
4347IEMIMPL_MEDIA_OPT_F2 psignd, 1
4348IEMIMPL_MEDIA_OPT_F2 phaddw, 1
4349IEMIMPL_MEDIA_OPT_F2 phaddd, 1
4350IEMIMPL_MEDIA_OPT_F2 phsubw, 1
4351IEMIMPL_MEDIA_OPT_F2 phsubd, 1
4352IEMIMPL_MEDIA_OPT_F2 phaddsw, 1
4353IEMIMPL_MEDIA_OPT_F2 phsubsw, 1
4354IEMIMPL_MEDIA_OPT_F2 pmaddubsw, 1
4355IEMIMPL_MEDIA_OPT_F2 pmulhrsw, 1
4356IEMIMPL_MEDIA_OPT_F2 pmuludq, 1
4357IEMIMPL_MEDIA_OPT_F2 packsswb, 1
4358IEMIMPL_MEDIA_OPT_F2 packssdw, 1
4359IEMIMPL_MEDIA_OPT_F2 packuswb, 1
4360IEMIMPL_MEDIA_OPT_F2 packusdw, 0
4361IEMIMPL_MEDIA_OPT_F2 psllw, 1
4362IEMIMPL_MEDIA_OPT_F2 pslld, 1
4363IEMIMPL_MEDIA_OPT_F2 psllq, 1
4364IEMIMPL_MEDIA_OPT_F2 psrlw, 1
4365IEMIMPL_MEDIA_OPT_F2 psrld, 1
4366IEMIMPL_MEDIA_OPT_F2 psrlq, 1
4367IEMIMPL_MEDIA_OPT_F2 psraw, 1
4368IEMIMPL_MEDIA_OPT_F2 psrad, 1
4369IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
4370IEMIMPL_MEDIA_OPT_F2 pavgb, 1
4371IEMIMPL_MEDIA_OPT_F2 pavgw, 1
4372IEMIMPL_MEDIA_OPT_F2 psadbw, 1
4373IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
4374IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
4375IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
4376IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
4377IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
4378IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
4379IEMIMPL_MEDIA_OPT_F2 aesimc, 0
4380IEMIMPL_MEDIA_OPT_F2 aesenc, 0
4381IEMIMPL_MEDIA_OPT_F2 aesdec, 0
4382IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
4383IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
4384IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
4385IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
4386IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
4387IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
4388IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
4389
4390;;
4391; Media instruction working on one full sized and one half sized register (lower half).
4392;
4393; @param 1 The instruction
4394; @param 2 1 if MMX is included, 0 if not.
4395;
4396; @param A0 Pointer to the first full sized media register operand (input/output).
4397; @param A1 Pointer to the second half sized media register operand (input).
4398;
4399%macro IEMIMPL_MEDIA_F1L1 2
4400 %if %2 != 0
4401BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
4402 PROLOGUE_2_ARGS
4403 IEMIMPL_MMX_PROLOGUE
4404
4405 movq mm0, [A0]
4406 movq mm1, [A1]
4407 %1 mm0, mm1
4408 movq [A0], mm0
4409
4410 IEMIMPL_MMX_EPILOGUE
4411 EPILOGUE_2_ARGS
4412ENDPROC iemAImpl_ %+ %1 %+ _u64
4413 %endif
4414
4415BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
4416 PROLOGUE_2_ARGS
4417 IEMIMPL_SSE_PROLOGUE
4418
4419 movdqu xmm0, [A0]
4420 movdqu xmm1, [A1]
4421 %1 xmm0, xmm1
4422 movdqu [A0], xmm0
4423
4424 IEMIMPL_SSE_EPILOGUE
4425 EPILOGUE_2_ARGS
4426ENDPROC iemAImpl_ %+ %1 %+ _u128
4427%endmacro
4428
4429IEMIMPL_MEDIA_F1L1 punpcklbw, 1
4430IEMIMPL_MEDIA_F1L1 punpcklwd, 1
4431IEMIMPL_MEDIA_F1L1 punpckldq, 1
4432IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
4433
4434
4435;;
4436; Media instruction working two half sized input registers (lower half) and a full sized
4437; destination register (vpunpckh*).
4438;
4439; @param 1 The instruction
4440;
4441; @param A0 Pointer to the destination register (full sized, output only).
4442; @param A1 Pointer to the first full sized media source register operand, where we
4443; will only use the lower half as input - but we'll be loading it in full.
4444; @param A2 Pointer to the second full sized media source register operand, where we
4445; will only use the lower half as input - but we'll be loading it in full.
4446;
4447%macro IEMIMPL_MEDIA_F1L1L1 1
4448BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4449 PROLOGUE_3_ARGS
4450 IEMIMPL_AVX_PROLOGUE
4451
4452 vmovdqu xmm0, [A1]
4453 vmovdqu xmm1, [A2]
4454 %1 xmm0, xmm0, xmm1
4455 vmovdqu [A0], xmm0
4456
4457 IEMIMPL_AVX_PROLOGUE
4458 EPILOGUE_3_ARGS
4459ENDPROC iemAImpl_ %+ %1 %+ _u128
4460
4461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4462 PROLOGUE_3_ARGS
4463 IEMIMPL_AVX_PROLOGUE
4464
4465 vmovdqu ymm0, [A1]
4466 vmovdqu ymm1, [A2]
4467 %1 ymm0, ymm0, ymm1
4468 vmovdqu [A0], ymm0
4469
4470 IEMIMPL_AVX_PROLOGUE
4471 EPILOGUE_3_ARGS
4472ENDPROC iemAImpl_ %+ %1 %+ _u256
4473%endmacro
4474
4475IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
4476IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
4477IEMIMPL_MEDIA_F1L1L1 vpunpckldq
4478IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
4479
4480
4481;;
4482; Media instruction working on one full sized and one half sized register (high half).
4483;
4484; @param 1 The instruction
4485; @param 2 1 if MMX is included, 0 if not.
4486;
4487; @param A0 Pointer to the first full sized media register operand (input/output).
4488; @param A1 Pointer to the second full sized media register operand, where we
4489; will only use the upper half as input - but we'll load it in full.
4490;
4491%macro IEMIMPL_MEDIA_F1H1 2
4492IEMIMPL_MEDIA_F1L1 %1, %2
4493%endmacro
4494
4495IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4496IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4497IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4498IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4499
4500
4501;;
4502; Media instruction working two half sized input registers (high half) and a full sized
4503; destination register (vpunpckh*).
4504;
4505; @param 1 The instruction
4506;
4507; @param A0 Pointer to the destination register (full sized, output only).
4508; @param A1 Pointer to the first full sized media source register operand, where we
4509; will only use the upper half as input - but we'll be loading it in full.
4510; @param A2 Pointer to the second full sized media source register operand, where we
4511; will only use the upper half as input - but we'll be loading it in full.
4512;
4513%macro IEMIMPL_MEDIA_F1H1H1 1
4514IEMIMPL_MEDIA_F1L1L1 %1
4515%endmacro
4516
4517IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4518IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4519IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4520IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4521
4522
4523;
4524; Shufflers with evil 8-bit immediates.
4525;
4526
4527BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4528 PROLOGUE_3_ARGS
4529 IEMIMPL_MMX_PROLOGUE
4530
4531 movzx A2, A2_8 ; must clear top bits
4532 movq mm1, [A1]
4533 movq mm0, mm0 ; paranoia!
4534 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 5
4535 movq [A0], mm0
4536
4537 IEMIMPL_MMX_EPILOGUE
4538 EPILOGUE_3_ARGS
4539%assign bImm 0
4540%rep 256
4541.imm %+ bImm:
4542 IBT_ENDBRxx_WITHOUT_NOTRACK
4543 pshufw mm0, mm1, bImm
4544 ret
4545 %assign bImm bImm + 1
4546%endrep
4547.immEnd:
4548ENDPROC iemAImpl_pshufw_u64
4549
4550
4551%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4552BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4553 PROLOGUE_3_ARGS
4554 IEMIMPL_SSE_PROLOGUE
4555
4556 movzx A2, A2_8 ; must clear top bits
4557 movdqu xmm1, [A1]
4558 movdqu xmm0, xmm1 ; paranoia!
4559 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4560 movdqu [A0], xmm0
4561
4562 IEMIMPL_SSE_EPILOGUE
4563 EPILOGUE_3_ARGS
4564
4565 %assign bImm 0
4566 %rep 256
4567.imm %+ bImm:
4568 IBT_ENDBRxx_WITHOUT_NOTRACK
4569 %1 xmm0, xmm1, bImm
4570 ret
4571 %assign bImm bImm + 1
4572 %endrep
4573.immEnd:
4574ENDPROC iemAImpl_ %+ %1 %+ _u128
4575%endmacro
4576
4577IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4578IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4579IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4580
4581
4582%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4583BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4584 PROLOGUE_3_ARGS
4585 IEMIMPL_SSE_PROLOGUE
4586
4587 movzx A2, A2_8 ; must clear top bits
4588 vmovdqu ymm1, [A1]
4589 vmovdqu ymm0, ymm1 ; paranoia!
4590 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
4591 vmovdqu [A0], ymm0
4592
4593 IEMIMPL_SSE_EPILOGUE
4594 EPILOGUE_3_ARGS
4595 %assign bImm 0
4596 %rep 256
4597.imm %+ bImm:
4598 IBT_ENDBRxx_WITHOUT_NOTRACK
4599 %1 ymm0, ymm1, bImm
4600 ret
4601 %assign bImm bImm + 1
4602 %endrep
4603.immEnd:
4604ENDPROC iemAImpl_ %+ %1 %+ _u256
4605%endmacro
4606
4607IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4608IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4609IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4610
4611
4612;
4613; Shifts with evil 8-bit immediates.
4614;
4615
4616%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4618 PROLOGUE_2_ARGS
4619 IEMIMPL_MMX_PROLOGUE
4620
4621 movzx A1, A1_8 ; must clear top bits
4622 movq mm0, [A0]
4623 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 5
4624 movq [A0], mm0
4625
4626 IEMIMPL_MMX_EPILOGUE
4627 EPILOGUE_2_ARGS
4628%assign bImm 0
4629%rep 256
4630.imm %+ bImm:
4631 IBT_ENDBRxx_WITHOUT_NOTRACK
4632 %1 mm0, bImm
4633 ret
4634 %assign bImm bImm + 1
4635%endrep
4636.immEnd:
4637ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4638%endmacro
4639
4640IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4641IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4642IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4643IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4644IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4645IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4646IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4647IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4648
4649
4650%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4651BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4652 PROLOGUE_2_ARGS
4653 IEMIMPL_SSE_PROLOGUE
4654
4655 movzx A1, A1_8 ; must clear top bits
4656 movdqu xmm0, [A0]
4657 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A1, 6
4658 movdqu [A0], xmm0
4659
4660 IEMIMPL_SSE_EPILOGUE
4661 EPILOGUE_2_ARGS
4662 %assign bImm 0
4663 %rep 256
4664.imm %+ bImm:
4665 IBT_ENDBRxx_WITHOUT_NOTRACK
4666 %1 xmm0, bImm
4667 ret
4668 %assign bImm bImm + 1
4669 %endrep
4670.immEnd:
4671ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4672%endmacro
4673
4674IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4675IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4676IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4677IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4678IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4679IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4680IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4681IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4682IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4683IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4684
4685
4686;
4687; Move byte mask.
4688;
4689
4690BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4691 PROLOGUE_2_ARGS
4692 IEMIMPL_MMX_PROLOGUE
4693
4694 movq mm1, [A1]
4695 pmovmskb T0, mm1
4696 mov [A0], T0
4697%ifdef RT_ARCH_X86
4698 mov dword [A0 + 4], 0
4699%endif
4700 IEMIMPL_MMX_EPILOGUE
4701 EPILOGUE_2_ARGS
4702ENDPROC iemAImpl_pmovmskb_u64
4703
4704BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4705 PROLOGUE_2_ARGS
4706 IEMIMPL_SSE_PROLOGUE
4707
4708 movdqu xmm1, [A1]
4709 pmovmskb T0, xmm1
4710 mov [A0], T0
4711%ifdef RT_ARCH_X86
4712 mov dword [A0 + 4], 0
4713%endif
4714 IEMIMPL_SSE_EPILOGUE
4715 EPILOGUE_2_ARGS
4716ENDPROC iemAImpl_pmovmskb_u128
4717
4718BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4719 PROLOGUE_2_ARGS
4720 IEMIMPL_AVX_PROLOGUE
4721
4722 vmovdqu ymm1, [A1]
4723 vpmovmskb T0, ymm1
4724 mov [A0], T0
4725%ifdef RT_ARCH_X86
4726 mov dword [A0 + 4], 0
4727%endif
4728 IEMIMPL_AVX_EPILOGUE
4729 EPILOGUE_2_ARGS
4730ENDPROC iemAImpl_vpmovmskb_u256
4731
4732
4733;;
4734; Media instruction working on two full sized source registers and one destination (AVX).
4735;
4736; @param 1 The instruction
4737;
4738; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4739; @param A1 Pointer to the destination media register size operand (output).
4740; @param A2 Pointer to the first source media register size operand (input).
4741; @param A3 Pointer to the second source media register size operand (input).
4742;
4743; @todo r=aeichner Not used right now
4744;
4745%macro IEMIMPL_MEDIA_F3 1
4746BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4747 PROLOGUE_4_ARGS
4748 IEMIMPL_AVX_PROLOGUE
4749
4750 vmovdqu xmm0, [A2]
4751 vmovdqu xmm1, [A3]
4752 %1 xmm0, xmm0, xmm1
4753 vmovdqu [A1], xmm0
4754
4755 IEMIMPL_AVX_PROLOGUE
4756 EPILOGUE_4_ARGS
4757ENDPROC iemAImpl_ %+ %1 %+ _u128
4758
4759BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4760 PROLOGUE_4_ARGS
4761 IEMIMPL_AVX_PROLOGUE
4762
4763 vmovdqu ymm0, [A2]
4764 vmovdqu ymm1, [A3]
4765 %1 ymm0, ymm0, ymm1
4766 vmovdqu [A1], ymm0
4767
4768 IEMIMPL_AVX_PROLOGUE
4769 EPILOGUE_4_ARGS
4770ENDPROC iemAImpl_ %+ %1 %+ _u256
4771%endmacro
4772
4773;;
4774; Media instruction working on two full sized source registers and one destination (AVX),
4775; but no XSAVE state pointer argument.
4776;
4777; @param 1 The instruction
4778;
4779; @param A0 Pointer to the destination media register size operand (output).
4780; @param A1 Pointer to the first source media register size operand (input).
4781; @param A2 Pointer to the second source media register size operand (input).
4782;
4783%macro IEMIMPL_MEDIA_OPT_F3 1
4784BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4785 PROLOGUE_3_ARGS
4786 IEMIMPL_AVX_PROLOGUE
4787
4788 vmovdqu xmm0, [A1]
4789 vmovdqu xmm1, [A2]
4790 %1 xmm0, xmm0, xmm1
4791 vmovdqu [A0], xmm0
4792
4793 IEMIMPL_AVX_PROLOGUE
4794 EPILOGUE_3_ARGS
4795ENDPROC iemAImpl_ %+ %1 %+ _u128
4796
4797BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4798 PROLOGUE_3_ARGS
4799 IEMIMPL_AVX_PROLOGUE
4800
4801 vmovdqu ymm0, [A1]
4802 vmovdqu ymm1, [A2]
4803 %1 ymm0, ymm0, ymm1
4804 vmovdqu [A0], ymm0
4805
4806 IEMIMPL_AVX_PROLOGUE
4807 EPILOGUE_3_ARGS
4808ENDPROC iemAImpl_ %+ %1 %+ _u256
4809%endmacro
4810
4811IEMIMPL_MEDIA_OPT_F3 vpshufb
4812IEMIMPL_MEDIA_OPT_F3 vpand
4813IEMIMPL_MEDIA_OPT_F3 vpminub
4814IEMIMPL_MEDIA_OPT_F3 vpminuw
4815IEMIMPL_MEDIA_OPT_F3 vpminud
4816IEMIMPL_MEDIA_OPT_F3 vpminsb
4817IEMIMPL_MEDIA_OPT_F3 vpminsw
4818IEMIMPL_MEDIA_OPT_F3 vpminsd
4819IEMIMPL_MEDIA_OPT_F3 vpmaxub
4820IEMIMPL_MEDIA_OPT_F3 vpmaxuw
4821IEMIMPL_MEDIA_OPT_F3 vpmaxud
4822IEMIMPL_MEDIA_OPT_F3 vpmaxsb
4823IEMIMPL_MEDIA_OPT_F3 vpmaxsw
4824IEMIMPL_MEDIA_OPT_F3 vpmaxsd
4825IEMIMPL_MEDIA_OPT_F3 vpandn
4826IEMIMPL_MEDIA_OPT_F3 vpor
4827IEMIMPL_MEDIA_OPT_F3 vpxor
4828IEMIMPL_MEDIA_OPT_F3 vpcmpeqb
4829IEMIMPL_MEDIA_OPT_F3 vpcmpeqw
4830IEMIMPL_MEDIA_OPT_F3 vpcmpeqd
4831IEMIMPL_MEDIA_OPT_F3 vpcmpeqq
4832IEMIMPL_MEDIA_OPT_F3 vpcmpgtb
4833IEMIMPL_MEDIA_OPT_F3 vpcmpgtw
4834IEMIMPL_MEDIA_OPT_F3 vpcmpgtd
4835IEMIMPL_MEDIA_OPT_F3 vpcmpgtq
4836IEMIMPL_MEDIA_OPT_F3 vpaddb
4837IEMIMPL_MEDIA_OPT_F3 vpaddw
4838IEMIMPL_MEDIA_OPT_F3 vpaddd
4839IEMIMPL_MEDIA_OPT_F3 vpaddq
4840IEMIMPL_MEDIA_OPT_F3 vpsubb
4841IEMIMPL_MEDIA_OPT_F3 vpsubw
4842IEMIMPL_MEDIA_OPT_F3 vpsubd
4843IEMIMPL_MEDIA_OPT_F3 vpsubq
4844IEMIMPL_MEDIA_OPT_F3 vpacksswb
4845IEMIMPL_MEDIA_OPT_F3 vpackssdw
4846IEMIMPL_MEDIA_OPT_F3 vpackuswb
4847IEMIMPL_MEDIA_OPT_F3 vpackusdw
4848IEMIMPL_MEDIA_OPT_F3 vpmullw
4849IEMIMPL_MEDIA_OPT_F3 vpmulld
4850IEMIMPL_MEDIA_OPT_F3 vpmulhw
4851IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4852IEMIMPL_MEDIA_OPT_F3 vpavgb
4853IEMIMPL_MEDIA_OPT_F3 vpavgw
4854IEMIMPL_MEDIA_OPT_F3 vpsignb
4855IEMIMPL_MEDIA_OPT_F3 vpsignw
4856IEMIMPL_MEDIA_OPT_F3 vpsignd
4857IEMIMPL_MEDIA_OPT_F3 vphaddw
4858IEMIMPL_MEDIA_OPT_F3 vphaddd
4859IEMIMPL_MEDIA_OPT_F3 vphsubw
4860IEMIMPL_MEDIA_OPT_F3 vphsubd
4861IEMIMPL_MEDIA_OPT_F3 vphaddsw
4862IEMIMPL_MEDIA_OPT_F3 vphsubsw
4863IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4864IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4865IEMIMPL_MEDIA_OPT_F3 vpsadbw
4866IEMIMPL_MEDIA_OPT_F3 vpmuldq
4867IEMIMPL_MEDIA_OPT_F3 vpmuludq
4868IEMIMPL_MEDIA_OPT_F3 vunpcklps
4869IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4870IEMIMPL_MEDIA_OPT_F3 vunpckhps
4871IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4872IEMIMPL_MEDIA_OPT_F3 vpsubsb
4873IEMIMPL_MEDIA_OPT_F3 vpsubsw
4874IEMIMPL_MEDIA_OPT_F3 vpsubusb
4875IEMIMPL_MEDIA_OPT_F3 vpsubusw
4876IEMIMPL_MEDIA_OPT_F3 vpaddusb
4877IEMIMPL_MEDIA_OPT_F3 vpaddusw
4878IEMIMPL_MEDIA_OPT_F3 vpaddsb
4879IEMIMPL_MEDIA_OPT_F3 vpaddsw
4880IEMIMPL_MEDIA_OPT_F3 vpermilps
4881IEMIMPL_MEDIA_OPT_F3 vpermilpd
4882IEMIMPL_MEDIA_OPT_F3 vpmaddwd
4883IEMIMPL_MEDIA_OPT_F3 vpsrlvd
4884IEMIMPL_MEDIA_OPT_F3 vpsrlvq
4885IEMIMPL_MEDIA_OPT_F3 vpsravd
4886IEMIMPL_MEDIA_OPT_F3 vpsllvd
4887IEMIMPL_MEDIA_OPT_F3 vpsllvq
4888
4889;;
4890; Media instruction working on one full sized source register, one full sized destination
4891; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
4892; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
4893; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
4894; of either 16, 32, or 64, it acts like the max shift size)
4895;
4896; @param 1 The instruction
4897;
4898; @param A0 Pointer to the destination media register size operand (output).
4899; @param A1 Pointer to the first source media register size operand (input).
4900; @param A2 Pointer to the second source media register size operand (input).
4901;
4902%macro IEMIMPL_SHIFT_OPT_F3 1
4903BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4904 PROLOGUE_3_ARGS
4905 IEMIMPL_AVX_PROLOGUE
4906
4907 vmovdqu xmm0, [A1]
4908 vmovdqu xmm1, [A2]
4909 %1 xmm0, xmm0, xmm1
4910 vmovdqu [A0], xmm0
4911
4912 IEMIMPL_AVX_PROLOGUE
4913 EPILOGUE_3_ARGS
4914ENDPROC iemAImpl_ %+ %1 %+ _u128
4915
4916BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4917 PROLOGUE_3_ARGS
4918 IEMIMPL_AVX_PROLOGUE
4919
4920 vmovdqu ymm0, [A1]
4921 vmovdqu xmm1, [A2]
4922 %1 ymm0, ymm0, xmm1
4923 vmovdqu [A0], ymm0
4924
4925 IEMIMPL_AVX_PROLOGUE
4926 EPILOGUE_3_ARGS
4927ENDPROC iemAImpl_ %+ %1 %+ _u256
4928%endmacro
4929
4930IEMIMPL_SHIFT_OPT_F3 vpsllw
4931IEMIMPL_SHIFT_OPT_F3 vpslld
4932IEMIMPL_SHIFT_OPT_F3 vpsllq
4933IEMIMPL_SHIFT_OPT_F3 vpsraw
4934IEMIMPL_SHIFT_OPT_F3 vpsrad
4935IEMIMPL_SHIFT_OPT_F3 vpsrlw
4936IEMIMPL_SHIFT_OPT_F3 vpsrld
4937IEMIMPL_SHIFT_OPT_F3 vpsrlq
4938
4939
4940;;
4941; Media instruction working on one full sized source registers and one destination (AVX),
4942; but no XSAVE state pointer argument.
4943;
4944; @param 1 The instruction
4945; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4946;
4947; @param A0 Pointer to the destination media register size operand (output).
4948; @param A1 Pointer to the source media register size operand (input).
4949;
4950%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4951BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4952 PROLOGUE_2_ARGS
4953 IEMIMPL_AVX_PROLOGUE
4954
4955 vmovdqu xmm0, [A1]
4956 %1 xmm0, xmm0
4957 vmovdqu [A0], xmm0
4958
4959 IEMIMPL_AVX_PROLOGUE
4960 EPILOGUE_2_ARGS
4961ENDPROC iemAImpl_ %+ %1 %+ _u128
4962
4963 %if %2 == 1
4964BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4965 PROLOGUE_2_ARGS
4966 IEMIMPL_AVX_PROLOGUE
4967
4968 vmovdqu ymm0, [A1]
4969 %1 ymm0, ymm0
4970 vmovdqu [A0], ymm0
4971
4972 IEMIMPL_AVX_PROLOGUE
4973 EPILOGUE_2_ARGS
4974ENDPROC iemAImpl_ %+ %1 %+ _u256
4975 %endif
4976%endmacro
4977
4978IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4979IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4980IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4981IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4982
4983
4984;
4985; The SSE 4.2 crc32
4986;
4987; @param A1 Pointer to the 32-bit destination.
4988; @param A2 The source operand, sized according to the suffix.
4989;
4990BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4991 PROLOGUE_2_ARGS
4992
4993 mov T0_32, [A0]
4994 crc32 T0_32, A1_8
4995 mov [A0], T0_32
4996
4997 EPILOGUE_2_ARGS
4998ENDPROC iemAImpl_crc32_u8
4999
5000BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
5001 PROLOGUE_2_ARGS
5002
5003 mov T0_32, [A0]
5004 crc32 T0_32, A1_16
5005 mov [A0], T0_32
5006
5007 EPILOGUE_2_ARGS
5008ENDPROC iemAImpl_crc32_u16
5009
5010BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
5011 PROLOGUE_2_ARGS
5012
5013 mov T0_32, [A0]
5014 crc32 T0_32, A1_32
5015 mov [A0], T0_32
5016
5017 EPILOGUE_2_ARGS
5018ENDPROC iemAImpl_crc32_u32
5019
5020%ifdef RT_ARCH_AMD64
5021BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
5022 PROLOGUE_2_ARGS
5023
5024 mov T0_32, [A0]
5025 crc32 T0, A1
5026 mov [A0], T0_32
5027
5028 EPILOGUE_2_ARGS
5029ENDPROC iemAImpl_crc32_u64
5030%endif
5031
5032
5033;
5034; PTEST (SSE 4.1)
5035;
5036; @param A0 Pointer to the first source operand (aka readonly destination).
5037; @param A1 Pointer to the second source operand.
5038; @param A2 Pointer to the EFLAGS register.
5039;
5040BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
5041 PROLOGUE_3_ARGS
5042 IEMIMPL_SSE_PROLOGUE
5043
5044 movdqu xmm0, [A0]
5045 movdqu xmm1, [A1]
5046 ptest xmm0, xmm1
5047 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5048
5049 IEMIMPL_SSE_EPILOGUE
5050 EPILOGUE_3_ARGS
5051ENDPROC iemAImpl_ptest_u128
5052
5053BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
5054 PROLOGUE_3_ARGS
5055 IEMIMPL_SSE_PROLOGUE
5056
5057 vmovdqu ymm0, [A0]
5058 vmovdqu ymm1, [A1]
5059 vptest ymm0, ymm1
5060 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5061
5062 IEMIMPL_SSE_EPILOGUE
5063 EPILOGUE_3_ARGS
5064ENDPROC iemAImpl_vptest_u256
5065
5066
5067;; Template for the vtestp{s,d} instructions
5068;
5069; @param 1 The instruction
5070;
5071; @param A0 Pointer to the first source operand (aka readonly destination).
5072; @param A1 Pointer to the second source operand.
5073; @param A2 Pointer to the EFLAGS register.
5074;
5075%macro IEMIMPL_VTESTP_S_D 1
5076BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5077 PROLOGUE_3_ARGS
5078 IEMIMPL_AVX_PROLOGUE
5079
5080 vmovdqu xmm0, [A0]
5081 vmovdqu xmm1, [A1]
5082 %1 xmm0, xmm1
5083 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5084
5085 IEMIMPL_AVX_EPILOGUE
5086 EPILOGUE_3_ARGS
5087ENDPROC iemAImpl_ %+ %1 %+ _u128
5088
5089BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
5090 PROLOGUE_3_ARGS
5091 IEMIMPL_AVX_PROLOGUE
5092
5093 vmovdqu ymm0, [A0]
5094 vmovdqu ymm1, [A1]
5095 %1 ymm0, ymm1
5096 IEM_SAVE_FLAGS_OLD A2, X86_EFL_ZF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_AF | X86_EFL_PF | X86_EFL_SF
5097
5098 IEMIMPL_AVX_EPILOGUE
5099 EPILOGUE_3_ARGS
5100ENDPROC iemAImpl_ %+ %1 %+ _u256
5101%endmacro
5102
5103IEMIMPL_VTESTP_S_D vtestps
5104IEMIMPL_VTESTP_S_D vtestpd
5105
5106
5107;;
5108; Template for the [v]pmov{s,z}x* instructions
5109;
5110; @param 1 The instruction
5111;
5112; @param A0 Pointer to the destination media register size operand (output).
5113; @param A1 The source operand value (input).
5114;
5115%macro IEMIMPL_V_PMOV_SZ_X 1
5116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5117 PROLOGUE_2_ARGS
5118 IEMIMPL_SSE_PROLOGUE
5119
5120 movd xmm0, A1
5121 %1 xmm0, xmm0
5122 vmovdqu [A0], xmm0
5123
5124 IEMIMPL_SSE_PROLOGUE
5125 EPILOGUE_2_ARGS
5126ENDPROC iemAImpl_ %+ %1 %+ _u128
5127
5128BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5129 PROLOGUE_2_ARGS
5130 IEMIMPL_AVX_PROLOGUE
5131
5132 movd xmm0, A1
5133 v %+ %1 xmm0, xmm0
5134 vmovdqu [A0], xmm0
5135
5136 IEMIMPL_AVX_PROLOGUE
5137 EPILOGUE_2_ARGS
5138ENDPROC iemAImpl_v %+ %1 %+ _u128
5139
5140BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5141 PROLOGUE_2_ARGS
5142 IEMIMPL_AVX_PROLOGUE
5143
5144 movdqu xmm0, [A1]
5145 v %+ %1 ymm0, xmm0
5146 vmovdqu [A0], ymm0
5147
5148 IEMIMPL_AVX_PROLOGUE
5149 EPILOGUE_2_ARGS
5150ENDPROC iemAImpl_v %+ %1 %+ _u256
5151%endmacro
5152
5153IEMIMPL_V_PMOV_SZ_X pmovsxbw
5154IEMIMPL_V_PMOV_SZ_X pmovsxbd
5155IEMIMPL_V_PMOV_SZ_X pmovsxbq
5156IEMIMPL_V_PMOV_SZ_X pmovsxwd
5157IEMIMPL_V_PMOV_SZ_X pmovsxwq
5158IEMIMPL_V_PMOV_SZ_X pmovsxdq
5159
5160IEMIMPL_V_PMOV_SZ_X pmovzxbw
5161IEMIMPL_V_PMOV_SZ_X pmovzxbd
5162IEMIMPL_V_PMOV_SZ_X pmovzxbq
5163IEMIMPL_V_PMOV_SZ_X pmovzxwd
5164IEMIMPL_V_PMOV_SZ_X pmovzxwq
5165IEMIMPL_V_PMOV_SZ_X pmovzxdq
5166
5167
5168;;
5169; Initialize the SSE MXCSR register using the guest value partially to
5170; account for rounding mode, load the value from the given register.
5171;
5172; @uses 4 bytes of stack to save the original value, T0.
5173; @param 1 Expression giving the register holding the guest's MXCSR.
5174;
5175%macro SSE_AVX_LD_MXCSR 1
5176 sub xSP, 4
5177
5178 stmxcsr [xSP]
5179 mov T0_32, %1
5180 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5181 or T0_32, X86_MXCSR_XCPT_MASK
5182 sub xSP, 4
5183 mov [xSP], T0_32
5184 ldmxcsr [xSP]
5185 add xSP, 4
5186%endmacro
5187
5188
5189;;
5190; Restores the SSE MXCSR register with the original value.
5191;
5192; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5193; @param 1 Expression giving the register to return the new guest's MXCSR value.
5194; @param 2 Expression giving the register holding original guest's MXCSR value.
5195;
5196; @note Restores the stack pointer.
5197;
5198%macro SSE_AVX_ST_MXCSR 2
5199 sub xSP, 4
5200 stmxcsr [xSP]
5201 mov %1, [xSP]
5202 add xSP, 4
5203 ; Merge the status bits into the original MXCSR value.
5204 and %1, X86_MXCSR_XCPT_FLAGS
5205 or %1, %2
5206
5207 ldmxcsr [xSP]
5208 add xSP, 4
5209%endmacro
5210
5211
5212;;
5213; Floating point instruction working on two full sized registers.
5214;
5215; @param 1 The instruction
5216; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
5217;
5218; @returns R0_32 The new MXCSR value of the guest.
5219; @param A0 The guest's MXCSR register value to use.
5220; @param A1 Where to return the result.
5221; @param A2 Pointer to the first media register size operand (input/output).
5222; @param A3 Pointer to the second media register size operand (input).
5223;
5224%macro IEMIMPL_FP_F2 2
5225BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
5226 PROLOGUE_4_ARGS
5227 IEMIMPL_SSE_PROLOGUE
5228 SSE_AVX_LD_MXCSR A0_32
5229
5230 movdqu xmm0, [A2]
5231 movdqu xmm1, [A3]
5232 %1 xmm0, xmm1
5233 movdqu [A1], xmm0
5234
5235 SSE_AVX_ST_MXCSR R0_32, A0_32
5236 IEMIMPL_SSE_PROLOGUE
5237 EPILOGUE_4_ARGS
5238ENDPROC iemAImpl_ %+ %1 %+ _u128
5239
5240 %if %2 == 3
5241BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5242 PROLOGUE_4_ARGS
5243 IEMIMPL_AVX_PROLOGUE
5244 SSE_AVX_LD_MXCSR A0_32
5245
5246 vmovdqu xmm0, [A2]
5247 vmovdqu xmm1, [A3]
5248 v %+ %1 xmm0, xmm0, xmm1
5249 vmovdqu [A1], xmm0
5250
5251 SSE_AVX_ST_MXCSR R0_32, A0_32
5252 IEMIMPL_AVX_PROLOGUE
5253 EPILOGUE_4_ARGS
5254ENDPROC iemAImpl_v %+ %1 %+ _u128
5255
5256BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5257 PROLOGUE_4_ARGS
5258 IEMIMPL_AVX_PROLOGUE
5259 SSE_AVX_LD_MXCSR A0_32
5260
5261 vmovdqu ymm0, [A2]
5262 vmovdqu ymm1, [A3]
5263 v %+ %1 ymm0, ymm0, ymm1
5264 vmovdqu [A1], ymm0
5265
5266 SSE_AVX_ST_MXCSR R0_32, A0_32
5267 IEMIMPL_AVX_PROLOGUE
5268 EPILOGUE_4_ARGS
5269ENDPROC iemAImpl_v %+ %1 %+ _u256
5270 %elif %2 == 2
5271BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
5272 PROLOGUE_4_ARGS
5273 IEMIMPL_AVX_PROLOGUE
5274 SSE_AVX_LD_MXCSR A0_32
5275
5276 vmovdqu xmm0, [A2]
5277 vmovdqu xmm1, [A3]
5278 v %+ %1 xmm0, xmm1
5279 vmovdqu [A1], xmm0
5280
5281 SSE_AVX_ST_MXCSR R0_32, A0_32
5282 IEMIMPL_AVX_PROLOGUE
5283 EPILOGUE_4_ARGS
5284ENDPROC iemAImpl_v %+ %1 %+ _u128
5285
5286BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
5287 PROLOGUE_4_ARGS
5288 IEMIMPL_AVX_PROLOGUE
5289 SSE_AVX_LD_MXCSR A0_32
5290
5291 vmovdqu ymm0, [A2]
5292 vmovdqu ymm1, [A3]
5293 v %+ %1 ymm0, ymm1
5294 vmovdqu [A1], ymm0
5295
5296 SSE_AVX_ST_MXCSR R0_32, A0_32
5297 IEMIMPL_AVX_PROLOGUE
5298 EPILOGUE_4_ARGS
5299ENDPROC iemAImpl_v %+ %1 %+ _u256
5300 %endif
5301%endmacro
5302
5303IEMIMPL_FP_F2 addps, 3
5304IEMIMPL_FP_F2 addpd, 3
5305IEMIMPL_FP_F2 mulps, 3
5306IEMIMPL_FP_F2 mulpd, 3
5307IEMIMPL_FP_F2 subps, 3
5308IEMIMPL_FP_F2 subpd, 3
5309IEMIMPL_FP_F2 minps, 3
5310IEMIMPL_FP_F2 minpd, 3
5311IEMIMPL_FP_F2 divps, 3
5312IEMIMPL_FP_F2 divpd, 3
5313IEMIMPL_FP_F2 maxps, 3
5314IEMIMPL_FP_F2 maxpd, 3
5315IEMIMPL_FP_F2 haddps, 3
5316IEMIMPL_FP_F2 haddpd, 3
5317IEMIMPL_FP_F2 hsubps, 3
5318IEMIMPL_FP_F2 hsubpd, 3
5319IEMIMPL_FP_F2 addsubps, 3
5320IEMIMPL_FP_F2 addsubpd, 3
5321
5322
5323;;
5324; These are actually unary operations but to keep it simple
5325; we treat them as binary for now, so the output result is
5326; always in sync with the register where the result might get written
5327; to.
5328IEMIMPL_FP_F2 sqrtps, 2
5329IEMIMPL_FP_F2 rsqrtps, 2
5330IEMIMPL_FP_F2 sqrtpd, 2
5331IEMIMPL_FP_F2 rcpps, 2
5332IEMIMPL_FP_F2 cvtdq2ps, 2
5333IEMIMPL_FP_F2 cvtps2dq, 2
5334IEMIMPL_FP_F2 cvttps2dq, 2
5335IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
5336IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
5337IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
5338
5339
5340;;
5341; Floating point instruction working on a full sized register and a single precision operand.
5342;
5343; @param 1 The instruction
5344;
5345; @return R0_32 The new MXCSR value of the guest.
5346; @param A0 The guest's MXCSR register value to use.
5347; @param A1 Where to return the result.
5348; @param A2 Pointer to the first media register size operand (input/output).
5349; @param A3 Pointer to the second single precision floating point value (input).
5350;
5351%macro IEMIMPL_FP_F2_R32 1
5352BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
5353 PROLOGUE_4_ARGS
5354 IEMIMPL_SSE_PROLOGUE
5355 SSE_AVX_LD_MXCSR A0_32
5356
5357 movdqu xmm0, [A2]
5358 movd xmm1, [A3]
5359 %1 xmm0, xmm1
5360 movdqu [A1], xmm0
5361
5362 SSE_AVX_ST_MXCSR R0_32, A0_32
5363 IEMIMPL_SSE_EPILOGUE
5364 EPILOGUE_4_ARGS
5365ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
5366
5367BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
5368 PROLOGUE_4_ARGS
5369 IEMIMPL_AVX_PROLOGUE
5370 SSE_AVX_LD_MXCSR A0_32
5371
5372 vmovdqu xmm0, [A2]
5373 vmovd xmm1, [A3]
5374 v %+ %1 xmm0, xmm0, xmm1
5375 vmovdqu [A1], xmm0
5376
5377 SSE_AVX_ST_MXCSR R0_32, A0_32
5378 IEMIMPL_AVX_PROLOGUE
5379 EPILOGUE_4_ARGS
5380ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
5381%endmacro
5382
5383IEMIMPL_FP_F2_R32 addss
5384IEMIMPL_FP_F2_R32 mulss
5385IEMIMPL_FP_F2_R32 subss
5386IEMIMPL_FP_F2_R32 minss
5387IEMIMPL_FP_F2_R32 divss
5388IEMIMPL_FP_F2_R32 maxss
5389IEMIMPL_FP_F2_R32 cvtss2sd
5390IEMIMPL_FP_F2_R32 sqrtss
5391IEMIMPL_FP_F2_R32 rsqrtss
5392IEMIMPL_FP_F2_R32 rcpss
5393
5394
5395;;
5396; Floating point instruction working on a full sized register and a double precision operand.
5397;
5398; @param 1 The instruction
5399;
5400; @return R0_32 The new MXCSR value of the guest.
5401; @param A0 The guest's MXCSR register value to use.
5402; @param A1 Where to return the result.
5403; @param A2 Pointer to the first media register size operand (input/output).
5404; @param A3 Pointer to the second double precision floating point value (input).
5405;
5406%macro IEMIMPL_FP_F2_R64 1
5407BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
5408 PROLOGUE_4_ARGS
5409 IEMIMPL_SSE_PROLOGUE
5410 SSE_AVX_LD_MXCSR A0_32
5411
5412 movdqu xmm0, [A2]
5413 movq xmm1, [A3]
5414 %1 xmm0, xmm1
5415 movdqu [A1], xmm0
5416
5417 SSE_AVX_ST_MXCSR R0_32, A0_32
5418 IEMIMPL_SSE_EPILOGUE
5419 EPILOGUE_4_ARGS
5420ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5421
5422BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5423 PROLOGUE_4_ARGS
5424 IEMIMPL_AVX_PROLOGUE
5425 SSE_AVX_LD_MXCSR A0_32
5426
5427 vmovdqu xmm0, [A2]
5428 vmovq xmm1, [A3]
5429 v %+ %1 xmm0, xmm0, xmm1
5430 vmovdqu [A1], xmm0
5431
5432 SSE_AVX_ST_MXCSR R0_32, A0_32
5433 IEMIMPL_AVX_EPILOGUE
5434 EPILOGUE_4_ARGS
5435ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5436%endmacro
5437
5438IEMIMPL_FP_F2_R64 addsd
5439IEMIMPL_FP_F2_R64 mulsd
5440IEMIMPL_FP_F2_R64 subsd
5441IEMIMPL_FP_F2_R64 minsd
5442IEMIMPL_FP_F2_R64 divsd
5443IEMIMPL_FP_F2_R64 maxsd
5444IEMIMPL_FP_F2_R64 cvtsd2ss
5445IEMIMPL_FP_F2_R64 sqrtsd
5446
5447
5448;;
5449; Macro for the cvtpd2ps/cvtps2pd instructions.
5450;
5451; 1 The instruction name.
5452; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5453;
5454; @return R0_32 The new MXCSR value of the guest.
5455; @param A0_32 The guest's MXCSR register value to use.
5456; @param A1 Where to return the result.
5457; @param A2 Pointer to the first media register size operand (input/output).
5458; @param A3 Pointer to the second media register size operand (input).
5459;
5460%macro IEMIMPL_CVT_F2 2
5461BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5462 PROLOGUE_4_ARGS
5463 IEMIMPL_SSE_PROLOGUE
5464 SSE_AVX_LD_MXCSR A0_32
5465
5466 movdqu xmm0, [A2]
5467 movdqu xmm1, [A3]
5468 %1 xmm0, xmm1
5469 movdqu [A1], xmm0
5470
5471 SSE_AVX_ST_MXCSR R0_32, A0_32
5472 IEMIMPL_SSE_EPILOGUE
5473 EPILOGUE_4_ARGS
5474ENDPROC iemAImpl_ %+ %1 %+ _u128
5475
5476BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
5477 PROLOGUE_4_ARGS
5478 IEMIMPL_AVX_PROLOGUE
5479 SSE_AVX_LD_MXCSR A0_32
5480
5481 vmovdqu xmm0, [A2]
5482 vmovdqu xmm1, [A3]
5483 v %+ %1 xmm0, xmm1
5484 vmovdqu [A1], xmm0
5485
5486 SSE_AVX_ST_MXCSR R0_32, A0_32
5487 IEMIMPL_AVX_EPILOGUE
5488 EPILOGUE_4_ARGS
5489ENDPROC iemAImpl_v %+ %1 %+ _u128
5490
5491BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5492 PROLOGUE_4_ARGS
5493 IEMIMPL_AVX_PROLOGUE
5494 SSE_AVX_LD_MXCSR A0_32
5495
5496 vmovdqu ymm0, [A2]
5497 vmovdqu ymm1, [A3]
5498 %if %2 == 0
5499 v %+ %1 xmm0, ymm1
5500 %else
5501 v %+ %1 ymm0, xmm1
5502 %endif
5503 vmovdqu [A1], ymm0
5504
5505 SSE_AVX_ST_MXCSR R0_32, A0_32
5506 IEMIMPL_AVX_EPILOGUE
5507 EPILOGUE_4_ARGS
5508ENDPROC iemAImpl_v %+ %1 %+ _u256
5509%endmacro
5510
5511IEMIMPL_CVT_F2 cvtpd2ps, 0
5512;IEMIMPL_CVT_F2 cvtps2pd, 1 - inefficient.
5513
5514BEGINPROC_FASTCALL iemAImpl_cvtps2pd_u128, 12
5515 PROLOGUE_3_ARGS
5516 IEMIMPL_SSE_PROLOGUE
5517 SSE_AVX_LD_MXCSR A0_32
5518
5519 cvtps2pd xmm0, [A2]
5520 movdqu [A1], xmm0
5521
5522 SSE_AVX_ST_MXCSR R0_32, A0_32
5523 IEMIMPL_SSE_EPILOGUE
5524 EPILOGUE_3_ARGS
5525ENDPROC iemAImpl_cvtps2pd_u128
5526
5527
5528
5529;;
5530; shufps instructions with 8-bit immediates.
5531;
5532; @param A0 Pointer to the destination media register size operand (input/output).
5533; @param A1 Pointer to the first source media register size operand (input).
5534; @param A2 The 8-bit immediate
5535;
5536BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5537 PROLOGUE_3_ARGS
5538 IEMIMPL_SSE_PROLOGUE
5539
5540 movzx A2, A2_8 ; must clear top bits
5541 movdqu xmm0, [A0]
5542 movdqu xmm1, [A1]
5543 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5544 movdqu [A0], xmm0
5545
5546 IEMIMPL_SSE_EPILOGUE
5547 EPILOGUE_3_ARGS
5548 %assign bImm 0
5549 %rep 256
5550.imm %+ bImm:
5551 IBT_ENDBRxx_WITHOUT_NOTRACK
5552 shufps xmm0, xmm1, bImm
5553 ret
5554 int3
5555 %assign bImm bImm + 1
5556 %endrep
5557.immEnd:
5558ENDPROC iemAImpl_shufps_u128
5559
5560
5561;;
5562; shufpd instruction with 8-bit immediates.
5563;
5564; @param A0 Pointer to the destination media register size operand (input/output).
5565; @param A1 Pointer to the first source media register size operand (input).
5566; @param A2 The 8-bit immediate
5567;
5568BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5569 PROLOGUE_3_ARGS
5570 IEMIMPL_SSE_PROLOGUE
5571
5572 movzx A2, A2_8 ; must clear top bits
5573 movdqu xmm0, [A0]
5574 movdqu xmm1, [A1]
5575 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5576 movdqu [A0], xmm0
5577
5578 IEMIMPL_SSE_EPILOGUE
5579 EPILOGUE_3_ARGS
5580 %assign bImm 0
5581 %rep 256
5582.imm %+ bImm:
5583 IBT_ENDBRxx_WITHOUT_NOTRACK
5584 shufpd xmm0, xmm1, bImm
5585 ret
5586 %assign bImm bImm + 1
5587 %endrep
5588.immEnd:
5589ENDPROC iemAImpl_shufpd_u128
5590
5591
5592;;
5593; vshufp{s,d} instructions with 8-bit immediates.
5594;
5595; @param 1 The instruction name.
5596;
5597; @param A0 Pointer to the destination media register size operand (output).
5598; @param A1 Pointer to the first source media register size operand (input).
5599; @param A2 Pointer to the second source media register size operand (input).
5600; @param A3 The 8-bit immediate
5601;
5602%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5604 PROLOGUE_4_ARGS
5605 IEMIMPL_AVX_PROLOGUE
5606
5607 movzx A3, A3_8 ; must clear top bits
5608 movdqu xmm0, [A1]
5609 movdqu xmm1, [A2]
5610 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5611 movdqu [A0], xmm0
5612
5613 IEMIMPL_AVX_EPILOGUE
5614 EPILOGUE_4_ARGS
5615 %assign bImm 0
5616 %rep 256
5617.imm %+ bImm:
5618 IBT_ENDBRxx_WITHOUT_NOTRACK
5619 %1 xmm0, xmm0, xmm1, bImm
5620 ret
5621 %assign bImm bImm + 1
5622 %endrep
5623.immEnd:
5624ENDPROC iemAImpl_ %+ %1 %+ _u128
5625
5626BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5627 PROLOGUE_4_ARGS
5628 IEMIMPL_AVX_PROLOGUE
5629
5630 movzx A3, A3_8 ; must clear top bits
5631 vmovdqu ymm0, [A1]
5632 vmovdqu ymm1, [A2]
5633 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
5634 vmovdqu [A0], ymm0
5635
5636 IEMIMPL_AVX_EPILOGUE
5637 EPILOGUE_4_ARGS
5638 %assign bImm 0
5639 %rep 256
5640.imm %+ bImm:
5641 IBT_ENDBRxx_WITHOUT_NOTRACK
5642 %1 ymm0, ymm0, ymm1, bImm
5643 ret
5644 %assign bImm bImm + 1
5645 %endrep
5646.immEnd:
5647ENDPROC iemAImpl_ %+ %1 %+ _u256
5648%endmacro
5649
5650IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5651IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5652
5653
5654;;
5655; One of the [p]blendv{b,ps,pd} variants
5656;
5657; @param 1 The instruction
5658;
5659; @param A0 Pointer to the first media register sized operand (input/output).
5660; @param A1 Pointer to the second media sized value (input).
5661; @param A2 Pointer to the media register sized mask value (input).
5662;
5663%macro IEMIMPL_P_BLEND 1
5664BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5665 PROLOGUE_3_ARGS
5666 IEMIMPL_SSE_PROLOGUE
5667
5668 movdqu xmm0, [A2] ; This is implicit
5669 movdqu xmm1, [A0]
5670 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5671 %1 xmm1, xmm2
5672 movdqu [A0], xmm1
5673
5674 IEMIMPL_SSE_PROLOGUE
5675 EPILOGUE_3_ARGS
5676ENDPROC iemAImpl_ %+ %1 %+ _u128
5677%endmacro
5678
5679IEMIMPL_P_BLEND pblendvb
5680IEMIMPL_P_BLEND blendvps
5681IEMIMPL_P_BLEND blendvpd
5682
5683
5684;;
5685; One of the v[p]blendv{b,ps,pd} variants
5686;
5687; @param 1 The instruction
5688;
5689; @param A0 Pointer to the first media register sized operand (output).
5690; @param A1 Pointer to the first media register sized operand (input).
5691; @param A2 Pointer to the second media register sized operand (input).
5692; @param A3 Pointer to the media register sized mask value (input).
5693%macro IEMIMPL_AVX_P_BLEND 1
5694BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5695 PROLOGUE_4_ARGS
5696 IEMIMPL_AVX_PROLOGUE
5697
5698 vmovdqu xmm0, [A1]
5699 vmovdqu xmm1, [A2]
5700 vmovdqu xmm2, [A3]
5701 %1 xmm0, xmm0, xmm1, xmm2
5702 vmovdqu [A0], xmm0
5703
5704 IEMIMPL_AVX_PROLOGUE
5705 EPILOGUE_4_ARGS
5706ENDPROC iemAImpl_ %+ %1 %+ _u128
5707
5708BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5709 PROLOGUE_4_ARGS
5710 IEMIMPL_AVX_PROLOGUE
5711
5712 vmovdqu ymm0, [A1]
5713 vmovdqu ymm1, [A2]
5714 vmovdqu ymm2, [A3]
5715 %1 ymm0, ymm0, ymm1, ymm2
5716 vmovdqu [A0], ymm0
5717
5718 IEMIMPL_AVX_PROLOGUE
5719 EPILOGUE_4_ARGS
5720ENDPROC iemAImpl_ %+ %1 %+ _u256
5721%endmacro
5722
5723IEMIMPL_AVX_P_BLEND vpblendvb
5724IEMIMPL_AVX_P_BLEND vblendvps
5725IEMIMPL_AVX_P_BLEND vblendvpd
5726
5727
5728;;
5729; palignr mm1, mm2/m64 instruction.
5730;
5731; @param A0 Pointer to the first media register sized operand (output).
5732; @param A1 The second register sized operand (input).
5733; @param A2 The 8-bit immediate.
5734BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5735 PROLOGUE_3_ARGS
5736 IEMIMPL_MMX_PROLOGUE
5737
5738 movzx A2, A2_8 ; must clear top bits
5739 movq mm0, [A0]
5740 movq mm1, A1
5741 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
5742 movq [A0], mm0
5743
5744 IEMIMPL_MMX_EPILOGUE
5745 EPILOGUE_3_ARGS
5746 %assign bImm 0
5747 %rep 256
5748.imm %+ bImm:
5749 IBT_ENDBRxx_WITHOUT_NOTRACK
5750 palignr mm0, mm1, bImm
5751 ret
5752 %assign bImm bImm + 1
5753 %endrep
5754.immEnd:
5755ENDPROC iemAImpl_palignr_u64
5756
5757
5758;;
5759; SSE instructions with 8-bit immediates of the form
5760; xxx xmm1, xmm2, imm8.
5761; where the instruction encoding takes up 6 bytes.
5762;
5763; @param 1 The instruction name.
5764;
5765; @param A0 Pointer to the first media register size operand (input/output).
5766; @param A1 Pointer to the second source media register size operand (input).
5767; @param A2 The 8-bit immediate
5768;
5769%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5770BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5771 PROLOGUE_3_ARGS
5772 IEMIMPL_SSE_PROLOGUE
5773
5774 movzx A2, A2_8 ; must clear top bits
5775 movdqu xmm0, [A0]
5776 movdqu xmm1, [A1]
5777 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 8
5778 movdqu [A0], xmm0
5779
5780 IEMIMPL_SSE_EPILOGUE
5781 EPILOGUE_3_ARGS
5782 %assign bImm 0
5783 %rep 256
5784.imm %+ bImm:
5785 IBT_ENDBRxx_WITHOUT_NOTRACK
5786 %1 xmm0, xmm1, bImm
5787 ret
5788 int3
5789 %assign bImm bImm + 1
5790 %endrep
5791.immEnd:
5792ENDPROC iemAImpl_ %+ %1 %+ _u128
5793%endmacro
5794
5795IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5796IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5797IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5798IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5799IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5800IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5801IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5802
5803
5804;;
5805; AVX instructions with 8-bit immediates of the form
5806; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5807; where the instruction encoding takes up 6 bytes.
5808;
5809; @param 1 The instruction name.
5810; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5811; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5812;
5813; @param A0 Pointer to the destination media register size operand (output).
5814; @param A1 Pointer to the first source media register size operand (input).
5815; @param A2 Pointer to the second source media register size operand (input).
5816; @param A3 The 8-bit immediate
5817;
5818%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5819 %if %2 == 1
5820BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5821 PROLOGUE_4_ARGS
5822 IEMIMPL_AVX_PROLOGUE
5823
5824 movzx A3, A3_8 ; must clear top bits
5825 movdqu xmm0, [A1]
5826 movdqu xmm1, [A2]
5827 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5828 movdqu [A0], xmm0
5829
5830 IEMIMPL_AVX_EPILOGUE
5831 EPILOGUE_4_ARGS
5832 %assign bImm 0
5833 %rep 256
5834.imm %+ bImm:
5835 IBT_ENDBRxx_WITHOUT_NOTRACK
5836 %1 xmm0, xmm0, xmm1, bImm
5837 ret
5838 int3
5839 %assign bImm bImm + 1
5840 %endrep
5841.immEnd:
5842ENDPROC iemAImpl_ %+ %1 %+ _u128
5843 %endif
5844
5845 %if %3 == 1
5846BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5847 PROLOGUE_4_ARGS
5848 IEMIMPL_AVX_PROLOGUE
5849
5850 movzx A3, A3_8 ; must clear top bits
5851 vmovdqu ymm0, [A1]
5852 vmovdqu ymm1, [A2]
5853 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5854 vmovdqu [A0], ymm0
5855
5856 IEMIMPL_AVX_EPILOGUE
5857 EPILOGUE_4_ARGS
5858 %assign bImm 0
5859 %rep 256
5860.imm %+ bImm:
5861 IBT_ENDBRxx_WITHOUT_NOTRACK
5862 %1 ymm0, ymm0, ymm1, bImm
5863 ret
5864 int3
5865 %assign bImm bImm + 1
5866 %endrep
5867.immEnd:
5868ENDPROC iemAImpl_ %+ %1 %+ _u256
5869 %endif
5870%endmacro
5871
5872IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5873IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5874IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5875IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
5876IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5877IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5878IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5879IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5880IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5881
5882
5883;;
5884; AVX instructions with 8-bit immediates of the form
5885; xxx {x,y}mm1, {x,y}mm2, imm8.
5886; where the instruction encoding takes up 6 bytes.
5887;
5888; @param 1 The instruction name.
5889; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5890; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5891; @param 4 The number of bytes taken up by a single instance of the instruction.
5892;
5893; @param A0 Pointer to the destination media register size operand (output).
5894; @param A1 Pointer to the first source media register size operand (input).
5895; @param A2 The 8-bit immediate
5896;
5897%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP 4
5898 %if %2 == 1
5899BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
5900 PROLOGUE_4_ARGS
5901 IEMIMPL_AVX_PROLOGUE
5902
5903 movzx A2, A2_8 ; must clear top bits
5904 movdqu xmm1, [A1]
5905 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
5906 movdqu [A0], xmm0
5907
5908 IEMIMPL_AVX_EPILOGUE
5909 EPILOGUE_4_ARGS
5910 %assign bImm 0
5911 %rep 256
5912.imm %+ bImm:
5913 IBT_ENDBRxx_WITHOUT_NOTRACK
5914 %1 xmm0, xmm1, bImm
5915 ret
5916 int3
5917 %assign bImm bImm + 1
5918 %endrep
5919.immEnd:
5920ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
5921 %endif
5922
5923 %if %3 == 1
5924BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
5925 PROLOGUE_4_ARGS
5926 IEMIMPL_AVX_PROLOGUE
5927
5928 movzx A2, A2_8 ; must clear top bits
5929 vmovdqu ymm1, [A1]
5930 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, %4
5931 vmovdqu [A0], ymm0
5932
5933 IEMIMPL_AVX_EPILOGUE
5934 EPILOGUE_4_ARGS
5935 %assign bImm 0
5936 %rep 256
5937.imm %+ bImm:
5938 IBT_ENDBRxx_WITHOUT_NOTRACK
5939 %1 ymm0, ymm1, bImm
5940 ret
5941 int3
5942 %assign bImm bImm + 1
5943 %endrep
5944.immEnd:
5945ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
5946 %endif
5947%endmacro
5948
5949IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilps, 1, 1, 8
5950IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpermilpd, 1, 1, 8
5951IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpslldq, 1, 1, 7
5952IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP vpsrldq, 1, 1, 7
5953
5954
5955;;
5956; Need to move this as well somewhere better?
5957;
5958struc IEMPCMPISTRXSRC
5959 .uSrc1 resd 4
5960 .uSrc2 resd 4
5961endstruc
5962
5963struc IEMPCMPESTRXSRC
5964 .uSrc1 resd 4
5965 .uSrc2 resd 4
5966 .u64Rax resd 2
5967 .u64Rdx resd 2
5968endstruc
5969
5970;;
5971; The pcmpistri instruction.
5972;
5973; @return R0_32 The new ECX value.
5974; @param A0 Pointer to the EFLAGS register.
5975; @param A1 Pointer to the first operand (input).
5976; @param A2 Pointer to the second operand (input).
5977; @param A3 The 8-bit immediate
5978;
5979BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5980 PROLOGUE_4_ARGS
5981 IEMIMPL_SSE_PROLOGUE
5982
5983 movzx A3, A3_8 ; must clear top bits
5984 movdqu xmm0, [A1]
5985 movdqu xmm1, [A2]
5986 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5987 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
5988
5989 IEM_SAVE_FLAGS_OLD T2, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
5990 mov R0_32, ecx
5991
5992 IEMIMPL_SSE_EPILOGUE
5993 EPILOGUE_4_ARGS
5994 %assign bImm 0
5995 %rep 256
5996.imm %+ bImm:
5997 IBT_ENDBRxx_WITHOUT_NOTRACK
5998 pcmpistri xmm0, xmm1, bImm
5999 ret
6000 int3
6001 %assign bImm bImm + 1
6002 %endrep
6003.immEnd:
6004ENDPROC iemAImpl_pcmpistri_u128
6005
6006;;
6007; The pcmpestri instruction.
6008;
6009; @param A0 Pointer to the ECX register to store the result to (output).
6010; @param A1 Pointer to the EFLAGS register.
6011; @param A2 Pointer to the structure containing the source operands (input).
6012; @param A3 The 8-bit immediate
6013;
6014BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
6015 PROLOGUE_4_ARGS
6016 IEMIMPL_SSE_PROLOGUE
6017
6018 movzx A3, A3_8 ; must clear top bits
6019 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
6020 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
6021 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
6022 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6023 push xDX ; xDX can be A1 or A2 depending on the calling convention
6024 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6025 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6026 IBT_NOTRACK
6027 call T1
6028
6029 pop xDX
6030 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6031 mov [T2], ecx
6032
6033 IEMIMPL_SSE_EPILOGUE
6034 EPILOGUE_4_ARGS
6035 %assign bImm 0
6036 %rep 256
6037.imm %+ bImm:
6038 IBT_ENDBRxx_WITHOUT_NOTRACK
6039 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6040 pcmpestri xmm0, xmm1, bImm
6041 ret
6042 %assign bImm bImm + 1
6043 %endrep
6044.immEnd:
6045ENDPROC iemAImpl_pcmpestri_u128
6046
6047;;
6048; The pcmpistrm instruction template.
6049;
6050; @param A0 Pointer to the XMM0 register to store the result to (output).
6051; @param A1 Pointer to the EFLAGS register.
6052; @param A2 Pointer to the structure containing the source operands (input).
6053; @param A3 The 8-bit immediate
6054;
6055BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
6056 PROLOGUE_4_ARGS
6057 IEMIMPL_SSE_PROLOGUE
6058
6059 movzx A3, A3_8 ; must clear top bits
6060 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
6061 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
6062 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6063
6064 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6065 movdqu [A0], xmm0
6066
6067 IEMIMPL_SSE_EPILOGUE
6068 EPILOGUE_4_ARGS
6069 %assign bImm 0
6070 %rep 256
6071.imm %+ bImm:
6072 IBT_ENDBRxx_WITHOUT_NOTRACK
6073 pcmpistrm xmm1, xmm2, bImm
6074 ret
6075 int3
6076 %assign bImm bImm + 1
6077 %endrep
6078.immEnd:
6079ENDPROC iemAImpl_pcmpistrm_u128
6080
6081;;
6082; The pcmpestrm instruction template.
6083;
6084; @param A0 Pointer to the XMM0 register to store the result to (output).
6085; @param A1 Pointer to the EFLAGS register.
6086; @param A2 Pointer to the structure containing the source operands (input).
6087; @param A3 The 8-bit immediate
6088;
6089BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
6090 PROLOGUE_4_ARGS
6091 IEMIMPL_SSE_PROLOGUE
6092
6093 movzx A3, A3_8 ; must clear top bits
6094 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
6095 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
6096 IEMIMPL_JUMP_TABLE_TARGET T1, A3, 8
6097 push xDX ; xDX can be A1 or A2 depending on the calling convention
6098 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
6099 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
6100 IBT_NOTRACK
6101 call T1
6102
6103 pop xDX
6104 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_OF, 0, X86_EFL_AF | X86_EFL_PF
6105 movdqu [A0], xmm0
6106
6107 IEMIMPL_SSE_EPILOGUE
6108 EPILOGUE_4_ARGS
6109 %assign bImm 0
6110 %rep 256
6111.imm %+ bImm:
6112 IBT_ENDBRxx_WITHOUT_NOTRACK
6113 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
6114 pcmpestrm xmm1, xmm2, bImm
6115 ret
6116 %assign bImm bImm + 1
6117 %endrep
6118.immEnd:
6119ENDPROC iemAImpl_pcmpestrm_u128
6120
6121
6122;;
6123; movmskp{s,d} SSE instruction template
6124;
6125; @param 1 The SSE instruction name.
6126; @param 2 The AVX instruction name.
6127;
6128; @param A0 Pointer to the output register (output/byte sized).
6129; @param A1 Pointer to the source media register size operand (input).
6130;
6131%macro IEMIMPL_MEDIA_MOVMSK_P 2
6132BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6133 PROLOGUE_2_ARGS
6134 IEMIMPL_SSE_PROLOGUE
6135
6136 movdqu xmm0, [A1]
6137 %1 T0, xmm0
6138 mov byte [A0], T0_8
6139
6140 IEMIMPL_SSE_EPILOGUE
6141 EPILOGUE_2_ARGS
6142ENDPROC iemAImpl_ %+ %1 %+ _u128
6143
6144BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6145 PROLOGUE_2_ARGS
6146 IEMIMPL_AVX_PROLOGUE
6147
6148 movdqu xmm0, [A1]
6149 %2 T0, xmm0
6150 mov byte [A0], T0_8
6151
6152 IEMIMPL_AVX_EPILOGUE
6153 EPILOGUE_2_ARGS
6154ENDPROC iemAImpl_ %+ %2 %+ _u128
6155
6156BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6157 PROLOGUE_2_ARGS
6158 IEMIMPL_AVX_PROLOGUE
6159
6160 vmovdqu ymm0, [A1]
6161 %2 T0, ymm0
6162 mov byte [A0], T0_8
6163
6164 IEMIMPL_AVX_EPILOGUE
6165 EPILOGUE_2_ARGS
6166ENDPROC iemAImpl_ %+ %2 %+ _u256
6167%endmacro
6168
6169IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6170IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6171
6172
6173;;
6174; cvttsd2si instruction - 32-bit variant.
6175;
6176; @return R0_32 The new MXCSR value of the guest.
6177; @param A0_32 The guest's MXCSR register value to use.
6178; @param A1 Pointer to the result operand (output).
6179; @param A2 Pointer to the second operand (input).
6180;
6181BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
6182 PROLOGUE_4_ARGS
6183 IEMIMPL_SSE_PROLOGUE
6184 SSE_AVX_LD_MXCSR A0_32
6185
6186 cvttsd2si T0_32, [A2]
6187 mov dword [A1], T0_32
6188
6189 SSE_AVX_ST_MXCSR R0_32, A0_32
6190 IEMIMPL_SSE_EPILOGUE
6191 EPILOGUE_4_ARGS
6192ENDPROC iemAImpl_cvttsd2si_i32_r64
6193
6194;;
6195; cvttsd2si instruction - 64-bit variant.
6196;
6197; @return R0_32 The new MXCSR value of the guest.
6198; @param A0_32 The guest's MXCSR register value to use.
6199; @param A1 Pointer to the result operand (output).
6200; @param A2 Pointer to the second operand (input).
6201;
6202BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
6203 PROLOGUE_3_ARGS
6204 IEMIMPL_SSE_PROLOGUE
6205 SSE_AVX_LD_MXCSR A0_32
6206
6207 cvttsd2si T0, [A2]
6208 mov qword [A1], T0
6209
6210 SSE_AVX_ST_MXCSR R0_32, A0_32
6211 IEMIMPL_SSE_EPILOGUE
6212 EPILOGUE_3_ARGS
6213ENDPROC iemAImpl_cvttsd2si_i64_r64
6214
6215
6216;;
6217; cvtsd2si instruction - 32-bit variant.
6218;
6219; @return R0_32 The new MXCSR value of the guest.
6220; @param A0_32 The guest's MXCSR register value to use.
6221; @param A1 Pointer to the result operand (output).
6222; @param A2 Pointer to the second operand (input).
6223;
6224BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
6225 PROLOGUE_3_ARGS
6226 IEMIMPL_SSE_PROLOGUE
6227 SSE_AVX_LD_MXCSR A0_32
6228
6229 cvtsd2si T0_32, [A2]
6230 mov dword [A1], T0_32
6231
6232 SSE_AVX_ST_MXCSR R0_32, A0_32
6233 IEMIMPL_SSE_EPILOGUE
6234 EPILOGUE_3_ARGS
6235ENDPROC iemAImpl_cvtsd2si_i32_r64
6236
6237;;
6238; cvtsd2si instruction - 64-bit variant.
6239;
6240; @return R0_32 The new MXCSR value of the guest.
6241; @param A0_32 The guest's MXCSR register value to use.
6242; @param A1 Pointer to the result operand (output).
6243; @param A2 Pointer to the second operand (input).
6244;
6245BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6246 PROLOGUE_3_ARGS
6247 IEMIMPL_SSE_PROLOGUE
6248 SSE_AVX_LD_MXCSR A0_32
6249
6250 cvtsd2si T0, [A2]
6251 mov qword [A1], T0
6252
6253 SSE_AVX_ST_MXCSR R0_32, A0_32
6254 IEMIMPL_SSE_EPILOGUE
6255 EPILOGUE_3_ARGS
6256ENDPROC iemAImpl_cvtsd2si_i64_r64
6257
6258
6259;;
6260; cvttss2si instruction - 32-bit variant.
6261;
6262; @return R0_32 The new MXCSR value of the guest.
6263; @param A0_32 The guest's MXCSR register value to use.
6264; @param A1 Pointer to the result operand (output).
6265; @param A2 Pointer to the second operand (input).
6266;
6267BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6268 PROLOGUE_3_ARGS
6269 IEMIMPL_SSE_PROLOGUE
6270 SSE_AVX_LD_MXCSR A0_32
6271
6272 cvttss2si T0_32, [A2]
6273 mov dword [A1], T0_32
6274
6275 SSE_AVX_ST_MXCSR R0_32, A0_32
6276 IEMIMPL_SSE_EPILOGUE
6277 EPILOGUE_3_ARGS
6278ENDPROC iemAImpl_cvttss2si_i32_r32
6279
6280;;
6281; cvttss2si instruction - 64-bit variant.
6282;
6283; @return R0_32 The new MXCSR value of the guest.
6284; @param A0_32 The guest's MXCSR register value to use.
6285; @param A1 Pointer to the result operand (output).
6286; @param A2 Pointer to the second operand (input).
6287;
6288BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6289 PROLOGUE_3_ARGS
6290 IEMIMPL_SSE_PROLOGUE
6291 SSE_AVX_LD_MXCSR A0_32
6292
6293 cvttss2si T0, [A2]
6294 mov qword [A1], T0
6295
6296 SSE_AVX_ST_MXCSR R0_32, A0_32
6297 IEMIMPL_SSE_EPILOGUE
6298 EPILOGUE_3_ARGS
6299ENDPROC iemAImpl_cvttss2si_i64_r32
6300
6301
6302;;
6303; cvtss2si instruction - 32-bit variant.
6304;
6305; @return R0_32 The new MXCSR value of the guest.
6306; @param A0_32 The guest's MXCSR register value to use.
6307; @param A1 Pointer to the result operand (output).
6308; @param A2 Pointer to the second operand (input).
6309;
6310BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6311 PROLOGUE_3_ARGS
6312 IEMIMPL_SSE_PROLOGUE
6313 SSE_AVX_LD_MXCSR A0_32
6314
6315 cvtss2si T0_32, [A2]
6316 mov dword [A1], T0_32
6317
6318 SSE_AVX_ST_MXCSR R0_32, A0_32
6319 IEMIMPL_SSE_EPILOGUE
6320 EPILOGUE_3_ARGS
6321ENDPROC iemAImpl_cvtss2si_i32_r32
6322
6323;;
6324; cvtss2si instruction - 64-bit variant.
6325;
6326; @return R0_32 The new MXCSR value of the guest.
6327; @param A0_32 The guest's MXCSR register value to use.
6328; @param A1 Pointer to the result operand (output).
6329; @param A2 Pointer to the second operand (input).
6330;
6331BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6332 PROLOGUE_3_ARGS
6333 IEMIMPL_SSE_PROLOGUE
6334 SSE_AVX_LD_MXCSR A0_32
6335
6336 cvtss2si T0, [A2]
6337 mov qword [A1], T0
6338
6339 SSE_AVX_ST_MXCSR R0_32, A0_32
6340 IEMIMPL_SSE_EPILOGUE
6341 EPILOGUE_3_ARGS
6342ENDPROC iemAImpl_cvtss2si_i64_r32
6343
6344
6345;;
6346; cvtsi2ss instruction - 32-bit variant.
6347;
6348; @return R0_32 The new MXCSR value of the guest.
6349; @param A0_32 The guest's MXCSR register value to use.
6350; @param A1 Pointer to the result operand (output).
6351; @param A2 Pointer to the second operand (input).
6352;
6353BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6354 PROLOGUE_3_ARGS
6355 IEMIMPL_SSE_PROLOGUE
6356 SSE_AVX_LD_MXCSR A0_32
6357
6358 cvtsi2ss xmm0, dword [A2]
6359 movd dword [A1], xmm0
6360
6361 SSE_AVX_ST_MXCSR R0_32, A0_32
6362 IEMIMPL_SSE_EPILOGUE
6363 EPILOGUE_3_ARGS
6364ENDPROC iemAImpl_cvtsi2ss_r32_i32
6365
6366;;
6367; cvtsi2ss instruction - 64-bit variant.
6368;
6369; @return R0_32 The new MXCSR value of the guest.
6370; @param A0_32 The guest's MXCSR register value to use.
6371; @param A1 Pointer to the result operand (output).
6372; @param A2 Pointer to the second operand (input).
6373;
6374BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6375 PROLOGUE_3_ARGS
6376 IEMIMPL_SSE_PROLOGUE
6377 SSE_AVX_LD_MXCSR A0_32
6378
6379 cvtsi2ss xmm0, qword [A2]
6380 movd dword [A1], xmm0
6381
6382 SSE_AVX_ST_MXCSR R0_32, A0_32
6383 IEMIMPL_SSE_EPILOGUE
6384 EPILOGUE_3_ARGS
6385ENDPROC iemAImpl_cvtsi2ss_r32_i64
6386
6387
6388;;
6389; cvtsi2sd instruction - 32-bit variant.
6390;
6391; @return R0_32 The new MXCSR value of the guest.
6392; @param A0_32 The guest's MXCSR register value to use.
6393; @param A1 Pointer to the result operand (output).
6394; @param A2 Pointer to the second operand (input).
6395;
6396BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6397 PROLOGUE_3_ARGS
6398 IEMIMPL_SSE_PROLOGUE
6399 SSE_AVX_LD_MXCSR A0_32
6400
6401 cvtsi2sd xmm0, dword [A2]
6402 movq [A1], xmm0
6403
6404 SSE_AVX_ST_MXCSR R0_32, A0_32
6405 IEMIMPL_SSE_EPILOGUE
6406 EPILOGUE_3_ARGS
6407ENDPROC iemAImpl_cvtsi2sd_r64_i32
6408
6409;;
6410; cvtsi2sd instruction - 64-bit variant.
6411;
6412; @return R0_32 The new MXCSR value of the guest.
6413; @param A0_32 The guest's MXCSR register value to use.
6414; @param A1 Pointer to the result operand (output).
6415; @param A2 Pointer to the second operand (input).
6416;
6417BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6418 PROLOGUE_3_ARGS
6419 IEMIMPL_SSE_PROLOGUE
6420 SSE_AVX_LD_MXCSR A0_32
6421
6422 cvtsi2sd xmm0, qword [A2]
6423 movq [A1], xmm0
6424
6425 SSE_AVX_ST_MXCSR R0_32, A0_32
6426 IEMIMPL_SSE_EPILOGUE
6427 EPILOGUE_3_ARGS
6428ENDPROC iemAImpl_cvtsi2sd_r64_i64
6429
6430
6431;
6432; UCOMISS (SSE)
6433;
6434; @return R0_32 The new MXCSR value of the guest.
6435; @param A0_32 The guest's MXCSR register value to use (input).
6436; @param A1 Pointer to the EFLAGS value (input/output).
6437; @param A2_32 The first source operand.
6438; @param A3_32 The second source operand.
6439;
6440BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6441 PROLOGUE_4_ARGS
6442 IEMIMPL_SSE_PROLOGUE
6443 SSE_AVX_LD_MXCSR A0_32
6444
6445 movd xmm0, A2_32
6446 movd xmm1, A3_32
6447 ucomiss xmm0, xmm1
6448 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6449
6450 SSE_AVX_ST_MXCSR R0_32, A0_32
6451 IEMIMPL_SSE_EPILOGUE
6452 EPILOGUE_4_ARGS
6453ENDPROC iemAImpl_ucomiss_u128
6454
6455BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6456 PROLOGUE_4_ARGS
6457 IEMIMPL_SSE_PROLOGUE
6458 SSE_AVX_LD_MXCSR A0_32
6459
6460 movd xmm0, A2_32
6461 movd xmm1, A3_32
6462 vucomiss xmm0, xmm1
6463 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6464
6465 SSE_AVX_ST_MXCSR R0_32, A0_32
6466 IEMIMPL_SSE_EPILOGUE
6467 EPILOGUE_3_ARGS
6468ENDPROC iemAImpl_vucomiss_u128
6469
6470
6471;
6472; UCOMISD (SSE)
6473;
6474; @return R0_32 The new MXCSR value of the guest.
6475; @param A0_32 The guest's MXCSR register value to use (input).
6476; @param A1 Pointer to the EFLAGS value (input/output).
6477; @param A2 The first source operand.
6478; @param A3 The second source operand.
6479;
6480BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6481 PROLOGUE_4_ARGS
6482 IEMIMPL_SSE_PROLOGUE
6483 SSE_AVX_LD_MXCSR A0_32
6484
6485 movq xmm0, A2
6486 movq xmm1, A3
6487 ucomisd xmm0, xmm1
6488 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6489
6490 SSE_AVX_ST_MXCSR R0_32, A0_32
6491 IEMIMPL_SSE_EPILOGUE
6492 EPILOGUE_4_ARGS
6493ENDPROC iemAImpl_ucomisd_u128
6494
6495BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6496 PROLOGUE_4_ARGS
6497 IEMIMPL_SSE_PROLOGUE
6498 SSE_AVX_LD_MXCSR A0_32
6499
6500 movq xmm0, A2
6501 movq xmm1, A3
6502 vucomisd xmm0, xmm1
6503 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6504
6505 SSE_AVX_ST_MXCSR R0_32, A0_32
6506 IEMIMPL_SSE_EPILOGUE
6507 EPILOGUE_4_ARGS
6508ENDPROC iemAImpl_vucomisd_u128
6509
6510;
6511; COMISS (SSE)
6512;
6513; @return R0_32 The new MXCSR value of the guest.
6514; @param A0_32 The guest's MXCSR register value to use (input).
6515; @param A1 Pointer to the EFLAGS value (input/output).
6516; @param A2_32 The first source operand.
6517; @param A3_32 The second source operand.
6518;
6519BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6520 PROLOGUE_4_ARGS
6521 IEMIMPL_SSE_PROLOGUE
6522 SSE_AVX_LD_MXCSR A0_32
6523
6524 movd xmm0, A2_32
6525 movd xmm1, A3_32
6526 comiss xmm0, xmm1
6527 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6528
6529 SSE_AVX_ST_MXCSR R0_32, A0_32
6530 IEMIMPL_SSE_EPILOGUE
6531 EPILOGUE_4_ARGS
6532ENDPROC iemAImpl_comiss_u128
6533
6534BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6535 PROLOGUE_4_ARGS
6536 IEMIMPL_SSE_PROLOGUE
6537 SSE_AVX_LD_MXCSR A0_32
6538
6539 movd xmm0, A2_32
6540 movd xmm1, A3_32
6541 vcomiss xmm0, xmm1
6542 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6543
6544 SSE_AVX_ST_MXCSR R0_32, A0_32
6545 IEMIMPL_SSE_EPILOGUE
6546 EPILOGUE_4_ARGS
6547ENDPROC iemAImpl_vcomiss_u128
6548
6549
6550;
6551; COMISD (SSE)
6552;
6553; @return R0_32 The new MXCSR value of the guest.
6554; @param A0_32 The guest's MXCSR register value to use (input).
6555; @param A1 Pointer to the EFLAGS value (input/output).
6556; @param A2 The first source operand.
6557; @param A3 The second source operand.
6558;
6559BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6560 PROLOGUE_4_ARGS
6561 IEMIMPL_SSE_PROLOGUE
6562 SSE_AVX_LD_MXCSR A0_32
6563
6564 movq xmm0, A2
6565 movq xmm1, A3
6566 comisd xmm0, xmm1
6567 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6568
6569 SSE_AVX_ST_MXCSR R0_32, A0_32
6570 IEMIMPL_SSE_EPILOGUE
6571 EPILOGUE_4_ARGS
6572ENDPROC iemAImpl_comisd_u128
6573
6574BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6575 PROLOGUE_4_ARGS
6576 IEMIMPL_SSE_PROLOGUE
6577 SSE_AVX_LD_MXCSR A0_32
6578
6579 movq xmm0, A2
6580 movq xmm1, A3
6581 vcomisd xmm0, xmm1
6582 IEM_SAVE_FLAGS_OLD A1, X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF
6583
6584 SSE_AVX_ST_MXCSR R0_32, A0_32
6585 IEMIMPL_SSE_EPILOGUE
6586 EPILOGUE_4_ARGS
6587ENDPROC iemAImpl_vcomisd_u128
6588
6589
6590;;
6591; Need to move this as well somewhere better?
6592;
6593struc IEMMEDIAF2XMMSRC
6594 .uSrc1 resd 4
6595 .uSrc2 resd 4
6596endstruc
6597
6598
6599;
6600; CMPPS (SSE)
6601;
6602; @return R0_32 The new MXCSR value of the guest.
6603; @param A0_32 The guest's MXCSR register value to use (input).
6604; @param A1 Pointer to the first media register size operand (output).
6605; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6606; @param A3 The 8-bit immediate (input).
6607;
6608BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6609 PROLOGUE_4_ARGS
6610 IEMIMPL_SSE_PROLOGUE
6611 SSE_AVX_LD_MXCSR A0_32
6612
6613 movzx A3, A3_8 ; must clear top bits
6614 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6615 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6616 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 5
6617 movdqu [A1], xmm0
6618
6619 SSE_AVX_ST_MXCSR R0_32, A0_32
6620 IEMIMPL_SSE_EPILOGUE
6621 EPILOGUE_4_ARGS
6622 %assign bImm 0
6623 %rep 256
6624.imm %+ bImm:
6625 IBT_ENDBRxx_WITHOUT_NOTRACK
6626 cmpps xmm0, xmm1, bImm
6627 ret
6628 %assign bImm bImm + 1
6629 %endrep
6630.immEnd:
6631ENDPROC iemAImpl_cmpps_u128
6632
6633;;
6634; SSE instructions with 8-bit immediates of the form
6635; xxx xmm1, xmm2, imm8.
6636; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6637; register.
6638;
6639; @param 1 The instruction name.
6640;
6641; @return R0_32 The new MXCSR value of the guest.
6642; @param A0_32 The guest's MXCSR register value to use (input).
6643; @param A1 Pointer to the first media register size operand (output).
6644; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6645; @param A3 The 8-bit immediate (input).
6646;
6647%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6648BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6649 PROLOGUE_4_ARGS
6650 IEMIMPL_SSE_PROLOGUE
6651 SSE_AVX_LD_MXCSR A0_32
6652
6653 movzx A3, A3_8 ; must clear top bits
6654 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6655 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6656 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 6
6657 movdqu [A1], xmm0
6658
6659 SSE_AVX_ST_MXCSR R0_32, A0_32
6660 IEMIMPL_SSE_EPILOGUE
6661 EPILOGUE_4_ARGS
6662 %assign bImm 0
6663 %rep 256
6664.imm %+ bImm:
6665 IBT_ENDBRxx_WITHOUT_NOTRACK
6666 %1 xmm0, xmm1, bImm
6667 ret
6668 %assign bImm bImm + 1
6669 %endrep
6670.immEnd:
6671ENDPROC iemAImpl_ %+ %1 %+ _u128
6672%endmacro
6673
6674IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6675IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6676IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6677
6678;;
6679; SSE instructions with 8-bit immediates of the form
6680; xxx xmm1, xmm2, imm8.
6681; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6682; register.
6683;
6684; @param 1 The instruction name.
6685;
6686; @return R0_32 The new MXCSR value of the guest.
6687; @param A0_32 The guest's MXCSR register value to use (input).
6688; @param A1 Pointer to the first media register size operand (output).
6689; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6690; @param A3 The 8-bit immediate (input).
6691;
6692%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6693BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6694 PROLOGUE_4_ARGS
6695 IEMIMPL_SSE_PROLOGUE
6696 SSE_AVX_LD_MXCSR A0_32
6697
6698 movzx A3, A3_8 ; must clear top bits
6699 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6700 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6701 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A3, 8
6702 movdqu [A1], xmm0
6703
6704 SSE_AVX_ST_MXCSR R0_32, A0_32
6705 IEMIMPL_SSE_EPILOGUE
6706 EPILOGUE_4_ARGS
6707 %assign bImm 0
6708 %rep 256
6709.imm %+ bImm:
6710 IBT_ENDBRxx_WITHOUT_NOTRACK
6711 %1 xmm0, xmm1, bImm
6712 ret
6713 int3
6714 %assign bImm bImm + 1
6715 %endrep
6716.immEnd:
6717ENDPROC iemAImpl_ %+ %1 %+ _u128
6718%endmacro
6719
6720IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6721IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6722IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6723IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6724IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6725IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6726
6727
6728;;
6729; SSE instructions of the form
6730; xxx mm, xmm.
6731; and we need to load and save the MXCSR register.
6732;
6733; @param 1 The instruction name.
6734;
6735; @return R0_32 The new MXCSR value of the guest.
6736; @param A0_32 The guest's MXCSR register value to use (input).
6737; @param A1 Pointer to the first MMX register sized operand (output).
6738; @param A2 Pointer to the media register sized operand (input).
6739;
6740%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6741BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6742 PROLOGUE_3_ARGS
6743 IEMIMPL_SSE_PROLOGUE
6744 SSE_AVX_LD_MXCSR A0_32
6745
6746 movdqu xmm0, [A2]
6747 %1 mm0, xmm0
6748 movq [A1], mm0
6749
6750 SSE_AVX_ST_MXCSR R0_32, A0_32
6751 IEMIMPL_SSE_EPILOGUE
6752 EPILOGUE_3_ARGS
6753ENDPROC iemAImpl_ %+ %1 %+ _u128
6754%endmacro
6755
6756IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6757IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6758
6759;;
6760; SSE instructions of the form
6761; xxx xmm, xmm/m64.
6762; and we need to load and save the MXCSR register.
6763;
6764; @param 1 The instruction name.
6765;
6766; @return R0_32 The new MXCSR value of the guest.
6767; @param A0_32 The guest's MXCSR register value to use (input).
6768; @param A1 Pointer to the first media register sized operand (input/output).
6769; @param A2 The 64bit source value from a MMX media register (input)
6770;
6771%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6772BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6773 PROLOGUE_3_ARGS
6774 IEMIMPL_SSE_PROLOGUE
6775 SSE_AVX_LD_MXCSR A0_32
6776
6777 movdqu xmm0, [A1]
6778 movq mm0, A2
6779 %1 xmm0, mm0
6780 movdqu [A1], xmm0
6781
6782 SSE_AVX_ST_MXCSR R0_32, A0_32
6783 IEMIMPL_SSE_EPILOGUE
6784 EPILOGUE_3_ARGS
6785ENDPROC iemAImpl_ %+ %1 %+ _u128
6786%endmacro
6787
6788IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6789IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6790
6791;;
6792; SSE instructions of the form
6793; xxx mm, xmm/m64.
6794; and we need to load and save the MXCSR register.
6795;
6796; @param 1 The instruction name.
6797;
6798; @return R0_32 The new MXCSR value of the guest.
6799; @param A0_32 The guest's MXCSR register value to use (input).
6800; @param A1 Pointer to the first MMX media register sized operand (output).
6801; @param A2 The 64bit source value (input).
6802;
6803%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6804BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6805 PROLOGUE_3_ARGS
6806 IEMIMPL_SSE_PROLOGUE
6807 SSE_AVX_LD_MXCSR A0_32
6808
6809 movq xmm0, A2
6810 %1 mm0, xmm0
6811 movq [A1], mm0
6812
6813 SSE_AVX_ST_MXCSR R0_32, A0_32
6814 IEMIMPL_SSE_EPILOGUE
6815 EPILOGUE_3_ARGS
6816ENDPROC iemAImpl_ %+ %1 %+ _u128
6817%endmacro
6818
6819IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6820IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6821
6822;
6823; All forms of RDRAND and RDSEED
6824;
6825; @param A0 Pointer to the destination operand.
6826; @param A1 Pointer to the EFLAGS value (input/output).
6827;
6828%macro IEMIMPL_RDRAND_RDSEED 3
6829BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6830 PROLOGUE_2_ARGS
6831
6832 %1 %2
6833 mov [A0], %2
6834 IEM_SAVE_FLAGS_OLD A1, X86_EFL_CF, 0, X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF
6835
6836 EPILOGUE_2_ARGS
6837ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6838%endmacro
6839
6840IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6841IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6842IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6843IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6844IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6845IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6846
6847
6848;;
6849; sha1rnds4 xmm1, xmm2, imm8.
6850;
6851; @param 1 The instruction name.
6852;
6853; @param A0 Pointer to the first media register size operand (input/output).
6854; @param A1 Pointer to the second source media register size operand (input).
6855; @param A2 The 8-bit immediate
6856;
6857BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6858 PROLOGUE_3_ARGS
6859 IEMIMPL_SSE_PROLOGUE
6860
6861 movzx A2, A2_8 ; must clear top bits
6862 movdqu xmm0, [A0]
6863 movdqu xmm1, [A1]
6864 IEMIMPL_CALL_JUMP_TABLE_TARGET T1, A2, 6
6865 movdqu [A0], xmm0
6866
6867 IEMIMPL_SSE_EPILOGUE
6868 EPILOGUE_3_ARGS
6869 %assign bImm 0
6870 %rep 256
6871.imm %+ bImm:
6872 IBT_ENDBRxx_WITHOUT_NOTRACK
6873 sha1rnds4 xmm0, xmm1, bImm
6874 ret
6875 %assign bImm bImm + 1
6876 %endrep
6877.immEnd:
6878ENDPROC iemAImpl_sha1rnds4_u128
6879
6880
6881;;
6882; sha256rnds2 xmm1, xmm2, <XMM0>.
6883;
6884; @param 1 The instruction name.
6885;
6886; @param A0 Pointer to the first media register size operand (input/output).
6887; @param A1 Pointer to the second source media register size operand (input).
6888; @param A2 Pointer to the implicit XMM0 constants (input).
6889;
6890BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6891 PROLOGUE_3_ARGS
6892 IEMIMPL_SSE_PROLOGUE
6893
6894 movdqu xmm0, [A2]
6895 movdqu xmm1, [A0]
6896 movdqu xmm2, [A1]
6897 sha256rnds2 xmm1, xmm2
6898 movdqu [A0], xmm1
6899
6900 IEMIMPL_SSE_EPILOGUE
6901 EPILOGUE_3_ARGS
6902ENDPROC iemAImpl_sha256rnds2_u128
6903
6904
6905;
6906; 32-bit forms of ADCX and ADOX
6907;
6908; @returns Updated EFLAGS.
6909; @param A0 Incoming EFLAGS value (input).
6910; @param A1 Pointer to the destination operand (input/output).
6911; @param A2 32-bit source operand 1 (input).
6912;
6913%macro IEMIMPL_ADX_32 2
6914BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6915 PROLOGUE_4_ARGS
6916
6917 IEM_LOAD_FLAGS A0_32, %2, 0
6918 %1 A2_32, [A1]
6919 mov [A1], A2_32
6920 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
6921
6922 EPILOGUE_4_ARGS
6923ENDPROC iemAImpl_ %+ %1 %+ _u32
6924%endmacro
6925
6926;
6927; 64-bit forms of ADCX and ADOX
6928;
6929; @returns Updated EFLAGS.
6930; @param A0 Incoming EFLAGS value (input).
6931; @param A1 Pointer to the destination operand (input/output).
6932; @param A2 64-bit source operand 1 (input).
6933;
6934%macro IEMIMPL_ADX_64 2
6935BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6936 PROLOGUE_4_ARGS
6937
6938 IEM_LOAD_FLAGS A0_32, %2, 0
6939 %1 A2, [A1]
6940 mov [A1], A2
6941 IEM_SAVE_FLAGS_RETVAL A0_32, %2, 0, 0
6942
6943 EPILOGUE_4_ARGS
6944ENDPROC iemAImpl_ %+ %1 %+ _u64
6945%endmacro
6946
6947IEMIMPL_ADX_32 adcx, X86_EFL_CF
6948IEMIMPL_ADX_64 adcx, X86_EFL_CF
6949
6950IEMIMPL_ADX_32 adox, X86_EFL_OF
6951IEMIMPL_ADX_64 adox, X86_EFL_OF
6952
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette