VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 102882

Last change on this file since 102882 was 102882, checked in by vboxsync, 10 months ago

VMM/IEM: Skip some expensive flag loading in IEMAllAImpl.asm where possible. bugref:10371

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 192.9 KB
Line 
1; $Id: IEMAllAImpl.asm 102882 2024-01-16 00:37:36Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %define A3_8 cl
143 %endif
144
145 %ifdef ASM_CALL64_MSC
146 %define A0 rcx
147 %define A0_32 ecx
148 %define A0_16 cx
149 %define A0_8 cl
150
151 %define A1 rdx
152 %define A1_32 edx
153 %define A1_16 dx
154 %define A1_8 dl
155
156 %define A2 r8
157 %define A2_32 r8d
158 %define A2_16 r8w
159 %define A2_8 r8b
160
161 %define A3 r9
162 %define A3_32 r9d
163 %define A3_16 r9w
164 %define A3_8 r9b
165 %endif
166
167 %define T0 rax
168 %define T0_32 eax
169 %define T0_16 ax
170 %define T0_8 al
171
172 %define T1 r11
173 %define T1_32 r11d
174 %define T1_16 r11w
175 %define T1_8 r11b
176
177 %define T2 r10 ; only AMD64
178 %define T2_32 r10d
179 %define T2_16 r10w
180 %define T2_8 r10b
181
182%else
183 ; x86
184 %macro PROLOGUE_1_ARGS 0
185 push edi
186 %endmacro
187 %macro EPILOGUE_1_ARGS 0
188 pop edi
189 ret 0
190 %endmacro
191 %macro EPILOGUE_1_ARGS_EX 1
192 pop edi
193 ret %1
194 %endmacro
195
196 %macro PROLOGUE_2_ARGS 0
197 push edi
198 %endmacro
199 %macro EPILOGUE_2_ARGS 0
200 pop edi
201 ret 0
202 %endmacro
203 %macro EPILOGUE_2_ARGS_EX 1
204 pop edi
205 ret %1
206 %endmacro
207
208 %macro PROLOGUE_3_ARGS 0
209 push ebx
210 mov ebx, [esp + 4 + 4]
211 push edi
212 %endmacro
213 %macro EPILOGUE_3_ARGS_EX 1
214 %if (%1) < 4
215 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
216 %endif
217 pop edi
218 pop ebx
219 ret %1
220 %endmacro
221 %macro EPILOGUE_3_ARGS 0
222 EPILOGUE_3_ARGS_EX 4
223 %endmacro
224
225 %macro PROLOGUE_4_ARGS 0
226 push ebx
227 push edi
228 push esi
229 mov ebx, [esp + 12 + 4 + 0]
230 mov esi, [esp + 12 + 4 + 4]
231 %endmacro
232 %macro EPILOGUE_4_ARGS_EX 1
233 %if (%1) < 8
234 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
235 %endif
236 pop esi
237 pop edi
238 pop ebx
239 ret %1
240 %endmacro
241 %macro EPILOGUE_4_ARGS 0
242 EPILOGUE_4_ARGS_EX 8
243 %endmacro
244
245 %define A0 ecx
246 %define A0_32 ecx
247 %define A0_16 cx
248 %define A0_8 cl
249
250 %define A1 edx
251 %define A1_32 edx
252 %define A1_16 dx
253 %define A1_8 dl
254
255 %define A2 ebx
256 %define A2_32 ebx
257 %define A2_16 bx
258 %define A2_8 bl
259
260 %define A3 esi
261 %define A3_32 esi
262 %define A3_16 si
263
264 %define T0 eax
265 %define T0_32 eax
266 %define T0_16 ax
267 %define T0_8 al
268
269 %define T1 edi
270 %define T1_32 edi
271 %define T1_16 di
272%endif
273
274
275;;
276; Load the relevant flags from [%1] if there are undefined flags (%3).
277;
278; @remarks Clobbers T0, stack. Changes EFLAGS.
279; @param A2 The register pointing to the flags.
280; @param 1 The parameter (A0..A3) pointing to the eflags.
281; @param 2 The set of modified flags.
282; @param 3 The set of undefined flags.
283; @param 4 Force loading the flags.
284;
285%macro IEM_MAYBE_LOAD_FLAGS 3-4 1
286 %if (%3 + %4) != 0
287 pushf ; store current flags
288 mov T0_32, [%1] ; load the guest flags
289 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
290 and T0_32, (%2 | %3) ; select the modified and undefined flags.
291 or [xSP], T0 ; merge guest flags with host flags.
292 popf ; load the mixed flags.
293 %endif
294%endmacro
295
296;;
297; Load the relevant flags from [%1].
298;
299; @remarks Clobbers T0, stack. Changes EFLAGS.
300; @param A2 The register pointing to the flags.
301; @param 1 The parameter (A0..A3) pointing to the eflags.
302; @param 2 The set of flags to load.
303; @param 3 The set of undefined flags.
304;
305%macro IEM_LOAD_FLAGS 3
306 pushf ; store current flags
307 mov T0_32, [%1] ; load the guest flags
308 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
309 and T0_32, (%2 | %3) ; select the modified and undefined flags.
310 or [xSP], T0 ; merge guest flags with host flags.
311 popf ; load the mixed flags.
312%endmacro
313
314;;
315; Update the flag.
316;
317; @remarks Clobbers T0, T1, stack.
318; @param 1 The register pointing to the EFLAGS.
319; @param 2 The mask of modified flags to save.
320; @param 3 The mask of undefined flags to (maybe) save.
321;
322%macro IEM_SAVE_FLAGS 3
323 %if (%2 | %3) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; flags
327 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
328 and T1_32, (%2 | %3) ; select the modified and undefined flags.
329 or T0_32, T1_32 ; combine the flags.
330 mov [%1], T0_32 ; save the flags.
331 %endif
332%endmacro
333
334;;
335; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
336;
337; @remarks Clobbers T0, T1, stack.
338; @param 1 The register pointing to the EFLAGS.
339; @param 2 The mask of modified flags to save.
340; @param 3 Mask of additional flags to always clear
341; @param 4 Mask of additional flags to always set.
342;
343%macro IEM_SAVE_AND_ADJUST_FLAGS 4
344 %if (%2 | %3 | %4) != 0
345 pushf
346 pop T1
347 mov T0_32, [%1] ; load flags.
348 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
349 and T1_32, (%2) ; select the modified flags.
350 or T0_32, T1_32 ; combine the flags.
351 %if (%4) != 0
352 or T0_32, %4 ; add the always set flags.
353 %endif
354 mov [%1], T0_32 ; save the result.
355 %endif
356%endmacro
357
358;;
359; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
360; signed input (%4[%5]) and parity index (%6).
361;
362; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
363; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
364; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
365;
366; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
367; @param 1 The register pointing to the EFLAGS.
368; @param 2 The mask of modified flags to save.
369; @param 3 Mask of additional flags to always clear
370; @param 4 The result register to set SF by.
371; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
372; @param 6 The (full) register containing the parity table index. Will be modified!
373
374%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
375 %ifdef RT_ARCH_AMD64
376 pushf
377 pop T2
378 %else
379 push T0
380 pushf
381 pop T0
382 %endif
383 mov T1_32, [%1] ; load flags.
384 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
385 %ifdef RT_ARCH_AMD64
386 and T2_32, (%2) ; select the modified flags.
387 or T1_32, T2_32 ; combine the flags.
388 %else
389 and T0_32, (%2) ; select the modified flags.
390 or T1_32, T0_32 ; combine the flags.
391 pop T0
392 %endif
393
394 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
395 bt %4, %5 - 1
396 jnc %%sf_clear
397 or T1_32, X86_EFL_SF
398 %%sf_clear:
399
400 ; Parity last.
401 and %6, 0xff
402 %ifdef RT_ARCH_AMD64
403 lea T2, [NAME(g_afParity) xWrtRIP]
404 or T1_8, [T2 + %6]
405 %else
406 or T1_8, [NAME(g_afParity) + %6]
407 %endif
408
409 mov [%1], T1_32 ; save the result.
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419;
420%macro IEM_ADJUST_FLAGS 3
421 %if (%2 | %3) != 0
422 mov T0_32, [%1] ; Load flags.
423 %if (%2) != 0
424 and T0_32, ~(%2) ; Remove the always cleared flags.
425 %endif
426 %if (%3) != 0
427 or T0_32, %3 ; Add the always set flags.
428 %endif
429 mov [%1], T0_32 ; Save the result.
430 %endif
431%endmacro
432
433;;
434; Calculates the new EFLAGS using fixed clear and set bit masks.
435;
436; @remarks Clobbers T0, %4, EFLAGS.
437; @param 1 The register pointing to the EFLAGS.
438; @param 2 Mask of additional flags to always clear
439; @param 3 Mask of additional flags to always set.
440; @param 4 The (full) register containing the parity table index. Will be modified!
441;
442%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
443 mov T0_32, [%1] ; Load flags.
444 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
445 %if (%3) != 0
446 or T0_32, %3 ; Add the always set flags.
447 %endif
448 and %4, 0xff
449 %ifdef RT_ARCH_AMD64
450 lea T2, [NAME(g_afParity) xWrtRIP]
451 or T0_8, [T2 + %4]
452 %else
453 or T0_8, [NAME(g_afParity) + %4]
454 %endif
455 mov [%1], T0_32 ; Save the result.
456%endmacro
457
458
459;;
460; Checks that the size expression %1 matches %2 adjusted according to
461; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
462; @param 1 The jump array size assembly expression.
463; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
464;
465%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
466 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
467 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
468 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
469 %else
470 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
471 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
472 %endif
473%endmacro
474
475
476;*********************************************************************************************************************************
477;* External Symbols *
478;*********************************************************************************************************************************
479extern NAME(g_afParity)
480
481
482;;
483; Macro for implementing a binary operator.
484;
485; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
486; variants, except on 32-bit system where the 64-bit accesses requires hand
487; coding.
488;
489; All the functions takes a pointer to the destination memory operand in A0,
490; the source register operand in A1 and a pointer to eflags in A2.
491;
492; @param 1 The instruction mnemonic.
493; @param 2 Non-zero if there should be a locked version.
494; @param 3 The modified flags.
495; @param 4 The undefined flags.
496; @param 5 Force flag loading (ADC, SBC).
497;
498%macro IEMIMPL_BIN_OP 5
499BEGINCODE
500BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
501 PROLOGUE_3_ARGS
502 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
503 %1 byte [A0], A1_8
504 IEM_SAVE_FLAGS A2, %3, %4
505 EPILOGUE_3_ARGS
506ENDPROC iemAImpl_ %+ %1 %+ _u8
507
508BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
509 PROLOGUE_3_ARGS
510 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
511 %1 word [A0], A1_16
512 IEM_SAVE_FLAGS A2, %3, %4
513 EPILOGUE_3_ARGS
514ENDPROC iemAImpl_ %+ %1 %+ _u16
515
516BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
517 PROLOGUE_3_ARGS
518 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
519 %1 dword [A0], A1_32
520 IEM_SAVE_FLAGS A2, %3, %4
521 EPILOGUE_3_ARGS
522ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524 %ifdef RT_ARCH_AMD64
525BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526 PROLOGUE_3_ARGS
527 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
528 %1 qword [A0], A1
529 IEM_SAVE_FLAGS A2, %3, %4
530 EPILOGUE_3_ARGS_EX 8
531ENDPROC iemAImpl_ %+ %1 %+ _u64
532 %endif ; RT_ARCH_AMD64
533
534 %if %2 != 0 ; locked versions requested?
535
536BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
537 PROLOGUE_3_ARGS
538 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
539 lock %1 byte [A0], A1_8
540 IEM_SAVE_FLAGS A2, %3, %4
541 EPILOGUE_3_ARGS
542ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
543
544BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
545 PROLOGUE_3_ARGS
546 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
547 lock %1 word [A0], A1_16
548 IEM_SAVE_FLAGS A2, %3, %4
549 EPILOGUE_3_ARGS
550ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
551
552BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
553 PROLOGUE_3_ARGS
554 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
555 lock %1 dword [A0], A1_32
556 IEM_SAVE_FLAGS A2, %3, %4
557 EPILOGUE_3_ARGS
558ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
559
560 %ifdef RT_ARCH_AMD64
561BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
562 PROLOGUE_3_ARGS
563 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
564 lock %1 qword [A0], A1
565 IEM_SAVE_FLAGS A2, %3, %4
566 EPILOGUE_3_ARGS_EX 8
567ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
568 %endif ; RT_ARCH_AMD64
569 %endif ; locked
570%endmacro
571
572; instr,lock, modified-flags, undefined flags, force loading flags
573IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
574IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
575IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
576IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
577IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
578IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
579IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
580IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
581IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
582
583
584;;
585; Macro for implementing a binary operator, VEX variant with separate input/output.
586;
587; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
588; where the 64-bit accesses requires hand coding.
589;
590; All the functions takes a pointer to the destination memory operand in A0,
591; the first source register operand in A1, the second source register operand
592; in A2 and a pointer to eflags in A3.
593;
594; @param 1 The instruction mnemonic.
595; @param 2 The modified flags.
596; @param 3 The undefined flags.
597;
598%macro IEMIMPL_VEX_BIN_OP 3
599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
600 PROLOGUE_4_ARGS
601 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
602 %1 T0_32, A1_32, A2_32
603 mov [A0], T0_32
604 IEM_SAVE_FLAGS A3, %2, %3
605 EPILOGUE_4_ARGS
606ENDPROC iemAImpl_ %+ %1 %+ _u32
607
608 %ifdef RT_ARCH_AMD64
609BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
610 PROLOGUE_4_ARGS
611 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
612 %1 T0, A1, A2
613 mov [A0], T0
614 IEM_SAVE_FLAGS A3, %2, %3
615 EPILOGUE_4_ARGS
616ENDPROC iemAImpl_ %+ %1 %+ _u64
617 %endif ; RT_ARCH_AMD64
618%endmacro
619
620; instr, modified-flags, undefined-flags
621IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
622IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
623IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
624
625;;
626; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
627;
628; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
629; where the 64-bit accesses requires hand coding.
630;
631; All the functions takes a pointer to the destination memory operand in A0,
632; the source register operand in A1 and a pointer to eflags in A2.
633;
634; @param 1 The instruction mnemonic.
635; @param 2 The modified flags.
636; @param 3 The undefined flags.
637;
638%macro IEMIMPL_VEX_BIN_OP_2 3
639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
640 PROLOGUE_4_ARGS
641 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
642 mov T0_32, [A0]
643 %1 T0_32, A1_32
644 mov [A0], T0_32
645 IEM_SAVE_FLAGS A2, %2, %3
646 EPILOGUE_4_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u32
648
649 %ifdef RT_ARCH_AMD64
650BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
651 PROLOGUE_4_ARGS
652 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
653 mov T0, [A0]
654 %1 T0, A1
655 mov [A0], T0
656 IEM_SAVE_FLAGS A2, %2, %3
657 EPILOGUE_4_ARGS
658ENDPROC iemAImpl_ %+ %1 %+ _u64
659 %endif ; RT_ARCH_AMD64
660%endmacro
661
662; instr, modified-flags, undefined-flags
663IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
664IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
665IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
666
667
668;;
669; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
670;
671; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
672; where the 64-bit accesses requires hand coding.
673;
674; All the functions takes a pointer to the destination memory operand in A0,
675; the first source register operand in A1, the second source register operand
676; in A2 and a pointer to eflags in A3.
677;
678; @param 1 The instruction mnemonic.
679; @param 2 Fallback instruction if applicable.
680; @param 3 Whether to emit fallback or not.
681;
682%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
683BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
684 PROLOGUE_3_ARGS
685 %1 T0_32, A1_32, A2_32
686 mov [A0], T0_32
687 EPILOGUE_3_ARGS
688ENDPROC iemAImpl_ %+ %1 %+ _u32
689
690 %if %3
691BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
692 PROLOGUE_3_ARGS
693 %ifdef ASM_CALL64_GCC
694 mov cl, A2_8
695 %2 A1_32, cl
696 mov [A0], A1_32
697 %else
698 xchg A2, A0
699 %2 A1_32, cl
700 mov [A2], A1_32
701 %endif
702 EPILOGUE_3_ARGS
703ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
704 %endif
705
706 %ifdef RT_ARCH_AMD64
707BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
708 PROLOGUE_3_ARGS
709 %1 T0, A1, A2
710 mov [A0], T0
711 EPILOGUE_3_ARGS
712ENDPROC iemAImpl_ %+ %1 %+ _u64
713
714 %if %3
715BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
716 PROLOGUE_3_ARGS
717 %ifdef ASM_CALL64_GCC
718 mov cl, A2_8
719 %2 A1, cl
720 mov [A0], A1_32
721 %else
722 xchg A2, A0
723 %2 A1, cl
724 mov [A2], A1_32
725 %endif
726 mov [A0], A1
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
729 %endif
730 %endif ; RT_ARCH_AMD64
731%endmacro
732
733; instr, fallback instr, emit fallback
734IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
735IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
736IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
737IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
738IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
739
740
741;
742; RORX uses a immediate byte for the shift count, so we only do
743; fallback implementation of that one.
744;
745BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
746 PROLOGUE_3_ARGS
747 %ifdef ASM_CALL64_GCC
748 mov cl, A2_8
749 ror A1_32, cl
750 mov [A0], A1_32
751 %else
752 xchg A2, A0
753 ror A1_32, cl
754 mov [A2], A1_32
755 %endif
756 EPILOGUE_3_ARGS
757ENDPROC iemAImpl_rorx_u32
758
759 %ifdef RT_ARCH_AMD64
760BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
761 PROLOGUE_3_ARGS
762 %ifdef ASM_CALL64_GCC
763 mov cl, A2_8
764 ror A1, cl
765 mov [A0], A1
766 %else
767 xchg A2, A0
768 ror A1, cl
769 mov [A2], A1
770 %endif
771 EPILOGUE_3_ARGS
772ENDPROC iemAImpl_rorx_u64
773 %endif ; RT_ARCH_AMD64
774
775
776;
777; MULX
778;
779BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
780 PROLOGUE_4_ARGS
781%ifdef ASM_CALL64_GCC
782 ; A2_32 is EDX - prefect
783 mulx T0_32, T1_32, A3_32
784 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
785 mov [A0], T0_32
786%else
787 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
788 xchg A1, A2
789 mulx T0_32, T1_32, A3_32
790 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
791 mov [A0], T0_32
792%endif
793 EPILOGUE_4_ARGS
794ENDPROC iemAImpl_mulx_u32
795
796
797BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
798 PROLOGUE_4_ARGS
799%ifdef ASM_CALL64_GCC
800 ; A2_32 is EDX, T0_32 is EAX
801 mov eax, A3_32
802 mul A2_32
803 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
804 mov [A0], edx
805%else
806 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
807 xchg A1, A2
808 mov eax, A3_32
809 mul A2_32
810 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%endif
813 EPILOGUE_4_ARGS
814ENDPROC iemAImpl_mulx_u32_fallback
815
816%ifdef RT_ARCH_AMD64
817BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
818 PROLOGUE_4_ARGS
819%ifdef ASM_CALL64_GCC
820 ; A2 is RDX - prefect
821 mulx T0, T1, A3
822 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
823 mov [A0], T0
824%else
825 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
826 xchg A1, A2
827 mulx T0, T1, A3
828 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
829 mov [A0], T0
830%endif
831 EPILOGUE_4_ARGS
832ENDPROC iemAImpl_mulx_u64
833
834
835BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
836 PROLOGUE_4_ARGS
837%ifdef ASM_CALL64_GCC
838 ; A2 is RDX, T0 is RAX
839 mov rax, A3
840 mul A2
841 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
842 mov [A0], rdx
843%else
844 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
845 xchg A1, A2
846 mov rax, A3
847 mul A2
848 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%endif
851 EPILOGUE_4_ARGS
852ENDPROC iemAImpl_mulx_u64_fallback
853
854%endif
855
856
857;;
858; Macro for implementing a bit operator.
859;
860; This will generate code for the 16, 32 and 64 bit accesses with locked
861; variants, except on 32-bit system where the 64-bit accesses requires hand
862; coding.
863;
864; All the functions takes a pointer to the destination memory operand in A0,
865; the source register operand in A1 and a pointer to eflags in A2.
866;
867; @param 1 The instruction mnemonic.
868; @param 2 Non-zero if there should be a locked version.
869; @param 3 The modified flags.
870; @param 4 The undefined flags.
871;
872%macro IEMIMPL_BIT_OP 4
873BEGINCODE
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64
898 %endif ; RT_ARCH_AMD64
899
900 %if %2 != 0 ; locked versions requested?
901
902BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
903 PROLOGUE_3_ARGS
904 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
905 lock %1 word [A0], A1_16
906 IEM_SAVE_FLAGS A2, %3, %4
907 EPILOGUE_3_ARGS
908ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
909
910BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
911 PROLOGUE_3_ARGS
912 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
913 lock %1 dword [A0], A1_32
914 IEM_SAVE_FLAGS A2, %3, %4
915 EPILOGUE_3_ARGS
916ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
917
918 %ifdef RT_ARCH_AMD64
919BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
920 PROLOGUE_3_ARGS
921 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
922 lock %1 qword [A0], A1
923 IEM_SAVE_FLAGS A2, %3, %4
924 EPILOGUE_3_ARGS_EX 8
925ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
926 %endif ; RT_ARCH_AMD64
927 %endif ; locked
928%endmacro
929; modified efl, undefined eflags
930IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
931IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
932IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
933IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
934
935;;
936; Macro for implementing a bit search operator.
937;
938; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
939; system where the 64-bit accesses requires hand coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; In the ZF case the destination register is 'undefined', however it seems that
945; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
946; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
947; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
948; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
949;
950; @param 1 The instruction mnemonic.
951; @param 2 The modified flags.
952; @param 3 The undefined flags.
953; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
954;
955%macro IEMIMPL_BIT_OP2 4
956BEGINCODE
957BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
958 PROLOGUE_3_ARGS
959 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
960 %1 T0_16, A1_16
961%if %4 != 0
962 jz .unchanged_dst
963%endif
964 mov [A0], T0_16
965.unchanged_dst:
966 IEM_SAVE_FLAGS A2, %2, %3
967 EPILOGUE_3_ARGS
968ENDPROC iemAImpl_ %+ %1 %+ _u16
969
970BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
971 PROLOGUE_3_ARGS
972 %1 T1_16, A1_16
973%if %4 != 0
974 jz .unchanged_dst
975%endif
976 mov [A0], T1_16
977 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
978 EPILOGUE_3_ARGS
979.unchanged_dst:
980 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
981 EPILOGUE_3_ARGS
982ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
983
984BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
985 PROLOGUE_3_ARGS
986 %1 T0_16, A1_16
987%if %4 != 0
988 jz .unchanged_dst
989%endif
990 mov [A0], T0_16
991.unchanged_dst:
992 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
993 EPILOGUE_3_ARGS
994ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
995
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1000 %1 T0_32, A1_32
1001%if %4 != 0
1002 jz .unchanged_dst
1003%endif
1004 mov [A0], T0_32
1005.unchanged_dst:
1006 IEM_SAVE_FLAGS A2, %2, %3
1007 EPILOGUE_3_ARGS
1008ENDPROC iemAImpl_ %+ %1 %+ _u32
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1011 PROLOGUE_3_ARGS
1012 %1 T1_32, A1_32
1013%if %4 != 0
1014 jz .unchanged_dst
1015%endif
1016 mov [A0], T1_32
1017 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1018 EPILOGUE_3_ARGS
1019.unchanged_dst:
1020 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1021 EPILOGUE_3_ARGS
1022ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1023
1024BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1025 PROLOGUE_3_ARGS
1026 %1 T0_32, A1_32
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T0_32
1031.unchanged_dst:
1032 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1033 EPILOGUE_3_ARGS
1034ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1035
1036
1037 %ifdef RT_ARCH_AMD64
1038
1039BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1040 PROLOGUE_3_ARGS
1041 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1042 %1 T0, A1
1043%if %4 != 0
1044 jz .unchanged_dst
1045%endif
1046 mov [A0], T0
1047.unchanged_dst:
1048 IEM_SAVE_FLAGS A2, %2, %3
1049 EPILOGUE_3_ARGS_EX 8
1050ENDPROC iemAImpl_ %+ %1 %+ _u64
1051
1052BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1053 PROLOGUE_3_ARGS
1054 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1055 %1 T1, A1
1056%if %4 != 0
1057 jz .unchanged_dst
1058%endif
1059 mov [A0], T1
1060 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1061 EPILOGUE_3_ARGS
1062.unchanged_dst:
1063 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1064 EPILOGUE_3_ARGS
1065ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1066
1067BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1068 PROLOGUE_3_ARGS
1069 %1 T0, A1
1070%if %4 != 0
1071 jz .unchanged_dst
1072%endif
1073 mov [A0], T0
1074.unchanged_dst:
1075 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1076 EPILOGUE_3_ARGS_EX 8
1077ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1078
1079 %endif ; RT_ARCH_AMD64
1080%endmacro
1081
1082IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1083IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1084IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1085IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1086
1087
1088;;
1089; Macro for implementing POPCNT.
1090;
1091; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1092; system where the 64-bit accesses requires hand coding.
1093;
1094; All the functions takes a pointer to the destination memory operand in A0,
1095; the source register operand in A1 and a pointer to eflags in A2.
1096;
1097; ASSUMES Intel and AMD set EFLAGS the same way.
1098;
1099; ASSUMES the instruction does not support memory destination.
1100;
1101; @param 1 The instruction mnemonic.
1102; @param 2 The modified flags.
1103; @param 3 The undefined flags.
1104;
1105%macro IEMIMPL_BIT_OP3 3
1106BEGINCODE
1107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1108 PROLOGUE_3_ARGS
1109 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1110 %1 T0_16, A1_16
1111 mov [A0], T0_16
1112 IEM_SAVE_FLAGS A2, %2, %3
1113 EPILOGUE_3_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u16
1115
1116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1117 PROLOGUE_3_ARGS
1118 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1119 %1 T0_32, A1_32
1120 mov [A0], T0_32
1121 IEM_SAVE_FLAGS A2, %2, %3
1122 EPILOGUE_3_ARGS
1123ENDPROC iemAImpl_ %+ %1 %+ _u32
1124
1125 %ifdef RT_ARCH_AMD64
1126BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1127 PROLOGUE_3_ARGS
1128 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1129 %1 T0, A1
1130 mov [A0], T0
1131 IEM_SAVE_FLAGS A2, %2, %3
1132 EPILOGUE_3_ARGS_EX 8
1133ENDPROC iemAImpl_ %+ %1 %+ _u64
1134 %endif ; RT_ARCH_AMD64
1135%endmacro
1136IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1137
1138
1139;
1140; IMUL is also a similar but yet different case (no lock, no mem dst).
1141; The rDX:rAX variant of imul is handled together with mul further down.
1142;
1143BEGINCODE
1144; @param 1 EFLAGS that are modified.
1145; @param 2 Undefined EFLAGS.
1146; @param 3 Function suffix.
1147; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1148; 2 for AMD (set AF, clear PF, ZF and SF).
1149%macro IEMIMPL_IMUL_TWO 4
1150BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1151 PROLOGUE_3_ARGS
1152 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1153 imul A1_16, word [A0]
1154 mov [A0], A1_16
1155 %if %4 != 1
1156 IEM_SAVE_FLAGS A2, %1, %2
1157 %else
1158 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1159 %endif
1160 EPILOGUE_3_ARGS
1161ENDPROC iemAImpl_imul_two_u16 %+ %3
1162
1163BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1164 PROLOGUE_3_ARGS
1165 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1166 imul A1_32, dword [A0]
1167 mov [A0], A1_32
1168 %if %4 != 1
1169 IEM_SAVE_FLAGS A2, %1, %2
1170 %else
1171 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1172 %endif
1173 EPILOGUE_3_ARGS
1174ENDPROC iemAImpl_imul_two_u32 %+ %3
1175
1176 %ifdef RT_ARCH_AMD64
1177BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1178 PROLOGUE_3_ARGS
1179 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1180 imul A1, qword [A0]
1181 mov [A0], A1
1182 %if %4 != 1
1183 IEM_SAVE_FLAGS A2, %1, %2
1184 %else
1185 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1186 %endif
1187 EPILOGUE_3_ARGS_EX 8
1188ENDPROC iemAImpl_imul_two_u64 %+ %3
1189 %endif ; RT_ARCH_AMD64
1190%endmacro
1191IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1192IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1193IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1194
1195
1196;
1197; XCHG for memory operands. This implies locking. No flag changes.
1198;
1199; Each function takes two arguments, first the pointer to the memory,
1200; then the pointer to the register. They all return void.
1201;
1202BEGINCODE
1203BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1204 PROLOGUE_2_ARGS
1205 mov T0_8, [A1]
1206 xchg [A0], T0_8
1207 mov [A1], T0_8
1208 EPILOGUE_2_ARGS
1209ENDPROC iemAImpl_xchg_u8_locked
1210
1211BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1212 PROLOGUE_2_ARGS
1213 mov T0_16, [A1]
1214 xchg [A0], T0_16
1215 mov [A1], T0_16
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u16_locked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_32, [A1]
1222 xchg [A0], T0_32
1223 mov [A1], T0_32
1224 EPILOGUE_2_ARGS
1225ENDPROC iemAImpl_xchg_u32_locked
1226
1227%ifdef RT_ARCH_AMD64
1228BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0, [A1]
1231 xchg [A0], T0
1232 mov [A1], T0
1233 EPILOGUE_2_ARGS
1234ENDPROC iemAImpl_xchg_u64_locked
1235%endif
1236
1237; Unlocked variants for fDisregardLock mode.
1238
1239BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1240 PROLOGUE_2_ARGS
1241 mov T0_8, [A1]
1242 mov T1_8, [A0]
1243 mov [A0], T0_8
1244 mov [A1], T1_8
1245 EPILOGUE_2_ARGS
1246ENDPROC iemAImpl_xchg_u8_unlocked
1247
1248BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1249 PROLOGUE_2_ARGS
1250 mov T0_16, [A1]
1251 mov T1_16, [A0]
1252 mov [A0], T0_16
1253 mov [A1], T1_16
1254 EPILOGUE_2_ARGS
1255ENDPROC iemAImpl_xchg_u16_unlocked
1256
1257BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1258 PROLOGUE_2_ARGS
1259 mov T0_32, [A1]
1260 mov T1_32, [A0]
1261 mov [A0], T0_32
1262 mov [A1], T1_32
1263 EPILOGUE_2_ARGS
1264ENDPROC iemAImpl_xchg_u32_unlocked
1265
1266%ifdef RT_ARCH_AMD64
1267BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1268 PROLOGUE_2_ARGS
1269 mov T0, [A1]
1270 mov T1, [A0]
1271 mov [A0], T0
1272 mov [A1], T1
1273 EPILOGUE_2_ARGS
1274ENDPROC iemAImpl_xchg_u64_unlocked
1275%endif
1276
1277
1278;
1279; XADD for memory operands.
1280;
1281; Each function takes three arguments, first the pointer to the
1282; memory/register, then the pointer to the register, and finally a pointer to
1283; eflags. They all return void.
1284;
1285BEGINCODE
1286BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1287 PROLOGUE_3_ARGS
1288 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1289 mov T0_8, [A1]
1290 xadd [A0], T0_8
1291 mov [A1], T0_8
1292 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1293 EPILOGUE_3_ARGS
1294ENDPROC iemAImpl_xadd_u8
1295
1296BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1297 PROLOGUE_3_ARGS
1298 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1299 mov T0_16, [A1]
1300 xadd [A0], T0_16
1301 mov [A1], T0_16
1302 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1303 EPILOGUE_3_ARGS
1304ENDPROC iemAImpl_xadd_u16
1305
1306BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1307 PROLOGUE_3_ARGS
1308 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1309 mov T0_32, [A1]
1310 xadd [A0], T0_32
1311 mov [A1], T0_32
1312 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1313 EPILOGUE_3_ARGS
1314ENDPROC iemAImpl_xadd_u32
1315
1316%ifdef RT_ARCH_AMD64
1317BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1318 PROLOGUE_3_ARGS
1319 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1320 mov T0, [A1]
1321 xadd [A0], T0
1322 mov [A1], T0
1323 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1324 EPILOGUE_3_ARGS
1325ENDPROC iemAImpl_xadd_u64
1326%endif ; RT_ARCH_AMD64
1327
1328BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1329 PROLOGUE_3_ARGS
1330 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1331 mov T0_8, [A1]
1332 lock xadd [A0], T0_8
1333 mov [A1], T0_8
1334 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_xadd_u8_locked
1337
1338BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1341 mov T0_16, [A1]
1342 lock xadd [A0], T0_16
1343 mov [A1], T0_16
1344 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1345 EPILOGUE_3_ARGS
1346ENDPROC iemAImpl_xadd_u16_locked
1347
1348BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1351 mov T0_32, [A1]
1352 lock xadd [A0], T0_32
1353 mov [A1], T0_32
1354 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1355 EPILOGUE_3_ARGS
1356ENDPROC iemAImpl_xadd_u32_locked
1357
1358%ifdef RT_ARCH_AMD64
1359BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1360 PROLOGUE_3_ARGS
1361 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1362 mov T0, [A1]
1363 lock xadd [A0], T0
1364 mov [A1], T0
1365 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1366 EPILOGUE_3_ARGS
1367ENDPROC iemAImpl_xadd_u64_locked
1368%endif ; RT_ARCH_AMD64
1369
1370
1371;
1372; CMPXCHG8B.
1373;
1374; These are tricky register wise, so the code is duplicated for each calling
1375; convention.
1376;
1377; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1378;
1379; C-proto:
1380; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1381; uint32_t *pEFlags));
1382;
1383; Note! Identical to iemAImpl_cmpxchg16b.
1384;
1385BEGINCODE
1386BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1387%ifdef RT_ARCH_AMD64
1388 %ifdef ASM_CALL64_MSC
1389 push rbx
1390
1391 mov r11, rdx ; pu64EaxEdx (is also T1)
1392 mov r10, rcx ; pu64Dst
1393
1394 mov ebx, [r8]
1395 mov ecx, [r8 + 4]
1396 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1397 mov eax, [r11]
1398 mov edx, [r11 + 4]
1399
1400 cmpxchg8b [r10]
1401
1402 mov [r11], eax
1403 mov [r11 + 4], edx
1404 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1405
1406 pop rbx
1407 ret
1408 %else
1409 push rbx
1410
1411 mov r10, rcx ; pEFlags
1412 mov r11, rdx ; pu64EbxEcx (is also T1)
1413
1414 mov ebx, [r11]
1415 mov ecx, [r11 + 4]
1416 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1417 mov eax, [rsi]
1418 mov edx, [rsi + 4]
1419
1420 cmpxchg8b [rdi]
1421
1422 mov [rsi], eax
1423 mov [rsi + 4], edx
1424 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1425
1426 pop rbx
1427 ret
1428
1429 %endif
1430%else
1431 push esi
1432 push edi
1433 push ebx
1434 push ebp
1435
1436 mov edi, ecx ; pu64Dst
1437 mov esi, edx ; pu64EaxEdx
1438 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1439 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1440
1441 mov ebx, [ecx]
1442 mov ecx, [ecx + 4]
1443 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1444 mov eax, [esi]
1445 mov edx, [esi + 4]
1446
1447 cmpxchg8b [edi]
1448
1449 mov [esi], eax
1450 mov [esi + 4], edx
1451 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1452
1453 pop ebp
1454 pop ebx
1455 pop edi
1456 pop esi
1457 ret 8
1458%endif
1459ENDPROC iemAImpl_cmpxchg8b
1460
1461BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1462%ifdef RT_ARCH_AMD64
1463 %ifdef ASM_CALL64_MSC
1464 push rbx
1465
1466 mov r11, rdx ; pu64EaxEdx (is also T1)
1467 mov r10, rcx ; pu64Dst
1468
1469 mov ebx, [r8]
1470 mov ecx, [r8 + 4]
1471 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1472 mov eax, [r11]
1473 mov edx, [r11 + 4]
1474
1475 lock cmpxchg8b [r10]
1476
1477 mov [r11], eax
1478 mov [r11 + 4], edx
1479 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1480
1481 pop rbx
1482 ret
1483 %else
1484 push rbx
1485
1486 mov r10, rcx ; pEFlags
1487 mov r11, rdx ; pu64EbxEcx (is also T1)
1488
1489 mov ebx, [r11]
1490 mov ecx, [r11 + 4]
1491 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1492 mov eax, [rsi]
1493 mov edx, [rsi + 4]
1494
1495 lock cmpxchg8b [rdi]
1496
1497 mov [rsi], eax
1498 mov [rsi + 4], edx
1499 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1500
1501 pop rbx
1502 ret
1503
1504 %endif
1505%else
1506 push esi
1507 push edi
1508 push ebx
1509 push ebp
1510
1511 mov edi, ecx ; pu64Dst
1512 mov esi, edx ; pu64EaxEdx
1513 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1514 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1515
1516 mov ebx, [ecx]
1517 mov ecx, [ecx + 4]
1518 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1519 mov eax, [esi]
1520 mov edx, [esi + 4]
1521
1522 lock cmpxchg8b [edi]
1523
1524 mov [esi], eax
1525 mov [esi + 4], edx
1526 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1527
1528 pop ebp
1529 pop ebx
1530 pop edi
1531 pop esi
1532 ret 8
1533%endif
1534ENDPROC iemAImpl_cmpxchg8b_locked
1535
1536%ifdef RT_ARCH_AMD64
1537
1538;
1539; CMPXCHG16B.
1540;
1541; These are tricky register wise, so the code is duplicated for each calling
1542; convention.
1543;
1544; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1545;
1546; C-proto:
1547; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1548; uint32_t *pEFlags));
1549;
1550; Note! Identical to iemAImpl_cmpxchg8b.
1551;
1552BEGINCODE
1553BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1554 %ifdef ASM_CALL64_MSC
1555 push rbx
1556
1557 mov r11, rdx ; pu64RaxRdx (is also T1)
1558 mov r10, rcx ; pu64Dst
1559
1560 mov rbx, [r8]
1561 mov rcx, [r8 + 8]
1562 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1563 mov rax, [r11]
1564 mov rdx, [r11 + 8]
1565
1566 cmpxchg16b [r10]
1567
1568 mov [r11], rax
1569 mov [r11 + 8], rdx
1570 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1571
1572 pop rbx
1573 ret
1574 %else
1575 push rbx
1576
1577 mov r10, rcx ; pEFlags
1578 mov r11, rdx ; pu64RbxRcx (is also T1)
1579
1580 mov rbx, [r11]
1581 mov rcx, [r11 + 8]
1582 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1583 mov rax, [rsi]
1584 mov rdx, [rsi + 8]
1585
1586 cmpxchg16b [rdi]
1587
1588 mov [rsi], rax
1589 mov [rsi + 8], rdx
1590 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1591
1592 pop rbx
1593 ret
1594
1595 %endif
1596ENDPROC iemAImpl_cmpxchg16b
1597
1598BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1599 %ifdef ASM_CALL64_MSC
1600 push rbx
1601
1602 mov r11, rdx ; pu64RaxRdx (is also T1)
1603 mov r10, rcx ; pu64Dst
1604
1605 mov rbx, [r8]
1606 mov rcx, [r8 + 8]
1607 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1608 mov rax, [r11]
1609 mov rdx, [r11 + 8]
1610
1611 lock cmpxchg16b [r10]
1612
1613 mov [r11], rax
1614 mov [r11 + 8], rdx
1615 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1616
1617 pop rbx
1618 ret
1619 %else
1620 push rbx
1621
1622 mov r10, rcx ; pEFlags
1623 mov r11, rdx ; pu64RbxRcx (is also T1)
1624
1625 mov rbx, [r11]
1626 mov rcx, [r11 + 8]
1627 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1628 mov rax, [rsi]
1629 mov rdx, [rsi + 8]
1630
1631 lock cmpxchg16b [rdi]
1632
1633 mov [rsi], rax
1634 mov [rsi + 8], rdx
1635 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1636
1637 pop rbx
1638 ret
1639
1640 %endif
1641ENDPROC iemAImpl_cmpxchg16b_locked
1642
1643%endif ; RT_ARCH_AMD64
1644
1645
1646;
1647; CMPXCHG.
1648;
1649; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1650;
1651; C-proto:
1652; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1653;
1654BEGINCODE
1655%macro IEMIMPL_CMPXCHG 2
1656BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1657 PROLOGUE_4_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1659 mov al, [A1]
1660 %1 cmpxchg [A0], A2_8
1661 mov [A1], al
1662 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1663 EPILOGUE_4_ARGS
1664ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1665
1666BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1667 PROLOGUE_4_ARGS
1668 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1669 mov ax, [A1]
1670 %1 cmpxchg [A0], A2_16
1671 mov [A1], ax
1672 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1673 EPILOGUE_4_ARGS
1674ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1675
1676BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1677 PROLOGUE_4_ARGS
1678 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1679 mov eax, [A1]
1680 %1 cmpxchg [A0], A2_32
1681 mov [A1], eax
1682 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1683 EPILOGUE_4_ARGS
1684ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1685
1686BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1687%ifdef RT_ARCH_AMD64
1688 PROLOGUE_4_ARGS
1689 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1690 mov rax, [A1]
1691 %1 cmpxchg [A0], A2
1692 mov [A1], rax
1693 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1694 EPILOGUE_4_ARGS
1695%else
1696 ;
1697 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1698 ;
1699 push esi
1700 push edi
1701 push ebx
1702 push ebp
1703
1704 mov edi, ecx ; pu64Dst
1705 mov esi, edx ; pu64Rax
1706 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1707 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1708
1709 mov ebx, [ecx]
1710 mov ecx, [ecx + 4]
1711 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1712 mov eax, [esi]
1713 mov edx, [esi + 4]
1714
1715 lock cmpxchg8b [edi]
1716
1717 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1718 jz .cmpxchg8b_not_equal
1719;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
1720 cmp eax, eax ; just set the other flags.
1721.store:
1722 mov [esi], eax
1723 mov [esi + 4], edx
1724 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1725
1726 pop ebp
1727 pop ebx
1728 pop edi
1729 pop esi
1730 ret 8
1731
1732.cmpxchg8b_not_equal:
1733 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1734 jne .store
1735 cmp [esi], eax
1736 jmp .store
1737
1738%endif
1739ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1740%endmacro ; IEMIMPL_CMPXCHG
1741
1742IEMIMPL_CMPXCHG , ,
1743IEMIMPL_CMPXCHG lock, _locked
1744
1745;;
1746; Macro for implementing a unary operator.
1747;
1748; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1749; variants, except on 32-bit system where the 64-bit accesses requires hand
1750; coding.
1751;
1752; All the functions takes a pointer to the destination memory operand in A0,
1753; the source register operand in A1 and a pointer to eflags in A2.
1754;
1755; @param 1 The instruction mnemonic.
1756; @param 2 The modified flags.
1757; @param 3 The undefined flags.
1758;
1759%macro IEMIMPL_UNARY_OP 3
1760BEGINCODE
1761BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1762 PROLOGUE_2_ARGS
1763 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1764 %1 byte [A0]
1765 IEM_SAVE_FLAGS A1, %2, %3
1766 EPILOGUE_2_ARGS
1767ENDPROC iemAImpl_ %+ %1 %+ _u8
1768
1769BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1770 PROLOGUE_2_ARGS
1771 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1772 lock %1 byte [A0]
1773 IEM_SAVE_FLAGS A1, %2, %3
1774 EPILOGUE_2_ARGS
1775ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1776
1777BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1778 PROLOGUE_2_ARGS
1779 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1780 %1 word [A0]
1781 IEM_SAVE_FLAGS A1, %2, %3
1782 EPILOGUE_2_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1786 PROLOGUE_2_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1788 lock %1 word [A0]
1789 IEM_SAVE_FLAGS A1, %2, %3
1790 EPILOGUE_2_ARGS
1791ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1792
1793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1794 PROLOGUE_2_ARGS
1795 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1796 %1 dword [A0]
1797 IEM_SAVE_FLAGS A1, %2, %3
1798 EPILOGUE_2_ARGS
1799ENDPROC iemAImpl_ %+ %1 %+ _u32
1800
1801BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1802 PROLOGUE_2_ARGS
1803 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1804 lock %1 dword [A0]
1805 IEM_SAVE_FLAGS A1, %2, %3
1806 EPILOGUE_2_ARGS
1807ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1808
1809 %ifdef RT_ARCH_AMD64
1810BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1811 PROLOGUE_2_ARGS
1812 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1813 %1 qword [A0]
1814 IEM_SAVE_FLAGS A1, %2, %3
1815 EPILOGUE_2_ARGS
1816ENDPROC iemAImpl_ %+ %1 %+ _u64
1817
1818BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1819 PROLOGUE_2_ARGS
1820 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1821 lock %1 qword [A0]
1822 IEM_SAVE_FLAGS A1, %2, %3
1823 EPILOGUE_2_ARGS
1824ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1825 %endif ; RT_ARCH_AMD64
1826
1827%endmacro
1828
1829IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1830IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1831IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1832IEMIMPL_UNARY_OP not, 0, 0
1833
1834
1835;
1836; BSWAP. No flag changes.
1837;
1838; Each function takes one argument, pointer to the value to bswap
1839; (input/output). They all return void.
1840;
1841BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1842 PROLOGUE_1_ARGS
1843 mov T0_32, [A0] ; just in case any of the upper bits are used.
1844 db 66h
1845 bswap T0_32
1846 mov [A0], T0_32
1847 EPILOGUE_1_ARGS
1848ENDPROC iemAImpl_bswap_u16
1849
1850BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1851 PROLOGUE_1_ARGS
1852 mov T0_32, [A0]
1853 bswap T0_32
1854 mov [A0], T0_32
1855 EPILOGUE_1_ARGS
1856ENDPROC iemAImpl_bswap_u32
1857
1858BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1859%ifdef RT_ARCH_AMD64
1860 PROLOGUE_1_ARGS
1861 mov T0, [A0]
1862 bswap T0
1863 mov [A0], T0
1864 EPILOGUE_1_ARGS
1865%else
1866 PROLOGUE_1_ARGS
1867 mov T0, [A0]
1868 mov T1, [A0 + 4]
1869 bswap T0
1870 bswap T1
1871 mov [A0 + 4], T0
1872 mov [A0], T1
1873 EPILOGUE_1_ARGS
1874%endif
1875ENDPROC iemAImpl_bswap_u64
1876
1877
1878;;
1879; Macro for implementing a shift operation.
1880;
1881; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1882; 32-bit system where the 64-bit accesses requires hand coding.
1883;
1884; All the functions takes a pointer to the destination memory operand in A0,
1885; the shift count in A1 and a pointer to eflags in A2.
1886;
1887; @param 1 The instruction mnemonic.
1888; @param 2 The modified flags.
1889; @param 3 The undefined flags.
1890; @param 4 Force load flags.
1891;
1892; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1893;
1894; @note the _intel and _amd variants are implemented in C.
1895;
1896%macro IEMIMPL_SHIFT_OP 4
1897BEGINCODE
1898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1899 PROLOGUE_3_ARGS
1900 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1901 %ifdef ASM_CALL64_GCC
1902 mov cl, A1_8
1903 %1 byte [A0], cl
1904 %else
1905 xchg A1, A0
1906 %1 byte [A1], cl
1907 %endif
1908 IEM_SAVE_FLAGS A2, %2, %3
1909 EPILOGUE_3_ARGS
1910ENDPROC iemAImpl_ %+ %1 %+ _u8
1911
1912BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1913 PROLOGUE_3_ARGS
1914 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1915 %ifdef ASM_CALL64_GCC
1916 mov cl, A1_8
1917 %1 word [A0], cl
1918 %else
1919 xchg A1, A0
1920 %1 word [A1], cl
1921 %endif
1922 IEM_SAVE_FLAGS A2, %2, %3
1923 EPILOGUE_3_ARGS
1924ENDPROC iemAImpl_ %+ %1 %+ _u16
1925
1926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1927 PROLOGUE_3_ARGS
1928 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1929 %ifdef ASM_CALL64_GCC
1930 mov cl, A1_8
1931 %1 dword [A0], cl
1932 %else
1933 xchg A1, A0
1934 %1 dword [A1], cl
1935 %endif
1936 IEM_SAVE_FLAGS A2, %2, %3
1937 EPILOGUE_3_ARGS
1938ENDPROC iemAImpl_ %+ %1 %+ _u32
1939
1940 %ifdef RT_ARCH_AMD64
1941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1942 PROLOGUE_3_ARGS
1943 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1944 %ifdef ASM_CALL64_GCC
1945 mov cl, A1_8
1946 %1 qword [A0], cl
1947 %else
1948 xchg A1, A0
1949 %1 qword [A1], cl
1950 %endif
1951 IEM_SAVE_FLAGS A2, %2, %3
1952 EPILOGUE_3_ARGS
1953ENDPROC iemAImpl_ %+ %1 %+ _u64
1954 %endif ; RT_ARCH_AMD64
1955
1956%endmacro
1957
1958;; @todo some questions wrt flags when the shift count is high according to intel docs...
1959IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1960IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1961IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1962IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1963IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1964IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1965IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1966
1967
1968;;
1969; Macro for implementing a double precision shift operation.
1970;
1971; This will generate code for the 16, 32 and 64 bit accesses, except on
1972; 32-bit system where the 64-bit accesses requires hand coding.
1973;
1974; The functions takes the destination operand (r/m) in A0, the source (reg) in
1975; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1976;
1977; @param 1 The instruction mnemonic.
1978; @param 2 The modified flags.
1979; @param 3 The undefined flags.
1980;
1981; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1982;
1983; @note the _intel and _amd variants are implemented in C.
1984;
1985%macro IEMIMPL_SHIFT_DBL_OP 3
1986BEGINCODE
1987BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1988 PROLOGUE_4_ARGS
1989 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1990 %ifdef ASM_CALL64_GCC
1991 xchg A3, A2
1992 %1 [A0], A1_16, cl
1993 xchg A3, A2
1994 %else
1995 xchg A0, A2
1996 %1 [A2], A1_16, cl
1997 %endif
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 EPILOGUE_4_ARGS
2000ENDPROC iemAImpl_ %+ %1 %+ _u16
2001
2002BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2003 PROLOGUE_4_ARGS
2004 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2005 %ifdef ASM_CALL64_GCC
2006 xchg A3, A2
2007 %1 [A0], A1_32, cl
2008 xchg A3, A2
2009 %else
2010 xchg A0, A2
2011 %1 [A2], A1_32, cl
2012 %endif
2013 IEM_SAVE_FLAGS A3, %2, %3
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32
2016
2017 %ifdef RT_ARCH_AMD64
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 %ifdef ASM_CALL64_GCC
2022 xchg A3, A2
2023 %1 [A0], A1, cl
2024 xchg A3, A2
2025 %else
2026 xchg A0, A2
2027 %1 [A2], A1, cl
2028 %endif
2029 IEM_SAVE_FLAGS A3, %2, %3
2030 EPILOGUE_4_ARGS_EX 12
2031ENDPROC iemAImpl_ %+ %1 %+ _u64
2032 %endif ; RT_ARCH_AMD64
2033
2034%endmacro
2035
2036IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2037IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2038
2039
2040;;
2041; Macro for implementing a multiplication operations.
2042;
2043; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2044; 32-bit system where the 64-bit accesses requires hand coding.
2045;
2046; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2047; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2048; pointer to eflags in A3.
2049;
2050; The functions all return 0 so the caller can be used for div/idiv as well as
2051; for the mul/imul implementation.
2052;
2053; @param 1 The instruction mnemonic.
2054; @param 2 The modified flags.
2055; @param 3 The undefined flags.
2056; @param 4 Name suffix.
2057; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2058;
2059; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2060;
2061%macro IEMIMPL_MUL_OP 5
2062BEGINCODE
2063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2064 PROLOGUE_3_ARGS
2065 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2066 mov al, [A0]
2067 %1 A1_8
2068 mov [A0], ax
2069 %if %5 != 1
2070 IEM_SAVE_FLAGS A2, %2, %3
2071 %else
2072 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2073 %endif
2074 xor eax, eax
2075 EPILOGUE_3_ARGS
2076ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2077
2078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2079 PROLOGUE_4_ARGS
2080 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2081 mov ax, [A0]
2082 %ifdef ASM_CALL64_GCC
2083 %1 A2_16
2084 mov [A0], ax
2085 mov [A1], dx
2086 %else
2087 mov T1, A1
2088 %1 A2_16
2089 mov [A0], ax
2090 mov [T1], dx
2091 %endif
2092 %if %5 != 1
2093 IEM_SAVE_FLAGS A3, %2, %3
2094 %else
2095 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2096 %endif
2097 xor eax, eax
2098 EPILOGUE_4_ARGS
2099ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2100
2101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2102 PROLOGUE_4_ARGS
2103 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2104 mov eax, [A0]
2105 %ifdef ASM_CALL64_GCC
2106 %1 A2_32
2107 mov [A0], eax
2108 mov [A1], edx
2109 %else
2110 mov T1, A1
2111 %1 A2_32
2112 mov [A0], eax
2113 mov [T1], edx
2114 %endif
2115 %if %5 != 1
2116 IEM_SAVE_FLAGS A3, %2, %3
2117 %else
2118 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2119 %endif
2120 xor eax, eax
2121 EPILOGUE_4_ARGS
2122ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2123
2124 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2125BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2128 mov rax, [A0]
2129 %ifdef ASM_CALL64_GCC
2130 %1 A2
2131 mov [A0], rax
2132 mov [A1], rdx
2133 %else
2134 mov T1, A1
2135 %1 A2
2136 mov [A0], rax
2137 mov [T1], rdx
2138 %endif
2139 %if %5 != 1
2140 IEM_SAVE_FLAGS A3, %2, %3
2141 %else
2142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2143 %endif
2144 xor eax, eax
2145 EPILOGUE_4_ARGS_EX 12
2146ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2147 %endif ; !RT_ARCH_AMD64
2148
2149%endmacro
2150
2151IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2152IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2153IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2154IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2155IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2156IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2157
2158
2159BEGINCODE
2160;;
2161; Worker function for negating a 32-bit number in T1:T0
2162; @uses None (T0,T1)
2163BEGINPROC iemAImpl_negate_T0_T1_u32
2164 push 0
2165 push 0
2166 xchg T0_32, [xSP]
2167 xchg T1_32, [xSP + xCB]
2168 sub T0_32, [xSP]
2169 sbb T1_32, [xSP + xCB]
2170 add xSP, xCB*2
2171 ret
2172ENDPROC iemAImpl_negate_T0_T1_u32
2173
2174%ifdef RT_ARCH_AMD64
2175;;
2176; Worker function for negating a 64-bit number in T1:T0
2177; @uses None (T0,T1)
2178BEGINPROC iemAImpl_negate_T0_T1_u64
2179 push 0
2180 push 0
2181 xchg T0, [xSP]
2182 xchg T1, [xSP + xCB]
2183 sub T0, [xSP]
2184 sbb T1, [xSP + xCB]
2185 add xSP, xCB*2
2186 ret
2187ENDPROC iemAImpl_negate_T0_T1_u64
2188%endif
2189
2190
2191;;
2192; Macro for implementing a division operations.
2193;
2194; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2195; 32-bit system where the 64-bit accesses requires hand coding.
2196;
2197; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2198; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2199; pointer to eflags in A3.
2200;
2201; The functions all return 0 on success and -1 if a divide error should be
2202; raised by the caller.
2203;
2204; @param 1 The instruction mnemonic.
2205; @param 2 The modified flags.
2206; @param 3 The undefined flags.
2207; @param 4 1 if signed, 0 if unsigned.
2208; @param 5 Function suffix.
2209; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2210; 2 for AMD (set AF, clear PF, ZF and SF).
2211;
2212; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2213;
2214%macro IEMIMPL_DIV_OP 6
2215BEGINCODE
2216BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2217 PROLOGUE_3_ARGS
2218
2219 ; div by chainsaw check.
2220 test A1_8, A1_8
2221 jz .div_zero
2222
2223 ; Overflow check - unsigned division is simple to verify, haven't
2224 ; found a simple way to check signed division yet unfortunately.
2225 %if %4 == 0
2226 cmp [A0 + 1], A1_8
2227 jae .div_overflow
2228 %else
2229 mov T0_16, [A0] ; T0 = dividend
2230 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2231 test A1_8, A1_8
2232 js .divisor_negative
2233 test T0_16, T0_16
2234 jns .both_positive
2235 neg T0_16
2236.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2237 push T0 ; Start off like unsigned below.
2238 shr T0_16, 7
2239 cmp T0_8, A1_8
2240 pop T0
2241 jb .div_no_overflow
2242 ja .div_overflow
2243 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2244 cmp T0_8, A1_8
2245 jae .div_overflow
2246 jmp .div_no_overflow
2247
2248.divisor_negative:
2249 neg A1_8
2250 test T0_16, T0_16
2251 jns .one_of_each
2252 neg T0_16
2253.both_positive: ; Same as unsigned shifted by sign indicator bit.
2254 shr T0_16, 7
2255 cmp T0_8, A1_8
2256 jae .div_overflow
2257.div_no_overflow:
2258 mov A1, T1 ; restore divisor
2259 %endif
2260
2261 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2262 mov ax, [A0]
2263 %1 A1_8
2264 mov [A0], ax
2265 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2266 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2267 %else
2268 IEM_SAVE_FLAGS A2, %2, %3
2269 %endif
2270 xor eax, eax
2271
2272.return:
2273 EPILOGUE_3_ARGS
2274
2275.div_zero:
2276.div_overflow:
2277 mov eax, -1
2278 jmp .return
2279ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2280
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2282 PROLOGUE_4_ARGS
2283
2284 ; div by chainsaw check.
2285 test A2_16, A2_16
2286 jz .div_zero
2287
2288 ; Overflow check - unsigned division is simple to verify, haven't
2289 ; found a simple way to check signed division yet unfortunately.
2290 %if %4 == 0
2291 cmp [A1], A2_16
2292 jae .div_overflow
2293 %else
2294 mov T0_16, [A1]
2295 shl T0_32, 16
2296 mov T0_16, [A0] ; T0 = dividend
2297 mov T1, A2 ; T1 = divisor
2298 test T1_16, T1_16
2299 js .divisor_negative
2300 test T0_32, T0_32
2301 jns .both_positive
2302 neg T0_32
2303.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2304 push T0 ; Start off like unsigned below.
2305 shr T0_32, 15
2306 cmp T0_16, T1_16
2307 pop T0
2308 jb .div_no_overflow
2309 ja .div_overflow
2310 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2311 cmp T0_16, T1_16
2312 jae .div_overflow
2313 jmp .div_no_overflow
2314
2315.divisor_negative:
2316 neg T1_16
2317 test T0_32, T0_32
2318 jns .one_of_each
2319 neg T0_32
2320.both_positive: ; Same as unsigned shifted by sign indicator bit.
2321 shr T0_32, 15
2322 cmp T0_16, T1_16
2323 jae .div_overflow
2324.div_no_overflow:
2325 %endif
2326
2327 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2328 %ifdef ASM_CALL64_GCC
2329 mov T1, A2
2330 mov ax, [A0]
2331 mov dx, [A1]
2332 %1 T1_16
2333 mov [A0], ax
2334 mov [A1], dx
2335 %else
2336 mov T1, A1
2337 mov ax, [A0]
2338 mov dx, [T1]
2339 %1 A2_16
2340 mov [A0], ax
2341 mov [T1], dx
2342 %endif
2343 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2344 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2345 %else
2346 IEM_SAVE_FLAGS A3, %2, %3
2347 %endif
2348 xor eax, eax
2349
2350.return:
2351 EPILOGUE_4_ARGS
2352
2353.div_zero:
2354.div_overflow:
2355 mov eax, -1
2356 jmp .return
2357ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2358
2359BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2360 PROLOGUE_4_ARGS
2361
2362 ; div by chainsaw check.
2363 test A2_32, A2_32
2364 jz .div_zero
2365
2366 ; Overflow check - unsigned division is simple to verify, haven't
2367 ; found a simple way to check signed division yet unfortunately.
2368 %if %4 == 0
2369 cmp [A1], A2_32
2370 jae .div_overflow
2371 %else
2372 push A2 ; save A2 so we modify it (we out of regs on x86).
2373 mov T0_32, [A0] ; T0 = dividend low
2374 mov T1_32, [A1] ; T1 = dividend high
2375 test A2_32, A2_32
2376 js .divisor_negative
2377 test T1_32, T1_32
2378 jns .both_positive
2379 call NAME(iemAImpl_negate_T0_T1_u32)
2380.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2381 push T0 ; Start off like unsigned below.
2382 shl T1_32, 1
2383 shr T0_32, 31
2384 or T1_32, T0_32
2385 cmp T1_32, A2_32
2386 pop T0
2387 jb .div_no_overflow
2388 ja .div_overflow
2389 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2390 cmp T0_32, A2_32
2391 jae .div_overflow
2392 jmp .div_no_overflow
2393
2394.divisor_negative:
2395 neg A2_32
2396 test T1_32, T1_32
2397 jns .one_of_each
2398 call NAME(iemAImpl_negate_T0_T1_u32)
2399.both_positive: ; Same as unsigned shifted by sign indicator bit.
2400 shl T1_32, 1
2401 shr T0_32, 31
2402 or T1_32, T0_32
2403 cmp T1_32, A2_32
2404 jae .div_overflow
2405.div_no_overflow:
2406 pop A2
2407 %endif
2408
2409 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2410 mov eax, [A0]
2411 %ifdef ASM_CALL64_GCC
2412 mov T1, A2
2413 mov eax, [A0]
2414 mov edx, [A1]
2415 %1 T1_32
2416 mov [A0], eax
2417 mov [A1], edx
2418 %else
2419 mov T1, A1
2420 mov eax, [A0]
2421 mov edx, [T1]
2422 %1 A2_32
2423 mov [A0], eax
2424 mov [T1], edx
2425 %endif
2426 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2427 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2428 %else
2429 IEM_SAVE_FLAGS A3, %2, %3
2430 %endif
2431 xor eax, eax
2432
2433.return:
2434 EPILOGUE_4_ARGS
2435
2436.div_overflow:
2437 %if %4 != 0
2438 pop A2
2439 %endif
2440.div_zero:
2441 mov eax, -1
2442 jmp .return
2443ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2444
2445 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2446BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2447 PROLOGUE_4_ARGS
2448
2449 test A2, A2
2450 jz .div_zero
2451 %if %4 == 0
2452 cmp [A1], A2
2453 jae .div_overflow
2454 %else
2455 push A2 ; save A2 so we modify it (we out of regs on x86).
2456 mov T0, [A0] ; T0 = dividend low
2457 mov T1, [A1] ; T1 = dividend high
2458 test A2, A2
2459 js .divisor_negative
2460 test T1, T1
2461 jns .both_positive
2462 call NAME(iemAImpl_negate_T0_T1_u64)
2463.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2464 push T0 ; Start off like unsigned below.
2465 shl T1, 1
2466 shr T0, 63
2467 or T1, T0
2468 cmp T1, A2
2469 pop T0
2470 jb .div_no_overflow
2471 ja .div_overflow
2472 mov T1, 0x7fffffffffffffff
2473 and T0, T1 ; Special case for covering (divisor - 1).
2474 cmp T0, A2
2475 jae .div_overflow
2476 jmp .div_no_overflow
2477
2478.divisor_negative:
2479 neg A2
2480 test T1, T1
2481 jns .one_of_each
2482 call NAME(iemAImpl_negate_T0_T1_u64)
2483.both_positive: ; Same as unsigned shifted by sign indicator bit.
2484 shl T1, 1
2485 shr T0, 63
2486 or T1, T0
2487 cmp T1, A2
2488 jae .div_overflow
2489.div_no_overflow:
2490 pop A2
2491 %endif
2492
2493 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2494 mov rax, [A0]
2495 %ifdef ASM_CALL64_GCC
2496 mov T1, A2
2497 mov rax, [A0]
2498 mov rdx, [A1]
2499 %1 T1
2500 mov [A0], rax
2501 mov [A1], rdx
2502 %else
2503 mov T1, A1
2504 mov rax, [A0]
2505 mov rdx, [T1]
2506 %1 A2
2507 mov [A0], rax
2508 mov [T1], rdx
2509 %endif
2510 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2511 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2512 %else
2513 IEM_SAVE_FLAGS A3, %2, %3
2514 %endif
2515 xor eax, eax
2516
2517.return:
2518 EPILOGUE_4_ARGS_EX 12
2519
2520.div_overflow:
2521 %if %4 != 0
2522 pop A2
2523 %endif
2524.div_zero:
2525 mov eax, -1
2526 jmp .return
2527ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2528 %endif ; !RT_ARCH_AMD64
2529
2530%endmacro
2531
2532IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2533IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2534IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2535IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2536IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2537IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2538
2539
2540;;
2541; Macro for implementing memory fence operation.
2542;
2543; No return value, no operands or anything.
2544;
2545; @param 1 The instruction.
2546;
2547%macro IEMIMPL_MEM_FENCE 1
2548BEGINCODE
2549BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2550 %1
2551 ret
2552ENDPROC iemAImpl_ %+ %1
2553%endmacro
2554
2555IEMIMPL_MEM_FENCE lfence
2556IEMIMPL_MEM_FENCE sfence
2557IEMIMPL_MEM_FENCE mfence
2558
2559;;
2560; Alternative for non-SSE2 host.
2561;
2562BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2563 push xAX
2564 xchg xAX, [xSP]
2565 add xSP, xCB
2566 ret
2567ENDPROC iemAImpl_alt_mem_fence
2568
2569
2570;;
2571; Initialize the FPU for the actual instruction being emulated, this means
2572; loading parts of the guest's control word and status word.
2573;
2574; @uses 24 bytes of stack. T0, T1
2575; @param 1 Expression giving the address of the FXSTATE of the guest.
2576;
2577%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2578 fnstenv [xSP]
2579
2580 ; FCW - for exception, precision and rounding control.
2581 movzx T0, word [%1 + X86FXSTATE.FCW]
2582 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2583 mov [xSP + X86FSTENV32P.FCW], T0_16
2584
2585 ; FSW - for undefined C0, C1, C2, and C3.
2586 movzx T1, word [%1 + X86FXSTATE.FSW]
2587 and T1, X86_FSW_C_MASK
2588 movzx T0, word [xSP + X86FSTENV32P.FSW]
2589 and T0, X86_FSW_TOP_MASK
2590 or T0, T1
2591 mov [xSP + X86FSTENV32P.FSW], T0_16
2592
2593 fldenv [xSP]
2594%endmacro
2595
2596
2597;;
2598; Initialize the FPU for the actual instruction being emulated, this means
2599; loading parts of the guest's control word, status word, and update the
2600; tag word for the top register if it's empty.
2601;
2602; ASSUMES actual TOP=7
2603;
2604; @uses 24 bytes of stack. T0, T1
2605; @param 1 Expression giving the address of the FXSTATE of the guest.
2606;
2607%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2608 fnstenv [xSP]
2609
2610 ; FCW - for exception, precision and rounding control.
2611 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2612 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2613 mov [xSP + X86FSTENV32P.FCW], T0_16
2614
2615 ; FSW - for undefined C0, C1, C2, and C3.
2616 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2617 and T1_32, X86_FSW_C_MASK
2618 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2619 and T0_32, X86_FSW_TOP_MASK
2620 or T0_32, T1_32
2621 mov [xSP + X86FSTENV32P.FSW], T0_16
2622
2623 ; FTW - Only for ST0 (in/out).
2624 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2625 shr T1_32, X86_FSW_TOP_SHIFT
2626 and T1_32, X86_FSW_TOP_SMASK
2627 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2628 jc %%st0_not_empty
2629 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2630%%st0_not_empty:
2631
2632 fldenv [xSP]
2633%endmacro
2634
2635
2636;;
2637; Need to move this as well somewhere better?
2638;
2639struc IEMFPURESULT
2640 .r80Result resw 5
2641 .FSW resw 1
2642endstruc
2643
2644
2645;;
2646; Need to move this as well somewhere better?
2647;
2648struc IEMFPURESULTTWO
2649 .r80Result1 resw 5
2650 .FSW resw 1
2651 .r80Result2 resw 5
2652endstruc
2653
2654
2655;
2656;---------------------- 16-bit signed integer operations ----------------------
2657;
2658
2659
2660;;
2661; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2662;
2663; @param A0 FPU context (fxsave).
2664; @param A1 Pointer to a IEMFPURESULT for the output.
2665; @param A2 Pointer to the 16-bit floating point value to convert.
2666;
2667BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2668 PROLOGUE_3_ARGS
2669 sub xSP, 20h
2670
2671 fninit
2672 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2673 fild word [A2]
2674
2675 fnstsw word [A1 + IEMFPURESULT.FSW]
2676 fnclex
2677 fstp tword [A1 + IEMFPURESULT.r80Result]
2678
2679 fninit
2680 add xSP, 20h
2681 EPILOGUE_3_ARGS
2682ENDPROC iemAImpl_fild_r80_from_i16
2683
2684
2685;;
2686; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2687;
2688; @param A0 FPU context (fxsave).
2689; @param A1 Where to return the output FSW.
2690; @param A2 Where to store the 16-bit signed integer value.
2691; @param A3 Pointer to the 80-bit value.
2692;
2693BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2694 PROLOGUE_4_ARGS
2695 sub xSP, 20h
2696
2697 fninit
2698 fld tword [A3]
2699 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2700 fistp word [A2]
2701
2702 fnstsw word [A1]
2703
2704 fninit
2705 add xSP, 20h
2706 EPILOGUE_4_ARGS
2707ENDPROC iemAImpl_fist_r80_to_i16
2708
2709
2710;;
2711; Store a 80-bit floating point value (register) as a 16-bit signed integer
2712; (memory) with truncation.
2713;
2714; @param A0 FPU context (fxsave).
2715; @param A1 Where to return the output FSW.
2716; @param A2 Where to store the 16-bit signed integer value.
2717; @param A3 Pointer to the 80-bit value.
2718;
2719BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2720 PROLOGUE_4_ARGS
2721 sub xSP, 20h
2722
2723 fninit
2724 fld tword [A3]
2725 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2726 fisttp word [A2]
2727
2728 fnstsw word [A1]
2729
2730 fninit
2731 add xSP, 20h
2732 EPILOGUE_4_ARGS
2733ENDPROC iemAImpl_fistt_r80_to_i16
2734
2735
2736;;
2737; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2738;
2739; @param 1 The instruction
2740;
2741; @param A0 FPU context (fxsave).
2742; @param A1 Pointer to a IEMFPURESULT for the output.
2743; @param A2 Pointer to the 80-bit value.
2744; @param A3 Pointer to the 16-bit value.
2745;
2746%macro IEMIMPL_FPU_R80_BY_I16 1
2747BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2748 PROLOGUE_4_ARGS
2749 sub xSP, 20h
2750
2751 fninit
2752 fld tword [A2]
2753 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2754 %1 word [A3]
2755
2756 fnstsw word [A1 + IEMFPURESULT.FSW]
2757 fnclex
2758 fstp tword [A1 + IEMFPURESULT.r80Result]
2759
2760 fninit
2761 add xSP, 20h
2762 EPILOGUE_4_ARGS
2763ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2764%endmacro
2765
2766IEMIMPL_FPU_R80_BY_I16 fiadd
2767IEMIMPL_FPU_R80_BY_I16 fimul
2768IEMIMPL_FPU_R80_BY_I16 fisub
2769IEMIMPL_FPU_R80_BY_I16 fisubr
2770IEMIMPL_FPU_R80_BY_I16 fidiv
2771IEMIMPL_FPU_R80_BY_I16 fidivr
2772
2773
2774;;
2775; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2776; only returning FSW.
2777;
2778; @param 1 The instruction
2779;
2780; @param A0 FPU context (fxsave).
2781; @param A1 Where to store the output FSW.
2782; @param A2 Pointer to the 80-bit value.
2783; @param A3 Pointer to the 64-bit value.
2784;
2785%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2786BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2787 PROLOGUE_4_ARGS
2788 sub xSP, 20h
2789
2790 fninit
2791 fld tword [A2]
2792 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2793 %1 word [A3]
2794
2795 fnstsw word [A1]
2796
2797 fninit
2798 add xSP, 20h
2799 EPILOGUE_4_ARGS
2800ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2801%endmacro
2802
2803IEMIMPL_FPU_R80_BY_I16_FSW ficom
2804
2805
2806
2807;
2808;---------------------- 32-bit signed integer operations ----------------------
2809;
2810
2811
2812;;
2813; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2814;
2815; @param A0 FPU context (fxsave).
2816; @param A1 Pointer to a IEMFPURESULT for the output.
2817; @param A2 Pointer to the 32-bit floating point value to convert.
2818;
2819BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2820 PROLOGUE_3_ARGS
2821 sub xSP, 20h
2822
2823 fninit
2824 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2825 fild dword [A2]
2826
2827 fnstsw word [A1 + IEMFPURESULT.FSW]
2828 fnclex
2829 fstp tword [A1 + IEMFPURESULT.r80Result]
2830
2831 fninit
2832 add xSP, 20h
2833 EPILOGUE_3_ARGS
2834ENDPROC iemAImpl_fild_r80_from_i32
2835
2836
2837;;
2838; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2839;
2840; @param A0 FPU context (fxsave).
2841; @param A1 Where to return the output FSW.
2842; @param A2 Where to store the 32-bit signed integer value.
2843; @param A3 Pointer to the 80-bit value.
2844;
2845BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2846 PROLOGUE_4_ARGS
2847 sub xSP, 20h
2848
2849 fninit
2850 fld tword [A3]
2851 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2852 fistp dword [A2]
2853
2854 fnstsw word [A1]
2855
2856 fninit
2857 add xSP, 20h
2858 EPILOGUE_4_ARGS
2859ENDPROC iemAImpl_fist_r80_to_i32
2860
2861
2862;;
2863; Store a 80-bit floating point value (register) as a 32-bit signed integer
2864; (memory) with truncation.
2865;
2866; @param A0 FPU context (fxsave).
2867; @param A1 Where to return the output FSW.
2868; @param A2 Where to store the 32-bit signed integer value.
2869; @param A3 Pointer to the 80-bit value.
2870;
2871BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2872 PROLOGUE_4_ARGS
2873 sub xSP, 20h
2874
2875 fninit
2876 fld tword [A3]
2877 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2878 fisttp dword [A2]
2879
2880 fnstsw word [A1]
2881
2882 fninit
2883 add xSP, 20h
2884 EPILOGUE_4_ARGS
2885ENDPROC iemAImpl_fistt_r80_to_i32
2886
2887
2888;;
2889; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2890;
2891; @param 1 The instruction
2892;
2893; @param A0 FPU context (fxsave).
2894; @param A1 Pointer to a IEMFPURESULT for the output.
2895; @param A2 Pointer to the 80-bit value.
2896; @param A3 Pointer to the 32-bit value.
2897;
2898%macro IEMIMPL_FPU_R80_BY_I32 1
2899BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2900 PROLOGUE_4_ARGS
2901 sub xSP, 20h
2902
2903 fninit
2904 fld tword [A2]
2905 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2906 %1 dword [A3]
2907
2908 fnstsw word [A1 + IEMFPURESULT.FSW]
2909 fnclex
2910 fstp tword [A1 + IEMFPURESULT.r80Result]
2911
2912 fninit
2913 add xSP, 20h
2914 EPILOGUE_4_ARGS
2915ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2916%endmacro
2917
2918IEMIMPL_FPU_R80_BY_I32 fiadd
2919IEMIMPL_FPU_R80_BY_I32 fimul
2920IEMIMPL_FPU_R80_BY_I32 fisub
2921IEMIMPL_FPU_R80_BY_I32 fisubr
2922IEMIMPL_FPU_R80_BY_I32 fidiv
2923IEMIMPL_FPU_R80_BY_I32 fidivr
2924
2925
2926;;
2927; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2928; only returning FSW.
2929;
2930; @param 1 The instruction
2931;
2932; @param A0 FPU context (fxsave).
2933; @param A1 Where to store the output FSW.
2934; @param A2 Pointer to the 80-bit value.
2935; @param A3 Pointer to the 64-bit value.
2936;
2937%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2938BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2939 PROLOGUE_4_ARGS
2940 sub xSP, 20h
2941
2942 fninit
2943 fld tword [A2]
2944 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2945 %1 dword [A3]
2946
2947 fnstsw word [A1]
2948
2949 fninit
2950 add xSP, 20h
2951 EPILOGUE_4_ARGS
2952ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2953%endmacro
2954
2955IEMIMPL_FPU_R80_BY_I32_FSW ficom
2956
2957
2958
2959;
2960;---------------------- 64-bit signed integer operations ----------------------
2961;
2962
2963
2964;;
2965; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2966;
2967; @param A0 FPU context (fxsave).
2968; @param A1 Pointer to a IEMFPURESULT for the output.
2969; @param A2 Pointer to the 64-bit floating point value to convert.
2970;
2971BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2972 PROLOGUE_3_ARGS
2973 sub xSP, 20h
2974
2975 fninit
2976 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2977 fild qword [A2]
2978
2979 fnstsw word [A1 + IEMFPURESULT.FSW]
2980 fnclex
2981 fstp tword [A1 + IEMFPURESULT.r80Result]
2982
2983 fninit
2984 add xSP, 20h
2985 EPILOGUE_3_ARGS
2986ENDPROC iemAImpl_fild_r80_from_i64
2987
2988
2989;;
2990; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2991;
2992; @param A0 FPU context (fxsave).
2993; @param A1 Where to return the output FSW.
2994; @param A2 Where to store the 64-bit signed integer value.
2995; @param A3 Pointer to the 80-bit value.
2996;
2997BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2998 PROLOGUE_4_ARGS
2999 sub xSP, 20h
3000
3001 fninit
3002 fld tword [A3]
3003 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3004 fistp qword [A2]
3005
3006 fnstsw word [A1]
3007
3008 fninit
3009 add xSP, 20h
3010 EPILOGUE_4_ARGS
3011ENDPROC iemAImpl_fist_r80_to_i64
3012
3013
3014;;
3015; Store a 80-bit floating point value (register) as a 64-bit signed integer
3016; (memory) with truncation.
3017;
3018; @param A0 FPU context (fxsave).
3019; @param A1 Where to return the output FSW.
3020; @param A2 Where to store the 64-bit signed integer value.
3021; @param A3 Pointer to the 80-bit value.
3022;
3023BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3024 PROLOGUE_4_ARGS
3025 sub xSP, 20h
3026
3027 fninit
3028 fld tword [A3]
3029 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3030 fisttp qword [A2]
3031
3032 fnstsw word [A1]
3033
3034 fninit
3035 add xSP, 20h
3036 EPILOGUE_4_ARGS
3037ENDPROC iemAImpl_fistt_r80_to_i64
3038
3039
3040
3041;
3042;---------------------- 32-bit floating point operations ----------------------
3043;
3044
3045;;
3046; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3047;
3048; @param A0 FPU context (fxsave).
3049; @param A1 Pointer to a IEMFPURESULT for the output.
3050; @param A2 Pointer to the 32-bit floating point value to convert.
3051;
3052BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3053 PROLOGUE_3_ARGS
3054 sub xSP, 20h
3055
3056 fninit
3057 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3058 fld dword [A2]
3059
3060 fnstsw word [A1 + IEMFPURESULT.FSW]
3061 fnclex
3062 fstp tword [A1 + IEMFPURESULT.r80Result]
3063
3064 fninit
3065 add xSP, 20h
3066 EPILOGUE_3_ARGS
3067ENDPROC iemAImpl_fld_r80_from_r32
3068
3069
3070;;
3071; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3072;
3073; @param A0 FPU context (fxsave).
3074; @param A1 Where to return the output FSW.
3075; @param A2 Where to store the 32-bit value.
3076; @param A3 Pointer to the 80-bit value.
3077;
3078BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3079 PROLOGUE_4_ARGS
3080 sub xSP, 20h
3081
3082 fninit
3083 fld tword [A3]
3084 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3085 fst dword [A2]
3086
3087 fnstsw word [A1]
3088
3089 fninit
3090 add xSP, 20h
3091 EPILOGUE_4_ARGS
3092ENDPROC iemAImpl_fst_r80_to_r32
3093
3094
3095;;
3096; FPU instruction working on one 80-bit and one 32-bit floating point value.
3097;
3098; @param 1 The instruction
3099;
3100; @param A0 FPU context (fxsave).
3101; @param A1 Pointer to a IEMFPURESULT for the output.
3102; @param A2 Pointer to the 80-bit value.
3103; @param A3 Pointer to the 32-bit value.
3104;
3105%macro IEMIMPL_FPU_R80_BY_R32 1
3106BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3107 PROLOGUE_4_ARGS
3108 sub xSP, 20h
3109
3110 fninit
3111 fld tword [A2]
3112 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3113 %1 dword [A3]
3114
3115 fnstsw word [A1 + IEMFPURESULT.FSW]
3116 fnclex
3117 fstp tword [A1 + IEMFPURESULT.r80Result]
3118
3119 fninit
3120 add xSP, 20h
3121 EPILOGUE_4_ARGS
3122ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3123%endmacro
3124
3125IEMIMPL_FPU_R80_BY_R32 fadd
3126IEMIMPL_FPU_R80_BY_R32 fmul
3127IEMIMPL_FPU_R80_BY_R32 fsub
3128IEMIMPL_FPU_R80_BY_R32 fsubr
3129IEMIMPL_FPU_R80_BY_R32 fdiv
3130IEMIMPL_FPU_R80_BY_R32 fdivr
3131
3132
3133;;
3134; FPU instruction working on one 80-bit and one 32-bit floating point value,
3135; only returning FSW.
3136;
3137; @param 1 The instruction
3138;
3139; @param A0 FPU context (fxsave).
3140; @param A1 Where to store the output FSW.
3141; @param A2 Pointer to the 80-bit value.
3142; @param A3 Pointer to the 64-bit value.
3143;
3144%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3145BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3146 PROLOGUE_4_ARGS
3147 sub xSP, 20h
3148
3149 fninit
3150 fld tword [A2]
3151 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3152 %1 dword [A3]
3153
3154 fnstsw word [A1]
3155
3156 fninit
3157 add xSP, 20h
3158 EPILOGUE_4_ARGS
3159ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3160%endmacro
3161
3162IEMIMPL_FPU_R80_BY_R32_FSW fcom
3163
3164
3165
3166;
3167;---------------------- 64-bit floating point operations ----------------------
3168;
3169
3170;;
3171; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3172;
3173; @param A0 FPU context (fxsave).
3174; @param A1 Pointer to a IEMFPURESULT for the output.
3175; @param A2 Pointer to the 64-bit floating point value to convert.
3176;
3177BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3178 PROLOGUE_3_ARGS
3179 sub xSP, 20h
3180
3181 fninit
3182 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3183 fld qword [A2]
3184
3185 fnstsw word [A1 + IEMFPURESULT.FSW]
3186 fnclex
3187 fstp tword [A1 + IEMFPURESULT.r80Result]
3188
3189 fninit
3190 add xSP, 20h
3191 EPILOGUE_3_ARGS
3192ENDPROC iemAImpl_fld_r80_from_r64
3193
3194
3195;;
3196; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3197;
3198; @param A0 FPU context (fxsave).
3199; @param A1 Where to return the output FSW.
3200; @param A2 Where to store the 64-bit value.
3201; @param A3 Pointer to the 80-bit value.
3202;
3203BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3204 PROLOGUE_4_ARGS
3205 sub xSP, 20h
3206
3207 fninit
3208 fld tword [A3]
3209 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3210 fst qword [A2]
3211
3212 fnstsw word [A1]
3213
3214 fninit
3215 add xSP, 20h
3216 EPILOGUE_4_ARGS
3217ENDPROC iemAImpl_fst_r80_to_r64
3218
3219
3220;;
3221; FPU instruction working on one 80-bit and one 64-bit floating point value.
3222;
3223; @param 1 The instruction
3224;
3225; @param A0 FPU context (fxsave).
3226; @param A1 Pointer to a IEMFPURESULT for the output.
3227; @param A2 Pointer to the 80-bit value.
3228; @param A3 Pointer to the 64-bit value.
3229;
3230%macro IEMIMPL_FPU_R80_BY_R64 1
3231BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3232 PROLOGUE_4_ARGS
3233 sub xSP, 20h
3234
3235 fninit
3236 fld tword [A2]
3237 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3238 %1 qword [A3]
3239
3240 fnstsw word [A1 + IEMFPURESULT.FSW]
3241 fnclex
3242 fstp tword [A1 + IEMFPURESULT.r80Result]
3243
3244 fninit
3245 add xSP, 20h
3246 EPILOGUE_4_ARGS
3247ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3248%endmacro
3249
3250IEMIMPL_FPU_R80_BY_R64 fadd
3251IEMIMPL_FPU_R80_BY_R64 fmul
3252IEMIMPL_FPU_R80_BY_R64 fsub
3253IEMIMPL_FPU_R80_BY_R64 fsubr
3254IEMIMPL_FPU_R80_BY_R64 fdiv
3255IEMIMPL_FPU_R80_BY_R64 fdivr
3256
3257;;
3258; FPU instruction working on one 80-bit and one 64-bit floating point value,
3259; only returning FSW.
3260;
3261; @param 1 The instruction
3262;
3263; @param A0 FPU context (fxsave).
3264; @param A1 Where to store the output FSW.
3265; @param A2 Pointer to the 80-bit value.
3266; @param A3 Pointer to the 64-bit value.
3267;
3268%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3269BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3270 PROLOGUE_4_ARGS
3271 sub xSP, 20h
3272
3273 fninit
3274 fld tword [A2]
3275 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3276 %1 qword [A3]
3277
3278 fnstsw word [A1]
3279
3280 fninit
3281 add xSP, 20h
3282 EPILOGUE_4_ARGS
3283ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3284%endmacro
3285
3286IEMIMPL_FPU_R80_BY_R64_FSW fcom
3287
3288
3289
3290;
3291;---------------------- 80-bit floating point operations ----------------------
3292;
3293
3294;;
3295; Loads a 80-bit floating point register value from memory.
3296;
3297; @param A0 FPU context (fxsave).
3298; @param A1 Pointer to a IEMFPURESULT for the output.
3299; @param A2 Pointer to the 80-bit floating point value to load.
3300;
3301BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3302 PROLOGUE_3_ARGS
3303 sub xSP, 20h
3304
3305 fninit
3306 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3307 fld tword [A2]
3308
3309 fnstsw word [A1 + IEMFPURESULT.FSW]
3310 fnclex
3311 fstp tword [A1 + IEMFPURESULT.r80Result]
3312
3313 fninit
3314 add xSP, 20h
3315 EPILOGUE_3_ARGS
3316ENDPROC iemAImpl_fld_r80_from_r80
3317
3318
3319;;
3320; Store a 80-bit floating point register to memory
3321;
3322; @param A0 FPU context (fxsave).
3323; @param A1 Where to return the output FSW.
3324; @param A2 Where to store the 80-bit value.
3325; @param A3 Pointer to the 80-bit register value.
3326;
3327BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3328 PROLOGUE_4_ARGS
3329 sub xSP, 20h
3330
3331 fninit
3332 fld tword [A3]
3333 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3334 fstp tword [A2]
3335
3336 fnstsw word [A1]
3337
3338 fninit
3339 add xSP, 20h
3340 EPILOGUE_4_ARGS
3341ENDPROC iemAImpl_fst_r80_to_r80
3342
3343
3344;;
3345; Loads an 80-bit floating point register value in BCD format from memory.
3346;
3347; @param A0 FPU context (fxsave).
3348; @param A1 Pointer to a IEMFPURESULT for the output.
3349; @param A2 Pointer to the 80-bit BCD value to load.
3350;
3351BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3352 PROLOGUE_3_ARGS
3353 sub xSP, 20h
3354
3355 fninit
3356 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3357 fbld tword [A2]
3358
3359 fnstsw word [A1 + IEMFPURESULT.FSW]
3360 fnclex
3361 fstp tword [A1 + IEMFPURESULT.r80Result]
3362
3363 fninit
3364 add xSP, 20h
3365 EPILOGUE_3_ARGS
3366ENDPROC iemAImpl_fld_r80_from_d80
3367
3368
3369;;
3370; Store a 80-bit floating point register to memory as BCD
3371;
3372; @param A0 FPU context (fxsave).
3373; @param A1 Where to return the output FSW.
3374; @param A2 Where to store the 80-bit BCD value.
3375; @param A3 Pointer to the 80-bit register value.
3376;
3377BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3378 PROLOGUE_4_ARGS
3379 sub xSP, 20h
3380
3381 fninit
3382 fld tword [A3]
3383 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3384 fbstp tword [A2]
3385
3386 fnstsw word [A1]
3387
3388 fninit
3389 add xSP, 20h
3390 EPILOGUE_4_ARGS
3391ENDPROC iemAImpl_fst_r80_to_d80
3392
3393
3394;;
3395; FPU instruction working on two 80-bit floating point values.
3396;
3397; @param 1 The instruction
3398;
3399; @param A0 FPU context (fxsave).
3400; @param A1 Pointer to a IEMFPURESULT for the output.
3401; @param A2 Pointer to the first 80-bit value (ST0)
3402; @param A3 Pointer to the second 80-bit value (STn).
3403;
3404%macro IEMIMPL_FPU_R80_BY_R80 2
3405BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3406 PROLOGUE_4_ARGS
3407 sub xSP, 20h
3408
3409 fninit
3410 fld tword [A3]
3411 fld tword [A2]
3412 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3413 %1 %2
3414
3415 fnstsw word [A1 + IEMFPURESULT.FSW]
3416 fnclex
3417 fstp tword [A1 + IEMFPURESULT.r80Result]
3418
3419 fninit
3420 add xSP, 20h
3421 EPILOGUE_4_ARGS
3422ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3423%endmacro
3424
3425IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3426IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3427IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3428IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3429IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3430IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3431IEMIMPL_FPU_R80_BY_R80 fprem, {}
3432IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3433IEMIMPL_FPU_R80_BY_R80 fscale, {}
3434
3435
3436;;
3437; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3438; storing the result in ST1 and popping the stack.
3439;
3440; @param 1 The instruction
3441;
3442; @param A0 FPU context (fxsave).
3443; @param A1 Pointer to a IEMFPURESULT for the output.
3444; @param A2 Pointer to the first 80-bit value (ST1).
3445; @param A3 Pointer to the second 80-bit value (ST0).
3446;
3447%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3448BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3449 PROLOGUE_4_ARGS
3450 sub xSP, 20h
3451
3452 fninit
3453 fld tword [A2]
3454 fld tword [A3]
3455 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3456 %1
3457
3458 fnstsw word [A1 + IEMFPURESULT.FSW]
3459 fnclex
3460 fstp tword [A1 + IEMFPURESULT.r80Result]
3461
3462 fninit
3463 add xSP, 20h
3464 EPILOGUE_4_ARGS
3465ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3466%endmacro
3467
3468IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3469IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3470IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3471
3472
3473;;
3474; FPU instruction working on two 80-bit floating point values, only
3475; returning FSW.
3476;
3477; @param 1 The instruction
3478;
3479; @param A0 FPU context (fxsave).
3480; @param A1 Pointer to a uint16_t for the resulting FSW.
3481; @param A2 Pointer to the first 80-bit value.
3482; @param A3 Pointer to the second 80-bit value.
3483;
3484%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3485BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3486 PROLOGUE_4_ARGS
3487 sub xSP, 20h
3488
3489 fninit
3490 fld tword [A3]
3491 fld tword [A2]
3492 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3493 %1 st0, st1
3494
3495 fnstsw word [A1]
3496
3497 fninit
3498 add xSP, 20h
3499 EPILOGUE_4_ARGS
3500ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3501%endmacro
3502
3503IEMIMPL_FPU_R80_BY_R80_FSW fcom
3504IEMIMPL_FPU_R80_BY_R80_FSW fucom
3505
3506
3507;;
3508; FPU instruction working on two 80-bit floating point values,
3509; returning FSW and EFLAGS (eax).
3510;
3511; @param 1 The instruction
3512;
3513; @returns EFLAGS in EAX.
3514; @param A0 FPU context (fxsave).
3515; @param A1 Pointer to a uint16_t for the resulting FSW.
3516; @param A2 Pointer to the first 80-bit value.
3517; @param A3 Pointer to the second 80-bit value.
3518;
3519%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3520BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3521 PROLOGUE_4_ARGS
3522 sub xSP, 20h
3523
3524 fninit
3525 fld tword [A3]
3526 fld tword [A2]
3527 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3528 %1 st1
3529
3530 fnstsw word [A1]
3531 pushf
3532 pop xAX
3533
3534 fninit
3535 add xSP, 20h
3536 EPILOGUE_4_ARGS
3537ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3538%endmacro
3539
3540IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3541IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3542
3543
3544;;
3545; FPU instruction working on one 80-bit floating point value.
3546;
3547; @param 1 The instruction
3548;
3549; @param A0 FPU context (fxsave).
3550; @param A1 Pointer to a IEMFPURESULT for the output.
3551; @param A2 Pointer to the 80-bit value.
3552;
3553%macro IEMIMPL_FPU_R80 1
3554BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3555 PROLOGUE_3_ARGS
3556 sub xSP, 20h
3557
3558 fninit
3559 fld tword [A2]
3560 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3561 %1
3562
3563 fnstsw word [A1 + IEMFPURESULT.FSW]
3564 fnclex
3565 fstp tword [A1 + IEMFPURESULT.r80Result]
3566
3567 fninit
3568 add xSP, 20h
3569 EPILOGUE_3_ARGS
3570ENDPROC iemAImpl_ %+ %1 %+ _r80
3571%endmacro
3572
3573IEMIMPL_FPU_R80 fchs
3574IEMIMPL_FPU_R80 fabs
3575IEMIMPL_FPU_R80 f2xm1
3576IEMIMPL_FPU_R80 fsqrt
3577IEMIMPL_FPU_R80 frndint
3578IEMIMPL_FPU_R80 fsin
3579IEMIMPL_FPU_R80 fcos
3580
3581
3582;;
3583; FPU instruction working on one 80-bit floating point value, only
3584; returning FSW.
3585;
3586; @param 1 The instruction
3587; @param 2 Non-zero to also restore FTW.
3588;
3589; @param A0 FPU context (fxsave).
3590; @param A1 Pointer to a uint16_t for the resulting FSW.
3591; @param A2 Pointer to the 80-bit value.
3592;
3593%macro IEMIMPL_FPU_R80_FSW 2
3594BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3595 PROLOGUE_3_ARGS
3596 sub xSP, 20h
3597
3598 fninit
3599 fld tword [A2]
3600%if %2 != 0
3601 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3602%else
3603 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3604%endif
3605 %1
3606
3607 fnstsw word [A1]
3608
3609 fninit
3610 add xSP, 20h
3611 EPILOGUE_3_ARGS
3612ENDPROC iemAImpl_ %+ %1 %+ _r80
3613%endmacro
3614
3615IEMIMPL_FPU_R80_FSW ftst, 0
3616IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3617
3618
3619
3620;;
3621; FPU instruction loading a 80-bit floating point constant.
3622;
3623; @param 1 The instruction
3624;
3625; @param A0 FPU context (fxsave).
3626; @param A1 Pointer to a IEMFPURESULT for the output.
3627;
3628%macro IEMIMPL_FPU_R80_CONST 1
3629BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3630 PROLOGUE_2_ARGS
3631 sub xSP, 20h
3632
3633 fninit
3634 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3635 %1
3636
3637 fnstsw word [A1 + IEMFPURESULT.FSW]
3638 fnclex
3639 fstp tword [A1 + IEMFPURESULT.r80Result]
3640
3641 fninit
3642 add xSP, 20h
3643 EPILOGUE_2_ARGS
3644ENDPROC iemAImpl_ %+ %1 %+
3645%endmacro
3646
3647IEMIMPL_FPU_R80_CONST fld1
3648IEMIMPL_FPU_R80_CONST fldl2t
3649IEMIMPL_FPU_R80_CONST fldl2e
3650IEMIMPL_FPU_R80_CONST fldpi
3651IEMIMPL_FPU_R80_CONST fldlg2
3652IEMIMPL_FPU_R80_CONST fldln2
3653IEMIMPL_FPU_R80_CONST fldz
3654
3655
3656;;
3657; FPU instruction working on one 80-bit floating point value, outputing two.
3658;
3659; @param 1 The instruction
3660;
3661; @param A0 FPU context (fxsave).
3662; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3663; @param A2 Pointer to the 80-bit value.
3664;
3665%macro IEMIMPL_FPU_R80_R80 1
3666BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3667 PROLOGUE_3_ARGS
3668 sub xSP, 20h
3669
3670 fninit
3671 fld tword [A2]
3672 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3673 %1
3674
3675 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3676 fnclex
3677 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3678 fnclex
3679 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3680
3681 fninit
3682 add xSP, 20h
3683 EPILOGUE_3_ARGS
3684ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3685%endmacro
3686
3687IEMIMPL_FPU_R80_R80 fptan
3688IEMIMPL_FPU_R80_R80 fxtract
3689IEMIMPL_FPU_R80_R80 fsincos
3690
3691
3692
3693
3694;---------------------- SSE and MMX Operations ----------------------
3695
3696;; @todo what do we need to do for MMX?
3697%macro IEMIMPL_MMX_PROLOGUE 0
3698%endmacro
3699%macro IEMIMPL_MMX_EPILOGUE 0
3700%endmacro
3701
3702;; @todo what do we need to do for SSE?
3703%macro IEMIMPL_SSE_PROLOGUE 0
3704%endmacro
3705%macro IEMIMPL_SSE_EPILOGUE 0
3706%endmacro
3707
3708;; @todo what do we need to do for AVX?
3709%macro IEMIMPL_AVX_PROLOGUE 0
3710%endmacro
3711%macro IEMIMPL_AVX_EPILOGUE 0
3712%endmacro
3713
3714
3715;;
3716; Media instruction working on two full sized registers.
3717;
3718; @param 1 The instruction
3719; @param 2 Whether there is an MMX variant (1) or not (0).
3720;
3721; @param A0 FPU context (fxsave).
3722; @param A1 Pointer to the first media register size operand (input/output).
3723; @param A2 Pointer to the second media register size operand (input).
3724;
3725%macro IEMIMPL_MEDIA_F2 2
3726%if %2 != 0
3727BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3728 PROLOGUE_3_ARGS
3729 IEMIMPL_MMX_PROLOGUE
3730
3731 movq mm0, [A1]
3732 movq mm1, [A2]
3733 %1 mm0, mm1
3734 movq [A1], mm0
3735
3736 IEMIMPL_MMX_EPILOGUE
3737 EPILOGUE_3_ARGS
3738ENDPROC iemAImpl_ %+ %1 %+ _u64
3739%endif
3740
3741BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3742 PROLOGUE_3_ARGS
3743 IEMIMPL_SSE_PROLOGUE
3744
3745 movdqu xmm0, [A1]
3746 movdqu xmm1, [A2]
3747 %1 xmm0, xmm1
3748 movdqu [A1], xmm0
3749
3750 IEMIMPL_SSE_EPILOGUE
3751 EPILOGUE_3_ARGS
3752ENDPROC iemAImpl_ %+ %1 %+ _u128
3753%endmacro
3754
3755IEMIMPL_MEDIA_F2 pshufb, 1
3756IEMIMPL_MEDIA_F2 pand, 1
3757IEMIMPL_MEDIA_F2 pandn, 1
3758IEMIMPL_MEDIA_F2 por, 1
3759IEMIMPL_MEDIA_F2 pxor, 1
3760IEMIMPL_MEDIA_F2 pcmpeqb, 1
3761IEMIMPL_MEDIA_F2 pcmpeqw, 1
3762IEMIMPL_MEDIA_F2 pcmpeqd, 1
3763IEMIMPL_MEDIA_F2 pcmpeqq, 0
3764IEMIMPL_MEDIA_F2 pcmpgtb, 1
3765IEMIMPL_MEDIA_F2 pcmpgtw, 1
3766IEMIMPL_MEDIA_F2 pcmpgtd, 1
3767IEMIMPL_MEDIA_F2 pcmpgtq, 0
3768IEMIMPL_MEDIA_F2 paddb, 1
3769IEMIMPL_MEDIA_F2 paddw, 1
3770IEMIMPL_MEDIA_F2 paddd, 1
3771IEMIMPL_MEDIA_F2 paddq, 1
3772IEMIMPL_MEDIA_F2 paddsb, 1
3773IEMIMPL_MEDIA_F2 paddsw, 1
3774IEMIMPL_MEDIA_F2 paddusb, 1
3775IEMIMPL_MEDIA_F2 paddusw, 1
3776IEMIMPL_MEDIA_F2 psubb, 1
3777IEMIMPL_MEDIA_F2 psubw, 1
3778IEMIMPL_MEDIA_F2 psubd, 1
3779IEMIMPL_MEDIA_F2 psubq, 1
3780IEMIMPL_MEDIA_F2 psubsb, 1
3781IEMIMPL_MEDIA_F2 psubsw, 1
3782IEMIMPL_MEDIA_F2 psubusb, 1
3783IEMIMPL_MEDIA_F2 psubusw, 1
3784IEMIMPL_MEDIA_F2 pmullw, 1
3785IEMIMPL_MEDIA_F2 pmulld, 0
3786IEMIMPL_MEDIA_F2 pmulhw, 1
3787IEMIMPL_MEDIA_F2 pmaddwd, 1
3788IEMIMPL_MEDIA_F2 pminub, 1
3789IEMIMPL_MEDIA_F2 pminuw, 0
3790IEMIMPL_MEDIA_F2 pminud, 0
3791IEMIMPL_MEDIA_F2 pminsb, 0
3792IEMIMPL_MEDIA_F2 pminsw, 1
3793IEMIMPL_MEDIA_F2 pminsd, 0
3794IEMIMPL_MEDIA_F2 pmaxub, 1
3795IEMIMPL_MEDIA_F2 pmaxuw, 0
3796IEMIMPL_MEDIA_F2 pmaxud, 0
3797IEMIMPL_MEDIA_F2 pmaxsb, 0
3798IEMIMPL_MEDIA_F2 pmaxsw, 1
3799IEMIMPL_MEDIA_F2 pmaxsd, 0
3800IEMIMPL_MEDIA_F2 pabsb, 1
3801IEMIMPL_MEDIA_F2 pabsw, 1
3802IEMIMPL_MEDIA_F2 pabsd, 1
3803IEMIMPL_MEDIA_F2 psignb, 1
3804IEMIMPL_MEDIA_F2 psignw, 1
3805IEMIMPL_MEDIA_F2 psignd, 1
3806IEMIMPL_MEDIA_F2 phaddw, 1
3807IEMIMPL_MEDIA_F2 phaddd, 1
3808IEMIMPL_MEDIA_F2 phsubw, 1
3809IEMIMPL_MEDIA_F2 phsubd, 1
3810IEMIMPL_MEDIA_F2 phaddsw, 1
3811IEMIMPL_MEDIA_F2 phsubsw, 1
3812IEMIMPL_MEDIA_F2 pmaddubsw, 1
3813IEMIMPL_MEDIA_F2 pmulhrsw, 1
3814IEMIMPL_MEDIA_F2 pmuludq, 1
3815
3816
3817;;
3818; Media instruction working on two full sized registers, but no FXSAVE state argument.
3819;
3820; @param 1 The instruction
3821; @param 2 Whether there is an MMX variant (1) or not (0).
3822;
3823; @param A0 Pointer to the first media register size operand (input/output).
3824; @param A1 Pointer to the second media register size operand (input).
3825;
3826%macro IEMIMPL_MEDIA_OPT_F2 2
3827%if %2 != 0
3828BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3829 PROLOGUE_2_ARGS
3830 IEMIMPL_MMX_PROLOGUE
3831
3832 movq mm0, [A0]
3833 movq mm1, [A1]
3834 %1 mm0, mm1
3835 movq [A0], mm0
3836
3837 IEMIMPL_MMX_EPILOGUE
3838 EPILOGUE_2_ARGS
3839ENDPROC iemAImpl_ %+ %1 %+ _u64
3840%endif
3841
3842BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3843 PROLOGUE_2_ARGS
3844 IEMIMPL_SSE_PROLOGUE
3845
3846 movdqu xmm0, [A0]
3847 movdqu xmm1, [A1]
3848 %1 xmm0, xmm1
3849 movdqu [A0], xmm0
3850
3851 IEMIMPL_SSE_EPILOGUE
3852 EPILOGUE_2_ARGS
3853ENDPROC iemAImpl_ %+ %1 %+ _u128
3854%endmacro
3855
3856IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3857IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3858IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3859IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3860IEMIMPL_MEDIA_OPT_F2 psllw, 1
3861IEMIMPL_MEDIA_OPT_F2 pslld, 1
3862IEMIMPL_MEDIA_OPT_F2 psllq, 1
3863IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3864IEMIMPL_MEDIA_OPT_F2 psrld, 1
3865IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3866IEMIMPL_MEDIA_OPT_F2 psraw, 1
3867IEMIMPL_MEDIA_OPT_F2 psrad, 1
3868IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3869IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3870IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3871IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3872IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3873IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3874IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3875IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3876IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3877IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3878IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3879IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3880IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3881IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3882IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3883IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3884IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3885IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3886IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3887IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3888
3889;;
3890; Media instruction working on one full sized and one half sized register (lower half).
3891;
3892; @param 1 The instruction
3893; @param 2 1 if MMX is included, 0 if not.
3894;
3895; @param A0 Pointer to the first full sized media register operand (input/output).
3896; @param A1 Pointer to the second half sized media register operand (input).
3897;
3898%macro IEMIMPL_MEDIA_F1L1 2
3899 %if %2 != 0
3900BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3901 PROLOGUE_2_ARGS
3902 IEMIMPL_MMX_PROLOGUE
3903
3904 movq mm0, [A0]
3905 movq mm1, [A1]
3906 %1 mm0, mm1
3907 movq [A0], mm0
3908
3909 IEMIMPL_MMX_EPILOGUE
3910 EPILOGUE_2_ARGS
3911ENDPROC iemAImpl_ %+ %1 %+ _u64
3912 %endif
3913
3914BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3915 PROLOGUE_2_ARGS
3916 IEMIMPL_SSE_PROLOGUE
3917
3918 movdqu xmm0, [A0]
3919 movdqu xmm1, [A1]
3920 %1 xmm0, xmm1
3921 movdqu [A0], xmm0
3922
3923 IEMIMPL_SSE_EPILOGUE
3924 EPILOGUE_2_ARGS
3925ENDPROC iemAImpl_ %+ %1 %+ _u128
3926%endmacro
3927
3928IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3929IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3930IEMIMPL_MEDIA_F1L1 punpckldq, 1
3931IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3932
3933
3934;;
3935; Media instruction working two half sized input registers (lower half) and a full sized
3936; destination register (vpunpckh*).
3937;
3938; @param 1 The instruction
3939;
3940; @param A0 Pointer to the destination register (full sized, output only).
3941; @param A1 Pointer to the first full sized media source register operand, where we
3942; will only use the lower half as input - but we'll be loading it in full.
3943; @param A2 Pointer to the second full sized media source register operand, where we
3944; will only use the lower half as input - but we'll be loading it in full.
3945;
3946%macro IEMIMPL_MEDIA_F1L1L1 1
3947BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3948 PROLOGUE_3_ARGS
3949 IEMIMPL_AVX_PROLOGUE
3950
3951 vmovdqu xmm0, [A1]
3952 vmovdqu xmm1, [A2]
3953 %1 xmm0, xmm0, xmm1
3954 vmovdqu [A0], xmm0
3955
3956 IEMIMPL_AVX_PROLOGUE
3957 EPILOGUE_3_ARGS
3958ENDPROC iemAImpl_ %+ %1 %+ _u128
3959
3960BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3961 PROLOGUE_3_ARGS
3962 IEMIMPL_AVX_PROLOGUE
3963
3964 vmovdqu ymm0, [A1]
3965 vmovdqu ymm1, [A2]
3966 %1 ymm0, ymm0, ymm1
3967 vmovdqu [A0], ymm0
3968
3969 IEMIMPL_AVX_PROLOGUE
3970 EPILOGUE_3_ARGS
3971ENDPROC iemAImpl_ %+ %1 %+ _u256
3972%endmacro
3973
3974IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3975IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3976IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3977IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3978
3979
3980;;
3981; Media instruction working on one full sized and one half sized register (high half).
3982;
3983; @param 1 The instruction
3984; @param 2 1 if MMX is included, 0 if not.
3985;
3986; @param A0 Pointer to the first full sized media register operand (input/output).
3987; @param A1 Pointer to the second full sized media register operand, where we
3988; will only use the upper half as input - but we'll load it in full.
3989;
3990%macro IEMIMPL_MEDIA_F1H1 2
3991IEMIMPL_MEDIA_F1L1 %1, %2
3992%endmacro
3993
3994IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3995IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3996IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3997IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3998
3999
4000;;
4001; Media instruction working two half sized input registers (high half) and a full sized
4002; destination register (vpunpckh*).
4003;
4004; @param 1 The instruction
4005;
4006; @param A0 Pointer to the destination register (full sized, output only).
4007; @param A1 Pointer to the first full sized media source register operand, where we
4008; will only use the upper half as input - but we'll be loading it in full.
4009; @param A2 Pointer to the second full sized media source register operand, where we
4010; will only use the upper half as input - but we'll be loading it in full.
4011;
4012%macro IEMIMPL_MEDIA_F1H1H1 1
4013IEMIMPL_MEDIA_F1L1L1 %1
4014%endmacro
4015
4016IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4017IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4018IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4019IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4020
4021
4022;
4023; Shufflers with evil 8-bit immediates.
4024;
4025
4026BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4027 PROLOGUE_3_ARGS
4028 IEMIMPL_MMX_PROLOGUE
4029
4030 movzx A2, A2_8 ; must clear top bits
4031 movq mm1, [A1]
4032 movq mm0, mm0 ; paranoia!
4033 lea T1, [.imm0 xWrtRIP]
4034 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4035 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4036 %else
4037 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4038 %endif
4039 lea T1, [T1 + T0]
4040 IBT_NOTRACK
4041 call T1
4042 movq [A0], mm0
4043
4044 IEMIMPL_MMX_EPILOGUE
4045 EPILOGUE_3_ARGS
4046%assign bImm 0
4047%rep 256
4048.imm %+ bImm:
4049 IBT_ENDBRxx_WITHOUT_NOTRACK
4050 pshufw mm0, mm1, bImm
4051 ret
4052 %assign bImm bImm + 1
4053%endrep
4054.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4055ENDPROC iemAImpl_pshufw_u64
4056
4057
4058%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4059BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4060 PROLOGUE_3_ARGS
4061 IEMIMPL_SSE_PROLOGUE
4062
4063 movzx A2, A2_8 ; must clear top bits
4064 movdqu xmm1, [A1]
4065 movdqu xmm0, xmm1 ; paranoia!
4066 lea T1, [.imm0 xWrtRIP]
4067 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4068 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4069 %else
4070 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4071 %endif
4072 lea T1, [T1 + T0*2]
4073 IBT_NOTRACK
4074 call T1
4075 movdqu [A0], xmm0
4076
4077 IEMIMPL_SSE_EPILOGUE
4078 EPILOGUE_3_ARGS
4079
4080 %assign bImm 0
4081 %rep 256
4082.imm %+ bImm:
4083 IBT_ENDBRxx_WITHOUT_NOTRACK
4084 %1 xmm0, xmm1, bImm
4085 ret
4086 %assign bImm bImm + 1
4087 %endrep
4088.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4089ENDPROC iemAImpl_ %+ %1 %+ _u128
4090%endmacro
4091
4092IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4093IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4094IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4095
4096
4097%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4098BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4099 PROLOGUE_3_ARGS
4100 IEMIMPL_SSE_PROLOGUE
4101
4102 movzx A2, A2_8 ; must clear top bits
4103 vmovdqu ymm1, [A1]
4104 vmovdqu ymm0, ymm1 ; paranoia!
4105 lea T1, [.imm0 xWrtRIP]
4106 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4107 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4108 %else
4109 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4110 %endif
4111 lea T1, [T1 + T0*2]
4112 IBT_NOTRACK
4113 call T1
4114 vmovdqu [A0], ymm0
4115
4116 IEMIMPL_SSE_EPILOGUE
4117 EPILOGUE_3_ARGS
4118 %assign bImm 0
4119 %rep 256
4120.imm %+ bImm:
4121 IBT_ENDBRxx_WITHOUT_NOTRACK
4122 %1 ymm0, ymm1, bImm
4123 ret
4124 %assign bImm bImm + 1
4125 %endrep
4126.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4127ENDPROC iemAImpl_ %+ %1 %+ _u256
4128%endmacro
4129
4130IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4131IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4132IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4133
4134
4135;
4136; Shifts with evil 8-bit immediates.
4137;
4138
4139%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4140BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4141 PROLOGUE_2_ARGS
4142 IEMIMPL_MMX_PROLOGUE
4143
4144 movzx A1, A1_8 ; must clear top bits
4145 movq mm0, [A0]
4146 lea T1, [.imm0 xWrtRIP]
4147 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4148 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4149 %else
4150 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4151 %endif
4152 lea T1, [T1 + T0]
4153 IBT_NOTRACK
4154 call T1
4155 movq [A0], mm0
4156
4157 IEMIMPL_MMX_EPILOGUE
4158 EPILOGUE_2_ARGS
4159%assign bImm 0
4160%rep 256
4161.imm %+ bImm:
4162 IBT_ENDBRxx_WITHOUT_NOTRACK
4163 %1 mm0, bImm
4164 ret
4165 %assign bImm bImm + 1
4166%endrep
4167.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4168ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4169%endmacro
4170
4171IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4172IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4173IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4174IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4175IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4176IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4177IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4178IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4179
4180
4181%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4182BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4183 PROLOGUE_2_ARGS
4184 IEMIMPL_SSE_PROLOGUE
4185
4186 movzx A1, A1_8 ; must clear top bits
4187 movdqu xmm0, [A0]
4188 lea T1, [.imm0 xWrtRIP]
4189 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4190 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4191 %else
4192 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4193 %endif
4194 lea T1, [T1 + T0*2]
4195 IBT_NOTRACK
4196 call T1
4197 movdqu [A0], xmm0
4198
4199 IEMIMPL_SSE_EPILOGUE
4200 EPILOGUE_2_ARGS
4201 %assign bImm 0
4202 %rep 256
4203.imm %+ bImm:
4204 IBT_ENDBRxx_WITHOUT_NOTRACK
4205 %1 xmm0, bImm
4206 ret
4207 %assign bImm bImm + 1
4208 %endrep
4209.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4210ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4211%endmacro
4212
4213IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4214IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4215IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4216IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4217IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4218IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4219IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4220IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4221IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4222IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4223
4224
4225;
4226; Move byte mask.
4227;
4228
4229BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4230 PROLOGUE_2_ARGS
4231 IEMIMPL_MMX_PROLOGUE
4232
4233 movq mm1, [A1]
4234 pmovmskb T0, mm1
4235 mov [A0], T0
4236%ifdef RT_ARCH_X86
4237 mov dword [A0 + 4], 0
4238%endif
4239 IEMIMPL_MMX_EPILOGUE
4240 EPILOGUE_2_ARGS
4241ENDPROC iemAImpl_pmovmskb_u64
4242
4243BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4244 PROLOGUE_2_ARGS
4245 IEMIMPL_SSE_PROLOGUE
4246
4247 movdqu xmm1, [A1]
4248 pmovmskb T0, xmm1
4249 mov [A0], T0
4250%ifdef RT_ARCH_X86
4251 mov dword [A0 + 4], 0
4252%endif
4253 IEMIMPL_SSE_EPILOGUE
4254 EPILOGUE_2_ARGS
4255ENDPROC iemAImpl_pmovmskb_u128
4256
4257BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4258 PROLOGUE_2_ARGS
4259 IEMIMPL_AVX_PROLOGUE
4260
4261 vmovdqu ymm1, [A1]
4262 vpmovmskb T0, ymm1
4263 mov [A0], T0
4264%ifdef RT_ARCH_X86
4265 mov dword [A0 + 4], 0
4266%endif
4267 IEMIMPL_AVX_EPILOGUE
4268 EPILOGUE_2_ARGS
4269ENDPROC iemAImpl_vpmovmskb_u256
4270
4271
4272;;
4273; Media instruction working on two full sized source registers and one destination (AVX).
4274;
4275; @param 1 The instruction
4276;
4277; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4278; @param A1 Pointer to the destination media register size operand (output).
4279; @param A2 Pointer to the first source media register size operand (input).
4280; @param A3 Pointer to the second source media register size operand (input).
4281;
4282%macro IEMIMPL_MEDIA_F3 1
4283BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4284 PROLOGUE_4_ARGS
4285 IEMIMPL_AVX_PROLOGUE
4286
4287 vmovdqu xmm0, [A2]
4288 vmovdqu xmm1, [A3]
4289 %1 xmm0, xmm0, xmm1
4290 vmovdqu [A1], xmm0
4291
4292 IEMIMPL_AVX_PROLOGUE
4293 EPILOGUE_4_ARGS
4294ENDPROC iemAImpl_ %+ %1 %+ _u128
4295
4296BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4297 PROLOGUE_4_ARGS
4298 IEMIMPL_AVX_PROLOGUE
4299
4300 vmovdqu ymm0, [A2]
4301 vmovdqu ymm1, [A3]
4302 %1 ymm0, ymm0, ymm1
4303 vmovdqu [A1], ymm0
4304
4305 IEMIMPL_AVX_PROLOGUE
4306 EPILOGUE_4_ARGS
4307ENDPROC iemAImpl_ %+ %1 %+ _u256
4308%endmacro
4309
4310IEMIMPL_MEDIA_F3 vpshufb
4311IEMIMPL_MEDIA_F3 vpand
4312IEMIMPL_MEDIA_F3 vpminub
4313IEMIMPL_MEDIA_F3 vpminuw
4314IEMIMPL_MEDIA_F3 vpminud
4315IEMIMPL_MEDIA_F3 vpminsb
4316IEMIMPL_MEDIA_F3 vpminsw
4317IEMIMPL_MEDIA_F3 vpminsd
4318IEMIMPL_MEDIA_F3 vpmaxub
4319IEMIMPL_MEDIA_F3 vpmaxuw
4320IEMIMPL_MEDIA_F3 vpmaxud
4321IEMIMPL_MEDIA_F3 vpmaxsb
4322IEMIMPL_MEDIA_F3 vpmaxsw
4323IEMIMPL_MEDIA_F3 vpmaxsd
4324IEMIMPL_MEDIA_F3 vpandn
4325IEMIMPL_MEDIA_F3 vpor
4326IEMIMPL_MEDIA_F3 vpxor
4327IEMIMPL_MEDIA_F3 vpcmpeqb
4328IEMIMPL_MEDIA_F3 vpcmpeqw
4329IEMIMPL_MEDIA_F3 vpcmpeqd
4330IEMIMPL_MEDIA_F3 vpcmpeqq
4331IEMIMPL_MEDIA_F3 vpcmpgtb
4332IEMIMPL_MEDIA_F3 vpcmpgtw
4333IEMIMPL_MEDIA_F3 vpcmpgtd
4334IEMIMPL_MEDIA_F3 vpcmpgtq
4335IEMIMPL_MEDIA_F3 vpaddb
4336IEMIMPL_MEDIA_F3 vpaddw
4337IEMIMPL_MEDIA_F3 vpaddd
4338IEMIMPL_MEDIA_F3 vpaddq
4339IEMIMPL_MEDIA_F3 vpsubb
4340IEMIMPL_MEDIA_F3 vpsubw
4341IEMIMPL_MEDIA_F3 vpsubd
4342IEMIMPL_MEDIA_F3 vpsubq
4343
4344
4345;;
4346; Media instruction working on two full sized source registers and one destination (AVX),
4347; but no XSAVE state pointer argument.
4348;
4349; @param 1 The instruction
4350;
4351; @param A0 Pointer to the destination media register size operand (output).
4352; @param A1 Pointer to the first source media register size operand (input).
4353; @param A2 Pointer to the second source media register size operand (input).
4354;
4355%macro IEMIMPL_MEDIA_OPT_F3 1
4356BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4357 PROLOGUE_3_ARGS
4358 IEMIMPL_AVX_PROLOGUE
4359
4360 vmovdqu xmm0, [A1]
4361 vmovdqu xmm1, [A2]
4362 %1 xmm0, xmm0, xmm1
4363 vmovdqu [A0], xmm0
4364
4365 IEMIMPL_AVX_PROLOGUE
4366 EPILOGUE_3_ARGS
4367ENDPROC iemAImpl_ %+ %1 %+ _u128
4368
4369BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4370 PROLOGUE_3_ARGS
4371 IEMIMPL_AVX_PROLOGUE
4372
4373 vmovdqu ymm0, [A1]
4374 vmovdqu ymm1, [A2]
4375 %1 ymm0, ymm0, ymm1
4376 vmovdqu [A0], ymm0
4377
4378 IEMIMPL_AVX_PROLOGUE
4379 EPILOGUE_3_ARGS
4380ENDPROC iemAImpl_ %+ %1 %+ _u256
4381%endmacro
4382
4383IEMIMPL_MEDIA_OPT_F3 vpacksswb
4384IEMIMPL_MEDIA_OPT_F3 vpackssdw
4385IEMIMPL_MEDIA_OPT_F3 vpackuswb
4386IEMIMPL_MEDIA_OPT_F3 vpackusdw
4387IEMIMPL_MEDIA_OPT_F3 vpmullw
4388IEMIMPL_MEDIA_OPT_F3 vpmulld
4389IEMIMPL_MEDIA_OPT_F3 vpmulhw
4390IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4391IEMIMPL_MEDIA_OPT_F3 vpavgb
4392IEMIMPL_MEDIA_OPT_F3 vpavgw
4393IEMIMPL_MEDIA_OPT_F3 vpsignb
4394IEMIMPL_MEDIA_OPT_F3 vpsignw
4395IEMIMPL_MEDIA_OPT_F3 vpsignd
4396IEMIMPL_MEDIA_OPT_F3 vphaddw
4397IEMIMPL_MEDIA_OPT_F3 vphaddd
4398IEMIMPL_MEDIA_OPT_F3 vphsubw
4399IEMIMPL_MEDIA_OPT_F3 vphsubd
4400IEMIMPL_MEDIA_OPT_F3 vphaddsw
4401IEMIMPL_MEDIA_OPT_F3 vphsubsw
4402IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4403IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4404IEMIMPL_MEDIA_OPT_F3 vpsadbw
4405IEMIMPL_MEDIA_OPT_F3 vpmuldq
4406IEMIMPL_MEDIA_OPT_F3 vpmuludq
4407IEMIMPL_MEDIA_OPT_F3 vunpcklps
4408IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4409IEMIMPL_MEDIA_OPT_F3 vunpckhps
4410IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4411IEMIMPL_MEDIA_OPT_F3 vpsubsb
4412IEMIMPL_MEDIA_OPT_F3 vpsubsw
4413IEMIMPL_MEDIA_OPT_F3 vpsubusb
4414IEMIMPL_MEDIA_OPT_F3 vpsubusw
4415IEMIMPL_MEDIA_OPT_F3 vpaddusb
4416IEMIMPL_MEDIA_OPT_F3 vpaddusw
4417IEMIMPL_MEDIA_OPT_F3 vpaddsb
4418IEMIMPL_MEDIA_OPT_F3 vpaddsw
4419
4420
4421;;
4422; Media instruction working on one full sized source registers and one destination (AVX),
4423; but no XSAVE state pointer argument.
4424;
4425; @param 1 The instruction
4426; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4427;
4428; @param A0 Pointer to the destination media register size operand (output).
4429; @param A1 Pointer to the source media register size operand (input).
4430;
4431%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4432BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4433 PROLOGUE_2_ARGS
4434 IEMIMPL_AVX_PROLOGUE
4435
4436 vmovdqu xmm0, [A1]
4437 %1 xmm0, xmm0
4438 vmovdqu [A0], xmm0
4439
4440 IEMIMPL_AVX_PROLOGUE
4441 EPILOGUE_2_ARGS
4442ENDPROC iemAImpl_ %+ %1 %+ _u128
4443
4444 %if %2 == 1
4445BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4446 PROLOGUE_2_ARGS
4447 IEMIMPL_AVX_PROLOGUE
4448
4449 vmovdqu ymm0, [A1]
4450 %1 ymm0, ymm0
4451 vmovdqu [A0], ymm0
4452
4453 IEMIMPL_AVX_PROLOGUE
4454 EPILOGUE_2_ARGS
4455ENDPROC iemAImpl_ %+ %1 %+ _u256
4456 %endif
4457%endmacro
4458
4459IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4460IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4461IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4462IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4463
4464
4465;
4466; The SSE 4.2 crc32
4467;
4468; @param A1 Pointer to the 32-bit destination.
4469; @param A2 The source operand, sized according to the suffix.
4470;
4471BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4472 PROLOGUE_2_ARGS
4473
4474 mov T0_32, [A0]
4475 crc32 T0_32, A1_8
4476 mov [A0], T0_32
4477
4478 EPILOGUE_2_ARGS
4479ENDPROC iemAImpl_crc32_u8
4480
4481BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4482 PROLOGUE_2_ARGS
4483
4484 mov T0_32, [A0]
4485 crc32 T0_32, A1_16
4486 mov [A0], T0_32
4487
4488 EPILOGUE_2_ARGS
4489ENDPROC iemAImpl_crc32_u16
4490
4491BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4492 PROLOGUE_2_ARGS
4493
4494 mov T0_32, [A0]
4495 crc32 T0_32, A1_32
4496 mov [A0], T0_32
4497
4498 EPILOGUE_2_ARGS
4499ENDPROC iemAImpl_crc32_u32
4500
4501%ifdef RT_ARCH_AMD64
4502BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4503 PROLOGUE_2_ARGS
4504
4505 mov T0_32, [A0]
4506 crc32 T0, A1
4507 mov [A0], T0_32
4508
4509 EPILOGUE_2_ARGS
4510ENDPROC iemAImpl_crc32_u64
4511%endif
4512
4513
4514;
4515; PTEST (SSE 4.1)
4516;
4517; @param A0 Pointer to the first source operand (aka readonly destination).
4518; @param A1 Pointer to the second source operand.
4519; @param A2 Pointer to the EFLAGS register.
4520;
4521BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4522 PROLOGUE_3_ARGS
4523 IEMIMPL_SSE_PROLOGUE
4524
4525 movdqu xmm0, [A0]
4526 movdqu xmm1, [A1]
4527 ptest xmm0, xmm1
4528 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4529
4530 IEMIMPL_SSE_EPILOGUE
4531 EPILOGUE_3_ARGS
4532ENDPROC iemAImpl_ptest_u128
4533
4534BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4535 PROLOGUE_3_ARGS
4536 IEMIMPL_SSE_PROLOGUE
4537
4538 vmovdqu ymm0, [A0]
4539 vmovdqu ymm1, [A1]
4540 vptest ymm0, ymm1
4541 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4542
4543 IEMIMPL_SSE_EPILOGUE
4544 EPILOGUE_3_ARGS
4545ENDPROC iemAImpl_vptest_u256
4546
4547
4548;;
4549; Template for the [v]pmov{s,z}x* instructions
4550;
4551; @param 1 The instruction
4552;
4553; @param A0 Pointer to the destination media register size operand (output).
4554; @param A1 The source operand value (input).
4555;
4556%macro IEMIMPL_V_PMOV_SZ_X 1
4557BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4558 PROLOGUE_2_ARGS
4559 IEMIMPL_SSE_PROLOGUE
4560
4561 movd xmm0, A1
4562 %1 xmm0, xmm0
4563 vmovdqu [A0], xmm0
4564
4565 IEMIMPL_SSE_PROLOGUE
4566 EPILOGUE_2_ARGS
4567ENDPROC iemAImpl_ %+ %1 %+ _u128
4568
4569BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4570 PROLOGUE_2_ARGS
4571 IEMIMPL_AVX_PROLOGUE
4572
4573 movd xmm0, A1
4574 v %+ %1 xmm0, xmm0
4575 vmovdqu [A0], xmm0
4576
4577 IEMIMPL_AVX_PROLOGUE
4578 EPILOGUE_2_ARGS
4579ENDPROC iemAImpl_v %+ %1 %+ _u128
4580
4581BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4582 PROLOGUE_2_ARGS
4583 IEMIMPL_AVX_PROLOGUE
4584
4585 movdqu xmm0, [A1]
4586 v %+ %1 ymm0, xmm0
4587 vmovdqu [A0], ymm0
4588
4589 IEMIMPL_AVX_PROLOGUE
4590 EPILOGUE_2_ARGS
4591ENDPROC iemAImpl_v %+ %1 %+ _u256
4592%endmacro
4593
4594IEMIMPL_V_PMOV_SZ_X pmovsxbw
4595IEMIMPL_V_PMOV_SZ_X pmovsxbd
4596IEMIMPL_V_PMOV_SZ_X pmovsxbq
4597IEMIMPL_V_PMOV_SZ_X pmovsxwd
4598IEMIMPL_V_PMOV_SZ_X pmovsxwq
4599IEMIMPL_V_PMOV_SZ_X pmovsxdq
4600
4601IEMIMPL_V_PMOV_SZ_X pmovzxbw
4602IEMIMPL_V_PMOV_SZ_X pmovzxbd
4603IEMIMPL_V_PMOV_SZ_X pmovzxbq
4604IEMIMPL_V_PMOV_SZ_X pmovzxwd
4605IEMIMPL_V_PMOV_SZ_X pmovzxwq
4606IEMIMPL_V_PMOV_SZ_X pmovzxdq
4607
4608
4609;;
4610; Need to move this as well somewhere better?
4611;
4612struc IEMSSERESULT
4613 .uResult resd 4
4614 .MXCSR resd 1
4615endstruc
4616
4617
4618;;
4619; Need to move this as well somewhere better?
4620;
4621struc IEMAVX128RESULT
4622 .uResult resd 4
4623 .MXCSR resd 1
4624endstruc
4625
4626
4627;;
4628; Need to move this as well somewhere better?
4629;
4630struc IEMAVX256RESULT
4631 .uResult resd 8
4632 .MXCSR resd 1
4633endstruc
4634
4635
4636;;
4637; Initialize the SSE MXCSR register using the guest value partially to
4638; account for rounding mode.
4639;
4640; @uses 4 bytes of stack to save the original value, T0.
4641; @param 1 Expression giving the address of the FXSTATE of the guest.
4642;
4643%macro SSE_LD_FXSTATE_MXCSR 1
4644 sub xSP, 4
4645
4646 stmxcsr [xSP]
4647 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4648 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4649 or T0_32, X86_MXCSR_XCPT_MASK
4650 sub xSP, 4
4651 mov [xSP], T0_32
4652 ldmxcsr [xSP]
4653 add xSP, 4
4654%endmacro
4655
4656
4657;;
4658; Restores the SSE MXCSR register with the original value.
4659;
4660; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4661; @param 1 Expression giving the address where to return the MXCSR value.
4662; @param 2 Expression giving the address of the FXSTATE of the guest.
4663;
4664; @note Restores the stack pointer.
4665;
4666%macro SSE_ST_FXSTATE_MXCSR 2
4667 sub xSP, 4
4668 stmxcsr [xSP]
4669 mov T0_32, [xSP]
4670 add xSP, 4
4671 ; Merge the status bits into the original MXCSR value.
4672 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4673 and T0_32, X86_MXCSR_XCPT_FLAGS
4674 or T0_32, T1_32
4675 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4676
4677 ldmxcsr [xSP]
4678 add xSP, 4
4679%endmacro
4680
4681
4682;;
4683; Initialize the SSE MXCSR register using the guest value partially to
4684; account for rounding mode.
4685;
4686; @uses 4 bytes of stack to save the original value.
4687; @param 1 Expression giving the address of the FXSTATE of the guest.
4688;
4689%macro AVX_LD_XSAVEAREA_MXCSR 1
4690 sub xSP, 4
4691
4692 stmxcsr [xSP]
4693 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4694 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4695 sub xSP, 4
4696 mov [xSP], T0_32
4697 ldmxcsr [xSP]
4698 add xSP, 4
4699%endmacro
4700
4701
4702;;
4703; Restores the AVX128 MXCSR register with the original value.
4704;
4705; @param 1 Expression giving the address where to return the MXCSR value.
4706;
4707; @note Restores the stack pointer.
4708;
4709%macro AVX128_ST_XSAVEAREA_MXCSR 1
4710 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4711
4712 ldmxcsr [xSP]
4713 add xSP, 4
4714%endmacro
4715
4716
4717;;
4718; Restores the AVX256 MXCSR register with the original value.
4719;
4720; @param 1 Expression giving the address where to return the MXCSR value.
4721;
4722; @note Restores the stack pointer.
4723;
4724%macro AVX256_ST_XSAVEAREA_MXCSR 1
4725 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4726
4727 ldmxcsr [xSP]
4728 add xSP, 4
4729%endmacro
4730
4731
4732;;
4733; Floating point instruction working on two full sized registers.
4734;
4735; @param 1 The instruction
4736; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4737;
4738; @param A0 FPU context (FXSTATE or XSAVEAREA).
4739; @param A1 Where to return the result including the MXCSR value.
4740; @param A2 Pointer to the first media register size operand (input/output).
4741; @param A3 Pointer to the second media register size operand (input).
4742;
4743%macro IEMIMPL_FP_F2 2
4744BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4745 PROLOGUE_4_ARGS
4746 IEMIMPL_SSE_PROLOGUE
4747 SSE_LD_FXSTATE_MXCSR A0
4748
4749 movdqu xmm0, [A2]
4750 movdqu xmm1, [A3]
4751 %1 xmm0, xmm1
4752 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4753
4754 SSE_ST_FXSTATE_MXCSR A1, A0
4755 IEMIMPL_SSE_PROLOGUE
4756 EPILOGUE_4_ARGS
4757ENDPROC iemAImpl_ %+ %1 %+ _u128
4758
4759 %if %2 == 3
4760BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4761 PROLOGUE_4_ARGS
4762 IEMIMPL_AVX_PROLOGUE
4763 AVX_LD_XSAVEAREA_MXCSR A0
4764
4765 vmovdqu xmm0, [A2]
4766 vmovdqu xmm1, [A3]
4767 v %+ %1 xmm0, xmm0, xmm1
4768 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4769
4770 AVX128_ST_XSAVEAREA_MXCSR A1
4771 IEMIMPL_AVX_PROLOGUE
4772 EPILOGUE_4_ARGS
4773ENDPROC iemAImpl_v %+ %1 %+ _u128
4774
4775BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4776 PROLOGUE_4_ARGS
4777 IEMIMPL_AVX_PROLOGUE
4778 AVX_LD_XSAVEAREA_MXCSR A0
4779
4780 vmovdqu ymm0, [A2]
4781 vmovdqu ymm1, [A3]
4782 v %+ %1 ymm0, ymm0, ymm1
4783 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4784
4785 AVX256_ST_XSAVEAREA_MXCSR A1
4786 IEMIMPL_AVX_PROLOGUE
4787 EPILOGUE_4_ARGS
4788ENDPROC iemAImpl_v %+ %1 %+ _u256
4789 %elif %2 == 2
4790BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4791 PROLOGUE_4_ARGS
4792 IEMIMPL_AVX_PROLOGUE
4793 AVX_LD_XSAVEAREA_MXCSR A0
4794
4795 vmovdqu xmm0, [A2]
4796 vmovdqu xmm1, [A3]
4797 v %+ %1 xmm0, xmm1
4798 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4799
4800 AVX128_ST_XSAVEAREA_MXCSR A1
4801 IEMIMPL_AVX_PROLOGUE
4802 EPILOGUE_4_ARGS
4803ENDPROC iemAImpl_v %+ %1 %+ _u128
4804
4805BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4806 PROLOGUE_4_ARGS
4807 IEMIMPL_AVX_PROLOGUE
4808 AVX_LD_XSAVEAREA_MXCSR A0
4809
4810 vmovdqu ymm0, [A2]
4811 vmovdqu ymm1, [A3]
4812 v %+ %1 ymm0, ymm1
4813 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4814
4815 AVX256_ST_XSAVEAREA_MXCSR A1
4816 IEMIMPL_AVX_PROLOGUE
4817 EPILOGUE_4_ARGS
4818ENDPROC iemAImpl_v %+ %1 %+ _u256
4819 %endif
4820%endmacro
4821
4822IEMIMPL_FP_F2 addps, 3
4823IEMIMPL_FP_F2 addpd, 3
4824IEMIMPL_FP_F2 mulps, 3
4825IEMIMPL_FP_F2 mulpd, 3
4826IEMIMPL_FP_F2 subps, 3
4827IEMIMPL_FP_F2 subpd, 3
4828IEMIMPL_FP_F2 minps, 3
4829IEMIMPL_FP_F2 minpd, 3
4830IEMIMPL_FP_F2 divps, 3
4831IEMIMPL_FP_F2 divpd, 3
4832IEMIMPL_FP_F2 maxps, 3
4833IEMIMPL_FP_F2 maxpd, 3
4834IEMIMPL_FP_F2 haddps, 3
4835IEMIMPL_FP_F2 haddpd, 3
4836IEMIMPL_FP_F2 hsubps, 3
4837IEMIMPL_FP_F2 hsubpd, 3
4838IEMIMPL_FP_F2 addsubps, 3
4839IEMIMPL_FP_F2 addsubpd, 3
4840
4841
4842;;
4843; These are actually unary operations but to keep it simple
4844; we treat them as binary for now, so the output result is
4845; always in sync with the register where the result might get written
4846; to.
4847IEMIMPL_FP_F2 sqrtps, 2
4848IEMIMPL_FP_F2 rsqrtps, 2
4849IEMIMPL_FP_F2 sqrtpd, 2
4850IEMIMPL_FP_F2 rcpps, 2
4851IEMIMPL_FP_F2 cvtdq2ps, 2
4852IEMIMPL_FP_F2 cvtps2dq, 2
4853IEMIMPL_FP_F2 cvttps2dq, 2
4854IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4855IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4856IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4857
4858
4859;;
4860; Floating point instruction working on a full sized register and a single precision operand.
4861;
4862; @param 1 The instruction
4863;
4864; @param A0 FPU context (FXSTATE or XSAVEAREA).
4865; @param A1 Where to return the result including the MXCSR value.
4866; @param A2 Pointer to the first media register size operand (input/output).
4867; @param A3 Pointer to the second single precision floating point value (input).
4868;
4869%macro IEMIMPL_FP_F2_R32 1
4870BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4871 PROLOGUE_4_ARGS
4872 IEMIMPL_SSE_PROLOGUE
4873 SSE_LD_FXSTATE_MXCSR A0
4874
4875 movdqu xmm0, [A2]
4876 movd xmm1, [A3]
4877 %1 xmm0, xmm1
4878 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4879
4880 SSE_ST_FXSTATE_MXCSR A1, A0
4881 IEMIMPL_SSE_EPILOGUE
4882 EPILOGUE_4_ARGS
4883ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4884
4885BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4886 PROLOGUE_4_ARGS
4887 IEMIMPL_AVX_PROLOGUE
4888 AVX_LD_XSAVEAREA_MXCSR A0
4889
4890 vmovdqu xmm0, [A2]
4891 vmovd xmm1, [A3]
4892 v %+ %1 xmm0, xmm0, xmm1
4893 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4894
4895 AVX128_ST_XSAVEAREA_MXCSR A1
4896 IEMIMPL_AVX_PROLOGUE
4897 EPILOGUE_4_ARGS
4898ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4899%endmacro
4900
4901IEMIMPL_FP_F2_R32 addss
4902IEMIMPL_FP_F2_R32 mulss
4903IEMIMPL_FP_F2_R32 subss
4904IEMIMPL_FP_F2_R32 minss
4905IEMIMPL_FP_F2_R32 divss
4906IEMIMPL_FP_F2_R32 maxss
4907IEMIMPL_FP_F2_R32 cvtss2sd
4908IEMIMPL_FP_F2_R32 sqrtss
4909IEMIMPL_FP_F2_R32 rsqrtss
4910IEMIMPL_FP_F2_R32 rcpss
4911
4912
4913;;
4914; Floating point instruction working on a full sized register and a double precision operand.
4915;
4916; @param 1 The instruction
4917;
4918; @param A0 FPU context (FXSTATE or XSAVEAREA).
4919; @param A1 Where to return the result including the MXCSR value.
4920; @param A2 Pointer to the first media register size operand (input/output).
4921; @param A3 Pointer to the second double precision floating point value (input).
4922;
4923%macro IEMIMPL_FP_F2_R64 1
4924BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4925 PROLOGUE_4_ARGS
4926 IEMIMPL_SSE_PROLOGUE
4927 SSE_LD_FXSTATE_MXCSR A0
4928
4929 movdqu xmm0, [A2]
4930 movq xmm1, [A3]
4931 %1 xmm0, xmm1
4932 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4933
4934 SSE_ST_FXSTATE_MXCSR A1, A0
4935 IEMIMPL_SSE_EPILOGUE
4936 EPILOGUE_4_ARGS
4937ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4938
4939BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4940 PROLOGUE_4_ARGS
4941 IEMIMPL_AVX_PROLOGUE
4942 AVX_LD_XSAVEAREA_MXCSR A0
4943
4944 vmovdqu xmm0, [A2]
4945 vmovq xmm1, [A3]
4946 v %+ %1 xmm0, xmm0, xmm1
4947 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4948
4949 AVX128_ST_XSAVEAREA_MXCSR A1
4950 IEMIMPL_AVX_EPILOGUE
4951 EPILOGUE_4_ARGS
4952ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4953%endmacro
4954
4955IEMIMPL_FP_F2_R64 addsd
4956IEMIMPL_FP_F2_R64 mulsd
4957IEMIMPL_FP_F2_R64 subsd
4958IEMIMPL_FP_F2_R64 minsd
4959IEMIMPL_FP_F2_R64 divsd
4960IEMIMPL_FP_F2_R64 maxsd
4961IEMIMPL_FP_F2_R64 cvtsd2ss
4962IEMIMPL_FP_F2_R64 sqrtsd
4963
4964
4965;;
4966; Macro for the cvtpd2ps/cvtps2pd instructions.
4967;
4968; 1 The instruction name.
4969; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4970;
4971; @param A0 FPU context (FXSTATE or XSAVEAREA).
4972; @param A1 Where to return the result including the MXCSR value.
4973; @param A2 Pointer to the first media register size operand (input/output).
4974; @param A3 Pointer to the second media register size operand (input).
4975;
4976%macro IEMIMPL_CVT_F2 2
4977BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4978 PROLOGUE_4_ARGS
4979 IEMIMPL_SSE_PROLOGUE
4980 SSE_LD_FXSTATE_MXCSR A0
4981
4982 movdqu xmm0, [A2]
4983 movdqu xmm1, [A3]
4984 %1 xmm0, xmm1
4985 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4986
4987 SSE_ST_FXSTATE_MXCSR A1, A0
4988 IEMIMPL_SSE_EPILOGUE
4989 EPILOGUE_4_ARGS
4990ENDPROC iemAImpl_ %+ %1 %+ _u128
4991
4992BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4993 PROLOGUE_4_ARGS
4994 IEMIMPL_AVX_PROLOGUE
4995 AVX_LD_XSAVEAREA_MXCSR A0
4996
4997 vmovdqu xmm0, [A2]
4998 vmovdqu xmm1, [A3]
4999 v %+ %1 xmm0, xmm1
5000 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5001
5002 AVX128_ST_XSAVEAREA_MXCSR A1
5003 IEMIMPL_AVX_EPILOGUE
5004 EPILOGUE_4_ARGS
5005ENDPROC iemAImpl_v %+ %1 %+ _u128
5006
5007BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5008 PROLOGUE_4_ARGS
5009 IEMIMPL_AVX_PROLOGUE
5010 AVX_LD_XSAVEAREA_MXCSR A0
5011
5012 vmovdqu ymm0, [A2]
5013 vmovdqu ymm1, [A3]
5014 %if %2 == 0
5015 v %+ %1 xmm0, ymm1
5016 %else
5017 v %+ %1 ymm0, xmm1
5018 %endif
5019 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5020
5021 AVX256_ST_XSAVEAREA_MXCSR A1
5022 IEMIMPL_AVX_EPILOGUE
5023 EPILOGUE_4_ARGS
5024ENDPROC iemAImpl_v %+ %1 %+ _u256
5025%endmacro
5026
5027IEMIMPL_CVT_F2 cvtpd2ps, 0
5028IEMIMPL_CVT_F2 cvtps2pd, 1
5029
5030
5031;;
5032; shufps instructions with 8-bit immediates.
5033;
5034; @param A0 Pointer to the destination media register size operand (input/output).
5035; @param A1 Pointer to the first source media register size operand (input).
5036; @param A2 The 8-bit immediate
5037;
5038BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5039 PROLOGUE_3_ARGS
5040 IEMIMPL_SSE_PROLOGUE
5041
5042 movzx A2, A2_8 ; must clear top bits
5043 movdqu xmm0, [A0]
5044 movdqu xmm1, [A1]
5045 lea T1, [.imm0 xWrtRIP]
5046 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5047 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5048 %else
5049 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5050 %endif
5051 lea T1, [T1 + T0*2]
5052 IBT_NOTRACK
5053 call T1
5054 movdqu [A0], xmm0
5055
5056 IEMIMPL_SSE_EPILOGUE
5057 EPILOGUE_3_ARGS
5058 %assign bImm 0
5059 %rep 256
5060.imm %+ bImm:
5061 IBT_ENDBRxx_WITHOUT_NOTRACK
5062 shufps xmm0, xmm1, bImm
5063 ret
5064 int3
5065 %assign bImm bImm + 1
5066 %endrep
5067.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5068ENDPROC iemAImpl_shufps_u128
5069
5070
5071;;
5072; shufpd instruction with 8-bit immediates.
5073;
5074; @param A0 Pointer to the destination media register size operand (input/output).
5075; @param A1 Pointer to the first source media register size operand (input).
5076; @param A2 The 8-bit immediate
5077;
5078BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5079 PROLOGUE_3_ARGS
5080 IEMIMPL_SSE_PROLOGUE
5081
5082 movzx A2, A2_8 ; must clear top bits
5083 movdqu xmm0, [A0]
5084 movdqu xmm1, [A1]
5085 lea T1, [.imm0 xWrtRIP]
5086 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5087 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5088 %else
5089 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5090 %endif
5091 lea T1, [T1 + T0*2]
5092 IBT_NOTRACK
5093 call T1
5094 movdqu [A0], xmm0
5095
5096 IEMIMPL_SSE_EPILOGUE
5097 EPILOGUE_3_ARGS
5098 %assign bImm 0
5099 %rep 256
5100.imm %+ bImm:
5101 IBT_ENDBRxx_WITHOUT_NOTRACK
5102 shufpd xmm0, xmm1, bImm
5103 ret
5104 %assign bImm bImm + 1
5105 %endrep
5106.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5107ENDPROC iemAImpl_shufpd_u128
5108
5109
5110;;
5111; vshufp{s,d} instructions with 8-bit immediates.
5112;
5113; @param 1 The instruction name.
5114;
5115; @param A0 Pointer to the destination media register size operand (output).
5116; @param A1 Pointer to the first source media register size operand (input).
5117; @param A2 Pointer to the second source media register size operand (input).
5118; @param A3 The 8-bit immediate
5119;
5120%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5121BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5122 PROLOGUE_4_ARGS
5123 IEMIMPL_AVX_PROLOGUE
5124
5125 movzx A3, A3_8 ; must clear top bits
5126 movdqu xmm0, [A1]
5127 movdqu xmm1, [A2]
5128 lea T1, [.imm0 xWrtRIP]
5129 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5130 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5131 %else
5132 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5133 %endif
5134 lea T1, [T1 + T0*2]
5135 IBT_NOTRACK
5136 call T1
5137 movdqu [A0], xmm0
5138
5139 IEMIMPL_AVX_EPILOGUE
5140 EPILOGUE_4_ARGS
5141 %assign bImm 0
5142 %rep 256
5143.imm %+ bImm:
5144 IBT_ENDBRxx_WITHOUT_NOTRACK
5145 %1 xmm0, xmm0, xmm1, bImm
5146 ret
5147 %assign bImm bImm + 1
5148 %endrep
5149.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5150ENDPROC iemAImpl_ %+ %1 %+ _u128
5151
5152BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5153 PROLOGUE_4_ARGS
5154 IEMIMPL_AVX_PROLOGUE
5155
5156 movzx A3, A3_8 ; must clear top bits
5157 vmovdqu ymm0, [A1]
5158 vmovdqu ymm1, [A2]
5159 lea T1, [.imm0 xWrtRIP]
5160 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5161 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5162 %else
5163 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5164 %endif
5165 lea T1, [T1 + T0*2]
5166 IBT_NOTRACK
5167 call T1
5168 vmovdqu [A0], ymm0
5169
5170 IEMIMPL_AVX_EPILOGUE
5171 EPILOGUE_4_ARGS
5172 %assign bImm 0
5173 %rep 256
5174.imm %+ bImm:
5175 IBT_ENDBRxx_WITHOUT_NOTRACK
5176 %1 ymm0, ymm0, ymm1, bImm
5177 ret
5178 %assign bImm bImm + 1
5179 %endrep
5180.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5181ENDPROC iemAImpl_ %+ %1 %+ _u256
5182%endmacro
5183
5184IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5185IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5186
5187
5188;;
5189; One of the [p]blendv{b,ps,pd} variants
5190;
5191; @param 1 The instruction
5192;
5193; @param A0 Pointer to the first media register sized operand (input/output).
5194; @param A1 Pointer to the second media sized value (input).
5195; @param A2 Pointer to the media register sized mask value (input).
5196;
5197%macro IEMIMPL_P_BLEND 1
5198BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5199 PROLOGUE_3_ARGS
5200 IEMIMPL_SSE_PROLOGUE
5201
5202 movdqu xmm0, [A2] ; This is implicit
5203 movdqu xmm1, [A0]
5204 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5205 %1 xmm1, xmm2
5206 movdqu [A0], xmm1
5207
5208 IEMIMPL_SSE_PROLOGUE
5209 EPILOGUE_3_ARGS
5210ENDPROC iemAImpl_ %+ %1 %+ _u128
5211%endmacro
5212
5213IEMIMPL_P_BLEND pblendvb
5214IEMIMPL_P_BLEND blendvps
5215IEMIMPL_P_BLEND blendvpd
5216
5217
5218;;
5219; One of the v[p]blendv{b,ps,pd} variants
5220;
5221; @param 1 The instruction
5222;
5223; @param A0 Pointer to the first media register sized operand (output).
5224; @param A1 Pointer to the first media register sized operand (input).
5225; @param A2 Pointer to the second media register sized operand (input).
5226; @param A3 Pointer to the media register sized mask value (input).
5227%macro IEMIMPL_AVX_P_BLEND 1
5228BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5229 PROLOGUE_4_ARGS
5230 IEMIMPL_AVX_PROLOGUE
5231
5232 vmovdqu xmm0, [A1]
5233 vmovdqu xmm1, [A2]
5234 vmovdqu xmm2, [A3]
5235 %1 xmm0, xmm0, xmm1, xmm2
5236 vmovdqu [A0], xmm0
5237
5238 IEMIMPL_AVX_PROLOGUE
5239 EPILOGUE_4_ARGS
5240ENDPROC iemAImpl_ %+ %1 %+ _u128
5241
5242BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5243 PROLOGUE_4_ARGS
5244 IEMIMPL_AVX_PROLOGUE
5245
5246 vmovdqu ymm0, [A1]
5247 vmovdqu ymm1, [A2]
5248 vmovdqu ymm2, [A3]
5249 %1 ymm0, ymm0, ymm1, ymm2
5250 vmovdqu [A0], ymm0
5251
5252 IEMIMPL_AVX_PROLOGUE
5253 EPILOGUE_4_ARGS
5254ENDPROC iemAImpl_ %+ %1 %+ _u256
5255%endmacro
5256
5257IEMIMPL_AVX_P_BLEND vpblendvb
5258IEMIMPL_AVX_P_BLEND vblendvps
5259IEMIMPL_AVX_P_BLEND vblendvpd
5260
5261
5262;;
5263; palignr mm1, mm2/m64 instruction.
5264;
5265; @param A0 Pointer to the first media register sized operand (output).
5266; @param A1 The second register sized operand (input).
5267; @param A2 The 8-bit immediate.
5268BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5269 PROLOGUE_3_ARGS
5270 IEMIMPL_MMX_PROLOGUE
5271
5272 movzx A2, A2_8 ; must clear top bits
5273 movq mm0, [A0]
5274 movq mm1, A1
5275 lea T1, [.imm0 xWrtRIP]
5276 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5277 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5278 %else
5279 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5280 %endif
5281 lea T1, [T1 + T0*2]
5282 IBT_NOTRACK
5283 call T1
5284 movq [A0], mm0
5285
5286 IEMIMPL_MMX_EPILOGUE
5287 EPILOGUE_3_ARGS
5288 %assign bImm 0
5289 %rep 256
5290.imm %+ bImm:
5291 IBT_ENDBRxx_WITHOUT_NOTRACK
5292 palignr mm0, mm1, bImm
5293 ret
5294 %assign bImm bImm + 1
5295 %endrep
5296.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5297ENDPROC iemAImpl_palignr_u64
5298
5299
5300;;
5301; SSE instructions with 8-bit immediates of the form
5302; xxx xmm1, xmm2, imm8.
5303; where the instruction encoding takes up 6 bytes.
5304;
5305; @param 1 The instruction name.
5306;
5307; @param A0 Pointer to the first media register size operand (input/output).
5308; @param A1 Pointer to the second source media register size operand (input).
5309; @param A2 The 8-bit immediate
5310;
5311%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5312BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5313 PROLOGUE_3_ARGS
5314 IEMIMPL_SSE_PROLOGUE
5315
5316 movzx A2, A2_8 ; must clear top bits
5317 movdqu xmm0, [A0]
5318 movdqu xmm1, [A1]
5319 lea T1, [.imm0 xWrtRIP]
5320 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5321 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5322 lea T1, [T1 + T0*4]
5323 %else
5324 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5325 %endif
5326 IBT_NOTRACK
5327 call T1
5328 movdqu [A0], xmm0
5329
5330 IEMIMPL_SSE_EPILOGUE
5331 EPILOGUE_3_ARGS
5332 %assign bImm 0
5333 %rep 256
5334.imm %+ bImm:
5335 IBT_ENDBRxx_WITHOUT_NOTRACK
5336 %1 xmm0, xmm1, bImm
5337 ret
5338 int3
5339 %assign bImm bImm + 1
5340 %endrep
5341.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5342ENDPROC iemAImpl_ %+ %1 %+ _u128
5343%endmacro
5344
5345IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5346IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5347IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5348IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5349IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5350IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5351IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5352
5353
5354;;
5355; AVX instructions with 8-bit immediates of the form
5356; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5357; where the instruction encoding takes up 6 bytes.
5358;
5359; @param 1 The instruction name.
5360; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5361; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5362;
5363; @param A0 Pointer to the destination media register size operand (output).
5364; @param A1 Pointer to the first source media register size operand (input).
5365; @param A2 Pointer to the second source media register size operand (input).
5366; @param A3 The 8-bit immediate
5367;
5368%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5369 %if %2 == 1
5370BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5371 PROLOGUE_4_ARGS
5372 IEMIMPL_AVX_PROLOGUE
5373
5374 movzx A3, A3_8 ; must clear top bits
5375 movdqu xmm0, [A1]
5376 movdqu xmm1, [A2]
5377 lea T1, [.imm0 xWrtRIP]
5378 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5379 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5380 lea T1, [T1 + T0*4]
5381 %else
5382 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5383 %endif
5384 IBT_NOTRACK
5385 call T1
5386 movdqu [A0], xmm0
5387
5388 IEMIMPL_AVX_EPILOGUE
5389 EPILOGUE_4_ARGS
5390 %assign bImm 0
5391 %rep 256
5392.imm %+ bImm:
5393 IBT_ENDBRxx_WITHOUT_NOTRACK
5394 %1 xmm0, xmm0, xmm1, bImm
5395 ret
5396 int3
5397 %assign bImm bImm + 1
5398 %endrep
5399.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5400ENDPROC iemAImpl_ %+ %1 %+ _u128
5401 %endif
5402
5403 %if %3 == 1
5404BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5405 PROLOGUE_4_ARGS
5406 IEMIMPL_AVX_PROLOGUE
5407
5408 movzx A3, A3_8 ; must clear top bits
5409 vmovdqu ymm0, [A1]
5410 vmovdqu ymm1, [A2]
5411 lea T1, [.imm0 xWrtRIP]
5412 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5413 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5414 lea T1, [T1 + T0*4]
5415 %else
5416 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5417 %endif
5418 IBT_NOTRACK
5419 call T1
5420 vmovdqu [A0], ymm0
5421
5422 IEMIMPL_AVX_EPILOGUE
5423 EPILOGUE_4_ARGS
5424 %assign bImm 0
5425 %rep 256
5426.imm %+ bImm:
5427 IBT_ENDBRxx_WITHOUT_NOTRACK
5428 %1 ymm0, ymm0, ymm1, bImm
5429 ret
5430 int3
5431 %assign bImm bImm + 1
5432 %endrep
5433.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5434ENDPROC iemAImpl_ %+ %1 %+ _u256
5435 %endif
5436%endmacro
5437
5438IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5439IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5440IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5441IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5442IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5443IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5444IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5445IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5446
5447
5448;;
5449; Need to move this as well somewhere better?
5450;
5451struc IEMPCMPISTRXSRC
5452 .uSrc1 resd 4
5453 .uSrc2 resd 4
5454endstruc
5455
5456struc IEMPCMPESTRXSRC
5457 .uSrc1 resd 4
5458 .uSrc2 resd 4
5459 .u64Rax resd 2
5460 .u64Rdx resd 2
5461endstruc
5462
5463;;
5464; The pcmpistri instruction.
5465;
5466; @param A0 Pointer to the ECX register to store the result to (output).
5467; @param A1 Pointer to the EFLAGS register.
5468; @param A2 Pointer to the structure containing the source operands (input).
5469; @param A3 The 8-bit immediate
5470;
5471BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5472 PROLOGUE_4_ARGS
5473 IEMIMPL_SSE_PROLOGUE
5474
5475 movzx A3, A3_8 ; must clear top bits
5476 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5477 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5478 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5479 lea T1, [.imm0 xWrtRIP]
5480 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5481 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5482 lea T1, [T1 + T0*4]
5483 %else
5484 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5485 %endif
5486 IBT_NOTRACK
5487 call T1
5488
5489 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5490 mov [T2], ecx
5491
5492 IEMIMPL_SSE_EPILOGUE
5493 EPILOGUE_4_ARGS
5494 %assign bImm 0
5495 %rep 256
5496.imm %+ bImm:
5497 IBT_ENDBRxx_WITHOUT_NOTRACK
5498 pcmpistri xmm0, xmm1, bImm
5499 ret
5500 int3
5501 %assign bImm bImm + 1
5502 %endrep
5503.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5504ENDPROC iemAImpl_pcmpistri_u128
5505
5506;;
5507; The pcmpestri instruction.
5508;
5509; @param A0 Pointer to the ECX register to store the result to (output).
5510; @param A1 Pointer to the EFLAGS register.
5511; @param A2 Pointer to the structure containing the source operands (input).
5512; @param A3 The 8-bit immediate
5513;
5514BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5515 PROLOGUE_4_ARGS
5516 IEMIMPL_SSE_PROLOGUE
5517
5518 movzx A3, A3_8 ; must clear top bits
5519 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5520 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5521 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5522 lea T1, [.imm0 xWrtRIP]
5523 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5524 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5525 lea T1, [T1 + T0*4]
5526 %else
5527 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5528 %endif
5529 push xDX ; xDX can be A1 or A2 depending on the calling convention
5530 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5531 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5532 IBT_NOTRACK
5533 call T1
5534
5535 pop xDX
5536 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5537 mov [T2], ecx
5538
5539 IEMIMPL_SSE_EPILOGUE
5540 EPILOGUE_4_ARGS
5541 %assign bImm 0
5542 %rep 256
5543.imm %+ bImm:
5544 IBT_ENDBRxx_WITHOUT_NOTRACK
5545 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5546 pcmpestri xmm0, xmm1, bImm
5547 ret
5548 %assign bImm bImm + 1
5549 %endrep
5550.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5551ENDPROC iemAImpl_pcmpestri_u128
5552
5553;;
5554; The pcmpistrm instruction template.
5555;
5556; @param A0 Pointer to the XMM0 register to store the result to (output).
5557; @param A1 Pointer to the EFLAGS register.
5558; @param A2 Pointer to the structure containing the source operands (input).
5559; @param A3 The 8-bit immediate
5560;
5561BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5562 PROLOGUE_4_ARGS
5563 IEMIMPL_SSE_PROLOGUE
5564
5565 movzx A3, A3_8 ; must clear top bits
5566 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5567 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5568 lea T1, [.imm0 xWrtRIP]
5569 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5570 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5571 lea T1, [T1 + T0*4]
5572 %else
5573 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5574 %endif
5575 IBT_NOTRACK
5576 call T1
5577
5578 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5579 movdqu [A0], xmm0
5580
5581 IEMIMPL_SSE_EPILOGUE
5582 EPILOGUE_4_ARGS
5583 %assign bImm 0
5584 %rep 256
5585.imm %+ bImm:
5586 IBT_ENDBRxx_WITHOUT_NOTRACK
5587 pcmpistrm xmm1, xmm2, bImm
5588 ret
5589 int3
5590 %assign bImm bImm + 1
5591 %endrep
5592.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5593ENDPROC iemAImpl_pcmpistrm_u128
5594
5595;;
5596; The pcmpestrm instruction template.
5597;
5598; @param A0 Pointer to the XMM0 register to store the result to (output).
5599; @param A1 Pointer to the EFLAGS register.
5600; @param A2 Pointer to the structure containing the source operands (input).
5601; @param A3 The 8-bit immediate
5602;
5603BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5604 PROLOGUE_4_ARGS
5605 IEMIMPL_SSE_PROLOGUE
5606
5607 movzx A3, A3_8 ; must clear top bits
5608 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5609 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5610 lea T1, [.imm0 xWrtRIP]
5611 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5612 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5613 lea T1, [T1 + T0*4]
5614 %else
5615 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5616 %endif
5617 push xDX ; xDX can be A1 or A2 depending on the calling convention
5618 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5619 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5620 IBT_NOTRACK
5621 call T1
5622
5623 pop xDX
5624 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5625 movdqu [A0], xmm0
5626
5627 IEMIMPL_SSE_EPILOGUE
5628 EPILOGUE_4_ARGS
5629 %assign bImm 0
5630 %rep 256
5631.imm %+ bImm:
5632 IBT_ENDBRxx_WITHOUT_NOTRACK
5633 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5634 pcmpestrm xmm1, xmm2, bImm
5635 ret
5636 %assign bImm bImm + 1
5637 %endrep
5638.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5639ENDPROC iemAImpl_pcmpestrm_u128
5640
5641
5642;;
5643; pinsrw instruction.
5644;
5645; @param A0 Pointer to the first media register size operand (input/output).
5646; @param A1 The 16 bit input operand (input).
5647; @param A2 The 8-bit immediate
5648;
5649BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5650 PROLOGUE_3_ARGS
5651 IEMIMPL_SSE_PROLOGUE
5652
5653 movzx A2, A2_8 ; must clear top bits
5654 movq mm0, [A0]
5655 lea T1, [.imm0 xWrtRIP]
5656 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5657 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5658 %else
5659 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5660 %endif
5661 lea T1, [T1 + T0]
5662 IBT_NOTRACK
5663 call T1
5664 movq [A0], mm0
5665
5666 IEMIMPL_SSE_EPILOGUE
5667 EPILOGUE_3_ARGS
5668 %assign bImm 0
5669 %rep 256
5670.imm %+ bImm:
5671 IBT_ENDBRxx_WITHOUT_NOTRACK
5672 pinsrw mm0, A1_32, bImm
5673 ret
5674 %assign bImm bImm + 1
5675 %endrep
5676.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5677ENDPROC iemAImpl_pinsrw_u64
5678
5679BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5680 PROLOGUE_3_ARGS
5681 IEMIMPL_SSE_PROLOGUE
5682
5683 movzx A2, A2_8 ; must clear top bits
5684 movdqu xmm0, [A0]
5685 lea T1, [.imm0 xWrtRIP]
5686 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5687 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5688 %else
5689 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5690 %endif
5691 lea T1, [T1 + T0*2]
5692 IBT_NOTRACK
5693 call T1
5694 movdqu [A0], xmm0
5695
5696 IEMIMPL_SSE_EPILOGUE
5697 EPILOGUE_3_ARGS
5698 %assign bImm 0
5699 %rep 256
5700.imm %+ bImm:
5701 IBT_ENDBRxx_WITHOUT_NOTRACK
5702 pinsrw xmm0, A1_32, bImm
5703 ret
5704 %assign bImm bImm + 1
5705 %endrep
5706.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5707ENDPROC iemAImpl_pinsrw_u128
5708
5709;;
5710; vpinsrw instruction.
5711;
5712; @param A0 Pointer to the first media register size operand (output).
5713; @param A1 Pointer to the source media register size operand (input).
5714; @param A2 The 16 bit input operand (input).
5715; @param A3 The 8-bit immediate
5716;
5717BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5718 PROLOGUE_4_ARGS
5719 IEMIMPL_SSE_PROLOGUE
5720
5721 movzx A3, A3_8 ; must clear top bits
5722 movdqu xmm0, [A1]
5723 lea T1, [.imm0 xWrtRIP]
5724 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5725 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5726 %else
5727 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5728 %endif
5729 lea T1, [T1 + T0*2]
5730 mov A1, A2 ; A2 requires longer encoding on Windows
5731 IBT_NOTRACK
5732 call T1
5733 movdqu [A0], xmm0
5734
5735 IEMIMPL_SSE_EPILOGUE
5736 EPILOGUE_4_ARGS
5737 %assign bImm 0
5738 %rep 256
5739.imm %+ bImm:
5740 IBT_ENDBRxx_WITHOUT_NOTRACK
5741 vpinsrw xmm0, xmm0, A1_32, bImm
5742 ret
5743 %assign bImm bImm + 1
5744 %endrep
5745.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5746ENDPROC iemAImpl_vpinsrw_u128
5747
5748
5749;;
5750; pextrw instruction.
5751;
5752; @param A0 Pointer to the 16bit output operand (output).
5753; @param A1 Pointer to the media register size operand (input).
5754; @param A2 The 8-bit immediate
5755;
5756BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5757 PROLOGUE_3_ARGS
5758 IEMIMPL_SSE_PROLOGUE
5759
5760 movzx A2, A2_8 ; must clear top bits
5761 movq mm0, A1
5762 lea T1, [.imm0 xWrtRIP]
5763 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5764 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5765 %else
5766 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5767 %endif
5768 lea T1, [T1 + T0]
5769 IBT_NOTRACK
5770 call T1
5771 mov word [A0], T0_16
5772
5773 IEMIMPL_SSE_EPILOGUE
5774 EPILOGUE_3_ARGS
5775 %assign bImm 0
5776 %rep 256
5777.imm %+ bImm:
5778 IBT_ENDBRxx_WITHOUT_NOTRACK
5779 pextrw T0_32, mm0, bImm
5780 ret
5781 %assign bImm bImm + 1
5782 %endrep
5783.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5784ENDPROC iemAImpl_pextrw_u64
5785
5786BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5787 PROLOGUE_3_ARGS
5788 IEMIMPL_SSE_PROLOGUE
5789
5790 movzx A2, A2_8 ; must clear top bits
5791 movdqu xmm0, [A1]
5792 lea T1, [.imm0 xWrtRIP]
5793 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5794 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5795 %else
5796 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5797 %endif
5798 lea T1, [T1 + T0*2]
5799 IBT_NOTRACK
5800 call T1
5801 mov word [A0], T0_16
5802
5803 IEMIMPL_SSE_EPILOGUE
5804 EPILOGUE_3_ARGS
5805 %assign bImm 0
5806 %rep 256
5807.imm %+ bImm:
5808 IBT_ENDBRxx_WITHOUT_NOTRACK
5809 pextrw T0_32, xmm0, bImm
5810 ret
5811 %assign bImm bImm + 1
5812 %endrep
5813.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5814ENDPROC iemAImpl_pextrw_u128
5815
5816;;
5817; vpextrw instruction.
5818;
5819; @param A0 Pointer to the 16bit output operand (output).
5820; @param A1 Pointer to the source media register size operand (input).
5821; @param A2 The 8-bit immediate
5822;
5823BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5824 PROLOGUE_3_ARGS
5825 IEMIMPL_SSE_PROLOGUE
5826
5827 movzx A2, A2_8 ; must clear top bits
5828 movdqu xmm0, [A1]
5829 lea T1, [.imm0 xWrtRIP]
5830 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5831 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5832 %else
5833 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5834 %endif
5835 lea T1, [T1 + T0*2]
5836 IBT_NOTRACK
5837 call T1
5838 mov word [A0], T0_16
5839
5840 IEMIMPL_SSE_EPILOGUE
5841 EPILOGUE_3_ARGS
5842 %assign bImm 0
5843 %rep 256
5844.imm %+ bImm:
5845 IBT_ENDBRxx_WITHOUT_NOTRACK
5846 vpextrw T0_32, xmm0, bImm
5847 ret
5848 %assign bImm bImm + 1
5849 %endrep
5850.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5851ENDPROC iemAImpl_vpextrw_u128
5852
5853
5854;;
5855; movmskp{s,d} SSE instruction template
5856;
5857; @param 1 The SSE instruction name.
5858; @param 2 The AVX instruction name.
5859;
5860; @param A0 Pointer to the output register (output/byte sized).
5861; @param A1 Pointer to the source media register size operand (input).
5862;
5863%macro IEMIMPL_MEDIA_MOVMSK_P 2
5864BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5865 PROLOGUE_2_ARGS
5866 IEMIMPL_SSE_PROLOGUE
5867
5868 movdqu xmm0, [A1]
5869 %1 T0, xmm0
5870 mov byte [A0], T0_8
5871
5872 IEMIMPL_SSE_EPILOGUE
5873 EPILOGUE_2_ARGS
5874ENDPROC iemAImpl_ %+ %1 %+ _u128
5875
5876BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5877 PROLOGUE_2_ARGS
5878 IEMIMPL_AVX_PROLOGUE
5879
5880 movdqu xmm0, [A1]
5881 %2 T0, xmm0
5882 mov byte [A0], T0_8
5883
5884 IEMIMPL_AVX_EPILOGUE
5885 EPILOGUE_2_ARGS
5886ENDPROC iemAImpl_ %+ %2 %+ _u128
5887
5888BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5889 PROLOGUE_2_ARGS
5890 IEMIMPL_AVX_PROLOGUE
5891
5892 vmovdqu ymm0, [A1]
5893 %2 T0, ymm0
5894 mov byte [A0], T0_8
5895
5896 IEMIMPL_AVX_EPILOGUE
5897 EPILOGUE_2_ARGS
5898ENDPROC iemAImpl_ %+ %2 %+ _u256
5899%endmacro
5900
5901IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5902IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5903
5904
5905;;
5906; Restores the SSE MXCSR register with the original value.
5907;
5908; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5909; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5910; @param 2 Expression giving the address of the FXSTATE of the guest.
5911;
5912; @note Restores the stack pointer.
5913;
5914%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5915 sub xSP, 4
5916 stmxcsr [xSP]
5917 mov T0_32, [xSP]
5918 add xSP, 4
5919 ; Merge the status bits into the original MXCSR value.
5920 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5921 and T0_32, X86_MXCSR_XCPT_FLAGS
5922 or T0_32, T1_32
5923 mov [%1], T0_32
5924
5925 ldmxcsr [xSP]
5926 add xSP, 4
5927%endmacro
5928
5929
5930;;
5931; cvttsd2si instruction - 32-bit variant.
5932;
5933; @param A0 FPU context (FXSTATE or XSAVEAREA).
5934; @param A1 Where to return the MXCSR value.
5935; @param A2 Pointer to the result operand (output).
5936; @param A3 Pointer to the second operand (input).
5937;
5938BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5939 PROLOGUE_4_ARGS
5940 IEMIMPL_SSE_PROLOGUE
5941 SSE_LD_FXSTATE_MXCSR A0
5942
5943 cvttsd2si T0_32, [A3]
5944 mov dword [A2], T0_32
5945
5946 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5947 IEMIMPL_SSE_EPILOGUE
5948 EPILOGUE_4_ARGS
5949ENDPROC iemAImpl_cvttsd2si_i32_r64
5950
5951;;
5952; cvttsd2si instruction - 64-bit variant.
5953;
5954; @param A0 FPU context (FXSTATE or XSAVEAREA).
5955; @param A1 Where to return the MXCSR value.
5956; @param A2 Pointer to the result operand (output).
5957; @param A3 Pointer to the second operand (input).
5958;
5959BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5960 PROLOGUE_4_ARGS
5961 IEMIMPL_SSE_PROLOGUE
5962 SSE_LD_FXSTATE_MXCSR A0
5963
5964 cvttsd2si T0, [A3]
5965 mov qword [A2], T0
5966
5967 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5968 IEMIMPL_SSE_EPILOGUE
5969 EPILOGUE_4_ARGS
5970ENDPROC iemAImpl_cvttsd2si_i64_r64
5971
5972
5973;;
5974; cvtsd2si instruction - 32-bit variant.
5975;
5976; @param A0 FPU context (FXSTATE or XSAVEAREA).
5977; @param A1 Where to return the MXCSR value.
5978; @param A2 Pointer to the result operand (output).
5979; @param A3 Pointer to the second operand (input).
5980;
5981BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5982 PROLOGUE_4_ARGS
5983 IEMIMPL_SSE_PROLOGUE
5984 SSE_LD_FXSTATE_MXCSR A0
5985
5986 cvtsd2si T0_32, [A3]
5987 mov dword [A2], T0_32
5988
5989 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5990 IEMIMPL_SSE_EPILOGUE
5991 EPILOGUE_4_ARGS
5992ENDPROC iemAImpl_cvtsd2si_i32_r64
5993
5994;;
5995; cvtsd2si instruction - 64-bit variant.
5996;
5997; @param A0 FPU context (FXSTATE or XSAVEAREA).
5998; @param A1 Where to return the MXCSR value.
5999; @param A2 Pointer to the result operand (output).
6000; @param A3 Pointer to the second operand (input).
6001;
6002BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6003 PROLOGUE_4_ARGS
6004 IEMIMPL_SSE_PROLOGUE
6005 SSE_LD_FXSTATE_MXCSR A0
6006
6007 cvtsd2si T0, [A3]
6008 mov qword [A2], T0
6009
6010 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6011 IEMIMPL_SSE_EPILOGUE
6012 EPILOGUE_4_ARGS
6013ENDPROC iemAImpl_cvtsd2si_i64_r64
6014
6015
6016;;
6017; cvttss2si instruction - 32-bit variant.
6018;
6019; @param A0 FPU context (FXSTATE or XSAVEAREA).
6020; @param A1 Where to return the MXCSR value.
6021; @param A2 Pointer to the result operand (output).
6022; @param A3 Pointer to the second operand (input).
6023;
6024BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6025 PROLOGUE_4_ARGS
6026 IEMIMPL_SSE_PROLOGUE
6027 SSE_LD_FXSTATE_MXCSR A0
6028
6029 cvttss2si T0_32, [A3]
6030 mov dword [A2], T0_32
6031
6032 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6033 IEMIMPL_SSE_EPILOGUE
6034 EPILOGUE_4_ARGS
6035ENDPROC iemAImpl_cvttss2si_i32_r32
6036
6037;;
6038; cvttss2si instruction - 64-bit variant.
6039;
6040; @param A0 FPU context (FXSTATE or XSAVEAREA).
6041; @param A1 Where to return the MXCSR value.
6042; @param A2 Pointer to the result operand (output).
6043; @param A3 Pointer to the second operand (input).
6044;
6045BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6046 PROLOGUE_4_ARGS
6047 IEMIMPL_SSE_PROLOGUE
6048 SSE_LD_FXSTATE_MXCSR A0
6049
6050 cvttss2si T0, [A3]
6051 mov qword [A2], T0
6052
6053 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6054 IEMIMPL_SSE_EPILOGUE
6055 EPILOGUE_4_ARGS
6056ENDPROC iemAImpl_cvttss2si_i64_r32
6057
6058
6059;;
6060; cvtss2si instruction - 32-bit variant.
6061;
6062; @param A0 FPU context (FXSTATE or XSAVEAREA).
6063; @param A1 Where to return the MXCSR value.
6064; @param A2 Pointer to the result operand (output).
6065; @param A3 Pointer to the second operand (input).
6066;
6067BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6068 PROLOGUE_4_ARGS
6069 IEMIMPL_SSE_PROLOGUE
6070 SSE_LD_FXSTATE_MXCSR A0
6071
6072 cvtss2si T0_32, [A3]
6073 mov dword [A2], T0_32
6074
6075 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6076 IEMIMPL_SSE_EPILOGUE
6077 EPILOGUE_4_ARGS
6078ENDPROC iemAImpl_cvtss2si_i32_r32
6079
6080;;
6081; cvtss2si instruction - 64-bit variant.
6082;
6083; @param A0 FPU context (FXSTATE or XSAVEAREA).
6084; @param A1 Where to return the MXCSR value.
6085; @param A2 Pointer to the result operand (output).
6086; @param A3 Pointer to the second operand (input).
6087;
6088BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6089 PROLOGUE_4_ARGS
6090 IEMIMPL_SSE_PROLOGUE
6091 SSE_LD_FXSTATE_MXCSR A0
6092
6093 cvtss2si T0, [A3]
6094 mov qword [A2], T0
6095
6096 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6097 IEMIMPL_SSE_EPILOGUE
6098 EPILOGUE_4_ARGS
6099ENDPROC iemAImpl_cvtss2si_i64_r32
6100
6101
6102;;
6103; cvtsi2ss instruction - 32-bit variant.
6104;
6105; @param A0 FPU context (FXSTATE or XSAVEAREA).
6106; @param A1 Where to return the MXCSR value.
6107; @param A2 Pointer to the result operand (output).
6108; @param A3 Pointer to the second operand (input).
6109;
6110BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6111 PROLOGUE_4_ARGS
6112 IEMIMPL_SSE_PROLOGUE
6113 SSE_LD_FXSTATE_MXCSR A0
6114
6115 cvtsi2ss xmm0, dword [A3]
6116 movd dword [A2], xmm0
6117
6118 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6119 IEMIMPL_SSE_EPILOGUE
6120 EPILOGUE_4_ARGS
6121ENDPROC iemAImpl_cvtsi2ss_r32_i32
6122
6123;;
6124; cvtsi2ss instruction - 64-bit variant.
6125;
6126; @param A0 FPU context (FXSTATE or XSAVEAREA).
6127; @param A1 Where to return the MXCSR value.
6128; @param A2 Pointer to the result operand (output).
6129; @param A3 Pointer to the second operand (input).
6130;
6131BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6132 PROLOGUE_4_ARGS
6133 IEMIMPL_SSE_PROLOGUE
6134 SSE_LD_FXSTATE_MXCSR A0
6135
6136 cvtsi2ss xmm0, qword [A3]
6137 movd dword [A2], xmm0
6138
6139 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6140 IEMIMPL_SSE_EPILOGUE
6141 EPILOGUE_4_ARGS
6142ENDPROC iemAImpl_cvtsi2ss_r32_i64
6143
6144
6145;;
6146; cvtsi2sd instruction - 32-bit variant.
6147;
6148; @param A0 FPU context (FXSTATE or XSAVEAREA).
6149; @param A1 Where to return the MXCSR value.
6150; @param A2 Pointer to the result operand (output).
6151; @param A3 Pointer to the second operand (input).
6152;
6153BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6154 PROLOGUE_4_ARGS
6155 IEMIMPL_SSE_PROLOGUE
6156 SSE_LD_FXSTATE_MXCSR A0
6157
6158 cvtsi2sd xmm0, dword [A3]
6159 movq [A2], xmm0
6160
6161 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6162 IEMIMPL_SSE_EPILOGUE
6163 EPILOGUE_4_ARGS
6164ENDPROC iemAImpl_cvtsi2sd_r64_i32
6165
6166;;
6167; cvtsi2sd instruction - 64-bit variant.
6168;
6169; @param A0 FPU context (FXSTATE or XSAVEAREA).
6170; @param A1 Where to return the MXCSR value.
6171; @param A2 Pointer to the result operand (output).
6172; @param A3 Pointer to the second operand (input).
6173;
6174BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6175 PROLOGUE_4_ARGS
6176 IEMIMPL_SSE_PROLOGUE
6177 SSE_LD_FXSTATE_MXCSR A0
6178
6179 cvtsi2sd xmm0, qword [A3]
6180 movq [A2], xmm0
6181
6182 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6183 IEMIMPL_SSE_EPILOGUE
6184 EPILOGUE_4_ARGS
6185ENDPROC iemAImpl_cvtsi2sd_r64_i64
6186
6187
6188;;
6189; Initialize the SSE MXCSR register using the guest value partially to
6190; account for rounding mode.
6191;
6192; @uses 4 bytes of stack to save the original value, T0.
6193; @param 1 Expression giving the address of the MXCSR register of the guest.
6194;
6195%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6196 sub xSP, 4
6197
6198 stmxcsr [xSP]
6199 mov T0_32, [%1]
6200 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6201 or T0_32, X86_MXCSR_XCPT_MASK
6202 sub xSP, 4
6203 mov [xSP], T0_32
6204 ldmxcsr [xSP]
6205 add xSP, 4
6206%endmacro
6207
6208
6209;;
6210; Restores the SSE MXCSR register with the original value.
6211;
6212; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6213; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6214;
6215; @note Restores the stack pointer.
6216;
6217%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6218 sub xSP, 4
6219 stmxcsr [xSP]
6220 mov T0_32, [xSP]
6221 add xSP, 4
6222 ; Merge the status bits into the original MXCSR value.
6223 mov T1_32, [%1]
6224 and T0_32, X86_MXCSR_XCPT_FLAGS
6225 or T0_32, T1_32
6226 mov [%1], T0_32
6227
6228 ldmxcsr [xSP]
6229 add xSP, 4
6230%endmacro
6231
6232
6233;
6234; UCOMISS (SSE)
6235;
6236; @param A0 Pointer to the MXCSR value (input/output).
6237; @param A1 Pointer to the EFLAGS value (input/output).
6238; @param A2 Pointer to the first source operand (aka readonly destination).
6239; @param A3 Pointer to the second source operand.
6240;
6241BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6242 PROLOGUE_4_ARGS
6243 IEMIMPL_SSE_PROLOGUE
6244 SSE_LD_FXSTATE_MXCSR_ONLY A0
6245
6246 movdqu xmm0, [A2]
6247 movdqu xmm1, [A3]
6248 ucomiss xmm0, xmm1
6249 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6250
6251 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6252 IEMIMPL_SSE_EPILOGUE
6253 EPILOGUE_4_ARGS
6254ENDPROC iemAImpl_ucomiss_u128
6255
6256BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6257 PROLOGUE_4_ARGS
6258 IEMIMPL_SSE_PROLOGUE
6259 SSE_LD_FXSTATE_MXCSR_ONLY A0
6260
6261 movdqu xmm0, [A2]
6262 movdqu xmm1, [A3]
6263 vucomiss xmm0, xmm1
6264 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6265
6266 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6267 IEMIMPL_SSE_EPILOGUE
6268 EPILOGUE_4_ARGS
6269ENDPROC iemAImpl_vucomiss_u128
6270
6271
6272;
6273; UCOMISD (SSE)
6274;
6275; @param A0 Pointer to the MXCSR value (input/output).
6276; @param A1 Pointer to the EFLAGS value (input/output).
6277; @param A2 Pointer to the first source operand (aka readonly destination).
6278; @param A3 Pointer to the second source operand.
6279;
6280BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6281 PROLOGUE_4_ARGS
6282 IEMIMPL_SSE_PROLOGUE
6283 SSE_LD_FXSTATE_MXCSR_ONLY A0
6284
6285 movdqu xmm0, [A2]
6286 movdqu xmm1, [A3]
6287 ucomisd xmm0, xmm1
6288 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6289
6290 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6291 IEMIMPL_SSE_EPILOGUE
6292 EPILOGUE_4_ARGS
6293ENDPROC iemAImpl_ucomisd_u128
6294
6295BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6296 PROLOGUE_4_ARGS
6297 IEMIMPL_SSE_PROLOGUE
6298 SSE_LD_FXSTATE_MXCSR_ONLY A0
6299
6300 movdqu xmm0, [A2]
6301 movdqu xmm1, [A3]
6302 vucomisd xmm0, xmm1
6303 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6304
6305 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6306 IEMIMPL_SSE_EPILOGUE
6307 EPILOGUE_4_ARGS
6308ENDPROC iemAImpl_vucomisd_u128
6309
6310;
6311; COMISS (SSE)
6312;
6313; @param A0 Pointer to the MXCSR value (input/output).
6314; @param A1 Pointer to the EFLAGS value (input/output).
6315; @param A2 Pointer to the first source operand (aka readonly destination).
6316; @param A3 Pointer to the second source operand.
6317;
6318BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6319 PROLOGUE_4_ARGS
6320 IEMIMPL_SSE_PROLOGUE
6321 SSE_LD_FXSTATE_MXCSR_ONLY A0
6322
6323 movdqu xmm0, [A2]
6324 movdqu xmm1, [A3]
6325 comiss xmm0, xmm1
6326 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6327
6328 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6329 IEMIMPL_SSE_EPILOGUE
6330 EPILOGUE_4_ARGS
6331ENDPROC iemAImpl_comiss_u128
6332
6333BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6334 PROLOGUE_4_ARGS
6335 IEMIMPL_SSE_PROLOGUE
6336 SSE_LD_FXSTATE_MXCSR_ONLY A0
6337
6338 movdqu xmm0, [A2]
6339 movdqu xmm1, [A3]
6340 vcomiss xmm0, xmm1
6341 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6342
6343 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6344 IEMIMPL_SSE_EPILOGUE
6345 EPILOGUE_4_ARGS
6346ENDPROC iemAImpl_vcomiss_u128
6347
6348
6349;
6350; COMISD (SSE)
6351;
6352; @param A0 Pointer to the MXCSR value (input/output).
6353; @param A1 Pointer to the EFLAGS value (input/output).
6354; @param A2 Pointer to the first source operand (aka readonly destination).
6355; @param A3 Pointer to the second source operand.
6356;
6357BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6358 PROLOGUE_4_ARGS
6359 IEMIMPL_SSE_PROLOGUE
6360 SSE_LD_FXSTATE_MXCSR_ONLY A0
6361
6362 movdqu xmm0, [A2]
6363 movdqu xmm1, [A3]
6364 comisd xmm0, xmm1
6365 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6366
6367 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6368 IEMIMPL_SSE_EPILOGUE
6369 EPILOGUE_4_ARGS
6370ENDPROC iemAImpl_comisd_u128
6371
6372BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6373 PROLOGUE_4_ARGS
6374 IEMIMPL_SSE_PROLOGUE
6375 SSE_LD_FXSTATE_MXCSR_ONLY A0
6376
6377 movdqu xmm0, [A2]
6378 movdqu xmm1, [A3]
6379 vcomisd xmm0, xmm1
6380 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6381
6382 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6383 IEMIMPL_SSE_EPILOGUE
6384 EPILOGUE_4_ARGS
6385ENDPROC iemAImpl_vcomisd_u128
6386
6387
6388;;
6389; Need to move this as well somewhere better?
6390;
6391struc IEMMEDIAF2XMMSRC
6392 .uSrc1 resd 4
6393 .uSrc2 resd 4
6394endstruc
6395
6396
6397;
6398; CMPPS (SSE)
6399;
6400; @param A0 Pointer to the MXCSR value (input/output).
6401; @param A1 Pointer to the first media register size operand (output).
6402; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6403; @param A3 The 8-bit immediate (input).
6404;
6405BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6406 PROLOGUE_4_ARGS
6407 IEMIMPL_SSE_PROLOGUE
6408 SSE_LD_FXSTATE_MXCSR_ONLY A0
6409
6410 movzx A3, A3_8 ; must clear top bits
6411 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6412 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6413 lea T1, [.imm0 xWrtRIP]
6414 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6415 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6416 %else
6417 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6418 %endif
6419 lea T1, [T1 + T0]
6420 IBT_NOTRACK
6421 call T1
6422 movdqu [A1], xmm0
6423
6424 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6425 IEMIMPL_SSE_EPILOGUE
6426 EPILOGUE_4_ARGS
6427 %assign bImm 0
6428 %rep 256
6429.imm %+ bImm:
6430 IBT_ENDBRxx_WITHOUT_NOTRACK
6431 cmpps xmm0, xmm1, bImm
6432 ret
6433 %assign bImm bImm + 1
6434 %endrep
6435.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6436ENDPROC iemAImpl_cmpps_u128
6437
6438;;
6439; SSE instructions with 8-bit immediates of the form
6440; xxx xmm1, xmm2, imm8.
6441; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6442; register.
6443;
6444; @param 1 The instruction name.
6445;
6446; @param A0 Pointer to the MXCSR value (input/output).
6447; @param A1 Pointer to the first media register size operand (output).
6448; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6449; @param A3 The 8-bit immediate (input).
6450;
6451%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6452BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6453 PROLOGUE_4_ARGS
6454 IEMIMPL_SSE_PROLOGUE
6455 SSE_LD_FXSTATE_MXCSR_ONLY A0
6456
6457 movzx A3, A3_8 ; must clear top bits
6458 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6459 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6460 lea T1, [.imm0 xWrtRIP]
6461 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6462 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6463 %else
6464 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6465 %endif
6466 lea T1, [T1 + T0*2]
6467 IBT_NOTRACK
6468 call T1
6469 movdqu [A1], xmm0
6470
6471 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6472 IEMIMPL_SSE_EPILOGUE
6473 EPILOGUE_4_ARGS
6474 %assign bImm 0
6475 %rep 256
6476.imm %+ bImm:
6477 IBT_ENDBRxx_WITHOUT_NOTRACK
6478 %1 xmm0, xmm1, bImm
6479 ret
6480 %assign bImm bImm + 1
6481 %endrep
6482.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6483ENDPROC iemAImpl_ %+ %1 %+ _u128
6484%endmacro
6485
6486IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6487IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6488IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6489
6490;;
6491; SSE instructions with 8-bit immediates of the form
6492; xxx xmm1, xmm2, imm8.
6493; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6494; register.
6495;
6496; @param 1 The instruction name.
6497;
6498; @param A0 Pointer to the MXCSR value (input/output).
6499; @param A1 Pointer to the first media register size operand (output).
6500; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6501; @param A3 The 8-bit immediate (input).
6502;
6503%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6504BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6505 PROLOGUE_4_ARGS
6506 IEMIMPL_SSE_PROLOGUE
6507 SSE_LD_FXSTATE_MXCSR_ONLY A0
6508
6509 movzx A3, A3_8 ; must clear top bits
6510 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6511 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6512 lea T1, [.imm0 xWrtRIP]
6513 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6514 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6515 lea T1, [T1 + T0*4]
6516 %else
6517 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6518 %endif
6519 IBT_NOTRACK
6520 call T1
6521 movdqu [A1], xmm0
6522
6523 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6524 IEMIMPL_SSE_EPILOGUE
6525 EPILOGUE_4_ARGS
6526 %assign bImm 0
6527 %rep 256
6528.imm %+ bImm:
6529 IBT_ENDBRxx_WITHOUT_NOTRACK
6530 %1 xmm0, xmm1, bImm
6531 ret
6532 int3
6533 %assign bImm bImm + 1
6534 %endrep
6535.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6536ENDPROC iemAImpl_ %+ %1 %+ _u128
6537%endmacro
6538
6539IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6540IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6541IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6542IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6543IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6544IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6545
6546
6547;;
6548; SSE instructions of the form
6549; xxx mm, xmm.
6550; and we need to load and save the MXCSR register.
6551;
6552; @param 1 The instruction name.
6553;
6554; @param A0 Pointer to the MXCSR value (input/output).
6555; @param A1 Pointer to the first MMX register sized operand (output).
6556; @param A2 Pointer to the media register sized operand (input).
6557;
6558%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6560 PROLOGUE_3_ARGS
6561 IEMIMPL_SSE_PROLOGUE
6562 SSE_LD_FXSTATE_MXCSR_ONLY A0
6563
6564 movdqu xmm0, [A2]
6565 %1 mm0, xmm0
6566 movq [A1], mm0
6567
6568 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6569 IEMIMPL_SSE_EPILOGUE
6570 EPILOGUE_3_ARGS
6571ENDPROC iemAImpl_ %+ %1 %+ _u128
6572%endmacro
6573
6574IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6575IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6576
6577;;
6578; SSE instructions of the form
6579; xxx xmm, xmm/m64.
6580; and we need to load and save the MXCSR register.
6581;
6582; @param 1 The instruction name.
6583;
6584; @param A0 Pointer to the MXCSR value (input/output).
6585; @param A1 Pointer to the first media register sized operand (input/output).
6586; @param A2 The 64bit source value from a MMX media register (input)
6587;
6588%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6589BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6590 PROLOGUE_3_ARGS
6591 IEMIMPL_SSE_PROLOGUE
6592 SSE_LD_FXSTATE_MXCSR_ONLY A0
6593
6594 movdqu xmm0, [A1]
6595 movq mm0, A2
6596 %1 xmm0, mm0
6597 movdqu [A1], xmm0
6598
6599 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6600 IEMIMPL_SSE_EPILOGUE
6601 EPILOGUE_3_ARGS
6602ENDPROC iemAImpl_ %+ %1 %+ _u128
6603%endmacro
6604
6605IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6606IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6607
6608;;
6609; SSE instructions of the form
6610; xxx mm, xmm/m64.
6611; and we need to load and save the MXCSR register.
6612;
6613; @param 1 The instruction name.
6614;
6615; @param A0 Pointer to the MXCSR value (input/output).
6616; @param A1 Pointer to the first MMX media register sized operand (output).
6617; @param A2 The 64bit source value (input).
6618;
6619%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6620BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6621 PROLOGUE_3_ARGS
6622 IEMIMPL_SSE_PROLOGUE
6623 SSE_LD_FXSTATE_MXCSR_ONLY A0
6624
6625 movq xmm0, A2
6626 %1 mm0, xmm0
6627 movq [A1], mm0
6628
6629 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6630 IEMIMPL_SSE_EPILOGUE
6631 EPILOGUE_3_ARGS
6632ENDPROC iemAImpl_ %+ %1 %+ _u128
6633%endmacro
6634
6635IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6636IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6637
6638;
6639; All forms of RDRAND and RDSEED
6640;
6641; @param A0 Pointer to the destination operand.
6642; @param A1 Pointer to the EFLAGS value (input/output).
6643;
6644%macro IEMIMPL_RDRAND_RDSEED 3
6645BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6646 PROLOGUE_2_ARGS
6647
6648 %1 %2
6649 mov [A0], %2
6650 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6651
6652 EPILOGUE_2_ARGS
6653ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6654%endmacro
6655
6656IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6657IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6658IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6659IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6660IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6661IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6662
6663
6664;;
6665; sha1rnds4 xmm1, xmm2, imm8.
6666;
6667; @param 1 The instruction name.
6668;
6669; @param A0 Pointer to the first media register size operand (input/output).
6670; @param A1 Pointer to the second source media register size operand (input).
6671; @param A2 The 8-bit immediate
6672;
6673BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6674 PROLOGUE_3_ARGS
6675 IEMIMPL_SSE_PROLOGUE
6676
6677 movzx A2, A2_8 ; must clear top bits
6678 movdqu xmm0, [A0]
6679 movdqu xmm1, [A1]
6680 lea T1, [.imm0 xWrtRIP]
6681 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6682 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6683 %else
6684 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6685 %endif
6686 lea T1, [T1 + T0*2]
6687 IBT_NOTRACK
6688 call T1
6689 movdqu [A0], xmm0
6690
6691 IEMIMPL_SSE_EPILOGUE
6692 EPILOGUE_3_ARGS
6693 %assign bImm 0
6694 %rep 256
6695.imm %+ bImm:
6696 IBT_ENDBRxx_WITHOUT_NOTRACK
6697 sha1rnds4 xmm0, xmm1, bImm
6698 ret
6699 %assign bImm bImm + 1
6700 %endrep
6701.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6702ENDPROC iemAImpl_sha1rnds4_u128
6703
6704
6705;;
6706; sha256rnds2 xmm1, xmm2, <XMM0>.
6707;
6708; @param 1 The instruction name.
6709;
6710; @param A0 Pointer to the first media register size operand (input/output).
6711; @param A1 Pointer to the second source media register size operand (input).
6712; @param A2 Pointer to the implicit XMM0 constants (input).
6713;
6714BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6715 PROLOGUE_3_ARGS
6716 IEMIMPL_SSE_PROLOGUE
6717
6718 movdqu xmm0, [A2]
6719 movdqu xmm1, [A0]
6720 movdqu xmm2, [A1]
6721 sha256rnds2 xmm1, xmm2
6722 movdqu [A0], xmm1
6723
6724 IEMIMPL_SSE_EPILOGUE
6725 EPILOGUE_3_ARGS
6726ENDPROC iemAImpl_sha256rnds2_u128
6727
6728
6729;
6730; 32-bit forms of ADCX and ADOX
6731;
6732; @param A0 Pointer to the destination operand (input/output).
6733; @param A1 Pointer to the EFLAGS value (input/output).
6734; @param A2 32-bit source operand 1 (input).
6735;
6736%macro IEMIMPL_ADX_32 2
6737BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6738 PROLOGUE_4_ARGS
6739
6740 IEM_LOAD_FLAGS A1, %2, 0
6741 %1 A2_32, [A0]
6742 mov [A0], A2_32
6743 IEM_SAVE_FLAGS A1, %2, 0
6744
6745 EPILOGUE_4_ARGS
6746ENDPROC iemAImpl_ %+ %1 %+ _u32
6747%endmacro
6748
6749;
6750; 64-bit forms of ADCX and ADOX
6751;
6752; @param A0 Pointer to the destination operand (input/output).
6753; @param A1 Pointer to the EFLAGS value (input/output).
6754; @param A2 64-bit source operand 1 (input).
6755;
6756%macro IEMIMPL_ADX_64 2
6757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6758 PROLOGUE_4_ARGS
6759
6760 IEM_LOAD_FLAGS A1, %2, 0
6761 %1 A2, [A0]
6762 mov [A0], A2
6763 IEM_SAVE_FLAGS A1, %2, 0
6764
6765 EPILOGUE_4_ARGS
6766ENDPROC iemAImpl_ %+ %1 %+ _u64
6767%endmacro
6768
6769IEMIMPL_ADX_32 adcx, X86_EFL_CF
6770IEMIMPL_ADX_64 adcx, X86_EFL_CF
6771
6772IEMIMPL_ADX_32 adox, X86_EFL_OF
6773IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette