VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 98122

Last change on this file since 98122 was 98103, checked in by vboxsync, 22 months ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 176.4 KB
Line 
1; $Id: IEMAllAImpl.asm 98103 2023-01-17 14:15:46Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90%endmacro
91
92
93;
94; We employ some macro assembly here to hid the calling convention differences.
95;
96%ifdef RT_ARCH_AMD64
97 %macro PROLOGUE_1_ARGS 0
98 %endmacro
99 %macro EPILOGUE_1_ARGS 0
100 ret
101 %endmacro
102 %macro EPILOGUE_1_ARGS_EX 0
103 ret
104 %endmacro
105
106 %macro PROLOGUE_2_ARGS 0
107 %endmacro
108 %macro EPILOGUE_2_ARGS 0
109 ret
110 %endmacro
111 %macro EPILOGUE_2_ARGS_EX 1
112 ret
113 %endmacro
114
115 %macro PROLOGUE_3_ARGS 0
116 %endmacro
117 %macro EPILOGUE_3_ARGS 0
118 ret
119 %endmacro
120 %macro EPILOGUE_3_ARGS_EX 1
121 ret
122 %endmacro
123
124 %macro PROLOGUE_4_ARGS 0
125 %endmacro
126 %macro EPILOGUE_4_ARGS 0
127 ret
128 %endmacro
129 %macro EPILOGUE_4_ARGS_EX 1
130 ret
131 %endmacro
132
133 %ifdef ASM_CALL64_GCC
134 %define A0 rdi
135 %define A0_32 edi
136 %define A0_16 di
137 %define A0_8 dil
138
139 %define A1 rsi
140 %define A1_32 esi
141 %define A1_16 si
142 %define A1_8 sil
143
144 %define A2 rdx
145 %define A2_32 edx
146 %define A2_16 dx
147 %define A2_8 dl
148
149 %define A3 rcx
150 %define A3_32 ecx
151 %define A3_16 cx
152 %endif
153
154 %ifdef ASM_CALL64_MSC
155 %define A0 rcx
156 %define A0_32 ecx
157 %define A0_16 cx
158 %define A0_8 cl
159
160 %define A1 rdx
161 %define A1_32 edx
162 %define A1_16 dx
163 %define A1_8 dl
164
165 %define A2 r8
166 %define A2_32 r8d
167 %define A2_16 r8w
168 %define A2_8 r8b
169
170 %define A3 r9
171 %define A3_32 r9d
172 %define A3_16 r9w
173 %endif
174
175 %define T0 rax
176 %define T0_32 eax
177 %define T0_16 ax
178 %define T0_8 al
179
180 %define T1 r11
181 %define T1_32 r11d
182 %define T1_16 r11w
183 %define T1_8 r11b
184
185 %define T2 r10 ; only AMD64
186 %define T2_32 r10d
187 %define T2_16 r10w
188 %define T2_8 r10b
189
190%else
191 ; x86
192 %macro PROLOGUE_1_ARGS 0
193 push edi
194 %endmacro
195 %macro EPILOGUE_1_ARGS 0
196 pop edi
197 ret 0
198 %endmacro
199 %macro EPILOGUE_1_ARGS_EX 1
200 pop edi
201 ret %1
202 %endmacro
203
204 %macro PROLOGUE_2_ARGS 0
205 push edi
206 %endmacro
207 %macro EPILOGUE_2_ARGS 0
208 pop edi
209 ret 0
210 %endmacro
211 %macro EPILOGUE_2_ARGS_EX 1
212 pop edi
213 ret %1
214 %endmacro
215
216 %macro PROLOGUE_3_ARGS 0
217 push ebx
218 mov ebx, [esp + 4 + 4]
219 push edi
220 %endmacro
221 %macro EPILOGUE_3_ARGS_EX 1
222 %if (%1) < 4
223 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
224 %endif
225 pop edi
226 pop ebx
227 ret %1
228 %endmacro
229 %macro EPILOGUE_3_ARGS 0
230 EPILOGUE_3_ARGS_EX 4
231 %endmacro
232
233 %macro PROLOGUE_4_ARGS 0
234 push ebx
235 push edi
236 push esi
237 mov ebx, [esp + 12 + 4 + 0]
238 mov esi, [esp + 12 + 4 + 4]
239 %endmacro
240 %macro EPILOGUE_4_ARGS_EX 1
241 %if (%1) < 8
242 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
243 %endif
244 pop esi
245 pop edi
246 pop ebx
247 ret %1
248 %endmacro
249 %macro EPILOGUE_4_ARGS 0
250 EPILOGUE_4_ARGS_EX 8
251 %endmacro
252
253 %define A0 ecx
254 %define A0_32 ecx
255 %define A0_16 cx
256 %define A0_8 cl
257
258 %define A1 edx
259 %define A1_32 edx
260 %define A1_16 dx
261 %define A1_8 dl
262
263 %define A2 ebx
264 %define A2_32 ebx
265 %define A2_16 bx
266 %define A2_8 bl
267
268 %define A3 esi
269 %define A3_32 esi
270 %define A3_16 si
271
272 %define T0 eax
273 %define T0_32 eax
274 %define T0_16 ax
275 %define T0_8 al
276
277 %define T1 edi
278 %define T1_32 edi
279 %define T1_16 di
280%endif
281
282
283;;
284; Load the relevant flags from [%1] if there are undefined flags (%3).
285;
286; @remarks Clobbers T0, stack. Changes EFLAGS.
287; @param A2 The register pointing to the flags.
288; @param 1 The parameter (A0..A3) pointing to the eflags.
289; @param 2 The set of modified flags.
290; @param 3 The set of undefined flags.
291;
292%macro IEM_MAYBE_LOAD_FLAGS 3
293 ;%if (%3) != 0
294 pushf ; store current flags
295 mov T0_32, [%1] ; load the guest flags
296 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
297 and T0_32, (%2 | %3) ; select the modified and undefined flags.
298 or [xSP], T0 ; merge guest flags with host flags.
299 popf ; load the mixed flags.
300 ;%endif
301%endmacro
302
303;;
304; Update the flag.
305;
306; @remarks Clobbers T0, T1, stack.
307; @param 1 The register pointing to the EFLAGS.
308; @param 2 The mask of modified flags to save.
309; @param 3 The mask of undefined flags to (maybe) save.
310;
311%macro IEM_SAVE_FLAGS 3
312 %if (%2 | %3) != 0
313 pushf
314 pop T1
315 mov T0_32, [%1] ; flags
316 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
317 and T1_32, (%2 | %3) ; select the modified and undefined flags.
318 or T0_32, T1_32 ; combine the flags.
319 mov [%1], T0_32 ; save the flags.
320 %endif
321%endmacro
322
323;;
324; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
325;
326; @remarks Clobbers T0, T1, stack.
327; @param 1 The register pointing to the EFLAGS.
328; @param 2 The mask of modified flags to save.
329; @param 3 Mask of additional flags to always clear
330; @param 4 Mask of additional flags to always set.
331;
332%macro IEM_SAVE_AND_ADJUST_FLAGS 4
333 %if (%2 | %3 | %4) != 0
334 pushf
335 pop T1
336 mov T0_32, [%1] ; load flags.
337 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
338 and T1_32, (%2) ; select the modified flags.
339 or T0_32, T1_32 ; combine the flags.
340 %if (%4) != 0
341 or T0_32, %4 ; add the always set flags.
342 %endif
343 mov [%1], T0_32 ; save the result.
344 %endif
345%endmacro
346
347;;
348; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
349; signed input (%4[%5]) and parity index (%6).
350;
351; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
352; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
353; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
354;
355; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
356; @param 1 The register pointing to the EFLAGS.
357; @param 2 The mask of modified flags to save.
358; @param 3 Mask of additional flags to always clear
359; @param 4 The result register to set SF by.
360; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
361; @param 6 The (full) register containing the parity table index. Will be modified!
362
363%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
364 %ifdef RT_ARCH_AMD64
365 pushf
366 pop T2
367 %else
368 push T0
369 pushf
370 pop T0
371 %endif
372 mov T1_32, [%1] ; load flags.
373 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
374 %ifdef RT_ARCH_AMD64
375 and T2_32, (%2) ; select the modified flags.
376 or T1_32, T2_32 ; combine the flags.
377 %else
378 and T0_32, (%2) ; select the modified flags.
379 or T1_32, T0_32 ; combine the flags.
380 pop T0
381 %endif
382
383 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
384 bt %4, %5 - 1
385 jnc %%sf_clear
386 or T1_32, X86_EFL_SF
387 %%sf_clear:
388
389 ; Parity last.
390 and %6, 0xff
391 %ifdef RT_ARCH_AMD64
392 lea T2, [NAME(g_afParity) xWrtRIP]
393 or T1_8, [T2 + %6]
394 %else
395 or T1_8, [NAME(g_afParity) + %6]
396 %endif
397
398 mov [%1], T1_32 ; save the result.
399%endmacro
400
401;;
402; Calculates the new EFLAGS using fixed clear and set bit masks.
403;
404; @remarks Clobbers T0.
405; @param 1 The register pointing to the EFLAGS.
406; @param 2 Mask of additional flags to always clear
407; @param 3 Mask of additional flags to always set.
408;
409%macro IEM_ADJUST_FLAGS 3
410 %if (%2 | %3) != 0
411 mov T0_32, [%1] ; Load flags.
412 %if (%2) != 0
413 and T0_32, ~(%2) ; Remove the always cleared flags.
414 %endif
415 %if (%3) != 0
416 or T0_32, %3 ; Add the always set flags.
417 %endif
418 mov [%1], T0_32 ; Save the result.
419 %endif
420%endmacro
421
422;;
423; Calculates the new EFLAGS using fixed clear and set bit masks.
424;
425; @remarks Clobbers T0, %4, EFLAGS.
426; @param 1 The register pointing to the EFLAGS.
427; @param 2 Mask of additional flags to always clear
428; @param 3 Mask of additional flags to always set.
429; @param 4 The (full) register containing the parity table index. Will be modified!
430;
431%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
432 mov T0_32, [%1] ; Load flags.
433 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 and %4, 0xff
438 %ifdef RT_ARCH_AMD64
439 lea T2, [NAME(g_afParity) xWrtRIP]
440 or T0_8, [T2 + %4]
441 %else
442 or T0_8, [NAME(g_afParity) + %4]
443 %endif
444 mov [%1], T0_32 ; Save the result.
445%endmacro
446
447
448;*********************************************************************************************************************************
449;* External Symbols *
450;*********************************************************************************************************************************
451extern NAME(g_afParity)
452
453
454;;
455; Macro for implementing a binary operator.
456;
457; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
458; variants, except on 32-bit system where the 64-bit accesses requires hand
459; coding.
460;
461; All the functions takes a pointer to the destination memory operand in A0,
462; the source register operand in A1 and a pointer to eflags in A2.
463;
464; @param 1 The instruction mnemonic.
465; @param 2 Non-zero if there should be a locked version.
466; @param 3 The modified flags.
467; @param 4 The undefined flags.
468;
469%macro IEMIMPL_BIN_OP 4
470BEGINCODE
471BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
472 PROLOGUE_3_ARGS
473 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
474 %1 byte [A0], A1_8
475 IEM_SAVE_FLAGS A2, %3, %4
476 EPILOGUE_3_ARGS
477ENDPROC iemAImpl_ %+ %1 %+ _u8
478
479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
480 PROLOGUE_3_ARGS
481 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
482 %1 word [A0], A1_16
483 IEM_SAVE_FLAGS A2, %3, %4
484 EPILOGUE_3_ARGS
485ENDPROC iemAImpl_ %+ %1 %+ _u16
486
487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
488 PROLOGUE_3_ARGS
489 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
490 %1 dword [A0], A1_32
491 IEM_SAVE_FLAGS A2, %3, %4
492 EPILOGUE_3_ARGS
493ENDPROC iemAImpl_ %+ %1 %+ _u32
494
495 %ifdef RT_ARCH_AMD64
496BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
497 PROLOGUE_3_ARGS
498 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
499 %1 qword [A0], A1
500 IEM_SAVE_FLAGS A2, %3, %4
501 EPILOGUE_3_ARGS_EX 8
502ENDPROC iemAImpl_ %+ %1 %+ _u64
503 %endif ; RT_ARCH_AMD64
504
505 %if %2 != 0 ; locked versions requested?
506
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 lock %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 lock %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 lock %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 lock %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
539 %endif ; RT_ARCH_AMD64
540 %endif ; locked
541%endmacro
542
543; instr,lock, modified-flags, undefined flags
544IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
545IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
546IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
547IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
548IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
549IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
550IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
551IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
552IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
553
554
555;;
556; Macro for implementing a binary operator, VEX variant with separate input/output.
557;
558; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
559; where the 64-bit accesses requires hand coding.
560;
561; All the functions takes a pointer to the destination memory operand in A0,
562; the first source register operand in A1, the second source register operand
563; in A2 and a pointer to eflags in A3.
564;
565; @param 1 The instruction mnemonic.
566; @param 2 The modified flags.
567; @param 3 The undefined flags.
568;
569%macro IEMIMPL_VEX_BIN_OP 3
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0_32, A1_32, A2_32
574 mov [A0], T0_32
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u32
578
579 %ifdef RT_ARCH_AMD64
580BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
581 PROLOGUE_4_ARGS
582 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
583 %1 T0, A1, A2
584 mov [A0], T0
585 IEM_SAVE_FLAGS A3, %2, %3
586 EPILOGUE_4_ARGS
587ENDPROC iemAImpl_ %+ %1 %+ _u64
588 %endif ; RT_ARCH_AMD64
589%endmacro
590
591; instr, modified-flags, undefined-flags
592IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
593IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
594IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
595
596;;
597; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
598;
599; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
600; where the 64-bit accesses requires hand coding.
601;
602; All the functions takes a pointer to the destination memory operand in A0,
603; the source register operand in A1 and a pointer to eflags in A2.
604;
605; @param 1 The instruction mnemonic.
606; @param 2 The modified flags.
607; @param 3 The undefined flags.
608;
609%macro IEMIMPL_VEX_BIN_OP_2 3
610BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
611 PROLOGUE_4_ARGS
612 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
613 mov T0_32, [A0]
614 %1 T0_32, A1_32
615 mov [A0], T0_32
616 IEM_SAVE_FLAGS A2, %2, %3
617 EPILOGUE_4_ARGS
618ENDPROC iemAImpl_ %+ %1 %+ _u32
619
620 %ifdef RT_ARCH_AMD64
621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
622 PROLOGUE_4_ARGS
623 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
624 mov T0, [A0]
625 %1 T0, A1
626 mov [A0], T0
627 IEM_SAVE_FLAGS A2, %2, %3
628 EPILOGUE_4_ARGS
629ENDPROC iemAImpl_ %+ %1 %+ _u64
630 %endif ; RT_ARCH_AMD64
631%endmacro
632
633; instr, modified-flags, undefined-flags
634IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
635IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
636IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
637
638
639;;
640; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
641;
642; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
643; where the 64-bit accesses requires hand coding.
644;
645; All the functions takes a pointer to the destination memory operand in A0,
646; the first source register operand in A1, the second source register operand
647; in A2 and a pointer to eflags in A3.
648;
649; @param 1 The instruction mnemonic.
650; @param 2 Fallback instruction if applicable.
651; @param 3 Whether to emit fallback or not.
652;
653%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
654BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
655 PROLOGUE_3_ARGS
656 %1 T0_32, A1_32, A2_32
657 mov [A0], T0_32
658 EPILOGUE_3_ARGS
659ENDPROC iemAImpl_ %+ %1 %+ _u32
660
661 %if %3
662BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
663 PROLOGUE_3_ARGS
664 %ifdef ASM_CALL64_GCC
665 mov cl, A2_8
666 %2 A1_32, cl
667 mov [A0], A1_32
668 %else
669 xchg A2, A0
670 %2 A1_32, cl
671 mov [A2], A1_32
672 %endif
673 EPILOGUE_3_ARGS
674ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
675 %endif
676
677 %ifdef RT_ARCH_AMD64
678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
679 PROLOGUE_3_ARGS
680 %1 T0, A1, A2
681 mov [A0], T0
682 EPILOGUE_3_ARGS
683ENDPROC iemAImpl_ %+ %1 %+ _u64
684
685 %if %3
686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
687 PROLOGUE_3_ARGS
688 %ifdef ASM_CALL64_GCC
689 mov cl, A2_8
690 %2 A1, cl
691 mov [A0], A1_32
692 %else
693 xchg A2, A0
694 %2 A1, cl
695 mov [A2], A1_32
696 %endif
697 mov [A0], A1
698 EPILOGUE_3_ARGS
699ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
700 %endif
701 %endif ; RT_ARCH_AMD64
702%endmacro
703
704; instr, fallback instr, emit fallback
705IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
706IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
707IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
708IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
709IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
710
711
712;
713; RORX uses a immediate byte for the shift count, so we only do
714; fallback implementation of that one.
715;
716BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
717 PROLOGUE_3_ARGS
718 %ifdef ASM_CALL64_GCC
719 mov cl, A2_8
720 ror A1_32, cl
721 mov [A0], A1_32
722 %else
723 xchg A2, A0
724 ror A1_32, cl
725 mov [A2], A1_32
726 %endif
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_rorx_u32
729
730 %ifdef RT_ARCH_AMD64
731BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
732 PROLOGUE_3_ARGS
733 %ifdef ASM_CALL64_GCC
734 mov cl, A2_8
735 ror A1, cl
736 mov [A0], A1_32
737 %else
738 xchg A2, A0
739 ror A1, cl
740 mov [A2], A1_32
741 %endif
742 mov [A0], A1
743 EPILOGUE_3_ARGS
744ENDPROC iemAImpl_rorx_u64
745 %endif ; RT_ARCH_AMD64
746
747
748;
749; MULX
750;
751BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
752 PROLOGUE_4_ARGS
753%ifdef ASM_CALL64_GCC
754 ; A2_32 is EDX - prefect
755 mulx T0_32, T1_32, A3_32
756 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
757 mov [A0], T0_32
758%else
759 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
760 xchg A1, A2
761 mulx T0_32, T1_32, A3_32
762 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
763 mov [A0], T0_32
764%endif
765 EPILOGUE_4_ARGS
766ENDPROC iemAImpl_mulx_u32
767
768
769BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
770 PROLOGUE_4_ARGS
771%ifdef ASM_CALL64_GCC
772 ; A2_32 is EDX, T0_32 is EAX
773 mov eax, A3_32
774 mul A2_32
775 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
776 mov [A0], edx
777%else
778 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
779 xchg A1, A2
780 mov eax, A3_32
781 mul A2_32
782 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
783 mov [A0], edx
784%endif
785 EPILOGUE_4_ARGS
786ENDPROC iemAImpl_mulx_u32_fallback
787
788%ifdef RT_ARCH_AMD64
789BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
790 PROLOGUE_4_ARGS
791%ifdef ASM_CALL64_GCC
792 ; A2 is RDX - prefect
793 mulx T0, T1, A3
794 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
795 mov [A0], T0
796%else
797 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
798 xchg A1, A2
799 mulx T0, T1, A3
800 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
801 mov [A0], T0
802%endif
803 EPILOGUE_4_ARGS
804ENDPROC iemAImpl_mulx_u64
805
806
807BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
808 PROLOGUE_4_ARGS
809%ifdef ASM_CALL64_GCC
810 ; A2 is RDX, T0 is RAX
811 mov rax, A3
812 mul A2
813 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
814 mov [A0], rdx
815%else
816 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
817 xchg A1, A2
818 mov rax, A3
819 mul A2
820 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
821 mov [A0], rdx
822%endif
823 EPILOGUE_4_ARGS
824ENDPROC iemAImpl_mulx_u64_fallback
825
826%endif
827
828
829;;
830; Macro for implementing a bit operator.
831;
832; This will generate code for the 16, 32 and 64 bit accesses with locked
833; variants, except on 32-bit system where the 64-bit accesses requires hand
834; coding.
835;
836; All the functions takes a pointer to the destination memory operand in A0,
837; the source register operand in A1 and a pointer to eflags in A2.
838;
839; @param 1 The instruction mnemonic.
840; @param 2 Non-zero if there should be a locked version.
841; @param 3 The modified flags.
842; @param 4 The undefined flags.
843;
844%macro IEMIMPL_BIT_OP 4
845BEGINCODE
846BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
847 PROLOGUE_3_ARGS
848 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
849 %1 word [A0], A1_16
850 IEM_SAVE_FLAGS A2, %3, %4
851 EPILOGUE_3_ARGS
852ENDPROC iemAImpl_ %+ %1 %+ _u16
853
854BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
855 PROLOGUE_3_ARGS
856 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
857 %1 dword [A0], A1_32
858 IEM_SAVE_FLAGS A2, %3, %4
859 EPILOGUE_3_ARGS
860ENDPROC iemAImpl_ %+ %1 %+ _u32
861
862 %ifdef RT_ARCH_AMD64
863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
864 PROLOGUE_3_ARGS
865 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
866 %1 qword [A0], A1
867 IEM_SAVE_FLAGS A2, %3, %4
868 EPILOGUE_3_ARGS_EX 8
869ENDPROC iemAImpl_ %+ %1 %+ _u64
870 %endif ; RT_ARCH_AMD64
871
872 %if %2 != 0 ; locked versions requested?
873
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 lock %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 lock %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 lock %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
898 %endif ; RT_ARCH_AMD64
899 %endif ; locked
900%endmacro
901IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
902IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
903IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
904IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
905
906;;
907; Macro for implementing a bit search operator.
908;
909; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
910; system where the 64-bit accesses requires hand coding.
911;
912; All the functions takes a pointer to the destination memory operand in A0,
913; the source register operand in A1 and a pointer to eflags in A2.
914;
915; In the ZF case the destination register is 'undefined', however it seems that
916; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
917; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
918; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
919; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
920;
921; @param 1 The instruction mnemonic.
922; @param 2 The modified flags.
923; @param 3 The undefined flags.
924; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
925;
926%macro IEMIMPL_BIT_OP2 4
927BEGINCODE
928BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
929 PROLOGUE_3_ARGS
930 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
931 %1 T0_16, A1_16
932%if %4 != 0
933 jz .unchanged_dst
934%endif
935 mov [A0], T0_16
936.unchanged_dst:
937 IEM_SAVE_FLAGS A2, %2, %3
938 EPILOGUE_3_ARGS
939ENDPROC iemAImpl_ %+ %1 %+ _u16
940
941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
942 PROLOGUE_3_ARGS
943 %1 T1_16, A1_16
944%if %4 != 0
945 jz .unchanged_dst
946%endif
947 mov [A0], T1_16
948 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
949 EPILOGUE_3_ARGS
950.unchanged_dst:
951 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
952 EPILOGUE_3_ARGS
953ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
954
955BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
956 PROLOGUE_3_ARGS
957 %1 T0_16, A1_16
958%if %4 != 0
959 jz .unchanged_dst
960%endif
961 mov [A0], T0_16
962.unchanged_dst:
963 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
964 EPILOGUE_3_ARGS
965ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
966
967
968BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
969 PROLOGUE_3_ARGS
970 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
971 %1 T0_32, A1_32
972%if %4 != 0
973 jz .unchanged_dst
974%endif
975 mov [A0], T0_32
976.unchanged_dst:
977 IEM_SAVE_FLAGS A2, %2, %3
978 EPILOGUE_3_ARGS
979ENDPROC iemAImpl_ %+ %1 %+ _u32
980
981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
982 PROLOGUE_3_ARGS
983 %1 T1_32, A1_32
984%if %4 != 0
985 jz .unchanged_dst
986%endif
987 mov [A0], T1_32
988 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
989 EPILOGUE_3_ARGS
990.unchanged_dst:
991 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
992 EPILOGUE_3_ARGS
993ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
994
995BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
996 PROLOGUE_3_ARGS
997 %1 T0_32, A1_32
998%if %4 != 0
999 jz .unchanged_dst
1000%endif
1001 mov [A0], T0_32
1002.unchanged_dst:
1003 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1004 EPILOGUE_3_ARGS
1005ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1006
1007
1008 %ifdef RT_ARCH_AMD64
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1011 PROLOGUE_3_ARGS
1012 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1013 %1 T0, A1
1014%if %4 != 0
1015 jz .unchanged_dst
1016%endif
1017 mov [A0], T0
1018.unchanged_dst:
1019 IEM_SAVE_FLAGS A2, %2, %3
1020 EPILOGUE_3_ARGS_EX 8
1021ENDPROC iemAImpl_ %+ %1 %+ _u64
1022
1023BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1024 PROLOGUE_3_ARGS
1025 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1026 %1 T1, A1
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T1
1031 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1032 EPILOGUE_3_ARGS
1033.unchanged_dst:
1034 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1035 EPILOGUE_3_ARGS
1036ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1037
1038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1039 PROLOGUE_3_ARGS
1040 %1 T0, A1
1041%if %4 != 0
1042 jz .unchanged_dst
1043%endif
1044 mov [A0], T0
1045.unchanged_dst:
1046 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1047 EPILOGUE_3_ARGS_EX 8
1048ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1049
1050 %endif ; RT_ARCH_AMD64
1051%endmacro
1052
1053IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1054IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1055IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1056IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1057
1058
1059;;
1060; Macro for implementing POPCNT.
1061;
1062; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1063; system where the 64-bit accesses requires hand coding.
1064;
1065; All the functions takes a pointer to the destination memory operand in A0,
1066; the source register operand in A1 and a pointer to eflags in A2.
1067;
1068; ASSUMES Intel and AMD set EFLAGS the same way.
1069;
1070; ASSUMES the instruction does not support memory destination.
1071;
1072; @param 1 The instruction mnemonic.
1073; @param 2 The modified flags.
1074; @param 3 The undefined flags.
1075;
1076%macro IEMIMPL_BIT_OP3 3
1077BEGINCODE
1078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1079 PROLOGUE_3_ARGS
1080 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1081 %1 T0_16, A1_16
1082 mov [A0], T0_16
1083 IEM_SAVE_FLAGS A2, %2, %3
1084 EPILOGUE_3_ARGS
1085ENDPROC iemAImpl_ %+ %1 %+ _u16
1086
1087BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1088 PROLOGUE_3_ARGS
1089 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090 %1 T0_32, A1_32
1091 mov [A0], T0_32
1092 IEM_SAVE_FLAGS A2, %2, %3
1093 EPILOGUE_3_ARGS
1094ENDPROC iemAImpl_ %+ %1 %+ _u32
1095
1096 %ifdef RT_ARCH_AMD64
1097BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1098 PROLOGUE_3_ARGS
1099 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1100 %1 T0, A1
1101 mov [A0], T0
1102 IEM_SAVE_FLAGS A2, %2, %3
1103 EPILOGUE_3_ARGS_EX 8
1104ENDPROC iemAImpl_ %+ %1 %+ _u64
1105 %endif ; RT_ARCH_AMD64
1106%endmacro
1107IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1108
1109
1110;
1111; IMUL is also a similar but yet different case (no lock, no mem dst).
1112; The rDX:rAX variant of imul is handled together with mul further down.
1113;
1114BEGINCODE
1115; @param 1 EFLAGS that are modified.
1116; @param 2 Undefined EFLAGS.
1117; @param 3 Function suffix.
1118; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1119; 2 for AMD (set AF, clear PF, ZF and SF).
1120%macro IEMIMPL_IMUL_TWO 4
1121BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1122 PROLOGUE_3_ARGS
1123 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1124 imul A1_16, word [A0]
1125 mov [A0], A1_16
1126 %if %4 != 1
1127 IEM_SAVE_FLAGS A2, %1, %2
1128 %else
1129 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1130 %endif
1131 EPILOGUE_3_ARGS
1132ENDPROC iemAImpl_imul_two_u16 %+ %3
1133
1134BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1135 PROLOGUE_3_ARGS
1136 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1137 imul A1_32, dword [A0]
1138 mov [A0], A1_32
1139 %if %4 != 1
1140 IEM_SAVE_FLAGS A2, %1, %2
1141 %else
1142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1143 %endif
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_imul_two_u32 %+ %3
1146
1147 %ifdef RT_ARCH_AMD64
1148BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1149 PROLOGUE_3_ARGS
1150 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1151 imul A1, qword [A0]
1152 mov [A0], A1
1153 %if %4 != 1
1154 IEM_SAVE_FLAGS A2, %1, %2
1155 %else
1156 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1157 %endif
1158 EPILOGUE_3_ARGS_EX 8
1159ENDPROC iemAImpl_imul_two_u64 %+ %3
1160 %endif ; RT_ARCH_AMD64
1161%endmacro
1162IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1163IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1164IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1165
1166
1167;
1168; XCHG for memory operands. This implies locking. No flag changes.
1169;
1170; Each function takes two arguments, first the pointer to the memory,
1171; then the pointer to the register. They all return void.
1172;
1173BEGINCODE
1174BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1175 PROLOGUE_2_ARGS
1176 mov T0_8, [A1]
1177 xchg [A0], T0_8
1178 mov [A1], T0_8
1179 EPILOGUE_2_ARGS
1180ENDPROC iemAImpl_xchg_u8_locked
1181
1182BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1183 PROLOGUE_2_ARGS
1184 mov T0_16, [A1]
1185 xchg [A0], T0_16
1186 mov [A1], T0_16
1187 EPILOGUE_2_ARGS
1188ENDPROC iemAImpl_xchg_u16_locked
1189
1190BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1191 PROLOGUE_2_ARGS
1192 mov T0_32, [A1]
1193 xchg [A0], T0_32
1194 mov [A1], T0_32
1195 EPILOGUE_2_ARGS
1196ENDPROC iemAImpl_xchg_u32_locked
1197
1198%ifdef RT_ARCH_AMD64
1199BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1200 PROLOGUE_2_ARGS
1201 mov T0, [A1]
1202 xchg [A0], T0
1203 mov [A1], T0
1204 EPILOGUE_2_ARGS
1205ENDPROC iemAImpl_xchg_u64_locked
1206%endif
1207
1208; Unlocked variants for fDisregardLock mode.
1209
1210BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1211 PROLOGUE_2_ARGS
1212 mov T0_8, [A1]
1213 mov T1_8, [A0]
1214 mov [A0], T0_8
1215 mov [A1], T1_8
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u8_unlocked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_16, [A1]
1222 mov T1_16, [A0]
1223 mov [A0], T0_16
1224 mov [A1], T1_16
1225 EPILOGUE_2_ARGS
1226ENDPROC iemAImpl_xchg_u16_unlocked
1227
1228BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0_32, [A1]
1231 mov T1_32, [A0]
1232 mov [A0], T0_32
1233 mov [A1], T1_32
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u32_unlocked
1236
1237%ifdef RT_ARCH_AMD64
1238BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1239 PROLOGUE_2_ARGS
1240 mov T0, [A1]
1241 mov T1, [A0]
1242 mov [A0], T0
1243 mov [A1], T1
1244 EPILOGUE_2_ARGS
1245ENDPROC iemAImpl_xchg_u64_unlocked
1246%endif
1247
1248
1249;
1250; XADD for memory operands.
1251;
1252; Each function takes three arguments, first the pointer to the
1253; memory/register, then the pointer to the register, and finally a pointer to
1254; eflags. They all return void.
1255;
1256BEGINCODE
1257BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1258 PROLOGUE_3_ARGS
1259 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1260 mov T0_8, [A1]
1261 xadd [A0], T0_8
1262 mov [A1], T0_8
1263 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1264 EPILOGUE_3_ARGS
1265ENDPROC iemAImpl_xadd_u8
1266
1267BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1268 PROLOGUE_3_ARGS
1269 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1270 mov T0_16, [A1]
1271 xadd [A0], T0_16
1272 mov [A1], T0_16
1273 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1274 EPILOGUE_3_ARGS
1275ENDPROC iemAImpl_xadd_u16
1276
1277BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1278 PROLOGUE_3_ARGS
1279 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1280 mov T0_32, [A1]
1281 xadd [A0], T0_32
1282 mov [A1], T0_32
1283 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1284 EPILOGUE_3_ARGS
1285ENDPROC iemAImpl_xadd_u32
1286
1287%ifdef RT_ARCH_AMD64
1288BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1289 PROLOGUE_3_ARGS
1290 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1291 mov T0, [A1]
1292 xadd [A0], T0
1293 mov [A1], T0
1294 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 EPILOGUE_3_ARGS
1296ENDPROC iemAImpl_xadd_u64
1297%endif ; RT_ARCH_AMD64
1298
1299BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1300 PROLOGUE_3_ARGS
1301 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 mov T0_8, [A1]
1303 lock xadd [A0], T0_8
1304 mov [A1], T0_8
1305 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 EPILOGUE_3_ARGS
1307ENDPROC iemAImpl_xadd_u8_locked
1308
1309BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1310 PROLOGUE_3_ARGS
1311 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1312 mov T0_16, [A1]
1313 lock xadd [A0], T0_16
1314 mov [A1], T0_16
1315 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1316 EPILOGUE_3_ARGS
1317ENDPROC iemAImpl_xadd_u16_locked
1318
1319BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1320 PROLOGUE_3_ARGS
1321 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1322 mov T0_32, [A1]
1323 lock xadd [A0], T0_32
1324 mov [A1], T0_32
1325 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 EPILOGUE_3_ARGS
1327ENDPROC iemAImpl_xadd_u32_locked
1328
1329%ifdef RT_ARCH_AMD64
1330BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1333 mov T0, [A1]
1334 lock xadd [A0], T0
1335 mov [A1], T0
1336 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 EPILOGUE_3_ARGS
1338ENDPROC iemAImpl_xadd_u64_locked
1339%endif ; RT_ARCH_AMD64
1340
1341
1342;
1343; CMPXCHG8B.
1344;
1345; These are tricky register wise, so the code is duplicated for each calling
1346; convention.
1347;
1348; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1349;
1350; C-proto:
1351; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1352; uint32_t *pEFlags));
1353;
1354; Note! Identical to iemAImpl_cmpxchg16b.
1355;
1356BEGINCODE
1357BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1358%ifdef RT_ARCH_AMD64
1359 %ifdef ASM_CALL64_MSC
1360 push rbx
1361
1362 mov r11, rdx ; pu64EaxEdx (is also T1)
1363 mov r10, rcx ; pu64Dst
1364
1365 mov ebx, [r8]
1366 mov ecx, [r8 + 4]
1367 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1368 mov eax, [r11]
1369 mov edx, [r11 + 4]
1370
1371 lock cmpxchg8b [r10]
1372
1373 mov [r11], eax
1374 mov [r11 + 4], edx
1375 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1376
1377 pop rbx
1378 ret
1379 %else
1380 push rbx
1381
1382 mov r10, rcx ; pEFlags
1383 mov r11, rdx ; pu64EbxEcx (is also T1)
1384
1385 mov ebx, [r11]
1386 mov ecx, [r11 + 4]
1387 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1388 mov eax, [rsi]
1389 mov edx, [rsi + 4]
1390
1391 lock cmpxchg8b [rdi]
1392
1393 mov [rsi], eax
1394 mov [rsi + 4], edx
1395 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1396
1397 pop rbx
1398 ret
1399
1400 %endif
1401%else
1402 push esi
1403 push edi
1404 push ebx
1405 push ebp
1406
1407 mov edi, ecx ; pu64Dst
1408 mov esi, edx ; pu64EaxEdx
1409 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1410 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1411
1412 mov ebx, [ecx]
1413 mov ecx, [ecx + 4]
1414 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1415 mov eax, [esi]
1416 mov edx, [esi + 4]
1417
1418 lock cmpxchg8b [edi]
1419
1420 mov [esi], eax
1421 mov [esi + 4], edx
1422 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1423
1424 pop ebp
1425 pop ebx
1426 pop edi
1427 pop esi
1428 ret 8
1429%endif
1430ENDPROC iemAImpl_cmpxchg8b
1431
1432BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1433 ; Lazy bird always lock prefixes cmpxchg8b.
1434 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1435ENDPROC iemAImpl_cmpxchg8b_locked
1436
1437%ifdef RT_ARCH_AMD64
1438
1439;
1440; CMPXCHG16B.
1441;
1442; These are tricky register wise, so the code is duplicated for each calling
1443; convention.
1444;
1445; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1446;
1447; C-proto:
1448; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1449; uint32_t *pEFlags));
1450;
1451; Note! Identical to iemAImpl_cmpxchg8b.
1452;
1453BEGINCODE
1454BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1455 %ifdef ASM_CALL64_MSC
1456 push rbx
1457
1458 mov r11, rdx ; pu64RaxRdx (is also T1)
1459 mov r10, rcx ; pu64Dst
1460
1461 mov rbx, [r8]
1462 mov rcx, [r8 + 8]
1463 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1464 mov rax, [r11]
1465 mov rdx, [r11 + 8]
1466
1467 lock cmpxchg16b [r10]
1468
1469 mov [r11], rax
1470 mov [r11 + 8], rdx
1471 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1472
1473 pop rbx
1474 ret
1475 %else
1476 push rbx
1477
1478 mov r10, rcx ; pEFlags
1479 mov r11, rdx ; pu64RbxRcx (is also T1)
1480
1481 mov rbx, [r11]
1482 mov rcx, [r11 + 8]
1483 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1484 mov rax, [rsi]
1485 mov rdx, [rsi + 8]
1486
1487 lock cmpxchg16b [rdi]
1488
1489 mov [rsi], rax
1490 mov [rsi + 8], rdx
1491 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1492
1493 pop rbx
1494 ret
1495
1496 %endif
1497ENDPROC iemAImpl_cmpxchg16b
1498
1499BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1500 ; Lazy bird always lock prefixes cmpxchg16b.
1501 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1502ENDPROC iemAImpl_cmpxchg16b_locked
1503
1504%endif ; RT_ARCH_AMD64
1505
1506
1507;
1508; CMPXCHG.
1509;
1510; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1511;
1512; C-proto:
1513; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1514;
1515BEGINCODE
1516%macro IEMIMPL_CMPXCHG 2
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1518 PROLOGUE_4_ARGS
1519 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1520 mov al, [A1]
1521 %1 cmpxchg [A0], A2_8
1522 mov [A1], al
1523 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1524 EPILOGUE_4_ARGS
1525ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1526
1527BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1528 PROLOGUE_4_ARGS
1529 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1530 mov ax, [A1]
1531 %1 cmpxchg [A0], A2_16
1532 mov [A1], ax
1533 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1534 EPILOGUE_4_ARGS
1535ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1536
1537BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1538 PROLOGUE_4_ARGS
1539 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1540 mov eax, [A1]
1541 %1 cmpxchg [A0], A2_32
1542 mov [A1], eax
1543 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1544 EPILOGUE_4_ARGS
1545ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1546
1547BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1548%ifdef RT_ARCH_AMD64
1549 PROLOGUE_4_ARGS
1550 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1551 mov rax, [A1]
1552 %1 cmpxchg [A0], A2
1553 mov [A1], rax
1554 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1555 EPILOGUE_4_ARGS
1556%else
1557 ;
1558 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1559 ;
1560 push esi
1561 push edi
1562 push ebx
1563 push ebp
1564
1565 mov edi, ecx ; pu64Dst
1566 mov esi, edx ; pu64Rax
1567 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1568 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1569
1570 mov ebx, [ecx]
1571 mov ecx, [ecx + 4]
1572 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1573 mov eax, [esi]
1574 mov edx, [esi + 4]
1575
1576 lock cmpxchg8b [edi]
1577
1578 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1579 jz .cmpxchg8b_not_equal
1580 cmp eax, eax ; just set the other flags.
1581.store:
1582 mov [esi], eax
1583 mov [esi + 4], edx
1584 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1585
1586 pop ebp
1587 pop ebx
1588 pop edi
1589 pop esi
1590 ret 8
1591
1592.cmpxchg8b_not_equal:
1593 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1594 jne .store
1595 cmp [esi], eax
1596 jmp .store
1597
1598%endif
1599ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1600%endmacro ; IEMIMPL_CMPXCHG
1601
1602IEMIMPL_CMPXCHG , ,
1603IEMIMPL_CMPXCHG lock, _locked
1604
1605;;
1606; Macro for implementing a unary operator.
1607;
1608; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1609; variants, except on 32-bit system where the 64-bit accesses requires hand
1610; coding.
1611;
1612; All the functions takes a pointer to the destination memory operand in A0,
1613; the source register operand in A1 and a pointer to eflags in A2.
1614;
1615; @param 1 The instruction mnemonic.
1616; @param 2 The modified flags.
1617; @param 3 The undefined flags.
1618;
1619%macro IEMIMPL_UNARY_OP 3
1620BEGINCODE
1621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1622 PROLOGUE_2_ARGS
1623 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1624 %1 byte [A0]
1625 IEM_SAVE_FLAGS A1, %2, %3
1626 EPILOGUE_2_ARGS
1627ENDPROC iemAImpl_ %+ %1 %+ _u8
1628
1629BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1630 PROLOGUE_2_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1632 lock %1 byte [A0]
1633 IEM_SAVE_FLAGS A1, %2, %3
1634 EPILOGUE_2_ARGS
1635ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1636
1637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1638 PROLOGUE_2_ARGS
1639 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1640 %1 word [A0]
1641 IEM_SAVE_FLAGS A1, %2, %3
1642 EPILOGUE_2_ARGS
1643ENDPROC iemAImpl_ %+ %1 %+ _u16
1644
1645BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1646 PROLOGUE_2_ARGS
1647 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1648 lock %1 word [A0]
1649 IEM_SAVE_FLAGS A1, %2, %3
1650 EPILOGUE_2_ARGS
1651ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1652
1653BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1654 PROLOGUE_2_ARGS
1655 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1656 %1 dword [A0]
1657 IEM_SAVE_FLAGS A1, %2, %3
1658 EPILOGUE_2_ARGS
1659ENDPROC iemAImpl_ %+ %1 %+ _u32
1660
1661BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1662 PROLOGUE_2_ARGS
1663 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1664 lock %1 dword [A0]
1665 IEM_SAVE_FLAGS A1, %2, %3
1666 EPILOGUE_2_ARGS
1667ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1668
1669 %ifdef RT_ARCH_AMD64
1670BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1671 PROLOGUE_2_ARGS
1672 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1673 %1 qword [A0]
1674 IEM_SAVE_FLAGS A1, %2, %3
1675 EPILOGUE_2_ARGS
1676ENDPROC iemAImpl_ %+ %1 %+ _u64
1677
1678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1679 PROLOGUE_2_ARGS
1680 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1681 lock %1 qword [A0]
1682 IEM_SAVE_FLAGS A1, %2, %3
1683 EPILOGUE_2_ARGS
1684ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1685 %endif ; RT_ARCH_AMD64
1686
1687%endmacro
1688
1689IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1690IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1691IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1692IEMIMPL_UNARY_OP not, 0, 0
1693
1694
1695;
1696; BSWAP. No flag changes.
1697;
1698; Each function takes one argument, pointer to the value to bswap
1699; (input/output). They all return void.
1700;
1701BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1702 PROLOGUE_1_ARGS
1703 mov T0_32, [A0] ; just in case any of the upper bits are used.
1704 db 66h
1705 bswap T0_32
1706 mov [A0], T0_32
1707 EPILOGUE_1_ARGS
1708ENDPROC iemAImpl_bswap_u16
1709
1710BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1711 PROLOGUE_1_ARGS
1712 mov T0_32, [A0]
1713 bswap T0_32
1714 mov [A0], T0_32
1715 EPILOGUE_1_ARGS
1716ENDPROC iemAImpl_bswap_u32
1717
1718BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1719%ifdef RT_ARCH_AMD64
1720 PROLOGUE_1_ARGS
1721 mov T0, [A0]
1722 bswap T0
1723 mov [A0], T0
1724 EPILOGUE_1_ARGS
1725%else
1726 PROLOGUE_1_ARGS
1727 mov T0, [A0]
1728 mov T1, [A0 + 4]
1729 bswap T0
1730 bswap T1
1731 mov [A0 + 4], T0
1732 mov [A0], T1
1733 EPILOGUE_1_ARGS
1734%endif
1735ENDPROC iemAImpl_bswap_u64
1736
1737
1738;;
1739; Macro for implementing a shift operation.
1740;
1741; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1742; 32-bit system where the 64-bit accesses requires hand coding.
1743;
1744; All the functions takes a pointer to the destination memory operand in A0,
1745; the shift count in A1 and a pointer to eflags in A2.
1746;
1747; @param 1 The instruction mnemonic.
1748; @param 2 The modified flags.
1749; @param 3 The undefined flags.
1750;
1751; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1752;
1753; @note the _intel and _amd variants are implemented in C.
1754;
1755%macro IEMIMPL_SHIFT_OP 3
1756BEGINCODE
1757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1758 PROLOGUE_3_ARGS
1759 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1760 %ifdef ASM_CALL64_GCC
1761 mov cl, A1_8
1762 %1 byte [A0], cl
1763 %else
1764 xchg A1, A0
1765 %1 byte [A1], cl
1766 %endif
1767 IEM_SAVE_FLAGS A2, %2, %3
1768 EPILOGUE_3_ARGS
1769ENDPROC iemAImpl_ %+ %1 %+ _u8
1770
1771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1772 PROLOGUE_3_ARGS
1773 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1774 %ifdef ASM_CALL64_GCC
1775 mov cl, A1_8
1776 %1 word [A0], cl
1777 %else
1778 xchg A1, A0
1779 %1 word [A1], cl
1780 %endif
1781 IEM_SAVE_FLAGS A2, %2, %3
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1786 PROLOGUE_3_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1788 %ifdef ASM_CALL64_GCC
1789 mov cl, A1_8
1790 %1 dword [A0], cl
1791 %else
1792 xchg A1, A0
1793 %1 dword [A1], cl
1794 %endif
1795 IEM_SAVE_FLAGS A2, %2, %3
1796 EPILOGUE_3_ARGS
1797ENDPROC iemAImpl_ %+ %1 %+ _u32
1798
1799 %ifdef RT_ARCH_AMD64
1800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1801 PROLOGUE_3_ARGS
1802 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1803 %ifdef ASM_CALL64_GCC
1804 mov cl, A1_8
1805 %1 qword [A0], cl
1806 %else
1807 xchg A1, A0
1808 %1 qword [A1], cl
1809 %endif
1810 IEM_SAVE_FLAGS A2, %2, %3
1811 EPILOGUE_3_ARGS
1812ENDPROC iemAImpl_ %+ %1 %+ _u64
1813 %endif ; RT_ARCH_AMD64
1814
1815%endmacro
1816
1817IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1818IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1819IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1820IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1821IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1822IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1823IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1824
1825
1826;;
1827; Macro for implementing a double precision shift operation.
1828;
1829; This will generate code for the 16, 32 and 64 bit accesses, except on
1830; 32-bit system where the 64-bit accesses requires hand coding.
1831;
1832; The functions takes the destination operand (r/m) in A0, the source (reg) in
1833; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1834;
1835; @param 1 The instruction mnemonic.
1836; @param 2 The modified flags.
1837; @param 3 The undefined flags.
1838;
1839; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1840;
1841; @note the _intel and _amd variants are implemented in C.
1842;
1843%macro IEMIMPL_SHIFT_DBL_OP 3
1844BEGINCODE
1845BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1846 PROLOGUE_4_ARGS
1847 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1848 %ifdef ASM_CALL64_GCC
1849 xchg A3, A2
1850 %1 [A0], A1_16, cl
1851 xchg A3, A2
1852 %else
1853 xchg A0, A2
1854 %1 [A2], A1_16, cl
1855 %endif
1856 IEM_SAVE_FLAGS A3, %2, %3
1857 EPILOGUE_4_ARGS
1858ENDPROC iemAImpl_ %+ %1 %+ _u16
1859
1860BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1861 PROLOGUE_4_ARGS
1862 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1863 %ifdef ASM_CALL64_GCC
1864 xchg A3, A2
1865 %1 [A0], A1_32, cl
1866 xchg A3, A2
1867 %else
1868 xchg A0, A2
1869 %1 [A2], A1_32, cl
1870 %endif
1871 IEM_SAVE_FLAGS A3, %2, %3
1872 EPILOGUE_4_ARGS
1873ENDPROC iemAImpl_ %+ %1 %+ _u32
1874
1875 %ifdef RT_ARCH_AMD64
1876BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1877 PROLOGUE_4_ARGS
1878 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1879 %ifdef ASM_CALL64_GCC
1880 xchg A3, A2
1881 %1 [A0], A1, cl
1882 xchg A3, A2
1883 %else
1884 xchg A0, A2
1885 %1 [A2], A1, cl
1886 %endif
1887 IEM_SAVE_FLAGS A3, %2, %3
1888 EPILOGUE_4_ARGS_EX 12
1889ENDPROC iemAImpl_ %+ %1 %+ _u64
1890 %endif ; RT_ARCH_AMD64
1891
1892%endmacro
1893
1894IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1895IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1896
1897
1898;;
1899; Macro for implementing a multiplication operations.
1900;
1901; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1902; 32-bit system where the 64-bit accesses requires hand coding.
1903;
1904; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1905; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1906; pointer to eflags in A3.
1907;
1908; The functions all return 0 so the caller can be used for div/idiv as well as
1909; for the mul/imul implementation.
1910;
1911; @param 1 The instruction mnemonic.
1912; @param 2 The modified flags.
1913; @param 3 The undefined flags.
1914; @param 4 Name suffix.
1915; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1916;
1917; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1918;
1919%macro IEMIMPL_MUL_OP 5
1920BEGINCODE
1921BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1922 PROLOGUE_3_ARGS
1923 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1924 mov al, [A0]
1925 %1 A1_8
1926 mov [A0], ax
1927 %if %5 != 1
1928 IEM_SAVE_FLAGS A2, %2, %3
1929 %else
1930 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1931 %endif
1932 xor eax, eax
1933 EPILOGUE_3_ARGS
1934ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1935
1936BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1937 PROLOGUE_4_ARGS
1938 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1939 mov ax, [A0]
1940 %ifdef ASM_CALL64_GCC
1941 %1 A2_16
1942 mov [A0], ax
1943 mov [A1], dx
1944 %else
1945 mov T1, A1
1946 %1 A2_16
1947 mov [A0], ax
1948 mov [T1], dx
1949 %endif
1950 %if %5 != 1
1951 IEM_SAVE_FLAGS A3, %2, %3
1952 %else
1953 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1954 %endif
1955 xor eax, eax
1956 EPILOGUE_4_ARGS
1957ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1958
1959BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1960 PROLOGUE_4_ARGS
1961 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1962 mov eax, [A0]
1963 %ifdef ASM_CALL64_GCC
1964 %1 A2_32
1965 mov [A0], eax
1966 mov [A1], edx
1967 %else
1968 mov T1, A1
1969 %1 A2_32
1970 mov [A0], eax
1971 mov [T1], edx
1972 %endif
1973 %if %5 != 1
1974 IEM_SAVE_FLAGS A3, %2, %3
1975 %else
1976 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1977 %endif
1978 xor eax, eax
1979 EPILOGUE_4_ARGS
1980ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1981
1982 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1983BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1984 PROLOGUE_4_ARGS
1985 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1986 mov rax, [A0]
1987 %ifdef ASM_CALL64_GCC
1988 %1 A2
1989 mov [A0], rax
1990 mov [A1], rdx
1991 %else
1992 mov T1, A1
1993 %1 A2
1994 mov [A0], rax
1995 mov [T1], rdx
1996 %endif
1997 %if %5 != 1
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 %else
2000 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2001 %endif
2002 xor eax, eax
2003 EPILOGUE_4_ARGS_EX 12
2004ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2005 %endif ; !RT_ARCH_AMD64
2006
2007%endmacro
2008
2009IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2010IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2011IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2012IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2013IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2014IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2015
2016
2017BEGINCODE
2018;;
2019; Worker function for negating a 32-bit number in T1:T0
2020; @uses None (T0,T1)
2021BEGINPROC iemAImpl_negate_T0_T1_u32
2022 push 0
2023 push 0
2024 xchg T0_32, [xSP]
2025 xchg T1_32, [xSP + xCB]
2026 sub T0_32, [xSP]
2027 sbb T1_32, [xSP + xCB]
2028 add xSP, xCB*2
2029 ret
2030ENDPROC iemAImpl_negate_T0_T1_u32
2031
2032%ifdef RT_ARCH_AMD64
2033;;
2034; Worker function for negating a 64-bit number in T1:T0
2035; @uses None (T0,T1)
2036BEGINPROC iemAImpl_negate_T0_T1_u64
2037 push 0
2038 push 0
2039 xchg T0, [xSP]
2040 xchg T1, [xSP + xCB]
2041 sub T0, [xSP]
2042 sbb T1, [xSP + xCB]
2043 add xSP, xCB*2
2044 ret
2045ENDPROC iemAImpl_negate_T0_T1_u64
2046%endif
2047
2048
2049;;
2050; Macro for implementing a division operations.
2051;
2052; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2053; 32-bit system where the 64-bit accesses requires hand coding.
2054;
2055; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2056; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2057; pointer to eflags in A3.
2058;
2059; The functions all return 0 on success and -1 if a divide error should be
2060; raised by the caller.
2061;
2062; @param 1 The instruction mnemonic.
2063; @param 2 The modified flags.
2064; @param 3 The undefined flags.
2065; @param 4 1 if signed, 0 if unsigned.
2066; @param 5 Function suffix.
2067; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2068; 2 for AMD (set AF, clear PF, ZF and SF).
2069;
2070; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2071;
2072%macro IEMIMPL_DIV_OP 6
2073BEGINCODE
2074BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2075 PROLOGUE_3_ARGS
2076
2077 ; div by chainsaw check.
2078 test A1_8, A1_8
2079 jz .div_zero
2080
2081 ; Overflow check - unsigned division is simple to verify, haven't
2082 ; found a simple way to check signed division yet unfortunately.
2083 %if %4 == 0
2084 cmp [A0 + 1], A1_8
2085 jae .div_overflow
2086 %else
2087 mov T0_16, [A0] ; T0 = dividend
2088 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2089 test A1_8, A1_8
2090 js .divisor_negative
2091 test T0_16, T0_16
2092 jns .both_positive
2093 neg T0_16
2094.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2095 push T0 ; Start off like unsigned below.
2096 shr T0_16, 7
2097 cmp T0_8, A1_8
2098 pop T0
2099 jb .div_no_overflow
2100 ja .div_overflow
2101 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2102 cmp T0_8, A1_8
2103 jae .div_overflow
2104 jmp .div_no_overflow
2105
2106.divisor_negative:
2107 neg A1_8
2108 test T0_16, T0_16
2109 jns .one_of_each
2110 neg T0_16
2111.both_positive: ; Same as unsigned shifted by sign indicator bit.
2112 shr T0_16, 7
2113 cmp T0_8, A1_8
2114 jae .div_overflow
2115.div_no_overflow:
2116 mov A1, T1 ; restore divisor
2117 %endif
2118
2119 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2120 mov ax, [A0]
2121 %1 A1_8
2122 mov [A0], ax
2123 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2124 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2125 %else
2126 IEM_SAVE_FLAGS A2, %2, %3
2127 %endif
2128 xor eax, eax
2129
2130.return:
2131 EPILOGUE_3_ARGS
2132
2133.div_zero:
2134.div_overflow:
2135 mov eax, -1
2136 jmp .return
2137ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2138
2139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2140 PROLOGUE_4_ARGS
2141
2142 ; div by chainsaw check.
2143 test A2_16, A2_16
2144 jz .div_zero
2145
2146 ; Overflow check - unsigned division is simple to verify, haven't
2147 ; found a simple way to check signed division yet unfortunately.
2148 %if %4 == 0
2149 cmp [A1], A2_16
2150 jae .div_overflow
2151 %else
2152 mov T0_16, [A1]
2153 shl T0_32, 16
2154 mov T0_16, [A0] ; T0 = dividend
2155 mov T1, A2 ; T1 = divisor
2156 test T1_16, T1_16
2157 js .divisor_negative
2158 test T0_32, T0_32
2159 jns .both_positive
2160 neg T0_32
2161.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2162 push T0 ; Start off like unsigned below.
2163 shr T0_32, 15
2164 cmp T0_16, T1_16
2165 pop T0
2166 jb .div_no_overflow
2167 ja .div_overflow
2168 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2169 cmp T0_16, T1_16
2170 jae .div_overflow
2171 jmp .div_no_overflow
2172
2173.divisor_negative:
2174 neg T1_16
2175 test T0_32, T0_32
2176 jns .one_of_each
2177 neg T0_32
2178.both_positive: ; Same as unsigned shifted by sign indicator bit.
2179 shr T0_32, 15
2180 cmp T0_16, T1_16
2181 jae .div_overflow
2182.div_no_overflow:
2183 %endif
2184
2185 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2186 %ifdef ASM_CALL64_GCC
2187 mov T1, A2
2188 mov ax, [A0]
2189 mov dx, [A1]
2190 %1 T1_16
2191 mov [A0], ax
2192 mov [A1], dx
2193 %else
2194 mov T1, A1
2195 mov ax, [A0]
2196 mov dx, [T1]
2197 %1 A2_16
2198 mov [A0], ax
2199 mov [T1], dx
2200 %endif
2201 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2202 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2203 %else
2204 IEM_SAVE_FLAGS A3, %2, %3
2205 %endif
2206 xor eax, eax
2207
2208.return:
2209 EPILOGUE_4_ARGS
2210
2211.div_zero:
2212.div_overflow:
2213 mov eax, -1
2214 jmp .return
2215ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2216
2217BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2218 PROLOGUE_4_ARGS
2219
2220 ; div by chainsaw check.
2221 test A2_32, A2_32
2222 jz .div_zero
2223
2224 ; Overflow check - unsigned division is simple to verify, haven't
2225 ; found a simple way to check signed division yet unfortunately.
2226 %if %4 == 0
2227 cmp [A1], A2_32
2228 jae .div_overflow
2229 %else
2230 push A2 ; save A2 so we modify it (we out of regs on x86).
2231 mov T0_32, [A0] ; T0 = dividend low
2232 mov T1_32, [A1] ; T1 = dividend high
2233 test A2_32, A2_32
2234 js .divisor_negative
2235 test T1_32, T1_32
2236 jns .both_positive
2237 call NAME(iemAImpl_negate_T0_T1_u32)
2238.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2239 push T0 ; Start off like unsigned below.
2240 shl T1_32, 1
2241 shr T0_32, 31
2242 or T1_32, T0_32
2243 cmp T1_32, A2_32
2244 pop T0
2245 jb .div_no_overflow
2246 ja .div_overflow
2247 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2248 cmp T0_32, A2_32
2249 jae .div_overflow
2250 jmp .div_no_overflow
2251
2252.divisor_negative:
2253 neg A2_32
2254 test T1_32, T1_32
2255 jns .one_of_each
2256 call NAME(iemAImpl_negate_T0_T1_u32)
2257.both_positive: ; Same as unsigned shifted by sign indicator bit.
2258 shl T1_32, 1
2259 shr T0_32, 31
2260 or T1_32, T0_32
2261 cmp T1_32, A2_32
2262 jae .div_overflow
2263.div_no_overflow:
2264 pop A2
2265 %endif
2266
2267 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2268 mov eax, [A0]
2269 %ifdef ASM_CALL64_GCC
2270 mov T1, A2
2271 mov eax, [A0]
2272 mov edx, [A1]
2273 %1 T1_32
2274 mov [A0], eax
2275 mov [A1], edx
2276 %else
2277 mov T1, A1
2278 mov eax, [A0]
2279 mov edx, [T1]
2280 %1 A2_32
2281 mov [A0], eax
2282 mov [T1], edx
2283 %endif
2284 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2285 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2286 %else
2287 IEM_SAVE_FLAGS A3, %2, %3
2288 %endif
2289 xor eax, eax
2290
2291.return:
2292 EPILOGUE_4_ARGS
2293
2294.div_overflow:
2295 %if %4 != 0
2296 pop A2
2297 %endif
2298.div_zero:
2299 mov eax, -1
2300 jmp .return
2301ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2302
2303 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2304BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2305 PROLOGUE_4_ARGS
2306
2307 test A2, A2
2308 jz .div_zero
2309 %if %4 == 0
2310 cmp [A1], A2
2311 jae .div_overflow
2312 %else
2313 push A2 ; save A2 so we modify it (we out of regs on x86).
2314 mov T0, [A0] ; T0 = dividend low
2315 mov T1, [A1] ; T1 = dividend high
2316 test A2, A2
2317 js .divisor_negative
2318 test T1, T1
2319 jns .both_positive
2320 call NAME(iemAImpl_negate_T0_T1_u64)
2321.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2322 push T0 ; Start off like unsigned below.
2323 shl T1, 1
2324 shr T0, 63
2325 or T1, T0
2326 cmp T1, A2
2327 pop T0
2328 jb .div_no_overflow
2329 ja .div_overflow
2330 mov T1, 0x7fffffffffffffff
2331 and T0, T1 ; Special case for covering (divisor - 1).
2332 cmp T0, A2
2333 jae .div_overflow
2334 jmp .div_no_overflow
2335
2336.divisor_negative:
2337 neg A2
2338 test T1, T1
2339 jns .one_of_each
2340 call NAME(iemAImpl_negate_T0_T1_u64)
2341.both_positive: ; Same as unsigned shifted by sign indicator bit.
2342 shl T1, 1
2343 shr T0, 63
2344 or T1, T0
2345 cmp T1, A2
2346 jae .div_overflow
2347.div_no_overflow:
2348 pop A2
2349 %endif
2350
2351 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2352 mov rax, [A0]
2353 %ifdef ASM_CALL64_GCC
2354 mov T1, A2
2355 mov rax, [A0]
2356 mov rdx, [A1]
2357 %1 T1
2358 mov [A0], rax
2359 mov [A1], rdx
2360 %else
2361 mov T1, A1
2362 mov rax, [A0]
2363 mov rdx, [T1]
2364 %1 A2
2365 mov [A0], rax
2366 mov [T1], rdx
2367 %endif
2368 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2369 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2370 %else
2371 IEM_SAVE_FLAGS A3, %2, %3
2372 %endif
2373 xor eax, eax
2374
2375.return:
2376 EPILOGUE_4_ARGS_EX 12
2377
2378.div_overflow:
2379 %if %4 != 0
2380 pop A2
2381 %endif
2382.div_zero:
2383 mov eax, -1
2384 jmp .return
2385ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2386 %endif ; !RT_ARCH_AMD64
2387
2388%endmacro
2389
2390IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2391IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2392IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2393IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2394IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2395IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2396
2397
2398;;
2399; Macro for implementing memory fence operation.
2400;
2401; No return value, no operands or anything.
2402;
2403; @param 1 The instruction.
2404;
2405%macro IEMIMPL_MEM_FENCE 1
2406BEGINCODE
2407BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2408 %1
2409 ret
2410ENDPROC iemAImpl_ %+ %1
2411%endmacro
2412
2413IEMIMPL_MEM_FENCE lfence
2414IEMIMPL_MEM_FENCE sfence
2415IEMIMPL_MEM_FENCE mfence
2416
2417;;
2418; Alternative for non-SSE2 host.
2419;
2420BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2421 push xAX
2422 xchg xAX, [xSP]
2423 add xSP, xCB
2424 ret
2425ENDPROC iemAImpl_alt_mem_fence
2426
2427
2428;;
2429; Initialize the FPU for the actual instruction being emulated, this means
2430; loading parts of the guest's control word and status word.
2431;
2432; @uses 24 bytes of stack. T0, T1
2433; @param 1 Expression giving the address of the FXSTATE of the guest.
2434;
2435%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2436 fnstenv [xSP]
2437
2438 ; FCW - for exception, precision and rounding control.
2439 movzx T0, word [%1 + X86FXSTATE.FCW]
2440 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2441 mov [xSP + X86FSTENV32P.FCW], T0_16
2442
2443 ; FSW - for undefined C0, C1, C2, and C3.
2444 movzx T1, word [%1 + X86FXSTATE.FSW]
2445 and T1, X86_FSW_C_MASK
2446 movzx T0, word [xSP + X86FSTENV32P.FSW]
2447 and T0, X86_FSW_TOP_MASK
2448 or T0, T1
2449 mov [xSP + X86FSTENV32P.FSW], T0_16
2450
2451 fldenv [xSP]
2452%endmacro
2453
2454
2455;;
2456; Initialize the FPU for the actual instruction being emulated, this means
2457; loading parts of the guest's control word, status word, and update the
2458; tag word for the top register if it's empty.
2459;
2460; ASSUMES actual TOP=7
2461;
2462; @uses 24 bytes of stack. T0, T1
2463; @param 1 Expression giving the address of the FXSTATE of the guest.
2464;
2465%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2466 fnstenv [xSP]
2467
2468 ; FCW - for exception, precision and rounding control.
2469 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2470 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2471 mov [xSP + X86FSTENV32P.FCW], T0_16
2472
2473 ; FSW - for undefined C0, C1, C2, and C3.
2474 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2475 and T1_32, X86_FSW_C_MASK
2476 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2477 and T0_32, X86_FSW_TOP_MASK
2478 or T0_32, T1_32
2479 mov [xSP + X86FSTENV32P.FSW], T0_16
2480
2481 ; FTW - Only for ST0 (in/out).
2482 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2483 shr T1_32, X86_FSW_TOP_SHIFT
2484 and T1_32, X86_FSW_TOP_SMASK
2485 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2486 jc %%st0_not_empty
2487 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2488%%st0_not_empty:
2489
2490 fldenv [xSP]
2491%endmacro
2492
2493
2494;;
2495; Need to move this as well somewhere better?
2496;
2497struc IEMFPURESULT
2498 .r80Result resw 5
2499 .FSW resw 1
2500endstruc
2501
2502
2503;;
2504; Need to move this as well somewhere better?
2505;
2506struc IEMFPURESULTTWO
2507 .r80Result1 resw 5
2508 .FSW resw 1
2509 .r80Result2 resw 5
2510endstruc
2511
2512
2513;
2514;---------------------- 16-bit signed integer operations ----------------------
2515;
2516
2517
2518;;
2519; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2520;
2521; @param A0 FPU context (fxsave).
2522; @param A1 Pointer to a IEMFPURESULT for the output.
2523; @param A2 Pointer to the 16-bit floating point value to convert.
2524;
2525BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2526 PROLOGUE_3_ARGS
2527 sub xSP, 20h
2528
2529 fninit
2530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2531 fild word [A2]
2532
2533 fnstsw word [A1 + IEMFPURESULT.FSW]
2534 fnclex
2535 fstp tword [A1 + IEMFPURESULT.r80Result]
2536
2537 fninit
2538 add xSP, 20h
2539 EPILOGUE_3_ARGS
2540ENDPROC iemAImpl_fild_r80_from_i16
2541
2542
2543;;
2544; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2545;
2546; @param A0 FPU context (fxsave).
2547; @param A1 Where to return the output FSW.
2548; @param A2 Where to store the 16-bit signed integer value.
2549; @param A3 Pointer to the 80-bit value.
2550;
2551BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2552 PROLOGUE_4_ARGS
2553 sub xSP, 20h
2554
2555 fninit
2556 fld tword [A3]
2557 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2558 fistp word [A2]
2559
2560 fnstsw word [A1]
2561
2562 fninit
2563 add xSP, 20h
2564 EPILOGUE_4_ARGS
2565ENDPROC iemAImpl_fist_r80_to_i16
2566
2567
2568;;
2569; Store a 80-bit floating point value (register) as a 16-bit signed integer
2570; (memory) with truncation.
2571;
2572; @param A0 FPU context (fxsave).
2573; @param A1 Where to return the output FSW.
2574; @param A2 Where to store the 16-bit signed integer value.
2575; @param A3 Pointer to the 80-bit value.
2576;
2577BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2578 PROLOGUE_4_ARGS
2579 sub xSP, 20h
2580
2581 fninit
2582 fld tword [A3]
2583 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2584 fisttp word [A2]
2585
2586 fnstsw word [A1]
2587
2588 fninit
2589 add xSP, 20h
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_fistt_r80_to_i16
2592
2593
2594;;
2595; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2596;
2597; @param 1 The instruction
2598;
2599; @param A0 FPU context (fxsave).
2600; @param A1 Pointer to a IEMFPURESULT for the output.
2601; @param A2 Pointer to the 80-bit value.
2602; @param A3 Pointer to the 16-bit value.
2603;
2604%macro IEMIMPL_FPU_R80_BY_I16 1
2605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2606 PROLOGUE_4_ARGS
2607 sub xSP, 20h
2608
2609 fninit
2610 fld tword [A2]
2611 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2612 %1 word [A3]
2613
2614 fnstsw word [A1 + IEMFPURESULT.FSW]
2615 fnclex
2616 fstp tword [A1 + IEMFPURESULT.r80Result]
2617
2618 fninit
2619 add xSP, 20h
2620 EPILOGUE_4_ARGS
2621ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2622%endmacro
2623
2624IEMIMPL_FPU_R80_BY_I16 fiadd
2625IEMIMPL_FPU_R80_BY_I16 fimul
2626IEMIMPL_FPU_R80_BY_I16 fisub
2627IEMIMPL_FPU_R80_BY_I16 fisubr
2628IEMIMPL_FPU_R80_BY_I16 fidiv
2629IEMIMPL_FPU_R80_BY_I16 fidivr
2630
2631
2632;;
2633; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2634; only returning FSW.
2635;
2636; @param 1 The instruction
2637;
2638; @param A0 FPU context (fxsave).
2639; @param A1 Where to store the output FSW.
2640; @param A2 Pointer to the 80-bit value.
2641; @param A3 Pointer to the 64-bit value.
2642;
2643%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2644BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2645 PROLOGUE_4_ARGS
2646 sub xSP, 20h
2647
2648 fninit
2649 fld tword [A2]
2650 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2651 %1 word [A3]
2652
2653 fnstsw word [A1]
2654
2655 fninit
2656 add xSP, 20h
2657 EPILOGUE_4_ARGS
2658ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2659%endmacro
2660
2661IEMIMPL_FPU_R80_BY_I16_FSW ficom
2662
2663
2664
2665;
2666;---------------------- 32-bit signed integer operations ----------------------
2667;
2668
2669
2670;;
2671; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Pointer to a IEMFPURESULT for the output.
2675; @param A2 Pointer to the 32-bit floating point value to convert.
2676;
2677BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2678 PROLOGUE_3_ARGS
2679 sub xSP, 20h
2680
2681 fninit
2682 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2683 fild dword [A2]
2684
2685 fnstsw word [A1 + IEMFPURESULT.FSW]
2686 fnclex
2687 fstp tword [A1 + IEMFPURESULT.r80Result]
2688
2689 fninit
2690 add xSP, 20h
2691 EPILOGUE_3_ARGS
2692ENDPROC iemAImpl_fild_r80_from_i32
2693
2694
2695;;
2696; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2697;
2698; @param A0 FPU context (fxsave).
2699; @param A1 Where to return the output FSW.
2700; @param A2 Where to store the 32-bit signed integer value.
2701; @param A3 Pointer to the 80-bit value.
2702;
2703BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2704 PROLOGUE_4_ARGS
2705 sub xSP, 20h
2706
2707 fninit
2708 fld tword [A3]
2709 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2710 fistp dword [A2]
2711
2712 fnstsw word [A1]
2713
2714 fninit
2715 add xSP, 20h
2716 EPILOGUE_4_ARGS
2717ENDPROC iemAImpl_fist_r80_to_i32
2718
2719
2720;;
2721; Store a 80-bit floating point value (register) as a 32-bit signed integer
2722; (memory) with truncation.
2723;
2724; @param A0 FPU context (fxsave).
2725; @param A1 Where to return the output FSW.
2726; @param A2 Where to store the 32-bit signed integer value.
2727; @param A3 Pointer to the 80-bit value.
2728;
2729BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2730 PROLOGUE_4_ARGS
2731 sub xSP, 20h
2732
2733 fninit
2734 fld tword [A3]
2735 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2736 fisttp dword [A2]
2737
2738 fnstsw word [A1]
2739
2740 fninit
2741 add xSP, 20h
2742 EPILOGUE_4_ARGS
2743ENDPROC iemAImpl_fistt_r80_to_i32
2744
2745
2746;;
2747; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2748;
2749; @param 1 The instruction
2750;
2751; @param A0 FPU context (fxsave).
2752; @param A1 Pointer to a IEMFPURESULT for the output.
2753; @param A2 Pointer to the 80-bit value.
2754; @param A3 Pointer to the 32-bit value.
2755;
2756%macro IEMIMPL_FPU_R80_BY_I32 1
2757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2758 PROLOGUE_4_ARGS
2759 sub xSP, 20h
2760
2761 fninit
2762 fld tword [A2]
2763 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2764 %1 dword [A3]
2765
2766 fnstsw word [A1 + IEMFPURESULT.FSW]
2767 fnclex
2768 fstp tword [A1 + IEMFPURESULT.r80Result]
2769
2770 fninit
2771 add xSP, 20h
2772 EPILOGUE_4_ARGS
2773ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2774%endmacro
2775
2776IEMIMPL_FPU_R80_BY_I32 fiadd
2777IEMIMPL_FPU_R80_BY_I32 fimul
2778IEMIMPL_FPU_R80_BY_I32 fisub
2779IEMIMPL_FPU_R80_BY_I32 fisubr
2780IEMIMPL_FPU_R80_BY_I32 fidiv
2781IEMIMPL_FPU_R80_BY_I32 fidivr
2782
2783
2784;;
2785; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2786; only returning FSW.
2787;
2788; @param 1 The instruction
2789;
2790; @param A0 FPU context (fxsave).
2791; @param A1 Where to store the output FSW.
2792; @param A2 Pointer to the 80-bit value.
2793; @param A3 Pointer to the 64-bit value.
2794;
2795%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2796BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2797 PROLOGUE_4_ARGS
2798 sub xSP, 20h
2799
2800 fninit
2801 fld tword [A2]
2802 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2803 %1 dword [A3]
2804
2805 fnstsw word [A1]
2806
2807 fninit
2808 add xSP, 20h
2809 EPILOGUE_4_ARGS
2810ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2811%endmacro
2812
2813IEMIMPL_FPU_R80_BY_I32_FSW ficom
2814
2815
2816
2817;
2818;---------------------- 64-bit signed integer operations ----------------------
2819;
2820
2821
2822;;
2823; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Pointer to a IEMFPURESULT for the output.
2827; @param A2 Pointer to the 64-bit floating point value to convert.
2828;
2829BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2830 PROLOGUE_3_ARGS
2831 sub xSP, 20h
2832
2833 fninit
2834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2835 fild qword [A2]
2836
2837 fnstsw word [A1 + IEMFPURESULT.FSW]
2838 fnclex
2839 fstp tword [A1 + IEMFPURESULT.r80Result]
2840
2841 fninit
2842 add xSP, 20h
2843 EPILOGUE_3_ARGS
2844ENDPROC iemAImpl_fild_r80_from_i64
2845
2846
2847;;
2848; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2849;
2850; @param A0 FPU context (fxsave).
2851; @param A1 Where to return the output FSW.
2852; @param A2 Where to store the 64-bit signed integer value.
2853; @param A3 Pointer to the 80-bit value.
2854;
2855BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2856 PROLOGUE_4_ARGS
2857 sub xSP, 20h
2858
2859 fninit
2860 fld tword [A3]
2861 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2862 fistp qword [A2]
2863
2864 fnstsw word [A1]
2865
2866 fninit
2867 add xSP, 20h
2868 EPILOGUE_4_ARGS
2869ENDPROC iemAImpl_fist_r80_to_i64
2870
2871
2872;;
2873; Store a 80-bit floating point value (register) as a 64-bit signed integer
2874; (memory) with truncation.
2875;
2876; @param A0 FPU context (fxsave).
2877; @param A1 Where to return the output FSW.
2878; @param A2 Where to store the 64-bit signed integer value.
2879; @param A3 Pointer to the 80-bit value.
2880;
2881BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2882 PROLOGUE_4_ARGS
2883 sub xSP, 20h
2884
2885 fninit
2886 fld tword [A3]
2887 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2888 fisttp qword [A2]
2889
2890 fnstsw word [A1]
2891
2892 fninit
2893 add xSP, 20h
2894 EPILOGUE_4_ARGS
2895ENDPROC iemAImpl_fistt_r80_to_i64
2896
2897
2898
2899;
2900;---------------------- 32-bit floating point operations ----------------------
2901;
2902
2903;;
2904; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2905;
2906; @param A0 FPU context (fxsave).
2907; @param A1 Pointer to a IEMFPURESULT for the output.
2908; @param A2 Pointer to the 32-bit floating point value to convert.
2909;
2910BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2911 PROLOGUE_3_ARGS
2912 sub xSP, 20h
2913
2914 fninit
2915 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2916 fld dword [A2]
2917
2918 fnstsw word [A1 + IEMFPURESULT.FSW]
2919 fnclex
2920 fstp tword [A1 + IEMFPURESULT.r80Result]
2921
2922 fninit
2923 add xSP, 20h
2924 EPILOGUE_3_ARGS
2925ENDPROC iemAImpl_fld_r80_from_r32
2926
2927
2928;;
2929; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2930;
2931; @param A0 FPU context (fxsave).
2932; @param A1 Where to return the output FSW.
2933; @param A2 Where to store the 32-bit value.
2934; @param A3 Pointer to the 80-bit value.
2935;
2936BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2937 PROLOGUE_4_ARGS
2938 sub xSP, 20h
2939
2940 fninit
2941 fld tword [A3]
2942 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2943 fst dword [A2]
2944
2945 fnstsw word [A1]
2946
2947 fninit
2948 add xSP, 20h
2949 EPILOGUE_4_ARGS
2950ENDPROC iemAImpl_fst_r80_to_r32
2951
2952
2953;;
2954; FPU instruction working on one 80-bit and one 32-bit floating point value.
2955;
2956; @param 1 The instruction
2957;
2958; @param A0 FPU context (fxsave).
2959; @param A1 Pointer to a IEMFPURESULT for the output.
2960; @param A2 Pointer to the 80-bit value.
2961; @param A3 Pointer to the 32-bit value.
2962;
2963%macro IEMIMPL_FPU_R80_BY_R32 1
2964BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2965 PROLOGUE_4_ARGS
2966 sub xSP, 20h
2967
2968 fninit
2969 fld tword [A2]
2970 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971 %1 dword [A3]
2972
2973 fnstsw word [A1 + IEMFPURESULT.FSW]
2974 fnclex
2975 fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977 fninit
2978 add xSP, 20h
2979 EPILOGUE_4_ARGS
2980ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2981%endmacro
2982
2983IEMIMPL_FPU_R80_BY_R32 fadd
2984IEMIMPL_FPU_R80_BY_R32 fmul
2985IEMIMPL_FPU_R80_BY_R32 fsub
2986IEMIMPL_FPU_R80_BY_R32 fsubr
2987IEMIMPL_FPU_R80_BY_R32 fdiv
2988IEMIMPL_FPU_R80_BY_R32 fdivr
2989
2990
2991;;
2992; FPU instruction working on one 80-bit and one 32-bit floating point value,
2993; only returning FSW.
2994;
2995; @param 1 The instruction
2996;
2997; @param A0 FPU context (fxsave).
2998; @param A1 Where to store the output FSW.
2999; @param A2 Pointer to the 80-bit value.
3000; @param A3 Pointer to the 64-bit value.
3001;
3002%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3004 PROLOGUE_4_ARGS
3005 sub xSP, 20h
3006
3007 fninit
3008 fld tword [A2]
3009 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3010 %1 dword [A3]
3011
3012 fnstsw word [A1]
3013
3014 fninit
3015 add xSP, 20h
3016 EPILOGUE_4_ARGS
3017ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3018%endmacro
3019
3020IEMIMPL_FPU_R80_BY_R32_FSW fcom
3021
3022
3023
3024;
3025;---------------------- 64-bit floating point operations ----------------------
3026;
3027
3028;;
3029; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3030;
3031; @param A0 FPU context (fxsave).
3032; @param A1 Pointer to a IEMFPURESULT for the output.
3033; @param A2 Pointer to the 64-bit floating point value to convert.
3034;
3035BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3036 PROLOGUE_3_ARGS
3037 sub xSP, 20h
3038
3039 fninit
3040 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3041 fld qword [A2]
3042
3043 fnstsw word [A1 + IEMFPURESULT.FSW]
3044 fnclex
3045 fstp tword [A1 + IEMFPURESULT.r80Result]
3046
3047 fninit
3048 add xSP, 20h
3049 EPILOGUE_3_ARGS
3050ENDPROC iemAImpl_fld_r80_from_r64
3051
3052
3053;;
3054; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3055;
3056; @param A0 FPU context (fxsave).
3057; @param A1 Where to return the output FSW.
3058; @param A2 Where to store the 64-bit value.
3059; @param A3 Pointer to the 80-bit value.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3062 PROLOGUE_4_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 fld tword [A3]
3067 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3068 fst qword [A2]
3069
3070 fnstsw word [A1]
3071
3072 fninit
3073 add xSP, 20h
3074 EPILOGUE_4_ARGS
3075ENDPROC iemAImpl_fst_r80_to_r64
3076
3077
3078;;
3079; FPU instruction working on one 80-bit and one 64-bit floating point value.
3080;
3081; @param 1 The instruction
3082;
3083; @param A0 FPU context (fxsave).
3084; @param A1 Pointer to a IEMFPURESULT for the output.
3085; @param A2 Pointer to the 80-bit value.
3086; @param A3 Pointer to the 64-bit value.
3087;
3088%macro IEMIMPL_FPU_R80_BY_R64 1
3089BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3090 PROLOGUE_4_ARGS
3091 sub xSP, 20h
3092
3093 fninit
3094 fld tword [A2]
3095 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3096 %1 qword [A3]
3097
3098 fnstsw word [A1 + IEMFPURESULT.FSW]
3099 fnclex
3100 fstp tword [A1 + IEMFPURESULT.r80Result]
3101
3102 fninit
3103 add xSP, 20h
3104 EPILOGUE_4_ARGS
3105ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3106%endmacro
3107
3108IEMIMPL_FPU_R80_BY_R64 fadd
3109IEMIMPL_FPU_R80_BY_R64 fmul
3110IEMIMPL_FPU_R80_BY_R64 fsub
3111IEMIMPL_FPU_R80_BY_R64 fsubr
3112IEMIMPL_FPU_R80_BY_R64 fdiv
3113IEMIMPL_FPU_R80_BY_R64 fdivr
3114
3115;;
3116; FPU instruction working on one 80-bit and one 64-bit floating point value,
3117; only returning FSW.
3118;
3119; @param 1 The instruction
3120;
3121; @param A0 FPU context (fxsave).
3122; @param A1 Where to store the output FSW.
3123; @param A2 Pointer to the 80-bit value.
3124; @param A3 Pointer to the 64-bit value.
3125;
3126%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3127BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3128 PROLOGUE_4_ARGS
3129 sub xSP, 20h
3130
3131 fninit
3132 fld tword [A2]
3133 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3134 %1 qword [A3]
3135
3136 fnstsw word [A1]
3137
3138 fninit
3139 add xSP, 20h
3140 EPILOGUE_4_ARGS
3141ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3142%endmacro
3143
3144IEMIMPL_FPU_R80_BY_R64_FSW fcom
3145
3146
3147
3148;
3149;---------------------- 80-bit floating point operations ----------------------
3150;
3151
3152;;
3153; Loads a 80-bit floating point register value from memory.
3154;
3155; @param A0 FPU context (fxsave).
3156; @param A1 Pointer to a IEMFPURESULT for the output.
3157; @param A2 Pointer to the 80-bit floating point value to load.
3158;
3159BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3160 PROLOGUE_3_ARGS
3161 sub xSP, 20h
3162
3163 fninit
3164 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3165 fld tword [A2]
3166
3167 fnstsw word [A1 + IEMFPURESULT.FSW]
3168 fnclex
3169 fstp tword [A1 + IEMFPURESULT.r80Result]
3170
3171 fninit
3172 add xSP, 20h
3173 EPILOGUE_3_ARGS
3174ENDPROC iemAImpl_fld_r80_from_r80
3175
3176
3177;;
3178; Store a 80-bit floating point register to memory
3179;
3180; @param A0 FPU context (fxsave).
3181; @param A1 Where to return the output FSW.
3182; @param A2 Where to store the 80-bit value.
3183; @param A3 Pointer to the 80-bit register value.
3184;
3185BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3186 PROLOGUE_4_ARGS
3187 sub xSP, 20h
3188
3189 fninit
3190 fld tword [A3]
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fstp tword [A2]
3193
3194 fnstsw word [A1]
3195
3196 fninit
3197 add xSP, 20h
3198 EPILOGUE_4_ARGS
3199ENDPROC iemAImpl_fst_r80_to_r80
3200
3201
3202;;
3203; Loads an 80-bit floating point register value in BCD format from memory.
3204;
3205; @param A0 FPU context (fxsave).
3206; @param A1 Pointer to a IEMFPURESULT for the output.
3207; @param A2 Pointer to the 80-bit BCD value to load.
3208;
3209BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3210 PROLOGUE_3_ARGS
3211 sub xSP, 20h
3212
3213 fninit
3214 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3215 fbld tword [A2]
3216
3217 fnstsw word [A1 + IEMFPURESULT.FSW]
3218 fnclex
3219 fstp tword [A1 + IEMFPURESULT.r80Result]
3220
3221 fninit
3222 add xSP, 20h
3223 EPILOGUE_3_ARGS
3224ENDPROC iemAImpl_fld_r80_from_d80
3225
3226
3227;;
3228; Store a 80-bit floating point register to memory as BCD
3229;
3230; @param A0 FPU context (fxsave).
3231; @param A1 Where to return the output FSW.
3232; @param A2 Where to store the 80-bit BCD value.
3233; @param A3 Pointer to the 80-bit register value.
3234;
3235BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3236 PROLOGUE_4_ARGS
3237 sub xSP, 20h
3238
3239 fninit
3240 fld tword [A3]
3241 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3242 fbstp tword [A2]
3243
3244 fnstsw word [A1]
3245
3246 fninit
3247 add xSP, 20h
3248 EPILOGUE_4_ARGS
3249ENDPROC iemAImpl_fst_r80_to_d80
3250
3251
3252;;
3253; FPU instruction working on two 80-bit floating point values.
3254;
3255; @param 1 The instruction
3256;
3257; @param A0 FPU context (fxsave).
3258; @param A1 Pointer to a IEMFPURESULT for the output.
3259; @param A2 Pointer to the first 80-bit value (ST0)
3260; @param A3 Pointer to the second 80-bit value (STn).
3261;
3262%macro IEMIMPL_FPU_R80_BY_R80 2
3263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3264 PROLOGUE_4_ARGS
3265 sub xSP, 20h
3266
3267 fninit
3268 fld tword [A3]
3269 fld tword [A2]
3270 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3271 %1 %2
3272
3273 fnstsw word [A1 + IEMFPURESULT.FSW]
3274 fnclex
3275 fstp tword [A1 + IEMFPURESULT.r80Result]
3276
3277 fninit
3278 add xSP, 20h
3279 EPILOGUE_4_ARGS
3280ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3281%endmacro
3282
3283IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3284IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3285IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3286IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3287IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3288IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3289IEMIMPL_FPU_R80_BY_R80 fprem, {}
3290IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3291IEMIMPL_FPU_R80_BY_R80 fscale, {}
3292
3293
3294;;
3295; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3296; storing the result in ST1 and popping the stack.
3297;
3298; @param 1 The instruction
3299;
3300; @param A0 FPU context (fxsave).
3301; @param A1 Pointer to a IEMFPURESULT for the output.
3302; @param A2 Pointer to the first 80-bit value (ST1).
3303; @param A3 Pointer to the second 80-bit value (ST0).
3304;
3305%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3306BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3307 PROLOGUE_4_ARGS
3308 sub xSP, 20h
3309
3310 fninit
3311 fld tword [A2]
3312 fld tword [A3]
3313 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3314 %1
3315
3316 fnstsw word [A1 + IEMFPURESULT.FSW]
3317 fnclex
3318 fstp tword [A1 + IEMFPURESULT.r80Result]
3319
3320 fninit
3321 add xSP, 20h
3322 EPILOGUE_4_ARGS
3323ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3324%endmacro
3325
3326IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3327IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3328IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3329
3330
3331;;
3332; FPU instruction working on two 80-bit floating point values, only
3333; returning FSW.
3334;
3335; @param 1 The instruction
3336;
3337; @param A0 FPU context (fxsave).
3338; @param A1 Pointer to a uint16_t for the resulting FSW.
3339; @param A2 Pointer to the first 80-bit value.
3340; @param A3 Pointer to the second 80-bit value.
3341;
3342%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3343BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3344 PROLOGUE_4_ARGS
3345 sub xSP, 20h
3346
3347 fninit
3348 fld tword [A3]
3349 fld tword [A2]
3350 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3351 %1 st0, st1
3352
3353 fnstsw word [A1]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_FSW fcom
3362IEMIMPL_FPU_R80_BY_R80_FSW fucom
3363
3364
3365;;
3366; FPU instruction working on two 80-bit floating point values,
3367; returning FSW and EFLAGS (eax).
3368;
3369; @param 1 The instruction
3370;
3371; @returns EFLAGS in EAX.
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st1
3387
3388 fnstsw word [A1]
3389 pushf
3390 pop xAX
3391
3392 fninit
3393 add xSP, 20h
3394 EPILOGUE_4_ARGS
3395ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3396%endmacro
3397
3398IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3399IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3400
3401
3402;;
3403; FPU instruction working on one 80-bit floating point value.
3404;
3405; @param 1 The instruction
3406;
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a IEMFPURESULT for the output.
3409; @param A2 Pointer to the 80-bit value.
3410;
3411%macro IEMIMPL_FPU_R80 1
3412BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3413 PROLOGUE_3_ARGS
3414 sub xSP, 20h
3415
3416 fninit
3417 fld tword [A2]
3418 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3419 %1
3420
3421 fnstsw word [A1 + IEMFPURESULT.FSW]
3422 fnclex
3423 fstp tword [A1 + IEMFPURESULT.r80Result]
3424
3425 fninit
3426 add xSP, 20h
3427 EPILOGUE_3_ARGS
3428ENDPROC iemAImpl_ %+ %1 %+ _r80
3429%endmacro
3430
3431IEMIMPL_FPU_R80 fchs
3432IEMIMPL_FPU_R80 fabs
3433IEMIMPL_FPU_R80 f2xm1
3434IEMIMPL_FPU_R80 fsqrt
3435IEMIMPL_FPU_R80 frndint
3436IEMIMPL_FPU_R80 fsin
3437IEMIMPL_FPU_R80 fcos
3438
3439
3440;;
3441; FPU instruction working on one 80-bit floating point value, only
3442; returning FSW.
3443;
3444; @param 1 The instruction
3445; @param 2 Non-zero to also restore FTW.
3446;
3447; @param A0 FPU context (fxsave).
3448; @param A1 Pointer to a uint16_t for the resulting FSW.
3449; @param A2 Pointer to the 80-bit value.
3450;
3451%macro IEMIMPL_FPU_R80_FSW 2
3452BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3453 PROLOGUE_3_ARGS
3454 sub xSP, 20h
3455
3456 fninit
3457 fld tword [A2]
3458%if %2 != 0
3459 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3460%else
3461 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3462%endif
3463 %1
3464
3465 fnstsw word [A1]
3466
3467 fninit
3468 add xSP, 20h
3469 EPILOGUE_3_ARGS
3470ENDPROC iemAImpl_ %+ %1 %+ _r80
3471%endmacro
3472
3473IEMIMPL_FPU_R80_FSW ftst, 0
3474IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3475
3476
3477
3478;;
3479; FPU instruction loading a 80-bit floating point constant.
3480;
3481; @param 1 The instruction
3482;
3483; @param A0 FPU context (fxsave).
3484; @param A1 Pointer to a IEMFPURESULT for the output.
3485;
3486%macro IEMIMPL_FPU_R80_CONST 1
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3488 PROLOGUE_2_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3493 %1
3494
3495 fnstsw word [A1 + IEMFPURESULT.FSW]
3496 fnclex
3497 fstp tword [A1 + IEMFPURESULT.r80Result]
3498
3499 fninit
3500 add xSP, 20h
3501 EPILOGUE_2_ARGS
3502ENDPROC iemAImpl_ %+ %1 %+
3503%endmacro
3504
3505IEMIMPL_FPU_R80_CONST fld1
3506IEMIMPL_FPU_R80_CONST fldl2t
3507IEMIMPL_FPU_R80_CONST fldl2e
3508IEMIMPL_FPU_R80_CONST fldpi
3509IEMIMPL_FPU_R80_CONST fldlg2
3510IEMIMPL_FPU_R80_CONST fldln2
3511IEMIMPL_FPU_R80_CONST fldz
3512
3513
3514;;
3515; FPU instruction working on one 80-bit floating point value, outputing two.
3516;
3517; @param 1 The instruction
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3521; @param A2 Pointer to the 80-bit value.
3522;
3523%macro IEMIMPL_FPU_R80_R80 1
3524BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3525 PROLOGUE_3_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A2]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 %1
3532
3533 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3534 fnclex
3535 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3536 fnclex
3537 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3538
3539 fninit
3540 add xSP, 20h
3541 EPILOGUE_3_ARGS
3542ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3543%endmacro
3544
3545IEMIMPL_FPU_R80_R80 fptan
3546IEMIMPL_FPU_R80_R80 fxtract
3547IEMIMPL_FPU_R80_R80 fsincos
3548
3549
3550
3551
3552;---------------------- SSE and MMX Operations ----------------------
3553
3554;; @todo what do we need to do for MMX?
3555%macro IEMIMPL_MMX_PROLOGUE 0
3556%endmacro
3557%macro IEMIMPL_MMX_EPILOGUE 0
3558%endmacro
3559
3560;; @todo what do we need to do for SSE?
3561%macro IEMIMPL_SSE_PROLOGUE 0
3562%endmacro
3563%macro IEMIMPL_SSE_EPILOGUE 0
3564%endmacro
3565
3566;; @todo what do we need to do for AVX?
3567%macro IEMIMPL_AVX_PROLOGUE 0
3568%endmacro
3569%macro IEMIMPL_AVX_EPILOGUE 0
3570%endmacro
3571
3572
3573;;
3574; Media instruction working on two full sized registers.
3575;
3576; @param 1 The instruction
3577; @param 2 Whether there is an MMX variant (1) or not (0).
3578;
3579; @param A0 FPU context (fxsave).
3580; @param A1 Pointer to the first media register size operand (input/output).
3581; @param A2 Pointer to the second media register size operand (input).
3582;
3583%macro IEMIMPL_MEDIA_F2 2
3584%if %2 != 0
3585BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3586 PROLOGUE_3_ARGS
3587 IEMIMPL_MMX_PROLOGUE
3588
3589 movq mm0, [A1]
3590 movq mm1, [A2]
3591 %1 mm0, mm1
3592 movq [A1], mm0
3593
3594 IEMIMPL_MMX_EPILOGUE
3595 EPILOGUE_3_ARGS
3596ENDPROC iemAImpl_ %+ %1 %+ _u64
3597%endif
3598
3599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3600 PROLOGUE_3_ARGS
3601 IEMIMPL_SSE_PROLOGUE
3602
3603 movdqu xmm0, [A1]
3604 movdqu xmm1, [A2]
3605 %1 xmm0, xmm1
3606 movdqu [A1], xmm0
3607
3608 IEMIMPL_SSE_EPILOGUE
3609 EPILOGUE_3_ARGS
3610ENDPROC iemAImpl_ %+ %1 %+ _u128
3611%endmacro
3612
3613IEMIMPL_MEDIA_F2 pshufb, 1
3614IEMIMPL_MEDIA_F2 pand, 1
3615IEMIMPL_MEDIA_F2 pandn, 1
3616IEMIMPL_MEDIA_F2 por, 1
3617IEMIMPL_MEDIA_F2 pxor, 1
3618IEMIMPL_MEDIA_F2 pcmpeqb, 1
3619IEMIMPL_MEDIA_F2 pcmpeqw, 1
3620IEMIMPL_MEDIA_F2 pcmpeqd, 1
3621IEMIMPL_MEDIA_F2 pcmpeqq, 0
3622IEMIMPL_MEDIA_F2 pcmpgtb, 1
3623IEMIMPL_MEDIA_F2 pcmpgtw, 1
3624IEMIMPL_MEDIA_F2 pcmpgtd, 1
3625IEMIMPL_MEDIA_F2 pcmpgtq, 0
3626IEMIMPL_MEDIA_F2 paddb, 1
3627IEMIMPL_MEDIA_F2 paddw, 1
3628IEMIMPL_MEDIA_F2 paddd, 1
3629IEMIMPL_MEDIA_F2 paddq, 1
3630IEMIMPL_MEDIA_F2 paddsb, 1
3631IEMIMPL_MEDIA_F2 paddsw, 1
3632IEMIMPL_MEDIA_F2 paddusb, 1
3633IEMIMPL_MEDIA_F2 paddusw, 1
3634IEMIMPL_MEDIA_F2 psubb, 1
3635IEMIMPL_MEDIA_F2 psubw, 1
3636IEMIMPL_MEDIA_F2 psubd, 1
3637IEMIMPL_MEDIA_F2 psubq, 1
3638IEMIMPL_MEDIA_F2 psubsb, 1
3639IEMIMPL_MEDIA_F2 psubsw, 1
3640IEMIMPL_MEDIA_F2 psubusb, 1
3641IEMIMPL_MEDIA_F2 psubusw, 1
3642IEMIMPL_MEDIA_F2 pmullw, 1
3643IEMIMPL_MEDIA_F2 pmulld, 0
3644IEMIMPL_MEDIA_F2 pmulhw, 1
3645IEMIMPL_MEDIA_F2 pmaddwd, 1
3646IEMIMPL_MEDIA_F2 pminub, 1
3647IEMIMPL_MEDIA_F2 pminuw, 0
3648IEMIMPL_MEDIA_F2 pminud, 0
3649IEMIMPL_MEDIA_F2 pminsb, 0
3650IEMIMPL_MEDIA_F2 pminsw, 1
3651IEMIMPL_MEDIA_F2 pminsd, 0
3652IEMIMPL_MEDIA_F2 pmaxub, 1
3653IEMIMPL_MEDIA_F2 pmaxuw, 0
3654IEMIMPL_MEDIA_F2 pmaxud, 0
3655IEMIMPL_MEDIA_F2 pmaxsb, 0
3656IEMIMPL_MEDIA_F2 pmaxsw, 1
3657IEMIMPL_MEDIA_F2 pmaxsd, 0
3658IEMIMPL_MEDIA_F2 pabsb, 1
3659IEMIMPL_MEDIA_F2 pabsw, 1
3660IEMIMPL_MEDIA_F2 pabsd, 1
3661IEMIMPL_MEDIA_F2 psignb, 1
3662IEMIMPL_MEDIA_F2 psignw, 1
3663IEMIMPL_MEDIA_F2 psignd, 1
3664IEMIMPL_MEDIA_F2 phaddw, 1
3665IEMIMPL_MEDIA_F2 phaddd, 1
3666IEMIMPL_MEDIA_F2 phsubw, 1
3667IEMIMPL_MEDIA_F2 phsubd, 1
3668IEMIMPL_MEDIA_F2 phaddsw, 1
3669IEMIMPL_MEDIA_F2 phsubsw, 1
3670IEMIMPL_MEDIA_F2 pmaddubsw, 1
3671IEMIMPL_MEDIA_F2 pmulhrsw, 1
3672IEMIMPL_MEDIA_F2 pmuludq, 1
3673
3674
3675;;
3676; Media instruction working on two full sized registers, but no FXSAVE state argument.
3677;
3678; @param 1 The instruction
3679; @param 2 Whether there is an MMX variant (1) or not (0).
3680;
3681; @param A0 Pointer to the first media register size operand (input/output).
3682; @param A1 Pointer to the second media register size operand (input).
3683;
3684%macro IEMIMPL_MEDIA_OPT_F2 2
3685%if %2 != 0
3686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3687 PROLOGUE_2_ARGS
3688 IEMIMPL_MMX_PROLOGUE
3689
3690 movq mm0, [A0]
3691 movq mm1, [A1]
3692 %1 mm0, mm1
3693 movq [A0], mm0
3694
3695 IEMIMPL_MMX_EPILOGUE
3696 EPILOGUE_2_ARGS
3697ENDPROC iemAImpl_ %+ %1 %+ _u64
3698%endif
3699
3700BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3701 PROLOGUE_2_ARGS
3702 IEMIMPL_SSE_PROLOGUE
3703
3704 movdqu xmm0, [A0]
3705 movdqu xmm1, [A1]
3706 %1 xmm0, xmm1
3707 movdqu [A0], xmm0
3708
3709 IEMIMPL_SSE_EPILOGUE
3710 EPILOGUE_2_ARGS
3711ENDPROC iemAImpl_ %+ %1 %+ _u128
3712%endmacro
3713
3714IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3715IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3716IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3717IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3718IEMIMPL_MEDIA_OPT_F2 psllw, 1
3719IEMIMPL_MEDIA_OPT_F2 pslld, 1
3720IEMIMPL_MEDIA_OPT_F2 psllq, 1
3721IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3722IEMIMPL_MEDIA_OPT_F2 psrld, 1
3723IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3724IEMIMPL_MEDIA_OPT_F2 psraw, 1
3725IEMIMPL_MEDIA_OPT_F2 psrad, 1
3726IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3727IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3728IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3729IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3730IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3731IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3732IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3733IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3734IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3735IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3736IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3737IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3738IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3739IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3740IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3741
3742;;
3743; Media instruction working on one full sized and one half sized register (lower half).
3744;
3745; @param 1 The instruction
3746; @param 2 1 if MMX is included, 0 if not.
3747;
3748; @param A0 Pointer to the first full sized media register operand (input/output).
3749; @param A1 Pointer to the second half sized media register operand (input).
3750;
3751%macro IEMIMPL_MEDIA_F1L1 2
3752 %if %2 != 0
3753BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3754 PROLOGUE_2_ARGS
3755 IEMIMPL_MMX_PROLOGUE
3756
3757 movq mm0, [A0]
3758 movq mm1, [A1]
3759 %1 mm0, mm1
3760 movq [A0], mm0
3761
3762 IEMIMPL_MMX_EPILOGUE
3763 EPILOGUE_2_ARGS
3764ENDPROC iemAImpl_ %+ %1 %+ _u64
3765 %endif
3766
3767BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3768 PROLOGUE_2_ARGS
3769 IEMIMPL_SSE_PROLOGUE
3770
3771 movdqu xmm0, [A0]
3772 movdqu xmm1, [A1]
3773 %1 xmm0, xmm1
3774 movdqu [A0], xmm0
3775
3776 IEMIMPL_SSE_EPILOGUE
3777 EPILOGUE_2_ARGS
3778ENDPROC iemAImpl_ %+ %1 %+ _u128
3779%endmacro
3780
3781IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3782IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3783IEMIMPL_MEDIA_F1L1 punpckldq, 1
3784IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3785
3786
3787;;
3788; Media instruction working two half sized input registers (lower half) and a full sized
3789; destination register (vpunpckh*).
3790;
3791; @param 1 The instruction
3792;
3793; @param A0 Pointer to the destination register (full sized, output only).
3794; @param A1 Pointer to the first full sized media source register operand, where we
3795; will only use the lower half as input - but we'll be loading it in full.
3796; @param A2 Pointer to the second full sized media source register operand, where we
3797; will only use the lower half as input - but we'll be loading it in full.
3798;
3799%macro IEMIMPL_MEDIA_F1L1L1 1
3800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3801 PROLOGUE_3_ARGS
3802 IEMIMPL_AVX_PROLOGUE
3803
3804 vmovdqu xmm0, [A1]
3805 vmovdqu xmm1, [A2]
3806 %1 xmm0, xmm0, xmm1
3807 vmovdqu [A0], xmm0
3808
3809 IEMIMPL_AVX_PROLOGUE
3810 EPILOGUE_3_ARGS
3811ENDPROC iemAImpl_ %+ %1 %+ _u128
3812
3813BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3814 PROLOGUE_3_ARGS
3815 IEMIMPL_AVX_PROLOGUE
3816
3817 vmovdqu ymm0, [A1]
3818 vmovdqu ymm1, [A2]
3819 %1 ymm0, ymm0, ymm1
3820 vmovdqu [A0], ymm0
3821
3822 IEMIMPL_AVX_PROLOGUE
3823 EPILOGUE_3_ARGS
3824ENDPROC iemAImpl_ %+ %1 %+ _u256
3825%endmacro
3826
3827IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3828IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3829IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3830IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3831
3832
3833;;
3834; Media instruction working on one full sized and one half sized register (high half).
3835;
3836; @param 1 The instruction
3837; @param 2 1 if MMX is included, 0 if not.
3838;
3839; @param A0 Pointer to the first full sized media register operand (input/output).
3840; @param A1 Pointer to the second full sized media register operand, where we
3841; will only use the upper half as input - but we'll load it in full.
3842;
3843%macro IEMIMPL_MEDIA_F1H1 2
3844IEMIMPL_MEDIA_F1L1 %1, %2
3845%endmacro
3846
3847IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3848IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3849IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3850IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3851
3852
3853;;
3854; Media instruction working two half sized input registers (high half) and a full sized
3855; destination register (vpunpckh*).
3856;
3857; @param 1 The instruction
3858;
3859; @param A0 Pointer to the destination register (full sized, output only).
3860; @param A1 Pointer to the first full sized media source register operand, where we
3861; will only use the upper half as input - but we'll be loading it in full.
3862; @param A2 Pointer to the second full sized media source register operand, where we
3863; will only use the upper half as input - but we'll be loading it in full.
3864;
3865%macro IEMIMPL_MEDIA_F1H1H1 1
3866IEMIMPL_MEDIA_F1L1L1 %1
3867%endmacro
3868
3869IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3870IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3871IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3872IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3873
3874
3875;
3876; Shufflers with evil 8-bit immediates.
3877;
3878
3879BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3880 PROLOGUE_3_ARGS
3881 IEMIMPL_MMX_PROLOGUE
3882
3883 movq mm1, [A1]
3884 movq mm0, mm0 ; paranoia!
3885 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3886 lea T1, [.imm0 xWrtRIP]
3887 lea T1, [T1 + T0]
3888 call T1
3889 movq [A0], mm0
3890
3891 IEMIMPL_MMX_EPILOGUE
3892 EPILOGUE_3_ARGS
3893%assign bImm 0
3894%rep 256
3895.imm %+ bImm:
3896 pshufw mm0, mm1, bImm
3897 ret
3898 %assign bImm bImm + 1
3899%endrep
3900.immEnd: ; 256*5 == 0x500
3901dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3902dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3903ENDPROC iemAImpl_pshufw_u64
3904
3905
3906%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3907BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3908 PROLOGUE_3_ARGS
3909 IEMIMPL_SSE_PROLOGUE
3910
3911 movdqu xmm1, [A1]
3912 movdqu xmm0, xmm1 ; paranoia!
3913 lea T1, [.imm0 xWrtRIP]
3914 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3915 lea T1, [T1 + T0*2]
3916 call T1
3917 movdqu [A0], xmm0
3918
3919 IEMIMPL_SSE_EPILOGUE
3920 EPILOGUE_3_ARGS
3921 %assign bImm 0
3922 %rep 256
3923.imm %+ bImm:
3924 %1 xmm0, xmm1, bImm
3925 ret
3926 %assign bImm bImm + 1
3927 %endrep
3928.immEnd: ; 256*6 == 0x600
3929dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3930dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3931ENDPROC iemAImpl_ %+ %1 %+ _u128
3932%endmacro
3933
3934IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3935IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3936IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3937
3938
3939%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3940BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3941 PROLOGUE_3_ARGS
3942 IEMIMPL_SSE_PROLOGUE
3943
3944 vmovdqu ymm1, [A1]
3945 vmovdqu ymm0, ymm1 ; paranoia!
3946 lea T1, [.imm0 xWrtRIP]
3947 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3948 lea T1, [T1 + T0*2]
3949 call T1
3950 vmovdqu [A0], ymm0
3951
3952 IEMIMPL_SSE_EPILOGUE
3953 EPILOGUE_3_ARGS
3954 %assign bImm 0
3955 %rep 256
3956.imm %+ bImm:
3957 %1 ymm0, ymm1, bImm
3958 ret
3959 %assign bImm bImm + 1
3960 %endrep
3961.immEnd: ; 256*6 == 0x600
3962dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3963dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3964ENDPROC iemAImpl_ %+ %1 %+ _u256
3965%endmacro
3966
3967IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3968IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3969IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3970
3971
3972;
3973; Shifts with evil 8-bit immediates.
3974;
3975
3976%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
3977BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
3978 PROLOGUE_2_ARGS
3979 IEMIMPL_MMX_PROLOGUE
3980
3981 movq mm0, [A0]
3982 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
3983 lea T1, [.imm0 xWrtRIP]
3984 lea T1, [T1 + T0]
3985 call T1
3986 movq [A0], mm0
3987
3988 IEMIMPL_MMX_EPILOGUE
3989 EPILOGUE_2_ARGS
3990%assign bImm 0
3991%rep 256
3992.imm %+ bImm:
3993 %1 mm0, bImm
3994 ret
3995 %assign bImm bImm + 1
3996%endrep
3997.immEnd: ; 256*5 == 0x500
3998dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3999dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4000ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4001%endmacro
4002
4003IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4004IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4005IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4006IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4007IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4008IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4009IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4010IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4011
4012
4013%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4015 PROLOGUE_2_ARGS
4016 IEMIMPL_SSE_PROLOGUE
4017
4018 movdqu xmm0, [A0]
4019 lea T1, [.imm0 xWrtRIP]
4020 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4021 lea T1, [T1 + T0*2]
4022 call T1
4023 movdqu [A0], xmm0
4024
4025 IEMIMPL_SSE_EPILOGUE
4026 EPILOGUE_2_ARGS
4027 %assign bImm 0
4028 %rep 256
4029.imm %+ bImm:
4030 %1 xmm0, bImm
4031 ret
4032 %assign bImm bImm + 1
4033 %endrep
4034.immEnd: ; 256*6 == 0x600
4035dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4036dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4037ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4038%endmacro
4039
4040IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4041IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4042IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4043IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4044IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4045IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4046IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4047IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4048IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4049IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4050
4051
4052;
4053; Move byte mask.
4054;
4055
4056BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4057 PROLOGUE_2_ARGS
4058 IEMIMPL_MMX_PROLOGUE
4059
4060 movq mm1, [A1]
4061 pmovmskb T0, mm1
4062 mov [A0], T0
4063%ifdef RT_ARCH_X86
4064 mov dword [A0 + 4], 0
4065%endif
4066 IEMIMPL_MMX_EPILOGUE
4067 EPILOGUE_2_ARGS
4068ENDPROC iemAImpl_pmovmskb_u64
4069
4070BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4071 PROLOGUE_2_ARGS
4072 IEMIMPL_SSE_PROLOGUE
4073
4074 movdqu xmm1, [A1]
4075 pmovmskb T0, xmm1
4076 mov [A0], T0
4077%ifdef RT_ARCH_X86
4078 mov dword [A0 + 4], 0
4079%endif
4080 IEMIMPL_SSE_EPILOGUE
4081 EPILOGUE_2_ARGS
4082ENDPROC iemAImpl_pmovmskb_u128
4083
4084BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4085 PROLOGUE_2_ARGS
4086 IEMIMPL_AVX_PROLOGUE
4087
4088 vmovdqu ymm1, [A1]
4089 vpmovmskb T0, ymm1
4090 mov [A0], T0
4091%ifdef RT_ARCH_X86
4092 mov dword [A0 + 4], 0
4093%endif
4094 IEMIMPL_AVX_EPILOGUE
4095 EPILOGUE_2_ARGS
4096ENDPROC iemAImpl_vpmovmskb_u256
4097
4098
4099;;
4100; Media instruction working on two full sized source registers and one destination (AVX).
4101;
4102; @param 1 The instruction
4103;
4104; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4105; @param A1 Pointer to the destination media register size operand (output).
4106; @param A2 Pointer to the first source media register size operand (input).
4107; @param A3 Pointer to the second source media register size operand (input).
4108;
4109%macro IEMIMPL_MEDIA_F3 1
4110BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4111 PROLOGUE_4_ARGS
4112 IEMIMPL_AVX_PROLOGUE
4113
4114 vmovdqu xmm0, [A2]
4115 vmovdqu xmm1, [A3]
4116 %1 xmm0, xmm0, xmm1
4117 vmovdqu [A1], xmm0
4118
4119 IEMIMPL_AVX_PROLOGUE
4120 EPILOGUE_4_ARGS
4121ENDPROC iemAImpl_ %+ %1 %+ _u128
4122
4123BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4124 PROLOGUE_4_ARGS
4125 IEMIMPL_AVX_PROLOGUE
4126
4127 vmovdqu ymm0, [A2]
4128 vmovdqu ymm1, [A3]
4129 %1 ymm0, ymm0, ymm1
4130 vmovdqu [A1], ymm0
4131
4132 IEMIMPL_AVX_PROLOGUE
4133 EPILOGUE_4_ARGS
4134ENDPROC iemAImpl_ %+ %1 %+ _u256
4135%endmacro
4136
4137IEMIMPL_MEDIA_F3 vpshufb
4138IEMIMPL_MEDIA_F3 vpand
4139IEMIMPL_MEDIA_F3 vpminub
4140IEMIMPL_MEDIA_F3 vpminuw
4141IEMIMPL_MEDIA_F3 vpminud
4142IEMIMPL_MEDIA_F3 vpminsb
4143IEMIMPL_MEDIA_F3 vpminsw
4144IEMIMPL_MEDIA_F3 vpminsd
4145IEMIMPL_MEDIA_F3 vpmaxub
4146IEMIMPL_MEDIA_F3 vpmaxuw
4147IEMIMPL_MEDIA_F3 vpmaxud
4148IEMIMPL_MEDIA_F3 vpmaxsb
4149IEMIMPL_MEDIA_F3 vpmaxsw
4150IEMIMPL_MEDIA_F3 vpmaxsd
4151IEMIMPL_MEDIA_F3 vpandn
4152IEMIMPL_MEDIA_F3 vpor
4153IEMIMPL_MEDIA_F3 vpxor
4154IEMIMPL_MEDIA_F3 vpcmpeqb
4155IEMIMPL_MEDIA_F3 vpcmpeqw
4156IEMIMPL_MEDIA_F3 vpcmpeqd
4157IEMIMPL_MEDIA_F3 vpcmpeqq
4158IEMIMPL_MEDIA_F3 vpcmpgtb
4159IEMIMPL_MEDIA_F3 vpcmpgtw
4160IEMIMPL_MEDIA_F3 vpcmpgtd
4161IEMIMPL_MEDIA_F3 vpcmpgtq
4162IEMIMPL_MEDIA_F3 vpaddb
4163IEMIMPL_MEDIA_F3 vpaddw
4164IEMIMPL_MEDIA_F3 vpaddd
4165IEMIMPL_MEDIA_F3 vpaddq
4166IEMIMPL_MEDIA_F3 vpsubb
4167IEMIMPL_MEDIA_F3 vpsubw
4168IEMIMPL_MEDIA_F3 vpsubd
4169IEMIMPL_MEDIA_F3 vpsubq
4170
4171
4172;;
4173; Media instruction working on two full sized source registers and one destination (AVX),
4174; but no XSAVE state pointer argument.
4175;
4176; @param 1 The instruction
4177;
4178; @param A0 Pointer to the destination media register size operand (output).
4179; @param A1 Pointer to the first source media register size operand (input).
4180; @param A2 Pointer to the second source media register size operand (input).
4181;
4182%macro IEMIMPL_MEDIA_OPT_F3 1
4183BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4184 PROLOGUE_3_ARGS
4185 IEMIMPL_AVX_PROLOGUE
4186
4187 vmovdqu xmm0, [A1]
4188 vmovdqu xmm1, [A2]
4189 %1 xmm0, xmm0, xmm1
4190 vmovdqu [A0], xmm0
4191
4192 IEMIMPL_AVX_PROLOGUE
4193 EPILOGUE_3_ARGS
4194ENDPROC iemAImpl_ %+ %1 %+ _u128
4195
4196BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4197 PROLOGUE_3_ARGS
4198 IEMIMPL_AVX_PROLOGUE
4199
4200 vmovdqu ymm0, [A1]
4201 vmovdqu ymm1, [A2]
4202 %1 ymm0, ymm0, ymm1
4203 vmovdqu [A0], ymm0
4204
4205 IEMIMPL_AVX_PROLOGUE
4206 EPILOGUE_3_ARGS
4207ENDPROC iemAImpl_ %+ %1 %+ _u256
4208%endmacro
4209
4210IEMIMPL_MEDIA_OPT_F3 vpacksswb
4211IEMIMPL_MEDIA_OPT_F3 vpackssdw
4212IEMIMPL_MEDIA_OPT_F3 vpackuswb
4213IEMIMPL_MEDIA_OPT_F3 vpackusdw
4214IEMIMPL_MEDIA_OPT_F3 vpmullw
4215IEMIMPL_MEDIA_OPT_F3 vpmulld
4216IEMIMPL_MEDIA_OPT_F3 vpmulhw
4217IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4218IEMIMPL_MEDIA_OPT_F3 vpavgb
4219IEMIMPL_MEDIA_OPT_F3 vpavgw
4220IEMIMPL_MEDIA_OPT_F3 vpsignb
4221IEMIMPL_MEDIA_OPT_F3 vpsignw
4222IEMIMPL_MEDIA_OPT_F3 vpsignd
4223IEMIMPL_MEDIA_OPT_F3 vphaddw
4224IEMIMPL_MEDIA_OPT_F3 vphaddd
4225IEMIMPL_MEDIA_OPT_F3 vphsubw
4226IEMIMPL_MEDIA_OPT_F3 vphsubd
4227IEMIMPL_MEDIA_OPT_F3 vphaddsw
4228IEMIMPL_MEDIA_OPT_F3 vphsubsw
4229IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4230IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4231IEMIMPL_MEDIA_OPT_F3 vpsadbw
4232IEMIMPL_MEDIA_OPT_F3 vpmuldq
4233IEMIMPL_MEDIA_OPT_F3 vpmuludq
4234IEMIMPL_MEDIA_OPT_F3 vunpcklps
4235IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4236IEMIMPL_MEDIA_OPT_F3 vunpckhps
4237IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4238
4239;;
4240; Media instruction working on one full sized source registers and one destination (AVX),
4241; but no XSAVE state pointer argument.
4242;
4243; @param 1 The instruction
4244; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4245;
4246; @param A0 Pointer to the destination media register size operand (output).
4247; @param A1 Pointer to the source media register size operand (input).
4248;
4249%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4250BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4251 PROLOGUE_2_ARGS
4252 IEMIMPL_AVX_PROLOGUE
4253
4254 vmovdqu xmm0, [A1]
4255 %1 xmm0, xmm0
4256 vmovdqu [A0], xmm0
4257
4258 IEMIMPL_AVX_PROLOGUE
4259 EPILOGUE_2_ARGS
4260ENDPROC iemAImpl_ %+ %1 %+ _u128
4261
4262 %if %2 == 1
4263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4264 PROLOGUE_2_ARGS
4265 IEMIMPL_AVX_PROLOGUE
4266
4267 vmovdqu ymm0, [A1]
4268 %1 ymm0, ymm0
4269 vmovdqu [A0], ymm0
4270
4271 IEMIMPL_AVX_PROLOGUE
4272 EPILOGUE_2_ARGS
4273ENDPROC iemAImpl_ %+ %1 %+ _u256
4274 %endif
4275%endmacro
4276
4277IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4278IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4279IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4280IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4281
4282
4283;
4284; The SSE 4.2 crc32
4285;
4286; @param A1 Pointer to the 32-bit destination.
4287; @param A2 The source operand, sized according to the suffix.
4288;
4289BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4290 PROLOGUE_2_ARGS
4291
4292 mov T0_32, [A0]
4293 crc32 T0_32, A1_8
4294 mov [A0], T0_32
4295
4296 EPILOGUE_2_ARGS
4297ENDPROC iemAImpl_crc32_u8
4298
4299BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4300 PROLOGUE_2_ARGS
4301
4302 mov T0_32, [A0]
4303 crc32 T0_32, A1_16
4304 mov [A0], T0_32
4305
4306 EPILOGUE_2_ARGS
4307ENDPROC iemAImpl_crc32_u16
4308
4309BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4310 PROLOGUE_2_ARGS
4311
4312 mov T0_32, [A0]
4313 crc32 T0_32, A1_32
4314 mov [A0], T0_32
4315
4316 EPILOGUE_2_ARGS
4317ENDPROC iemAImpl_crc32_u32
4318
4319%ifdef RT_ARCH_AMD64
4320BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4321 PROLOGUE_2_ARGS
4322
4323 mov T0_32, [A0]
4324 crc32 T0, A1
4325 mov [A0], T0_32
4326
4327 EPILOGUE_2_ARGS
4328ENDPROC iemAImpl_crc32_u64
4329%endif
4330
4331
4332;
4333; PTEST (SSE 4.1)
4334;
4335; @param A0 Pointer to the first source operand (aka readonly destination).
4336; @param A1 Pointer to the second source operand.
4337; @param A2 Pointer to the EFLAGS register.
4338;
4339BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4340 PROLOGUE_3_ARGS
4341 IEMIMPL_SSE_PROLOGUE
4342
4343 movdqu xmm0, [A0]
4344 movdqu xmm1, [A1]
4345 ptest xmm0, xmm1
4346 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4347
4348 IEMIMPL_SSE_EPILOGUE
4349 EPILOGUE_3_ARGS
4350ENDPROC iemAImpl_ptest_u128
4351
4352BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4353 PROLOGUE_3_ARGS
4354 IEMIMPL_SSE_PROLOGUE
4355
4356 vmovdqu ymm0, [A0]
4357 vmovdqu ymm1, [A1]
4358 vptest ymm0, ymm1
4359 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4360
4361 IEMIMPL_SSE_EPILOGUE
4362 EPILOGUE_3_ARGS
4363ENDPROC iemAImpl_vptest_u256
4364
4365
4366;;
4367; Template for the [v]pmov{s,z}x* instructions
4368;
4369; @param 1 The instruction
4370;
4371; @param A0 Pointer to the destination media register size operand (output).
4372; @param A1 The source operand value (input).
4373;
4374%macro IEMIMPL_V_PMOV_SZ_X 1
4375BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4376 PROLOGUE_2_ARGS
4377 IEMIMPL_SSE_PROLOGUE
4378
4379 movd xmm0, A1
4380 %1 xmm0, xmm0
4381 vmovdqu [A0], xmm0
4382
4383 IEMIMPL_SSE_PROLOGUE
4384 EPILOGUE_2_ARGS
4385ENDPROC iemAImpl_ %+ %1 %+ _u128
4386
4387BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4388 PROLOGUE_2_ARGS
4389 IEMIMPL_AVX_PROLOGUE
4390
4391 movd xmm0, A1
4392 v %+ %1 xmm0, xmm0
4393 vmovdqu [A0], xmm0
4394
4395 IEMIMPL_AVX_PROLOGUE
4396 EPILOGUE_2_ARGS
4397ENDPROC iemAImpl_v %+ %1 %+ _u128
4398
4399BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4400 PROLOGUE_2_ARGS
4401 IEMIMPL_AVX_PROLOGUE
4402
4403 movdqu xmm0, [A1]
4404 v %+ %1 ymm0, xmm0
4405 vmovdqu [A0], ymm0
4406
4407 IEMIMPL_AVX_PROLOGUE
4408 EPILOGUE_2_ARGS
4409ENDPROC iemAImpl_v %+ %1 %+ _u256
4410%endmacro
4411
4412IEMIMPL_V_PMOV_SZ_X pmovsxbw
4413IEMIMPL_V_PMOV_SZ_X pmovsxbd
4414IEMIMPL_V_PMOV_SZ_X pmovsxbq
4415IEMIMPL_V_PMOV_SZ_X pmovsxwd
4416IEMIMPL_V_PMOV_SZ_X pmovsxwq
4417IEMIMPL_V_PMOV_SZ_X pmovsxdq
4418
4419IEMIMPL_V_PMOV_SZ_X pmovzxbw
4420IEMIMPL_V_PMOV_SZ_X pmovzxbd
4421IEMIMPL_V_PMOV_SZ_X pmovzxbq
4422IEMIMPL_V_PMOV_SZ_X pmovzxwd
4423IEMIMPL_V_PMOV_SZ_X pmovzxwq
4424IEMIMPL_V_PMOV_SZ_X pmovzxdq
4425
4426
4427;;
4428; Need to move this as well somewhere better?
4429;
4430struc IEMSSERESULT
4431 .uResult resd 4
4432 .MXCSR resd 1
4433endstruc
4434
4435
4436;;
4437; Need to move this as well somewhere better?
4438;
4439struc IEMAVX128RESULT
4440 .uResult resd 4
4441 .MXCSR resd 1
4442endstruc
4443
4444
4445;;
4446; Need to move this as well somewhere better?
4447;
4448struc IEMAVX256RESULT
4449 .uResult resd 8
4450 .MXCSR resd 1
4451endstruc
4452
4453
4454;;
4455; Initialize the SSE MXCSR register using the guest value partially to
4456; account for rounding mode.
4457;
4458; @uses 4 bytes of stack to save the original value, T0.
4459; @param 1 Expression giving the address of the FXSTATE of the guest.
4460;
4461%macro SSE_LD_FXSTATE_MXCSR 1
4462 sub xSP, 4
4463
4464 stmxcsr [xSP]
4465 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4466 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4467 or T0_32, X86_MXCSR_XCPT_MASK
4468 sub xSP, 4
4469 mov [xSP], T0_32
4470 ldmxcsr [xSP]
4471 add xSP, 4
4472%endmacro
4473
4474
4475;;
4476; Restores the SSE MXCSR register with the original value.
4477;
4478; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4479; @param 1 Expression giving the address where to return the MXCSR value.
4480; @param 2 Expression giving the address of the FXSTATE of the guest.
4481;
4482; @note Restores the stack pointer.
4483;
4484%macro SSE_ST_FXSTATE_MXCSR 2
4485 sub xSP, 4
4486 stmxcsr [xSP]
4487 mov T0_32, [xSP]
4488 add xSP, 4
4489 ; Merge the status bits into the original MXCSR value.
4490 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4491 and T0_32, X86_MXCSR_XCPT_FLAGS
4492 or T0_32, T1_32
4493 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4494
4495 ldmxcsr [xSP]
4496 add xSP, 4
4497%endmacro
4498
4499
4500;;
4501; Initialize the SSE MXCSR register using the guest value partially to
4502; account for rounding mode.
4503;
4504; @uses 4 bytes of stack to save the original value.
4505; @param 1 Expression giving the address of the FXSTATE of the guest.
4506;
4507%macro AVX_LD_XSAVEAREA_MXCSR 1
4508 sub xSP, 4
4509
4510 stmxcsr [xSP]
4511 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4512 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4513 sub xSP, 4
4514 mov [xSP], T0_32
4515 ldmxcsr [xSP]
4516 add xSP, 4
4517%endmacro
4518
4519
4520;;
4521; Restores the AVX128 MXCSR register with the original value.
4522;
4523; @param 1 Expression giving the address where to return the MXCSR value.
4524;
4525; @note Restores the stack pointer.
4526;
4527%macro AVX128_ST_XSAVEAREA_MXCSR 1
4528 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4529
4530 ldmxcsr [xSP]
4531 add xSP, 4
4532%endmacro
4533
4534
4535;;
4536; Restores the AVX256 MXCSR register with the original value.
4537;
4538; @param 1 Expression giving the address where to return the MXCSR value.
4539;
4540; @note Restores the stack pointer.
4541;
4542%macro AVX256_ST_XSAVEAREA_MXCSR 1
4543 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4544
4545 ldmxcsr [xSP]
4546 add xSP, 4
4547%endmacro
4548
4549
4550;;
4551; Floating point instruction working on two full sized registers.
4552;
4553; @param 1 The instruction
4554; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4555;
4556; @param A0 FPU context (FXSTATE or XSAVEAREA).
4557; @param A1 Where to return the result including the MXCSR value.
4558; @param A2 Pointer to the first media register size operand (input/output).
4559; @param A3 Pointer to the second media register size operand (input).
4560;
4561%macro IEMIMPL_FP_F2 2
4562BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4563 PROLOGUE_4_ARGS
4564 IEMIMPL_SSE_PROLOGUE
4565 SSE_LD_FXSTATE_MXCSR A0
4566
4567 movdqu xmm0, [A2]
4568 movdqu xmm1, [A3]
4569 %1 xmm0, xmm1
4570 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4571
4572 SSE_ST_FXSTATE_MXCSR A1, A0
4573 IEMIMPL_SSE_PROLOGUE
4574 EPILOGUE_4_ARGS
4575ENDPROC iemAImpl_ %+ %1 %+ _u128
4576
4577 %if %2 == 3
4578BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4579 PROLOGUE_4_ARGS
4580 IEMIMPL_AVX_PROLOGUE
4581 AVX_LD_XSAVEAREA_MXCSR A0
4582
4583 vmovdqu xmm0, [A2]
4584 vmovdqu xmm1, [A3]
4585 v %+ %1 xmm0, xmm0, xmm1
4586 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4587
4588 AVX128_ST_XSAVEAREA_MXCSR A1
4589 IEMIMPL_AVX_PROLOGUE
4590 EPILOGUE_4_ARGS
4591ENDPROC iemAImpl_v %+ %1 %+ _u128
4592
4593BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4594 PROLOGUE_4_ARGS
4595 IEMIMPL_AVX_PROLOGUE
4596 AVX_LD_XSAVEAREA_MXCSR A0
4597
4598 vmovdqu ymm0, [A2]
4599 vmovdqu ymm1, [A3]
4600 v %+ %1 ymm0, ymm0, ymm1
4601 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4602
4603 AVX256_ST_XSAVEAREA_MXCSR A1
4604 IEMIMPL_AVX_PROLOGUE
4605 EPILOGUE_4_ARGS
4606ENDPROC iemAImpl_v %+ %1 %+ _u256
4607 %elif %2 == 2
4608BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4609 PROLOGUE_4_ARGS
4610 IEMIMPL_AVX_PROLOGUE
4611 AVX_LD_XSAVEAREA_MXCSR A0
4612
4613 vmovdqu xmm0, [A2]
4614 vmovdqu xmm1, [A3]
4615 v %+ %1 xmm0, xmm1
4616 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4617
4618 AVX128_ST_XSAVEAREA_MXCSR A1
4619 IEMIMPL_AVX_PROLOGUE
4620 EPILOGUE_4_ARGS
4621ENDPROC iemAImpl_v %+ %1 %+ _u128
4622
4623BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4624 PROLOGUE_4_ARGS
4625 IEMIMPL_AVX_PROLOGUE
4626 AVX_LD_XSAVEAREA_MXCSR A0
4627
4628 vmovdqu ymm0, [A2]
4629 vmovdqu ymm1, [A3]
4630 v %+ %1 ymm0, ymm1
4631 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4632
4633 AVX256_ST_XSAVEAREA_MXCSR A1
4634 IEMIMPL_AVX_PROLOGUE
4635 EPILOGUE_4_ARGS
4636ENDPROC iemAImpl_v %+ %1 %+ _u256
4637 %endif
4638%endmacro
4639
4640IEMIMPL_FP_F2 addps, 3
4641IEMIMPL_FP_F2 addpd, 3
4642IEMIMPL_FP_F2 mulps, 3
4643IEMIMPL_FP_F2 mulpd, 3
4644IEMIMPL_FP_F2 subps, 3
4645IEMIMPL_FP_F2 subpd, 3
4646IEMIMPL_FP_F2 minps, 3
4647IEMIMPL_FP_F2 minpd, 3
4648IEMIMPL_FP_F2 divps, 3
4649IEMIMPL_FP_F2 divpd, 3
4650IEMIMPL_FP_F2 maxps, 3
4651IEMIMPL_FP_F2 maxpd, 3
4652IEMIMPL_FP_F2 haddps, 3
4653IEMIMPL_FP_F2 haddpd, 3
4654IEMIMPL_FP_F2 hsubps, 3
4655IEMIMPL_FP_F2 hsubpd, 3
4656IEMIMPL_FP_F2 addsubps, 3
4657IEMIMPL_FP_F2 addsubpd, 3
4658
4659
4660;;
4661; These are actually unary operations but to keep it simple
4662; we treat them as binary for now, so the output result is
4663; always in sync with the register where the result might get written
4664; to.
4665IEMIMPL_FP_F2 sqrtps, 2
4666IEMIMPL_FP_F2 rsqrtps, 2
4667IEMIMPL_FP_F2 sqrtpd, 2
4668IEMIMPL_FP_F2 cvtdq2ps, 2
4669IEMIMPL_FP_F2 cvtps2dq, 2
4670IEMIMPL_FP_F2 cvttps2dq, 2
4671IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4672IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4673IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4674
4675
4676;;
4677; Floating point instruction working on a full sized register and a single precision operand.
4678;
4679; @param 1 The instruction
4680;
4681; @param A0 FPU context (FXSTATE or XSAVEAREA).
4682; @param A1 Where to return the result including the MXCSR value.
4683; @param A2 Pointer to the first media register size operand (input/output).
4684; @param A3 Pointer to the second single precision floating point value (input).
4685;
4686%macro IEMIMPL_FP_F2_R32 1
4687BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4688 PROLOGUE_4_ARGS
4689 IEMIMPL_SSE_PROLOGUE
4690 SSE_LD_FXSTATE_MXCSR A0
4691
4692 movdqu xmm0, [A2]
4693 movd xmm1, [A3]
4694 %1 xmm0, xmm1
4695 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4696
4697 SSE_ST_FXSTATE_MXCSR A1, A0
4698 IEMIMPL_SSE_EPILOGUE
4699 EPILOGUE_4_ARGS
4700ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4701
4702BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4703 PROLOGUE_4_ARGS
4704 IEMIMPL_AVX_PROLOGUE
4705 AVX_LD_XSAVEAREA_MXCSR A0
4706
4707 vmovdqu xmm0, [A2]
4708 vmovd xmm1, [A3]
4709 v %+ %1 xmm0, xmm0, xmm1
4710 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4711
4712 AVX128_ST_XSAVEAREA_MXCSR A1
4713 IEMIMPL_AVX_PROLOGUE
4714 EPILOGUE_4_ARGS
4715ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4716%endmacro
4717
4718IEMIMPL_FP_F2_R32 addss
4719IEMIMPL_FP_F2_R32 mulss
4720IEMIMPL_FP_F2_R32 subss
4721IEMIMPL_FP_F2_R32 minss
4722IEMIMPL_FP_F2_R32 divss
4723IEMIMPL_FP_F2_R32 maxss
4724IEMIMPL_FP_F2_R32 cvtss2sd
4725IEMIMPL_FP_F2_R32 sqrtss
4726IEMIMPL_FP_F2_R32 rsqrtss
4727
4728
4729;;
4730; Floating point instruction working on a full sized register and a double precision operand.
4731;
4732; @param 1 The instruction
4733;
4734; @param A0 FPU context (FXSTATE or XSAVEAREA).
4735; @param A1 Where to return the result including the MXCSR value.
4736; @param A2 Pointer to the first media register size operand (input/output).
4737; @param A3 Pointer to the second double precision floating point value (input).
4738;
4739%macro IEMIMPL_FP_F2_R64 1
4740BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4741 PROLOGUE_4_ARGS
4742 IEMIMPL_SSE_PROLOGUE
4743 SSE_LD_FXSTATE_MXCSR A0
4744
4745 movdqu xmm0, [A2]
4746 movq xmm1, [A3]
4747 %1 xmm0, xmm1
4748 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4749
4750 SSE_ST_FXSTATE_MXCSR A1, A0
4751 IEMIMPL_SSE_EPILOGUE
4752 EPILOGUE_4_ARGS
4753ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4754
4755BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4756 PROLOGUE_4_ARGS
4757 IEMIMPL_AVX_PROLOGUE
4758 AVX_LD_XSAVEAREA_MXCSR A0
4759
4760 vmovdqu xmm0, [A2]
4761 vmovq xmm1, [A3]
4762 v %+ %1 xmm0, xmm0, xmm1
4763 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4764
4765 AVX128_ST_XSAVEAREA_MXCSR A1
4766 IEMIMPL_AVX_EPILOGUE
4767 EPILOGUE_4_ARGS
4768ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4769%endmacro
4770
4771IEMIMPL_FP_F2_R64 addsd
4772IEMIMPL_FP_F2_R64 mulsd
4773IEMIMPL_FP_F2_R64 subsd
4774IEMIMPL_FP_F2_R64 minsd
4775IEMIMPL_FP_F2_R64 divsd
4776IEMIMPL_FP_F2_R64 maxsd
4777IEMIMPL_FP_F2_R64 cvtsd2ss
4778IEMIMPL_FP_F2_R64 sqrtsd
4779
4780
4781;;
4782; Macro for the cvtpd2ps/cvtps2pd instructions.
4783;
4784; 1 The instruction name.
4785; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4786;
4787; @param A0 FPU context (FXSTATE or XSAVEAREA).
4788; @param A1 Where to return the result including the MXCSR value.
4789; @param A2 Pointer to the first media register size operand (input/output).
4790; @param A3 Pointer to the second media register size operand (input).
4791;
4792%macro IEMIMPL_CVT_F2 2
4793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4794 PROLOGUE_4_ARGS
4795 IEMIMPL_SSE_PROLOGUE
4796 SSE_LD_FXSTATE_MXCSR A0
4797
4798 movdqu xmm0, [A2]
4799 movdqu xmm1, [A3]
4800 %1 xmm0, xmm1
4801 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4802
4803 SSE_ST_FXSTATE_MXCSR A1, A0
4804 IEMIMPL_SSE_EPILOGUE
4805 EPILOGUE_4_ARGS
4806ENDPROC iemAImpl_ %+ %1 %+ _u128
4807
4808BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4809 PROLOGUE_4_ARGS
4810 IEMIMPL_AVX_PROLOGUE
4811 AVX_LD_XSAVEAREA_MXCSR A0
4812
4813 vmovdqu xmm0, [A2]
4814 vmovdqu xmm1, [A3]
4815 v %+ %1 xmm0, xmm1
4816 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4817
4818 AVX128_ST_XSAVEAREA_MXCSR A1
4819 IEMIMPL_AVX_EPILOGUE
4820 EPILOGUE_4_ARGS
4821ENDPROC iemAImpl_v %+ %1 %+ _u128
4822
4823BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4824 PROLOGUE_4_ARGS
4825 IEMIMPL_AVX_PROLOGUE
4826 AVX_LD_XSAVEAREA_MXCSR A0
4827
4828 vmovdqu ymm0, [A2]
4829 vmovdqu ymm1, [A3]
4830 %if %2 == 0
4831 v %+ %1 xmm0, ymm1
4832 %else
4833 v %+ %1 ymm0, xmm1
4834 %endif
4835 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4836
4837 AVX256_ST_XSAVEAREA_MXCSR A1
4838 IEMIMPL_AVX_EPILOGUE
4839 EPILOGUE_4_ARGS
4840ENDPROC iemAImpl_v %+ %1 %+ _u256
4841%endmacro
4842
4843IEMIMPL_CVT_F2 cvtpd2ps, 0
4844IEMIMPL_CVT_F2 cvtps2pd, 1
4845
4846
4847;;
4848; shufps instructions with 8-bit immediates.
4849;
4850; @param A0 Pointer to the destination media register size operand (input/output).
4851; @param A1 Pointer to the first source media register size operand (input).
4852; @param A2 The 8-bit immediate
4853;
4854BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4855 PROLOGUE_3_ARGS
4856 IEMIMPL_SSE_PROLOGUE
4857
4858 movdqu xmm0, [A0]
4859 movdqu xmm1, [A1]
4860 lea T1, [.imm0 xWrtRIP]
4861 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2
4862 lea T1, [T1 + T0*2]
4863 call T1
4864 movdqu [A0], xmm0
4865
4866 IEMIMPL_SSE_EPILOGUE
4867 EPILOGUE_3_ARGS
4868 %assign bImm 0
4869 %rep 256
4870.imm %+ bImm:
4871 shufps xmm0, xmm1, bImm
4872 ret
4873 int3
4874 %assign bImm bImm + 1
4875 %endrep
4876.immEnd: ; 256*6 == 0x600
4877dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4878dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4879ENDPROC iemAImpl_shufps_u128
4880
4881
4882;;
4883; shufpd instruction with 8-bit immediates.
4884;
4885; @param A0 Pointer to the destination media register size operand (input/output).
4886; @param A1 Pointer to the first source media register size operand (input).
4887; @param A2 The 8-bit immediate
4888;
4889BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4890 PROLOGUE_3_ARGS
4891 IEMIMPL_SSE_PROLOGUE
4892
4893 movdqu xmm0, [A0]
4894 movdqu xmm1, [A1]
4895 lea T1, [.imm0 xWrtRIP]
4896 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2
4897 lea T1, [T1 + T0*2]
4898 call T1
4899 movdqu [A0], xmm0
4900
4901 IEMIMPL_SSE_EPILOGUE
4902 EPILOGUE_3_ARGS
4903 %assign bImm 0
4904 %rep 256
4905.imm %+ bImm:
4906 shufpd xmm0, xmm1, bImm
4907 ret
4908 %assign bImm bImm + 1
4909 %endrep
4910.immEnd: ; 256*6 == 0x600
4911dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4912dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4913ENDPROC iemAImpl_shufpd_u128
4914
4915
4916;;
4917; vshufp{s,d} instructions with 8-bit immediates.
4918;
4919; @param 1 The instruction name.
4920;
4921; @param A0 Pointer to the destination media register size operand (output).
4922; @param A1 Pointer to the first source media register size operand (input).
4923; @param A2 Pointer to the second source media register size operand (input).
4924; @param A3 The 8-bit immediate
4925;
4926%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4927BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4928 PROLOGUE_4_ARGS
4929 IEMIMPL_AVX_PROLOGUE
4930
4931 movdqu xmm0, [A1]
4932 movdqu xmm1, [A2]
4933 lea T1, [.imm0 xWrtRIP]
4934 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4935 lea T1, [T1 + T0*2]
4936 call T1
4937 movdqu [A0], xmm0
4938
4939 IEMIMPL_AVX_EPILOGUE
4940 EPILOGUE_4_ARGS
4941 %assign bImm 0
4942 %rep 256
4943.imm %+ bImm:
4944 %1 xmm0, xmm0, xmm1, bImm
4945 ret
4946 %assign bImm bImm + 1
4947 %endrep
4948.immEnd: ; 256*6 == 0x600
4949dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4950dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4951ENDPROC iemAImpl_ %+ %1 %+ _u128
4952
4953BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4954 PROLOGUE_4_ARGS
4955 IEMIMPL_AVX_PROLOGUE
4956
4957 vmovdqu ymm0, [A1]
4958 vmovdqu ymm1, [A2]
4959 lea T1, [.imm0 xWrtRIP]
4960 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4961 lea T1, [T1 + T0*2]
4962 call T1
4963 vmovdqu [A0], ymm0
4964
4965 IEMIMPL_AVX_EPILOGUE
4966 EPILOGUE_4_ARGS
4967 %assign bImm 0
4968 %rep 256
4969.imm %+ bImm:
4970 %1 ymm0, ymm0, ymm1, bImm
4971 ret
4972 %assign bImm bImm + 1
4973 %endrep
4974.immEnd: ; 256*6 == 0x600
4975dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4976dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4977ENDPROC iemAImpl_ %+ %1 %+ _u256
4978%endmacro
4979
4980IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
4981IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
4982
4983
4984;;
4985; One of the [p]blendv{b,ps,pd} variants
4986;
4987; @param 1 The instruction
4988;
4989; @param A0 Pointer to the first media register sized operand (input/output).
4990; @param A1 Pointer to the second media sized value (input).
4991; @param A2 Pointer to the media register sized mask value (input).
4992;
4993%macro IEMIMPL_P_BLEND 1
4994BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4995 PROLOGUE_3_ARGS
4996 IEMIMPL_SSE_PROLOGUE
4997
4998 movdqu xmm0, [A2] ; This is implicit
4999 movdqu xmm1, [A0]
5000 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5001 %1 xmm1, xmm2
5002 movdqu [A0], xmm1
5003
5004 IEMIMPL_SSE_PROLOGUE
5005 EPILOGUE_3_ARGS
5006ENDPROC iemAImpl_ %+ %1 %+ _u128
5007%endmacro
5008
5009IEMIMPL_P_BLEND pblendvb
5010IEMIMPL_P_BLEND blendvps
5011IEMIMPL_P_BLEND blendvpd
5012
5013
5014;;
5015; One of the v[p]blendv{b,ps,pd} variants
5016;
5017; @param 1 The instruction
5018;
5019; @param A0 Pointer to the first media register sized operand (output).
5020; @param A1 Pointer to the first media register sized operand (input).
5021; @param A2 Pointer to the second media register sized operand (input).
5022; @param A3 Pointer to the media register sized mask value (input).
5023%macro IEMIMPL_AVX_P_BLEND 1
5024BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5025 PROLOGUE_4_ARGS
5026 IEMIMPL_AVX_PROLOGUE
5027
5028 vmovdqu xmm0, [A1]
5029 vmovdqu xmm1, [A2]
5030 vmovdqu xmm2, [A3]
5031 %1 xmm0, xmm0, xmm1, xmm2
5032 vmovdqu [A0], xmm0
5033
5034 IEMIMPL_AVX_PROLOGUE
5035 EPILOGUE_4_ARGS
5036ENDPROC iemAImpl_ %+ %1 %+ _u128
5037
5038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5039 PROLOGUE_4_ARGS
5040 IEMIMPL_AVX_PROLOGUE
5041
5042 vmovdqu ymm0, [A1]
5043 vmovdqu ymm1, [A2]
5044 vmovdqu ymm2, [A3]
5045 %1 ymm0, ymm0, ymm1, ymm2
5046 vmovdqu [A0], ymm0
5047
5048 IEMIMPL_AVX_PROLOGUE
5049 EPILOGUE_4_ARGS
5050ENDPROC iemAImpl_ %+ %1 %+ _u256
5051%endmacro
5052
5053IEMIMPL_AVX_P_BLEND vpblendvb
5054IEMIMPL_AVX_P_BLEND vblendvps
5055IEMIMPL_AVX_P_BLEND vblendvpd
5056
5057
5058;;
5059; palignr mm1, mm2/m64 instruction.
5060;
5061; @param A0 Pointer to the first media register sized operand (output).
5062; @param A1 The second register sized operand (input).
5063; @param A2 The 8-bit immediate.
5064BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5065 PROLOGUE_3_ARGS
5066 IEMIMPL_MMX_PROLOGUE
5067
5068 movq mm0, [A0]
5069 movq mm1, A1
5070 lea T1, [.imm0 xWrtRIP]
5071 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2
5072 lea T1, [T1 + T0*2]
5073 call T1
5074 movq [A0], mm0
5075
5076 IEMIMPL_MMX_EPILOGUE
5077 EPILOGUE_3_ARGS
5078 %assign bImm 0
5079 %rep 256
5080.imm %+ bImm:
5081 palignr mm0, mm1, bImm
5082 ret
5083 %assign bImm bImm + 1
5084 %endrep
5085.immEnd: ; 256*6 == 0x600
5086dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5087dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5088ENDPROC iemAImpl_palignr_u64
5089
5090
5091;;
5092; SSE instructions with 8-bit immediates of the form
5093; xxx xmm1, xmm2, imm8.
5094; where the instruction encoding takes up 6 bytes.
5095;
5096; @param 1 The instruction name.
5097;
5098; @param A0 Pointer to the first media register size operand (input/output).
5099; @param A1 Pointer to the second source media register size operand (input).
5100; @param A2 The 8-bit immediate
5101;
5102%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5103BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5104 PROLOGUE_3_ARGS
5105 IEMIMPL_SSE_PROLOGUE
5106
5107 movdqu xmm0, [A0]
5108 movdqu xmm1, [A1]
5109 lea T1, [.imm0 xWrtRIP]
5110 lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2
5111 lea T1, [T1 + T0*2]
5112 call T1
5113 movdqu [A0], xmm0
5114
5115 IEMIMPL_SSE_EPILOGUE
5116 EPILOGUE_3_ARGS
5117 %assign bImm 0
5118 %rep 256
5119.imm %+ bImm:
5120 %1 xmm0, xmm1, bImm
5121 ret
5122 int3
5123 %assign bImm bImm + 1
5124 %endrep
5125.immEnd: ; 256*8 == 0x800
5126dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5127dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5128ENDPROC iemAImpl_ %+ %1 %+ _u128
5129%endmacro
5130
5131IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5132IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5133IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5134IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5135IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5136IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5137
5138
5139;;
5140; AVX instructions with 8-bit immediates of the form
5141; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5142; where the instruction encoding takes up 6 bytes.
5143;
5144; @param 1 The instruction name.
5145; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5146;
5147; @param A0 Pointer to the destination media register size operand (output).
5148; @param A1 Pointer to the first source media register size operand (input).
5149; @param A2 Pointer to the second source media register size operand (input).
5150; @param A3 The 8-bit immediate
5151;
5152%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5153BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5154 PROLOGUE_4_ARGS
5155 IEMIMPL_AVX_PROLOGUE
5156
5157 movdqu xmm0, [A1]
5158 movdqu xmm1, [A2]
5159 lea T1, [.imm0 xWrtRIP]
5160 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5161 lea T1, [T1 + T0*2]
5162 call T1
5163 movdqu [A0], xmm0
5164
5165 IEMIMPL_AVX_EPILOGUE
5166 EPILOGUE_4_ARGS
5167 %assign bImm 0
5168 %rep 256
5169.imm %+ bImm:
5170 %1 xmm0, xmm0, xmm1, bImm
5171 ret
5172 int3
5173 %assign bImm bImm + 1
5174 %endrep
5175.immEnd: ; 256*8 == 0x800
5176dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5177dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5178ENDPROC iemAImpl_ %+ %1 %+ _u128
5179
5180 %if %2 == 1
5181BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5182 PROLOGUE_4_ARGS
5183 IEMIMPL_AVX_PROLOGUE
5184
5185 vmovdqu ymm0, [A1]
5186 vmovdqu ymm1, [A2]
5187 lea T1, [.imm0 xWrtRIP]
5188 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5189 lea T1, [T1 + T0*2]
5190 call T1
5191 vmovdqu [A0], ymm0
5192
5193 IEMIMPL_AVX_EPILOGUE
5194 EPILOGUE_4_ARGS
5195 %assign bImm 0
5196 %rep 256
5197.imm %+ bImm:
5198 %1 ymm0, ymm0, ymm1, bImm
5199 ret
5200 int3
5201 %assign bImm bImm + 1
5202 %endrep
5203.immEnd: ; 256*8 == 0x800
5204dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5205dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5206ENDPROC iemAImpl_ %+ %1 %+ _u256
5207 %endif
5208%endmacro
5209
5210IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5211IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5212IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5213IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5214IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5215
5216
5217;;
5218; Need to move this as well somewhere better?
5219;
5220struc IEMPCMPISTRISRC
5221 .uSrc1 resd 4
5222 .uSrc2 resd 4
5223endstruc
5224
5225;;
5226; The pcmpistri instruction.
5227;
5228; @param A0 Pointer to the ECX register to store the result to (output).
5229; @param A1 Pointer to the EFLAGS register.
5230; @param A2 Pointer to the structure containing the source operands (input).
5231; @param A3 The 8-bit immediate
5232;
5233BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5234 PROLOGUE_4_ARGS
5235 IEMIMPL_SSE_PROLOGUE
5236
5237 movdqu xmm0, [A2 + IEMPCMPISTRISRC.uSrc1]
5238 movdqu xmm1, [A2 + IEMPCMPISTRISRC.uSrc2]
5239 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5240 lea T1, [.imm0 xWrtRIP]
5241 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5242 lea T1, [T1 + T0*2]
5243 call T1
5244
5245 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5246 mov [T2], ecx
5247
5248 IEMIMPL_SSE_EPILOGUE
5249 EPILOGUE_4_ARGS
5250 %assign bImm 0
5251 %rep 256
5252.imm %+ bImm:
5253 pcmpistri xmm0, xmm1, bImm
5254 ret
5255 int3
5256 %assign bImm bImm + 1
5257 %endrep
5258.immEnd: ; 256*8 == 0x800
5259dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5260dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5261ENDPROC iemAImpl_pcmpistri_u128
5262
5263
5264;;
5265; pinsrw instruction.
5266;
5267; @param A0 Pointer to the first media register size operand (input/output).
5268; @param A1 The 16 bit input operand (input).
5269; @param A2 The 8-bit immediate
5270;
5271BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5272 PROLOGUE_3_ARGS
5273 IEMIMPL_SSE_PROLOGUE
5274
5275 movq mm0, [A0]
5276 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5
5277 lea T1, [.imm0 xWrtRIP]
5278 lea T1, [T1 + T0]
5279 call T1
5280 movq [A0], mm0
5281
5282 IEMIMPL_SSE_EPILOGUE
5283 EPILOGUE_3_ARGS
5284 %assign bImm 0
5285 %rep 256
5286.imm %+ bImm:
5287 pinsrw mm0, A1_32, bImm
5288 ret
5289 %assign bImm bImm + 1
5290 %endrep
5291.immEnd: ; 256*5 == 0x500
5292dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5293dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5294ENDPROC iemAImpl_pinsrw_u64
5295
5296BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5297 PROLOGUE_3_ARGS
5298 IEMIMPL_SSE_PROLOGUE
5299
5300 movdqu xmm0, [A0]
5301 lea T1, [.imm0 xWrtRIP]
5302 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2
5303 lea T1, [T1 + T0*2]
5304 call T1
5305 movdqu [A0], xmm0
5306
5307 IEMIMPL_SSE_EPILOGUE
5308 EPILOGUE_3_ARGS
5309 %assign bImm 0
5310 %rep 256
5311.imm %+ bImm:
5312 pinsrw xmm0, A1_32, bImm
5313 ret
5314 %assign bImm bImm + 1
5315 %endrep
5316.immEnd: ; 256*6 == 0x600
5317dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5318dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5319ENDPROC iemAImpl_pinsrw_u128
5320
5321;;
5322; vpinsrw instruction.
5323;
5324; @param A0 Pointer to the first media register size operand (output).
5325; @param A1 Pointer to the source media register size operand (input).
5326; @param A2 The 16 bit input operand (input).
5327; @param A3 The 8-bit immediate
5328;
5329BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5330 PROLOGUE_4_ARGS
5331 IEMIMPL_SSE_PROLOGUE
5332
5333 movdqu xmm0, [A1]
5334 lea T1, [.imm0 xWrtRIP]
5335 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2
5336 lea T1, [T1 + T0*2]
5337 mov A1, A2 ; A2 requires longer encoding on Windows
5338 call T1
5339 movdqu [A0], xmm0
5340
5341 IEMIMPL_SSE_EPILOGUE
5342 EPILOGUE_4_ARGS
5343 %assign bImm 0
5344 %rep 256
5345.imm %+ bImm:
5346 vpinsrw xmm0, xmm0, A1_32, bImm
5347 ret
5348 %assign bImm bImm + 1
5349 %endrep
5350.immEnd: ; 256*6 == 0x600
5351dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5352dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5353ENDPROC iemAImpl_vpinsrw_u128
5354
5355
5356;;
5357; pextrw instruction.
5358;
5359; @param A0 Pointer to the 16bit output operand (output).
5360; @param A1 Pointer to the media register size operand (input).
5361; @param A2 The 8-bit immediate
5362;
5363BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5364 PROLOGUE_3_ARGS
5365 IEMIMPL_SSE_PROLOGUE
5366
5367 movq mm0, A1
5368 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5
5369 lea T1, [.imm0 xWrtRIP]
5370 lea T1, [T1 + T0]
5371 call T1
5372 mov word [A0], T0_16
5373
5374 IEMIMPL_SSE_EPILOGUE
5375 EPILOGUE_3_ARGS
5376 %assign bImm 0
5377 %rep 256
5378.imm %+ bImm:
5379 pextrw T0_32, mm0, bImm
5380 ret
5381 %assign bImm bImm + 1
5382 %endrep
5383.immEnd: ; 256*5 == 0x500
5384dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5385dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5386ENDPROC iemAImpl_pextrw_u64
5387
5388BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5389 PROLOGUE_3_ARGS
5390 IEMIMPL_SSE_PROLOGUE
5391
5392 movdqu xmm0, [A1]
5393 lea T1, [.imm0 xWrtRIP]
5394 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2
5395 lea T1, [T1 + T0*2]
5396 call T1
5397 mov word [A0], T0_16
5398
5399 IEMIMPL_SSE_EPILOGUE
5400 EPILOGUE_3_ARGS
5401 %assign bImm 0
5402 %rep 256
5403.imm %+ bImm:
5404 pextrw T0_32, xmm0, bImm
5405 ret
5406 %assign bImm bImm + 1
5407 %endrep
5408.immEnd: ; 256*6 == 0x600
5409dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5410dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5411ENDPROC iemAImpl_pextrw_u128
5412
5413;;
5414; vpextrw instruction.
5415;
5416; @param A0 Pointer to the 16bit output operand (output).
5417; @param A1 Pointer to the source media register size operand (input).
5418; @param A2 The 8-bit immediate
5419;
5420BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5421 PROLOGUE_3_ARGS
5422 IEMIMPL_SSE_PROLOGUE
5423
5424 movdqu xmm0, [A1]
5425 lea T1, [.imm0 xWrtRIP]
5426 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2
5427 lea T1, [T1 + T0*2]
5428 call T1
5429 mov word [A0], T0_16
5430
5431 IEMIMPL_SSE_EPILOGUE
5432 EPILOGUE_3_ARGS
5433 %assign bImm 0
5434 %rep 256
5435.imm %+ bImm:
5436 vpextrw T0_32, xmm0, bImm
5437 ret
5438 %assign bImm bImm + 1
5439 %endrep
5440.immEnd: ; 256*6 == 0x600
5441dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5442dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5443ENDPROC iemAImpl_vpextrw_u128
5444
5445
5446;;
5447; movmskp{s,d} SSE instruction template
5448;
5449; @param 1 The SSE instruction name.
5450; @param 2 The AVX instruction name.
5451;
5452; @param A0 Pointer to the output register (output/byte sized).
5453; @param A1 Pointer to the source media register size operand (input).
5454;
5455%macro IEMIMPL_MEDIA_MOVMSK_P 2
5456BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5457 PROLOGUE_2_ARGS
5458 IEMIMPL_SSE_PROLOGUE
5459
5460 movdqu xmm0, [A1]
5461 %1 T0, xmm0
5462 mov byte [A0], T0_8
5463
5464 IEMIMPL_SSE_EPILOGUE
5465 EPILOGUE_2_ARGS
5466ENDPROC iemAImpl_ %+ %1 %+ _u128
5467
5468BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5469 PROLOGUE_2_ARGS
5470 IEMIMPL_AVX_PROLOGUE
5471
5472 movdqu xmm0, [A1]
5473 %2 T0, xmm0
5474 mov byte [A0], T0_8
5475
5476 IEMIMPL_AVX_EPILOGUE
5477 EPILOGUE_2_ARGS
5478ENDPROC iemAImpl_ %+ %2 %+ _u128
5479
5480BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5481 PROLOGUE_2_ARGS
5482 IEMIMPL_AVX_PROLOGUE
5483
5484 vmovdqu ymm0, [A1]
5485 %2 T0, ymm0
5486 mov byte [A0], T0_8
5487
5488 IEMIMPL_AVX_EPILOGUE
5489 EPILOGUE_2_ARGS
5490ENDPROC iemAImpl_ %+ %2 %+ _u256
5491%endmacro
5492
5493IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5494IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5495
5496
5497;;
5498; Restores the SSE MXCSR register with the original value.
5499;
5500; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5501; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5502; @param 2 Expression giving the address of the FXSTATE of the guest.
5503;
5504; @note Restores the stack pointer.
5505;
5506%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5507 sub xSP, 4
5508 stmxcsr [xSP]
5509 mov T0_32, [xSP]
5510 add xSP, 4
5511 ; Merge the status bits into the original MXCSR value.
5512 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5513 and T0_32, X86_MXCSR_XCPT_FLAGS
5514 or T0_32, T1_32
5515 mov [%1], T0_32
5516
5517 ldmxcsr [xSP]
5518 add xSP, 4
5519%endmacro
5520
5521
5522;;
5523; cvttsd2si instruction - 32-bit variant.
5524;
5525; @param A0 FPU context (FXSTATE or XSAVEAREA).
5526; @param A1 Where to return the MXCSR value.
5527; @param A2 Pointer to the result operand (output).
5528; @param A3 Pointer to the second operand (input).
5529;
5530BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5531 PROLOGUE_4_ARGS
5532 IEMIMPL_SSE_PROLOGUE
5533 SSE_LD_FXSTATE_MXCSR A0
5534
5535 cvttsd2si T0_32, [A3]
5536 mov dword [A2], T0_32
5537
5538 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5539 IEMIMPL_SSE_EPILOGUE
5540 EPILOGUE_4_ARGS
5541ENDPROC iemAImpl_cvttsd2si_i32_r64
5542
5543;;
5544; cvttsd2si instruction - 64-bit variant.
5545;
5546; @param A0 FPU context (FXSTATE or XSAVEAREA).
5547; @param A1 Where to return the MXCSR value.
5548; @param A2 Pointer to the result operand (output).
5549; @param A3 Pointer to the second operand (input).
5550;
5551BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5552 PROLOGUE_4_ARGS
5553 IEMIMPL_SSE_PROLOGUE
5554 SSE_LD_FXSTATE_MXCSR A0
5555
5556 cvttsd2si T0, [A3]
5557 mov qword [A2], T0
5558
5559 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5560 IEMIMPL_SSE_EPILOGUE
5561 EPILOGUE_4_ARGS
5562ENDPROC iemAImpl_cvttsd2si_i64_r64
5563
5564
5565;;
5566; cvtsd2si instruction - 32-bit variant.
5567;
5568; @param A0 FPU context (FXSTATE or XSAVEAREA).
5569; @param A1 Where to return the MXCSR value.
5570; @param A2 Pointer to the result operand (output).
5571; @param A3 Pointer to the second operand (input).
5572;
5573BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5574 PROLOGUE_4_ARGS
5575 IEMIMPL_SSE_PROLOGUE
5576 SSE_LD_FXSTATE_MXCSR A0
5577
5578 cvtsd2si T0_32, [A3]
5579 mov dword [A2], T0_32
5580
5581 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5582 IEMIMPL_SSE_EPILOGUE
5583 EPILOGUE_4_ARGS
5584ENDPROC iemAImpl_cvtsd2si_i32_r64
5585
5586;;
5587; cvtsd2si instruction - 64-bit variant.
5588;
5589; @param A0 FPU context (FXSTATE or XSAVEAREA).
5590; @param A1 Where to return the MXCSR value.
5591; @param A2 Pointer to the result operand (output).
5592; @param A3 Pointer to the second operand (input).
5593;
5594BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5595 PROLOGUE_4_ARGS
5596 IEMIMPL_SSE_PROLOGUE
5597 SSE_LD_FXSTATE_MXCSR A0
5598
5599 cvtsd2si T0, [A3]
5600 mov qword [A2], T0
5601
5602 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5603 IEMIMPL_SSE_EPILOGUE
5604 EPILOGUE_4_ARGS
5605ENDPROC iemAImpl_cvtsd2si_i64_r64
5606
5607
5608;;
5609; cvttss2si instruction - 32-bit variant.
5610;
5611; @param A0 FPU context (FXSTATE or XSAVEAREA).
5612; @param A1 Where to return the MXCSR value.
5613; @param A2 Pointer to the result operand (output).
5614; @param A3 Pointer to the second operand (input).
5615;
5616BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5617 PROLOGUE_4_ARGS
5618 IEMIMPL_SSE_PROLOGUE
5619 SSE_LD_FXSTATE_MXCSR A0
5620
5621 cvttss2si T0_32, [A3]
5622 mov dword [A2], T0_32
5623
5624 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5625 IEMIMPL_SSE_EPILOGUE
5626 EPILOGUE_4_ARGS
5627ENDPROC iemAImpl_cvttss2si_i32_r32
5628
5629;;
5630; cvttss2si instruction - 64-bit variant.
5631;
5632; @param A0 FPU context (FXSTATE or XSAVEAREA).
5633; @param A1 Where to return the MXCSR value.
5634; @param A2 Pointer to the result operand (output).
5635; @param A3 Pointer to the second operand (input).
5636;
5637BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5638 PROLOGUE_4_ARGS
5639 IEMIMPL_SSE_PROLOGUE
5640 SSE_LD_FXSTATE_MXCSR A0
5641
5642 cvttss2si T0, [A3]
5643 mov qword [A2], T0
5644
5645 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5646 IEMIMPL_SSE_EPILOGUE
5647 EPILOGUE_4_ARGS
5648ENDPROC iemAImpl_cvttss2si_i64_r32
5649
5650
5651;;
5652; cvtss2si instruction - 32-bit variant.
5653;
5654; @param A0 FPU context (FXSTATE or XSAVEAREA).
5655; @param A1 Where to return the MXCSR value.
5656; @param A2 Pointer to the result operand (output).
5657; @param A3 Pointer to the second operand (input).
5658;
5659BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5660 PROLOGUE_4_ARGS
5661 IEMIMPL_SSE_PROLOGUE
5662 SSE_LD_FXSTATE_MXCSR A0
5663
5664 cvtss2si T0_32, [A3]
5665 mov dword [A2], T0_32
5666
5667 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5668 IEMIMPL_SSE_EPILOGUE
5669 EPILOGUE_4_ARGS
5670ENDPROC iemAImpl_cvtss2si_i32_r32
5671
5672;;
5673; cvtss2si instruction - 64-bit variant.
5674;
5675; @param A0 FPU context (FXSTATE or XSAVEAREA).
5676; @param A1 Where to return the MXCSR value.
5677; @param A2 Pointer to the result operand (output).
5678; @param A3 Pointer to the second operand (input).
5679;
5680BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5681 PROLOGUE_4_ARGS
5682 IEMIMPL_SSE_PROLOGUE
5683 SSE_LD_FXSTATE_MXCSR A0
5684
5685 cvtss2si T0, [A3]
5686 mov qword [A2], T0
5687
5688 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5689 IEMIMPL_SSE_EPILOGUE
5690 EPILOGUE_4_ARGS
5691ENDPROC iemAImpl_cvtss2si_i64_r32
5692
5693
5694;;
5695; cvtsi2ss instruction - 32-bit variant.
5696;
5697; @param A0 FPU context (FXSTATE or XSAVEAREA).
5698; @param A1 Where to return the MXCSR value.
5699; @param A2 Pointer to the result operand (output).
5700; @param A3 Pointer to the second operand (input).
5701;
5702BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5703 PROLOGUE_4_ARGS
5704 IEMIMPL_SSE_PROLOGUE
5705 SSE_LD_FXSTATE_MXCSR A0
5706
5707 cvtsi2ss xmm0, dword [A3]
5708 movd dword [A2], xmm0
5709
5710 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5711 IEMIMPL_SSE_EPILOGUE
5712 EPILOGUE_4_ARGS
5713ENDPROC iemAImpl_cvtsi2ss_r32_i32
5714
5715;;
5716; cvtsi2ss instruction - 64-bit variant.
5717;
5718; @param A0 FPU context (FXSTATE or XSAVEAREA).
5719; @param A1 Where to return the MXCSR value.
5720; @param A2 Pointer to the result operand (output).
5721; @param A3 Pointer to the second operand (input).
5722;
5723BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5724 PROLOGUE_4_ARGS
5725 IEMIMPL_SSE_PROLOGUE
5726 SSE_LD_FXSTATE_MXCSR A0
5727
5728 cvtsi2ss xmm0, qword [A3]
5729 movd dword [A2], xmm0
5730
5731 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5732 IEMIMPL_SSE_EPILOGUE
5733 EPILOGUE_4_ARGS
5734ENDPROC iemAImpl_cvtsi2ss_r32_i64
5735
5736
5737;;
5738; cvtsi2sd instruction - 32-bit variant.
5739;
5740; @param A0 FPU context (FXSTATE or XSAVEAREA).
5741; @param A1 Where to return the MXCSR value.
5742; @param A2 Pointer to the result operand (output).
5743; @param A3 Pointer to the second operand (input).
5744;
5745BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
5746 PROLOGUE_4_ARGS
5747 IEMIMPL_SSE_PROLOGUE
5748 SSE_LD_FXSTATE_MXCSR A0
5749
5750 cvtsi2sd xmm0, dword [A3]
5751 movq [A2], xmm0
5752
5753 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5754 IEMIMPL_SSE_EPILOGUE
5755 EPILOGUE_4_ARGS
5756ENDPROC iemAImpl_cvtsi2sd_r64_i32
5757
5758;;
5759; cvtsi2sd instruction - 64-bit variant.
5760;
5761; @param A0 FPU context (FXSTATE or XSAVEAREA).
5762; @param A1 Where to return the MXCSR value.
5763; @param A2 Pointer to the result operand (output).
5764; @param A3 Pointer to the second operand (input).
5765;
5766BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
5767 PROLOGUE_4_ARGS
5768 IEMIMPL_SSE_PROLOGUE
5769 SSE_LD_FXSTATE_MXCSR A0
5770
5771 cvtsi2sd xmm0, qword [A3]
5772 movq [A2], xmm0
5773
5774 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5775 IEMIMPL_SSE_EPILOGUE
5776 EPILOGUE_4_ARGS
5777ENDPROC iemAImpl_cvtsi2sd_r64_i64
5778
5779
5780;;
5781; Initialize the SSE MXCSR register using the guest value partially to
5782; account for rounding mode.
5783;
5784; @uses 4 bytes of stack to save the original value, T0.
5785; @param 1 Expression giving the address of the MXCSR register of the guest.
5786;
5787%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
5788 sub xSP, 4
5789
5790 stmxcsr [xSP]
5791 mov T0_32, [%1]
5792 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5793 or T0_32, X86_MXCSR_XCPT_MASK
5794 sub xSP, 4
5795 mov [xSP], T0_32
5796 ldmxcsr [xSP]
5797 add xSP, 4
5798%endmacro
5799
5800
5801;;
5802; Restores the SSE MXCSR register with the original value.
5803;
5804; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5805; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5806;
5807; @note Restores the stack pointer.
5808;
5809%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
5810 sub xSP, 4
5811 stmxcsr [xSP]
5812 mov T0_32, [xSP]
5813 add xSP, 4
5814 ; Merge the status bits into the original MXCSR value.
5815 mov T1_32, [%1]
5816 and T0_32, X86_MXCSR_XCPT_FLAGS
5817 or T0_32, T1_32
5818 mov [%1], T0_32
5819
5820 ldmxcsr [xSP]
5821 add xSP, 4
5822%endmacro
5823
5824
5825;
5826; UCOMISS (SSE)
5827;
5828; @param A0 Pointer to the MXCSR value (input/output).
5829; @param A1 Pointer to the EFLAGS value (input/output).
5830; @param A2 Pointer to the first source operand (aka readonly destination).
5831; @param A3 Pointer to the second source operand.
5832;
5833BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
5834 PROLOGUE_4_ARGS
5835 IEMIMPL_SSE_PROLOGUE
5836 SSE_LD_FXSTATE_MXCSR_ONLY A0
5837
5838 movdqu xmm0, [A2]
5839 movdqu xmm1, [A3]
5840 ucomiss xmm0, xmm1
5841 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5842
5843 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5844 IEMIMPL_SSE_EPILOGUE
5845 EPILOGUE_4_ARGS
5846ENDPROC iemAImpl_ucomiss_u128
5847
5848BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
5849 PROLOGUE_4_ARGS
5850 IEMIMPL_SSE_PROLOGUE
5851 SSE_LD_FXSTATE_MXCSR_ONLY A0
5852
5853 movdqu xmm0, [A2]
5854 movdqu xmm1, [A3]
5855 vucomiss xmm0, xmm1
5856 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5857
5858 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5859 IEMIMPL_SSE_EPILOGUE
5860 EPILOGUE_4_ARGS
5861ENDPROC iemAImpl_vucomiss_u128
5862
5863
5864;
5865; UCOMISD (SSE)
5866;
5867; @param A0 Pointer to the MXCSR value (input/output).
5868; @param A1 Pointer to the EFLAGS value (input/output).
5869; @param A2 Pointer to the first source operand (aka readonly destination).
5870; @param A3 Pointer to the second source operand.
5871;
5872BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
5873 PROLOGUE_4_ARGS
5874 IEMIMPL_SSE_PROLOGUE
5875 SSE_LD_FXSTATE_MXCSR_ONLY A0
5876
5877 movdqu xmm0, [A2]
5878 movdqu xmm1, [A3]
5879 ucomisd xmm0, xmm1
5880 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5881
5882 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5883 IEMIMPL_SSE_EPILOGUE
5884 EPILOGUE_4_ARGS
5885ENDPROC iemAImpl_ucomisd_u128
5886
5887BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
5888 PROLOGUE_4_ARGS
5889 IEMIMPL_SSE_PROLOGUE
5890 SSE_LD_FXSTATE_MXCSR_ONLY A0
5891
5892 movdqu xmm0, [A2]
5893 movdqu xmm1, [A3]
5894 vucomisd xmm0, xmm1
5895 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5896
5897 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5898 IEMIMPL_SSE_EPILOGUE
5899 EPILOGUE_4_ARGS
5900ENDPROC iemAImpl_vucomisd_u128
5901
5902;
5903; COMISS (SSE)
5904;
5905; @param A0 Pointer to the MXCSR value (input/output).
5906; @param A1 Pointer to the EFLAGS value (input/output).
5907; @param A2 Pointer to the first source operand (aka readonly destination).
5908; @param A3 Pointer to the second source operand.
5909;
5910BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
5911 PROLOGUE_4_ARGS
5912 IEMIMPL_SSE_PROLOGUE
5913 SSE_LD_FXSTATE_MXCSR_ONLY A0
5914
5915 movdqu xmm0, [A2]
5916 movdqu xmm1, [A3]
5917 comiss xmm0, xmm1
5918 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5919
5920 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5921 IEMIMPL_SSE_EPILOGUE
5922 EPILOGUE_4_ARGS
5923ENDPROC iemAImpl_comiss_u128
5924
5925BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
5926 PROLOGUE_4_ARGS
5927 IEMIMPL_SSE_PROLOGUE
5928 SSE_LD_FXSTATE_MXCSR_ONLY A0
5929
5930 movdqu xmm0, [A2]
5931 movdqu xmm1, [A3]
5932 vcomiss xmm0, xmm1
5933 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5934
5935 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5936 IEMIMPL_SSE_EPILOGUE
5937 EPILOGUE_4_ARGS
5938ENDPROC iemAImpl_vcomiss_u128
5939
5940
5941;
5942; COMISD (SSE)
5943;
5944; @param A0 Pointer to the MXCSR value (input/output).
5945; @param A1 Pointer to the EFLAGS value (input/output).
5946; @param A2 Pointer to the first source operand (aka readonly destination).
5947; @param A3 Pointer to the second source operand.
5948;
5949BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
5950 PROLOGUE_4_ARGS
5951 IEMIMPL_SSE_PROLOGUE
5952 SSE_LD_FXSTATE_MXCSR_ONLY A0
5953
5954 movdqu xmm0, [A2]
5955 movdqu xmm1, [A3]
5956 comisd xmm0, xmm1
5957 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5958
5959 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5960 IEMIMPL_SSE_EPILOGUE
5961 EPILOGUE_4_ARGS
5962ENDPROC iemAImpl_comisd_u128
5963
5964BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
5965 PROLOGUE_4_ARGS
5966 IEMIMPL_SSE_PROLOGUE
5967 SSE_LD_FXSTATE_MXCSR_ONLY A0
5968
5969 movdqu xmm0, [A2]
5970 movdqu xmm1, [A3]
5971 vcomisd xmm0, xmm1
5972 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5973
5974 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5975 IEMIMPL_SSE_EPILOGUE
5976 EPILOGUE_4_ARGS
5977ENDPROC iemAImpl_vcomisd_u128
5978
5979
5980;;
5981; Need to move this as well somewhere better?
5982;
5983struc IEMMEDIAF2XMMSRC
5984 .uSrc1 resd 4
5985 .uSrc2 resd 4
5986endstruc
5987
5988
5989;
5990; CMPPS (SSE)
5991;
5992; @param A0 Pointer to the MXCSR value (input/output).
5993; @param A1 Pointer to the first media register size operand (output).
5994; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
5995; @param A3 The 8-bit immediate (input).
5996;
5997BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
5998 PROLOGUE_4_ARGS
5999 IEMIMPL_SSE_PROLOGUE
6000 SSE_LD_FXSTATE_MXCSR_ONLY A0
6001
6002 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6003 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6004 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5
6005 lea T1, [.imm0 xWrtRIP]
6006 lea T1, [T1 + T0]
6007 call T1
6008 movdqu [A1], xmm0
6009
6010 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6011 IEMIMPL_SSE_EPILOGUE
6012 EPILOGUE_4_ARGS
6013 %assign bImm 0
6014 %rep 256
6015.imm %+ bImm:
6016 cmpps xmm0, xmm1, bImm
6017 ret
6018 %assign bImm bImm + 1
6019 %endrep
6020.immEnd: ; 256*5 == 0x500
6021dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6022dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6023ENDPROC iemAImpl_cmpps_u128
6024
6025;;
6026; SSE instructions with 8-bit immediates of the form
6027; xxx xmm1, xmm2, imm8.
6028; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6029; register.
6030;
6031; @param 1 The instruction name.
6032;
6033; @param A0 Pointer to the MXCSR value (input/output).
6034; @param A1 Pointer to the first media register size operand (output).
6035; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6036; @param A3 The 8-bit immediate (input).
6037;
6038%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6039BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6040 PROLOGUE_4_ARGS
6041 IEMIMPL_SSE_PROLOGUE
6042 SSE_LD_FXSTATE_MXCSR_ONLY A0
6043
6044 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6045 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6046 lea T1, [.imm0 xWrtRIP]
6047 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
6048 lea T1, [T1 + T0*2]
6049 call T1
6050 movdqu [A1], xmm0
6051
6052 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6053 IEMIMPL_SSE_EPILOGUE
6054 EPILOGUE_4_ARGS
6055 %assign bImm 0
6056 %rep 256
6057.imm %+ bImm:
6058 %1 xmm0, xmm1, bImm
6059 ret
6060 %assign bImm bImm + 1
6061 %endrep
6062.immEnd: ; 256*6 == 0x600
6063dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6064dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6065ENDPROC iemAImpl_ %+ %1 %+ _u128
6066%endmacro
6067
6068IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6069IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6070IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6071
6072;;
6073; SSE instructions with 8-bit immediates of the form
6074; xxx xmm1, xmm2, imm8.
6075; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6076; register.
6077;
6078; @param 1 The instruction name.
6079;
6080; @param A0 Pointer to the MXCSR value (input/output).
6081; @param A1 Pointer to the first media register size operand (output).
6082; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6083; @param A3 The 8-bit immediate (input).
6084;
6085%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6086BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6087 PROLOGUE_4_ARGS
6088 IEMIMPL_SSE_PROLOGUE
6089 SSE_LD_FXSTATE_MXCSR_ONLY A0
6090
6091 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6092 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6093 lea T1, [.imm0 xWrtRIP]
6094 lea T0, [A3*2 + A3] ; sizeof(insn+ret) == 7: 2 * (A3 * 3) + A3
6095 lea T0, [T0*2]
6096 lea T0, [T0 + A3]
6097 lea T1, [T1 + T0]
6098 call T1
6099 movdqu [A1], xmm0
6100
6101 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6102 IEMIMPL_SSE_EPILOGUE
6103 EPILOGUE_4_ARGS
6104 %assign bImm 0
6105 %rep 256
6106.imm %+ bImm:
6107 %1 xmm0, xmm1, bImm
6108 ret
6109 %assign bImm bImm + 1
6110 %endrep
6111.immEnd: ; 256*(6+1) == 0x700
6112dw 0xf8ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6113dw 0x106ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6114ENDPROC iemAImpl_ %+ %1 %+ _u128
6115%endmacro
6116
6117IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6118IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6119IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6120IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6121
6122
6123;;
6124; SSE instructions of the form
6125; xxx mm, xmm.
6126; and we need to load and save the MXCSR register.
6127;
6128; @param 1 The instruction name.
6129;
6130; @param A0 Pointer to the MXCSR value (input/output).
6131; @param A1 Pointer to the first MMX register sized operand (output).
6132; @param A2 Pointer to the media register sized operand (input).
6133;
6134%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6135BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6136 PROLOGUE_3_ARGS
6137 IEMIMPL_SSE_PROLOGUE
6138 SSE_LD_FXSTATE_MXCSR_ONLY A0
6139
6140 movdqu xmm0, [A2]
6141 %1 mm0, xmm0
6142 movq [A1], mm0
6143
6144 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6145 IEMIMPL_SSE_EPILOGUE
6146 EPILOGUE_3_ARGS
6147ENDPROC iemAImpl_ %+ %1 %+ _u128
6148%endmacro
6149
6150IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6151IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6152
6153;;
6154; SSE instructions of the form
6155; xxx xmm, xmm/m64.
6156; and we need to load and save the MXCSR register.
6157;
6158; @param 1 The instruction name.
6159;
6160; @param A0 Pointer to the MXCSR value (input/output).
6161; @param A1 Pointer to the first media register sized operand (input/output).
6162; @param A2 The 64bit source value from a MMX media register (input)
6163;
6164%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6165BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6166 PROLOGUE_3_ARGS
6167 IEMIMPL_SSE_PROLOGUE
6168 SSE_LD_FXSTATE_MXCSR_ONLY A0
6169
6170 movdqu xmm0, [A1]
6171 movq mm0, A2
6172 %1 xmm0, mm0
6173 movdqu [A1], xmm0
6174
6175 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6176 IEMIMPL_SSE_EPILOGUE
6177 EPILOGUE_3_ARGS
6178ENDPROC iemAImpl_ %+ %1 %+ _u128
6179%endmacro
6180
6181IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6182IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6183
6184;;
6185; SSE instructions of the form
6186; xxx mm, xmm/m64.
6187; and we need to load and save the MXCSR register.
6188;
6189; @param 1 The instruction name.
6190;
6191; @param A0 Pointer to the MXCSR value (input/output).
6192; @param A1 Pointer to the first MMX media register sized operand (output).
6193; @param A2 The 64bit source value (input).
6194;
6195%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6196BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6197 PROLOGUE_3_ARGS
6198 IEMIMPL_SSE_PROLOGUE
6199 SSE_LD_FXSTATE_MXCSR_ONLY A0
6200
6201 movq xmm0, A2
6202 %1 mm0, xmm0
6203 movq [A1], mm0
6204
6205 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6206 IEMIMPL_SSE_EPILOGUE
6207 EPILOGUE_3_ARGS
6208ENDPROC iemAImpl_ %+ %1 %+ _u128
6209%endmacro
6210
6211IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6212IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6213
6214;
6215; All forms of RDRAND and RDSEED
6216;
6217; @param A0 Pointer to the destination operand.
6218; @param A1 Pointer to the EFLAGS value (input/output).
6219;
6220%macro IEMIMPL_RDRAND_RDSEED 3
6221BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6222 PROLOGUE_2_ARGS
6223
6224 %1 %2
6225 mov [A0], %2
6226 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6227
6228 EPILOGUE_2_ARGS
6229ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6230%endmacro
6231
6232IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6233IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6234IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6235IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6236IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6237IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6238
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette