VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 97134

Last change on this file since 97134 was 97051, checked in by vboxsync, 2 years ago

Typo fixes.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 174.0 KB
Line 
1; $Id: IEMAllAImpl.asm 97051 2022-10-07 18:38:12Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2022 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90%endmacro
91
92
93;
94; We employ some macro assembly here to hid the calling convention differences.
95;
96%ifdef RT_ARCH_AMD64
97 %macro PROLOGUE_1_ARGS 0
98 %endmacro
99 %macro EPILOGUE_1_ARGS 0
100 ret
101 %endmacro
102 %macro EPILOGUE_1_ARGS_EX 0
103 ret
104 %endmacro
105
106 %macro PROLOGUE_2_ARGS 0
107 %endmacro
108 %macro EPILOGUE_2_ARGS 0
109 ret
110 %endmacro
111 %macro EPILOGUE_2_ARGS_EX 1
112 ret
113 %endmacro
114
115 %macro PROLOGUE_3_ARGS 0
116 %endmacro
117 %macro EPILOGUE_3_ARGS 0
118 ret
119 %endmacro
120 %macro EPILOGUE_3_ARGS_EX 1
121 ret
122 %endmacro
123
124 %macro PROLOGUE_4_ARGS 0
125 %endmacro
126 %macro EPILOGUE_4_ARGS 0
127 ret
128 %endmacro
129 %macro EPILOGUE_4_ARGS_EX 1
130 ret
131 %endmacro
132
133 %ifdef ASM_CALL64_GCC
134 %define A0 rdi
135 %define A0_32 edi
136 %define A0_16 di
137 %define A0_8 dil
138
139 %define A1 rsi
140 %define A1_32 esi
141 %define A1_16 si
142 %define A1_8 sil
143
144 %define A2 rdx
145 %define A2_32 edx
146 %define A2_16 dx
147 %define A2_8 dl
148
149 %define A3 rcx
150 %define A3_32 ecx
151 %define A3_16 cx
152 %endif
153
154 %ifdef ASM_CALL64_MSC
155 %define A0 rcx
156 %define A0_32 ecx
157 %define A0_16 cx
158 %define A0_8 cl
159
160 %define A1 rdx
161 %define A1_32 edx
162 %define A1_16 dx
163 %define A1_8 dl
164
165 %define A2 r8
166 %define A2_32 r8d
167 %define A2_16 r8w
168 %define A2_8 r8b
169
170 %define A3 r9
171 %define A3_32 r9d
172 %define A3_16 r9w
173 %endif
174
175 %define T0 rax
176 %define T0_32 eax
177 %define T0_16 ax
178 %define T0_8 al
179
180 %define T1 r11
181 %define T1_32 r11d
182 %define T1_16 r11w
183 %define T1_8 r11b
184
185 %define T2 r10 ; only AMD64
186 %define T2_32 r10d
187 %define T2_16 r10w
188 %define T2_8 r10b
189
190%else
191 ; x86
192 %macro PROLOGUE_1_ARGS 0
193 push edi
194 %endmacro
195 %macro EPILOGUE_1_ARGS 0
196 pop edi
197 ret 0
198 %endmacro
199 %macro EPILOGUE_1_ARGS_EX 1
200 pop edi
201 ret %1
202 %endmacro
203
204 %macro PROLOGUE_2_ARGS 0
205 push edi
206 %endmacro
207 %macro EPILOGUE_2_ARGS 0
208 pop edi
209 ret 0
210 %endmacro
211 %macro EPILOGUE_2_ARGS_EX 1
212 pop edi
213 ret %1
214 %endmacro
215
216 %macro PROLOGUE_3_ARGS 0
217 push ebx
218 mov ebx, [esp + 4 + 4]
219 push edi
220 %endmacro
221 %macro EPILOGUE_3_ARGS_EX 1
222 %if (%1) < 4
223 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
224 %endif
225 pop edi
226 pop ebx
227 ret %1
228 %endmacro
229 %macro EPILOGUE_3_ARGS 0
230 EPILOGUE_3_ARGS_EX 4
231 %endmacro
232
233 %macro PROLOGUE_4_ARGS 0
234 push ebx
235 push edi
236 push esi
237 mov ebx, [esp + 12 + 4 + 0]
238 mov esi, [esp + 12 + 4 + 4]
239 %endmacro
240 %macro EPILOGUE_4_ARGS_EX 1
241 %if (%1) < 8
242 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
243 %endif
244 pop esi
245 pop edi
246 pop ebx
247 ret %1
248 %endmacro
249 %macro EPILOGUE_4_ARGS 0
250 EPILOGUE_4_ARGS_EX 8
251 %endmacro
252
253 %define A0 ecx
254 %define A0_32 ecx
255 %define A0_16 cx
256 %define A0_8 cl
257
258 %define A1 edx
259 %define A1_32 edx
260 %define A1_16 dx
261 %define A1_8 dl
262
263 %define A2 ebx
264 %define A2_32 ebx
265 %define A2_16 bx
266 %define A2_8 bl
267
268 %define A3 esi
269 %define A3_32 esi
270 %define A3_16 si
271
272 %define T0 eax
273 %define T0_32 eax
274 %define T0_16 ax
275 %define T0_8 al
276
277 %define T1 edi
278 %define T1_32 edi
279 %define T1_16 di
280%endif
281
282
283;;
284; Load the relevant flags from [%1] if there are undefined flags (%3).
285;
286; @remarks Clobbers T0, stack. Changes EFLAGS.
287; @param A2 The register pointing to the flags.
288; @param 1 The parameter (A0..A3) pointing to the eflags.
289; @param 2 The set of modified flags.
290; @param 3 The set of undefined flags.
291;
292%macro IEM_MAYBE_LOAD_FLAGS 3
293 ;%if (%3) != 0
294 pushf ; store current flags
295 mov T0_32, [%1] ; load the guest flags
296 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
297 and T0_32, (%2 | %3) ; select the modified and undefined flags.
298 or [xSP], T0 ; merge guest flags with host flags.
299 popf ; load the mixed flags.
300 ;%endif
301%endmacro
302
303;;
304; Update the flag.
305;
306; @remarks Clobbers T0, T1, stack.
307; @param 1 The register pointing to the EFLAGS.
308; @param 2 The mask of modified flags to save.
309; @param 3 The mask of undefined flags to (maybe) save.
310;
311%macro IEM_SAVE_FLAGS 3
312 %if (%2 | %3) != 0
313 pushf
314 pop T1
315 mov T0_32, [%1] ; flags
316 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
317 and T1_32, (%2 | %3) ; select the modified and undefined flags.
318 or T0_32, T1_32 ; combine the flags.
319 mov [%1], T0_32 ; save the flags.
320 %endif
321%endmacro
322
323;;
324; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
325;
326; @remarks Clobbers T0, T1, stack.
327; @param 1 The register pointing to the EFLAGS.
328; @param 2 The mask of modified flags to save.
329; @param 3 Mask of additional flags to always clear
330; @param 4 Mask of additional flags to always set.
331;
332%macro IEM_SAVE_AND_ADJUST_FLAGS 4
333 %if (%2 | %3 | %4) != 0
334 pushf
335 pop T1
336 mov T0_32, [%1] ; load flags.
337 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
338 and T1_32, (%2) ; select the modified flags.
339 or T0_32, T1_32 ; combine the flags.
340 %if (%4) != 0
341 or T0_32, %4 ; add the always set flags.
342 %endif
343 mov [%1], T0_32 ; save the result.
344 %endif
345%endmacro
346
347;;
348; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
349; signed input (%4[%5]) and parity index (%6).
350;
351; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
352; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
353; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
354;
355; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
356; @param 1 The register pointing to the EFLAGS.
357; @param 2 The mask of modified flags to save.
358; @param 3 Mask of additional flags to always clear
359; @param 4 The result register to set SF by.
360; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
361; @param 6 The (full) register containing the parity table index. Will be modified!
362
363%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
364 %ifdef RT_ARCH_AMD64
365 pushf
366 pop T2
367 %else
368 push T0
369 pushf
370 pop T0
371 %endif
372 mov T1_32, [%1] ; load flags.
373 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
374 %ifdef RT_ARCH_AMD64
375 and T2_32, (%2) ; select the modified flags.
376 or T1_32, T2_32 ; combine the flags.
377 %else
378 and T0_32, (%2) ; select the modified flags.
379 or T1_32, T0_32 ; combine the flags.
380 pop T0
381 %endif
382
383 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
384 bt %4, %5 - 1
385 jnc %%sf_clear
386 or T1_32, X86_EFL_SF
387 %%sf_clear:
388
389 ; Parity last.
390 and %6, 0xff
391 %ifdef RT_ARCH_AMD64
392 lea T2, [NAME(g_afParity) xWrtRIP]
393 or T1_8, [T2 + %6]
394 %else
395 or T1_8, [NAME(g_afParity) + %6]
396 %endif
397
398 mov [%1], T1_32 ; save the result.
399%endmacro
400
401;;
402; Calculates the new EFLAGS using fixed clear and set bit masks.
403;
404; @remarks Clobbers T0.
405; @param 1 The register pointing to the EFLAGS.
406; @param 2 Mask of additional flags to always clear
407; @param 3 Mask of additional flags to always set.
408;
409%macro IEM_ADJUST_FLAGS 3
410 %if (%2 | %3) != 0
411 mov T0_32, [%1] ; Load flags.
412 %if (%2) != 0
413 and T0_32, ~(%2) ; Remove the always cleared flags.
414 %endif
415 %if (%3) != 0
416 or T0_32, %3 ; Add the always set flags.
417 %endif
418 mov [%1], T0_32 ; Save the result.
419 %endif
420%endmacro
421
422;;
423; Calculates the new EFLAGS using fixed clear and set bit masks.
424;
425; @remarks Clobbers T0, %4, EFLAGS.
426; @param 1 The register pointing to the EFLAGS.
427; @param 2 Mask of additional flags to always clear
428; @param 3 Mask of additional flags to always set.
429; @param 4 The (full) register containing the parity table index. Will be modified!
430;
431%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
432 mov T0_32, [%1] ; Load flags.
433 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 and %4, 0xff
438 %ifdef RT_ARCH_AMD64
439 lea T2, [NAME(g_afParity) xWrtRIP]
440 or T0_8, [T2 + %4]
441 %else
442 or T0_8, [NAME(g_afParity) + %4]
443 %endif
444 mov [%1], T0_32 ; Save the result.
445%endmacro
446
447
448;*********************************************************************************************************************************
449;* External Symbols *
450;*********************************************************************************************************************************
451extern NAME(g_afParity)
452
453
454;;
455; Macro for implementing a binary operator.
456;
457; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
458; variants, except on 32-bit system where the 64-bit accesses requires hand
459; coding.
460;
461; All the functions takes a pointer to the destination memory operand in A0,
462; the source register operand in A1 and a pointer to eflags in A2.
463;
464; @param 1 The instruction mnemonic.
465; @param 2 Non-zero if there should be a locked version.
466; @param 3 The modified flags.
467; @param 4 The undefined flags.
468;
469%macro IEMIMPL_BIN_OP 4
470BEGINCODE
471BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
472 PROLOGUE_3_ARGS
473 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
474 %1 byte [A0], A1_8
475 IEM_SAVE_FLAGS A2, %3, %4
476 EPILOGUE_3_ARGS
477ENDPROC iemAImpl_ %+ %1 %+ _u8
478
479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
480 PROLOGUE_3_ARGS
481 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
482 %1 word [A0], A1_16
483 IEM_SAVE_FLAGS A2, %3, %4
484 EPILOGUE_3_ARGS
485ENDPROC iemAImpl_ %+ %1 %+ _u16
486
487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
488 PROLOGUE_3_ARGS
489 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
490 %1 dword [A0], A1_32
491 IEM_SAVE_FLAGS A2, %3, %4
492 EPILOGUE_3_ARGS
493ENDPROC iemAImpl_ %+ %1 %+ _u32
494
495 %ifdef RT_ARCH_AMD64
496BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
497 PROLOGUE_3_ARGS
498 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
499 %1 qword [A0], A1
500 IEM_SAVE_FLAGS A2, %3, %4
501 EPILOGUE_3_ARGS_EX 8
502ENDPROC iemAImpl_ %+ %1 %+ _u64
503 %endif ; RT_ARCH_AMD64
504
505 %if %2 != 0 ; locked versions requested?
506
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 lock %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 lock %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 lock %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 lock %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
539 %endif ; RT_ARCH_AMD64
540 %endif ; locked
541%endmacro
542
543; instr,lock, modified-flags, undefined flags
544IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
545IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
546IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
547IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
548IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
549IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
550IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
551IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
552IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
553
554
555;;
556; Macro for implementing a binary operator, VEX variant with separate input/output.
557;
558; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
559; where the 64-bit accesses requires hand coding.
560;
561; All the functions takes a pointer to the destination memory operand in A0,
562; the first source register operand in A1, the second source register operand
563; in A2 and a pointer to eflags in A3.
564;
565; @param 1 The instruction mnemonic.
566; @param 2 The modified flags.
567; @param 3 The undefined flags.
568;
569%macro IEMIMPL_VEX_BIN_OP 3
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0_32, A1_32, A2_32
574 mov [A0], T0_32
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u32
578
579 %ifdef RT_ARCH_AMD64
580BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
581 PROLOGUE_4_ARGS
582 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
583 %1 T0, A1, A2
584 mov [A0], T0
585 IEM_SAVE_FLAGS A3, %2, %3
586 EPILOGUE_4_ARGS
587ENDPROC iemAImpl_ %+ %1 %+ _u64
588 %endif ; RT_ARCH_AMD64
589%endmacro
590
591; instr, modified-flags, undefined-flags
592IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
593IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
594IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
595
596;;
597; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
598;
599; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
600; where the 64-bit accesses requires hand coding.
601;
602; All the functions takes a pointer to the destination memory operand in A0,
603; the source register operand in A1 and a pointer to eflags in A2.
604;
605; @param 1 The instruction mnemonic.
606; @param 2 The modified flags.
607; @param 3 The undefined flags.
608;
609%macro IEMIMPL_VEX_BIN_OP_2 3
610BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
611 PROLOGUE_4_ARGS
612 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
613 mov T0_32, [A0]
614 %1 T0_32, A1_32
615 mov [A0], T0_32
616 IEM_SAVE_FLAGS A2, %2, %3
617 EPILOGUE_4_ARGS
618ENDPROC iemAImpl_ %+ %1 %+ _u32
619
620 %ifdef RT_ARCH_AMD64
621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
622 PROLOGUE_4_ARGS
623 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
624 mov T0, [A0]
625 %1 T0, A1
626 mov [A0], T0
627 IEM_SAVE_FLAGS A2, %2, %3
628 EPILOGUE_4_ARGS
629ENDPROC iemAImpl_ %+ %1 %+ _u64
630 %endif ; RT_ARCH_AMD64
631%endmacro
632
633; instr, modified-flags, undefined-flags
634IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
635IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
636IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
637
638
639;;
640; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
641;
642; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
643; where the 64-bit accesses requires hand coding.
644;
645; All the functions takes a pointer to the destination memory operand in A0,
646; the first source register operand in A1, the second source register operand
647; in A2 and a pointer to eflags in A3.
648;
649; @param 1 The instruction mnemonic.
650; @param 2 Fallback instruction if applicable.
651; @param 3 Whether to emit fallback or not.
652;
653%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
654BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
655 PROLOGUE_3_ARGS
656 %1 T0_32, A1_32, A2_32
657 mov [A0], T0_32
658 EPILOGUE_3_ARGS
659ENDPROC iemAImpl_ %+ %1 %+ _u32
660
661 %if %3
662BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
663 PROLOGUE_3_ARGS
664 %ifdef ASM_CALL64_GCC
665 mov cl, A2_8
666 %2 A1_32, cl
667 mov [A0], A1_32
668 %else
669 xchg A2, A0
670 %2 A1_32, cl
671 mov [A2], A1_32
672 %endif
673 EPILOGUE_3_ARGS
674ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
675 %endif
676
677 %ifdef RT_ARCH_AMD64
678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
679 PROLOGUE_3_ARGS
680 %1 T0, A1, A2
681 mov [A0], T0
682 EPILOGUE_3_ARGS
683ENDPROC iemAImpl_ %+ %1 %+ _u64
684
685 %if %3
686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
687 PROLOGUE_3_ARGS
688 %ifdef ASM_CALL64_GCC
689 mov cl, A2_8
690 %2 A1, cl
691 mov [A0], A1_32
692 %else
693 xchg A2, A0
694 %2 A1, cl
695 mov [A2], A1_32
696 %endif
697 mov [A0], A1
698 EPILOGUE_3_ARGS
699ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
700 %endif
701 %endif ; RT_ARCH_AMD64
702%endmacro
703
704; instr, fallback instr, emit fallback
705IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
706IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
707IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
708IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
709IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
710
711
712;
713; RORX uses a immediate byte for the shift count, so we only do
714; fallback implementation of that one.
715;
716BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
717 PROLOGUE_3_ARGS
718 %ifdef ASM_CALL64_GCC
719 mov cl, A2_8
720 ror A1_32, cl
721 mov [A0], A1_32
722 %else
723 xchg A2, A0
724 ror A1_32, cl
725 mov [A2], A1_32
726 %endif
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_rorx_u32
729
730 %ifdef RT_ARCH_AMD64
731BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
732 PROLOGUE_3_ARGS
733 %ifdef ASM_CALL64_GCC
734 mov cl, A2_8
735 ror A1, cl
736 mov [A0], A1_32
737 %else
738 xchg A2, A0
739 ror A1, cl
740 mov [A2], A1_32
741 %endif
742 mov [A0], A1
743 EPILOGUE_3_ARGS
744ENDPROC iemAImpl_rorx_u64
745 %endif ; RT_ARCH_AMD64
746
747
748;
749; MULX
750;
751BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
752 PROLOGUE_4_ARGS
753%ifdef ASM_CALL64_GCC
754 ; A2_32 is EDX - prefect
755 mulx T0_32, T1_32, A3_32
756 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
757 mov [A0], T0_32
758%else
759 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
760 xchg A1, A2
761 mulx T0_32, T1_32, A3_32
762 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
763 mov [A0], T0_32
764%endif
765 EPILOGUE_4_ARGS
766ENDPROC iemAImpl_mulx_u32
767
768
769BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
770 PROLOGUE_4_ARGS
771%ifdef ASM_CALL64_GCC
772 ; A2_32 is EDX, T0_32 is EAX
773 mov eax, A3_32
774 mul A2_32
775 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
776 mov [A0], edx
777%else
778 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
779 xchg A1, A2
780 mov eax, A3_32
781 mul A2_32
782 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
783 mov [A0], edx
784%endif
785 EPILOGUE_4_ARGS
786ENDPROC iemAImpl_mulx_u32_fallback
787
788%ifdef RT_ARCH_AMD64
789BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
790 PROLOGUE_4_ARGS
791%ifdef ASM_CALL64_GCC
792 ; A2 is RDX - prefect
793 mulx T0, T1, A3
794 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
795 mov [A0], T0
796%else
797 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
798 xchg A1, A2
799 mulx T0, T1, A3
800 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
801 mov [A0], T0
802%endif
803 EPILOGUE_4_ARGS
804ENDPROC iemAImpl_mulx_u64
805
806
807BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
808 PROLOGUE_4_ARGS
809%ifdef ASM_CALL64_GCC
810 ; A2 is RDX, T0 is RAX
811 mov rax, A3
812 mul A2
813 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
814 mov [A0], rdx
815%else
816 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
817 xchg A1, A2
818 mov rax, A3
819 mul A2
820 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
821 mov [A0], rdx
822%endif
823 EPILOGUE_4_ARGS
824ENDPROC iemAImpl_mulx_u64_fallback
825
826%endif
827
828
829;;
830; Macro for implementing a bit operator.
831;
832; This will generate code for the 16, 32 and 64 bit accesses with locked
833; variants, except on 32-bit system where the 64-bit accesses requires hand
834; coding.
835;
836; All the functions takes a pointer to the destination memory operand in A0,
837; the source register operand in A1 and a pointer to eflags in A2.
838;
839; @param 1 The instruction mnemonic.
840; @param 2 Non-zero if there should be a locked version.
841; @param 3 The modified flags.
842; @param 4 The undefined flags.
843;
844%macro IEMIMPL_BIT_OP 4
845BEGINCODE
846BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
847 PROLOGUE_3_ARGS
848 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
849 %1 word [A0], A1_16
850 IEM_SAVE_FLAGS A2, %3, %4
851 EPILOGUE_3_ARGS
852ENDPROC iemAImpl_ %+ %1 %+ _u16
853
854BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
855 PROLOGUE_3_ARGS
856 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
857 %1 dword [A0], A1_32
858 IEM_SAVE_FLAGS A2, %3, %4
859 EPILOGUE_3_ARGS
860ENDPROC iemAImpl_ %+ %1 %+ _u32
861
862 %ifdef RT_ARCH_AMD64
863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
864 PROLOGUE_3_ARGS
865 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
866 %1 qword [A0], A1
867 IEM_SAVE_FLAGS A2, %3, %4
868 EPILOGUE_3_ARGS_EX 8
869ENDPROC iemAImpl_ %+ %1 %+ _u64
870 %endif ; RT_ARCH_AMD64
871
872 %if %2 != 0 ; locked versions requested?
873
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 lock %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 lock %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 lock %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
898 %endif ; RT_ARCH_AMD64
899 %endif ; locked
900%endmacro
901IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
902IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
903IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
904IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
905
906;;
907; Macro for implementing a bit search operator.
908;
909; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
910; system where the 64-bit accesses requires hand coding.
911;
912; All the functions takes a pointer to the destination memory operand in A0,
913; the source register operand in A1 and a pointer to eflags in A2.
914;
915; In the ZF case the destination register is 'undefined', however it seems that
916; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
917; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
918; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
919; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
920;
921; @param 1 The instruction mnemonic.
922; @param 2 The modified flags.
923; @param 3 The undefined flags.
924; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
925;
926%macro IEMIMPL_BIT_OP2 4
927BEGINCODE
928BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
929 PROLOGUE_3_ARGS
930 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
931 %1 T0_16, A1_16
932%if %4 != 0
933 jz .unchanged_dst
934%endif
935 mov [A0], T0_16
936.unchanged_dst:
937 IEM_SAVE_FLAGS A2, %2, %3
938 EPILOGUE_3_ARGS
939ENDPROC iemAImpl_ %+ %1 %+ _u16
940
941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
942 PROLOGUE_3_ARGS
943 %1 T1_16, A1_16
944%if %4 != 0
945 jz .unchanged_dst
946%endif
947 mov [A0], T1_16
948 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
949 EPILOGUE_3_ARGS
950.unchanged_dst:
951 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
952 EPILOGUE_3_ARGS
953ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
954
955BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
956 PROLOGUE_3_ARGS
957 %1 T0_16, A1_16
958%if %4 != 0
959 jz .unchanged_dst
960%endif
961 mov [A0], T0_16
962.unchanged_dst:
963 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
964 EPILOGUE_3_ARGS
965ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
966
967
968BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
969 PROLOGUE_3_ARGS
970 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
971 %1 T0_32, A1_32
972%if %4 != 0
973 jz .unchanged_dst
974%endif
975 mov [A0], T0_32
976.unchanged_dst:
977 IEM_SAVE_FLAGS A2, %2, %3
978 EPILOGUE_3_ARGS
979ENDPROC iemAImpl_ %+ %1 %+ _u32
980
981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
982 PROLOGUE_3_ARGS
983 %1 T1_32, A1_32
984%if %4 != 0
985 jz .unchanged_dst
986%endif
987 mov [A0], T1_32
988 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
989 EPILOGUE_3_ARGS
990.unchanged_dst:
991 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
992 EPILOGUE_3_ARGS
993ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
994
995BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
996 PROLOGUE_3_ARGS
997 %1 T0_32, A1_32
998%if %4 != 0
999 jz .unchanged_dst
1000%endif
1001 mov [A0], T0_32
1002.unchanged_dst:
1003 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1004 EPILOGUE_3_ARGS
1005ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1006
1007
1008 %ifdef RT_ARCH_AMD64
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1011 PROLOGUE_3_ARGS
1012 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1013 %1 T0, A1
1014%if %4 != 0
1015 jz .unchanged_dst
1016%endif
1017 mov [A0], T0
1018.unchanged_dst:
1019 IEM_SAVE_FLAGS A2, %2, %3
1020 EPILOGUE_3_ARGS_EX 8
1021ENDPROC iemAImpl_ %+ %1 %+ _u64
1022
1023BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1024 PROLOGUE_3_ARGS
1025 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1026 %1 T1, A1
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T1
1031 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1032 EPILOGUE_3_ARGS
1033.unchanged_dst:
1034 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1035 EPILOGUE_3_ARGS
1036ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1037
1038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1039 PROLOGUE_3_ARGS
1040 %1 T0, A1
1041%if %4 != 0
1042 jz .unchanged_dst
1043%endif
1044 mov [A0], T0
1045.unchanged_dst:
1046 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1047 EPILOGUE_3_ARGS_EX 8
1048ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1049
1050 %endif ; RT_ARCH_AMD64
1051%endmacro
1052
1053IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1054IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1055IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1056IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1057
1058
1059;;
1060; Macro for implementing POPCNT.
1061;
1062; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1063; system where the 64-bit accesses requires hand coding.
1064;
1065; All the functions takes a pointer to the destination memory operand in A0,
1066; the source register operand in A1 and a pointer to eflags in A2.
1067;
1068; ASSUMES Intel and AMD set EFLAGS the same way.
1069;
1070; ASSUMES the instruction does not support memory destination.
1071;
1072; @param 1 The instruction mnemonic.
1073; @param 2 The modified flags.
1074; @param 3 The undefined flags.
1075;
1076%macro IEMIMPL_BIT_OP3 3
1077BEGINCODE
1078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1079 PROLOGUE_3_ARGS
1080 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1081 %1 T0_16, A1_16
1082 mov [A0], T0_16
1083 IEM_SAVE_FLAGS A2, %2, %3
1084 EPILOGUE_3_ARGS
1085ENDPROC iemAImpl_ %+ %1 %+ _u16
1086
1087BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1088 PROLOGUE_3_ARGS
1089 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090 %1 T0_32, A1_32
1091 mov [A0], T0_32
1092 IEM_SAVE_FLAGS A2, %2, %3
1093 EPILOGUE_3_ARGS
1094ENDPROC iemAImpl_ %+ %1 %+ _u32
1095
1096 %ifdef RT_ARCH_AMD64
1097BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1098 PROLOGUE_3_ARGS
1099 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1100 %1 T0, A1
1101 mov [A0], T0
1102 IEM_SAVE_FLAGS A2, %2, %3
1103 EPILOGUE_3_ARGS_EX 8
1104ENDPROC iemAImpl_ %+ %1 %+ _u64
1105 %endif ; RT_ARCH_AMD64
1106%endmacro
1107IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1108
1109
1110;
1111; IMUL is also a similar but yet different case (no lock, no mem dst).
1112; The rDX:rAX variant of imul is handled together with mul further down.
1113;
1114BEGINCODE
1115; @param 1 EFLAGS that are modified.
1116; @param 2 Undefined EFLAGS.
1117; @param 3 Function suffix.
1118; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1119; 2 for AMD (set AF, clear PF, ZF and SF).
1120%macro IEMIMPL_IMUL_TWO 4
1121BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1122 PROLOGUE_3_ARGS
1123 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1124 imul A1_16, word [A0]
1125 mov [A0], A1_16
1126 %if %4 != 1
1127 IEM_SAVE_FLAGS A2, %1, %2
1128 %else
1129 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1130 %endif
1131 EPILOGUE_3_ARGS
1132ENDPROC iemAImpl_imul_two_u16 %+ %3
1133
1134BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1135 PROLOGUE_3_ARGS
1136 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1137 imul A1_32, dword [A0]
1138 mov [A0], A1_32
1139 %if %4 != 1
1140 IEM_SAVE_FLAGS A2, %1, %2
1141 %else
1142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1143 %endif
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_imul_two_u32 %+ %3
1146
1147 %ifdef RT_ARCH_AMD64
1148BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1149 PROLOGUE_3_ARGS
1150 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1151 imul A1, qword [A0]
1152 mov [A0], A1
1153 %if %4 != 1
1154 IEM_SAVE_FLAGS A2, %1, %2
1155 %else
1156 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1157 %endif
1158 EPILOGUE_3_ARGS_EX 8
1159ENDPROC iemAImpl_imul_two_u64 %+ %3
1160 %endif ; RT_ARCH_AMD64
1161%endmacro
1162IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1163IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1164IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1165
1166
1167;
1168; XCHG for memory operands. This implies locking. No flag changes.
1169;
1170; Each function takes two arguments, first the pointer to the memory,
1171; then the pointer to the register. They all return void.
1172;
1173BEGINCODE
1174BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1175 PROLOGUE_2_ARGS
1176 mov T0_8, [A1]
1177 xchg [A0], T0_8
1178 mov [A1], T0_8
1179 EPILOGUE_2_ARGS
1180ENDPROC iemAImpl_xchg_u8_locked
1181
1182BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1183 PROLOGUE_2_ARGS
1184 mov T0_16, [A1]
1185 xchg [A0], T0_16
1186 mov [A1], T0_16
1187 EPILOGUE_2_ARGS
1188ENDPROC iemAImpl_xchg_u16_locked
1189
1190BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1191 PROLOGUE_2_ARGS
1192 mov T0_32, [A1]
1193 xchg [A0], T0_32
1194 mov [A1], T0_32
1195 EPILOGUE_2_ARGS
1196ENDPROC iemAImpl_xchg_u32_locked
1197
1198%ifdef RT_ARCH_AMD64
1199BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1200 PROLOGUE_2_ARGS
1201 mov T0, [A1]
1202 xchg [A0], T0
1203 mov [A1], T0
1204 EPILOGUE_2_ARGS
1205ENDPROC iemAImpl_xchg_u64_locked
1206%endif
1207
1208; Unlocked variants for fDisregardLock mode.
1209
1210BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1211 PROLOGUE_2_ARGS
1212 mov T0_8, [A1]
1213 mov T1_8, [A0]
1214 mov [A0], T0_8
1215 mov [A1], T1_8
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u8_unlocked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_16, [A1]
1222 mov T1_16, [A0]
1223 mov [A0], T0_16
1224 mov [A1], T1_16
1225 EPILOGUE_2_ARGS
1226ENDPROC iemAImpl_xchg_u16_unlocked
1227
1228BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0_32, [A1]
1231 mov T1_32, [A0]
1232 mov [A0], T0_32
1233 mov [A1], T1_32
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u32_unlocked
1236
1237%ifdef RT_ARCH_AMD64
1238BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1239 PROLOGUE_2_ARGS
1240 mov T0, [A1]
1241 mov T1, [A0]
1242 mov [A0], T0
1243 mov [A1], T1
1244 EPILOGUE_2_ARGS
1245ENDPROC iemAImpl_xchg_u64_unlocked
1246%endif
1247
1248
1249;
1250; XADD for memory operands.
1251;
1252; Each function takes three arguments, first the pointer to the
1253; memory/register, then the pointer to the register, and finally a pointer to
1254; eflags. They all return void.
1255;
1256BEGINCODE
1257BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1258 PROLOGUE_3_ARGS
1259 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1260 mov T0_8, [A1]
1261 xadd [A0], T0_8
1262 mov [A1], T0_8
1263 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1264 EPILOGUE_3_ARGS
1265ENDPROC iemAImpl_xadd_u8
1266
1267BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1268 PROLOGUE_3_ARGS
1269 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1270 mov T0_16, [A1]
1271 xadd [A0], T0_16
1272 mov [A1], T0_16
1273 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1274 EPILOGUE_3_ARGS
1275ENDPROC iemAImpl_xadd_u16
1276
1277BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1278 PROLOGUE_3_ARGS
1279 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1280 mov T0_32, [A1]
1281 xadd [A0], T0_32
1282 mov [A1], T0_32
1283 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1284 EPILOGUE_3_ARGS
1285ENDPROC iemAImpl_xadd_u32
1286
1287%ifdef RT_ARCH_AMD64
1288BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1289 PROLOGUE_3_ARGS
1290 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1291 mov T0, [A1]
1292 xadd [A0], T0
1293 mov [A1], T0
1294 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 EPILOGUE_3_ARGS
1296ENDPROC iemAImpl_xadd_u64
1297%endif ; RT_ARCH_AMD64
1298
1299BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1300 PROLOGUE_3_ARGS
1301 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 mov T0_8, [A1]
1303 lock xadd [A0], T0_8
1304 mov [A1], T0_8
1305 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 EPILOGUE_3_ARGS
1307ENDPROC iemAImpl_xadd_u8_locked
1308
1309BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1310 PROLOGUE_3_ARGS
1311 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1312 mov T0_16, [A1]
1313 lock xadd [A0], T0_16
1314 mov [A1], T0_16
1315 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1316 EPILOGUE_3_ARGS
1317ENDPROC iemAImpl_xadd_u16_locked
1318
1319BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1320 PROLOGUE_3_ARGS
1321 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1322 mov T0_32, [A1]
1323 lock xadd [A0], T0_32
1324 mov [A1], T0_32
1325 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 EPILOGUE_3_ARGS
1327ENDPROC iemAImpl_xadd_u32_locked
1328
1329%ifdef RT_ARCH_AMD64
1330BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1333 mov T0, [A1]
1334 lock xadd [A0], T0
1335 mov [A1], T0
1336 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 EPILOGUE_3_ARGS
1338ENDPROC iemAImpl_xadd_u64_locked
1339%endif ; RT_ARCH_AMD64
1340
1341
1342;
1343; CMPXCHG8B.
1344;
1345; These are tricky register wise, so the code is duplicated for each calling
1346; convention.
1347;
1348; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1349;
1350; C-proto:
1351; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1352; uint32_t *pEFlags));
1353;
1354; Note! Identical to iemAImpl_cmpxchg16b.
1355;
1356BEGINCODE
1357BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1358%ifdef RT_ARCH_AMD64
1359 %ifdef ASM_CALL64_MSC
1360 push rbx
1361
1362 mov r11, rdx ; pu64EaxEdx (is also T1)
1363 mov r10, rcx ; pu64Dst
1364
1365 mov ebx, [r8]
1366 mov ecx, [r8 + 4]
1367 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1368 mov eax, [r11]
1369 mov edx, [r11 + 4]
1370
1371 lock cmpxchg8b [r10]
1372
1373 mov [r11], eax
1374 mov [r11 + 4], edx
1375 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1376
1377 pop rbx
1378 ret
1379 %else
1380 push rbx
1381
1382 mov r10, rcx ; pEFlags
1383 mov r11, rdx ; pu64EbxEcx (is also T1)
1384
1385 mov ebx, [r11]
1386 mov ecx, [r11 + 4]
1387 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1388 mov eax, [rsi]
1389 mov edx, [rsi + 4]
1390
1391 lock cmpxchg8b [rdi]
1392
1393 mov [rsi], eax
1394 mov [rsi + 4], edx
1395 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1396
1397 pop rbx
1398 ret
1399
1400 %endif
1401%else
1402 push esi
1403 push edi
1404 push ebx
1405 push ebp
1406
1407 mov edi, ecx ; pu64Dst
1408 mov esi, edx ; pu64EaxEdx
1409 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1410 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1411
1412 mov ebx, [ecx]
1413 mov ecx, [ecx + 4]
1414 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1415 mov eax, [esi]
1416 mov edx, [esi + 4]
1417
1418 lock cmpxchg8b [edi]
1419
1420 mov [esi], eax
1421 mov [esi + 4], edx
1422 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1423
1424 pop ebp
1425 pop ebx
1426 pop edi
1427 pop esi
1428 ret 8
1429%endif
1430ENDPROC iemAImpl_cmpxchg8b
1431
1432BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1433 ; Lazy bird always lock prefixes cmpxchg8b.
1434 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1435ENDPROC iemAImpl_cmpxchg8b_locked
1436
1437%ifdef RT_ARCH_AMD64
1438
1439;
1440; CMPXCHG16B.
1441;
1442; These are tricky register wise, so the code is duplicated for each calling
1443; convention.
1444;
1445; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1446;
1447; C-proto:
1448; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1449; uint32_t *pEFlags));
1450;
1451; Note! Identical to iemAImpl_cmpxchg8b.
1452;
1453BEGINCODE
1454BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1455 %ifdef ASM_CALL64_MSC
1456 push rbx
1457
1458 mov r11, rdx ; pu64RaxRdx (is also T1)
1459 mov r10, rcx ; pu64Dst
1460
1461 mov rbx, [r8]
1462 mov rcx, [r8 + 8]
1463 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1464 mov rax, [r11]
1465 mov rdx, [r11 + 8]
1466
1467 lock cmpxchg16b [r10]
1468
1469 mov [r11], rax
1470 mov [r11 + 8], rdx
1471 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1472
1473 pop rbx
1474 ret
1475 %else
1476 push rbx
1477
1478 mov r10, rcx ; pEFlags
1479 mov r11, rdx ; pu64RbxRcx (is also T1)
1480
1481 mov rbx, [r11]
1482 mov rcx, [r11 + 8]
1483 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1484 mov rax, [rsi]
1485 mov rdx, [rsi + 8]
1486
1487 lock cmpxchg16b [rdi]
1488
1489 mov [rsi], rax
1490 mov [rsi + 8], rdx
1491 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1492
1493 pop rbx
1494 ret
1495
1496 %endif
1497ENDPROC iemAImpl_cmpxchg16b
1498
1499BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1500 ; Lazy bird always lock prefixes cmpxchg16b.
1501 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1502ENDPROC iemAImpl_cmpxchg16b_locked
1503
1504%endif ; RT_ARCH_AMD64
1505
1506
1507;
1508; CMPXCHG.
1509;
1510; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1511;
1512; C-proto:
1513; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1514;
1515BEGINCODE
1516%macro IEMIMPL_CMPXCHG 2
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1518 PROLOGUE_4_ARGS
1519 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1520 mov al, [A1]
1521 %1 cmpxchg [A0], A2_8
1522 mov [A1], al
1523 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1524 EPILOGUE_4_ARGS
1525ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1526
1527BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1528 PROLOGUE_4_ARGS
1529 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1530 mov ax, [A1]
1531 %1 cmpxchg [A0], A2_16
1532 mov [A1], ax
1533 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1534 EPILOGUE_4_ARGS
1535ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1536
1537BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1538 PROLOGUE_4_ARGS
1539 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1540 mov eax, [A1]
1541 %1 cmpxchg [A0], A2_32
1542 mov [A1], eax
1543 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1544 EPILOGUE_4_ARGS
1545ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1546
1547BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1548%ifdef RT_ARCH_AMD64
1549 PROLOGUE_4_ARGS
1550 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1551 mov rax, [A1]
1552 %1 cmpxchg [A0], A2
1553 mov [A1], rax
1554 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1555 EPILOGUE_4_ARGS
1556%else
1557 ;
1558 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1559 ;
1560 push esi
1561 push edi
1562 push ebx
1563 push ebp
1564
1565 mov edi, ecx ; pu64Dst
1566 mov esi, edx ; pu64Rax
1567 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1568 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1569
1570 mov ebx, [ecx]
1571 mov ecx, [ecx + 4]
1572 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1573 mov eax, [esi]
1574 mov edx, [esi + 4]
1575
1576 lock cmpxchg8b [edi]
1577
1578 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1579 jz .cmpxchg8b_not_equal
1580 cmp eax, eax ; just set the other flags.
1581.store:
1582 mov [esi], eax
1583 mov [esi + 4], edx
1584 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1585
1586 pop ebp
1587 pop ebx
1588 pop edi
1589 pop esi
1590 ret 8
1591
1592.cmpxchg8b_not_equal:
1593 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1594 jne .store
1595 cmp [esi], eax
1596 jmp .store
1597
1598%endif
1599ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1600%endmacro ; IEMIMPL_CMPXCHG
1601
1602IEMIMPL_CMPXCHG , ,
1603IEMIMPL_CMPXCHG lock, _locked
1604
1605;;
1606; Macro for implementing a unary operator.
1607;
1608; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1609; variants, except on 32-bit system where the 64-bit accesses requires hand
1610; coding.
1611;
1612; All the functions takes a pointer to the destination memory operand in A0,
1613; the source register operand in A1 and a pointer to eflags in A2.
1614;
1615; @param 1 The instruction mnemonic.
1616; @param 2 The modified flags.
1617; @param 3 The undefined flags.
1618;
1619%macro IEMIMPL_UNARY_OP 3
1620BEGINCODE
1621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1622 PROLOGUE_2_ARGS
1623 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1624 %1 byte [A0]
1625 IEM_SAVE_FLAGS A1, %2, %3
1626 EPILOGUE_2_ARGS
1627ENDPROC iemAImpl_ %+ %1 %+ _u8
1628
1629BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1630 PROLOGUE_2_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1632 lock %1 byte [A0]
1633 IEM_SAVE_FLAGS A1, %2, %3
1634 EPILOGUE_2_ARGS
1635ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1636
1637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1638 PROLOGUE_2_ARGS
1639 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1640 %1 word [A0]
1641 IEM_SAVE_FLAGS A1, %2, %3
1642 EPILOGUE_2_ARGS
1643ENDPROC iemAImpl_ %+ %1 %+ _u16
1644
1645BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1646 PROLOGUE_2_ARGS
1647 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1648 lock %1 word [A0]
1649 IEM_SAVE_FLAGS A1, %2, %3
1650 EPILOGUE_2_ARGS
1651ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1652
1653BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1654 PROLOGUE_2_ARGS
1655 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1656 %1 dword [A0]
1657 IEM_SAVE_FLAGS A1, %2, %3
1658 EPILOGUE_2_ARGS
1659ENDPROC iemAImpl_ %+ %1 %+ _u32
1660
1661BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1662 PROLOGUE_2_ARGS
1663 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1664 lock %1 dword [A0]
1665 IEM_SAVE_FLAGS A1, %2, %3
1666 EPILOGUE_2_ARGS
1667ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1668
1669 %ifdef RT_ARCH_AMD64
1670BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1671 PROLOGUE_2_ARGS
1672 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1673 %1 qword [A0]
1674 IEM_SAVE_FLAGS A1, %2, %3
1675 EPILOGUE_2_ARGS
1676ENDPROC iemAImpl_ %+ %1 %+ _u64
1677
1678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1679 PROLOGUE_2_ARGS
1680 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1681 lock %1 qword [A0]
1682 IEM_SAVE_FLAGS A1, %2, %3
1683 EPILOGUE_2_ARGS
1684ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1685 %endif ; RT_ARCH_AMD64
1686
1687%endmacro
1688
1689IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1690IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1691IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1692IEMIMPL_UNARY_OP not, 0, 0
1693
1694
1695;
1696; BSWAP. No flag changes.
1697;
1698; Each function takes one argument, pointer to the value to bswap
1699; (input/output). They all return void.
1700;
1701BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1702 PROLOGUE_1_ARGS
1703 mov T0_32, [A0] ; just in case any of the upper bits are used.
1704 db 66h
1705 bswap T0_32
1706 mov [A0], T0_32
1707 EPILOGUE_1_ARGS
1708ENDPROC iemAImpl_bswap_u16
1709
1710BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1711 PROLOGUE_1_ARGS
1712 mov T0_32, [A0]
1713 bswap T0_32
1714 mov [A0], T0_32
1715 EPILOGUE_1_ARGS
1716ENDPROC iemAImpl_bswap_u32
1717
1718BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1719%ifdef RT_ARCH_AMD64
1720 PROLOGUE_1_ARGS
1721 mov T0, [A0]
1722 bswap T0
1723 mov [A0], T0
1724 EPILOGUE_1_ARGS
1725%else
1726 PROLOGUE_1_ARGS
1727 mov T0, [A0]
1728 mov T1, [A0 + 4]
1729 bswap T0
1730 bswap T1
1731 mov [A0 + 4], T0
1732 mov [A0], T1
1733 EPILOGUE_1_ARGS
1734%endif
1735ENDPROC iemAImpl_bswap_u64
1736
1737
1738;;
1739; Macro for implementing a shift operation.
1740;
1741; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1742; 32-bit system where the 64-bit accesses requires hand coding.
1743;
1744; All the functions takes a pointer to the destination memory operand in A0,
1745; the shift count in A1 and a pointer to eflags in A2.
1746;
1747; @param 1 The instruction mnemonic.
1748; @param 2 The modified flags.
1749; @param 3 The undefined flags.
1750;
1751; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1752;
1753; @note the _intel and _amd variants are implemented in C.
1754;
1755%macro IEMIMPL_SHIFT_OP 3
1756BEGINCODE
1757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1758 PROLOGUE_3_ARGS
1759 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1760 %ifdef ASM_CALL64_GCC
1761 mov cl, A1_8
1762 %1 byte [A0], cl
1763 %else
1764 xchg A1, A0
1765 %1 byte [A1], cl
1766 %endif
1767 IEM_SAVE_FLAGS A2, %2, %3
1768 EPILOGUE_3_ARGS
1769ENDPROC iemAImpl_ %+ %1 %+ _u8
1770
1771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1772 PROLOGUE_3_ARGS
1773 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1774 %ifdef ASM_CALL64_GCC
1775 mov cl, A1_8
1776 %1 word [A0], cl
1777 %else
1778 xchg A1, A0
1779 %1 word [A1], cl
1780 %endif
1781 IEM_SAVE_FLAGS A2, %2, %3
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1786 PROLOGUE_3_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1788 %ifdef ASM_CALL64_GCC
1789 mov cl, A1_8
1790 %1 dword [A0], cl
1791 %else
1792 xchg A1, A0
1793 %1 dword [A1], cl
1794 %endif
1795 IEM_SAVE_FLAGS A2, %2, %3
1796 EPILOGUE_3_ARGS
1797ENDPROC iemAImpl_ %+ %1 %+ _u32
1798
1799 %ifdef RT_ARCH_AMD64
1800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1801 PROLOGUE_3_ARGS
1802 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1803 %ifdef ASM_CALL64_GCC
1804 mov cl, A1_8
1805 %1 qword [A0], cl
1806 %else
1807 xchg A1, A0
1808 %1 qword [A1], cl
1809 %endif
1810 IEM_SAVE_FLAGS A2, %2, %3
1811 EPILOGUE_3_ARGS
1812ENDPROC iemAImpl_ %+ %1 %+ _u64
1813 %endif ; RT_ARCH_AMD64
1814
1815%endmacro
1816
1817IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1818IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1819IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1820IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1821IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1822IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1823IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1824
1825
1826;;
1827; Macro for implementing a double precision shift operation.
1828;
1829; This will generate code for the 16, 32 and 64 bit accesses, except on
1830; 32-bit system where the 64-bit accesses requires hand coding.
1831;
1832; The functions takes the destination operand (r/m) in A0, the source (reg) in
1833; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1834;
1835; @param 1 The instruction mnemonic.
1836; @param 2 The modified flags.
1837; @param 3 The undefined flags.
1838;
1839; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1840;
1841; @note the _intel and _amd variants are implemented in C.
1842;
1843%macro IEMIMPL_SHIFT_DBL_OP 3
1844BEGINCODE
1845BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1846 PROLOGUE_4_ARGS
1847 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1848 %ifdef ASM_CALL64_GCC
1849 xchg A3, A2
1850 %1 [A0], A1_16, cl
1851 xchg A3, A2
1852 %else
1853 xchg A0, A2
1854 %1 [A2], A1_16, cl
1855 %endif
1856 IEM_SAVE_FLAGS A3, %2, %3
1857 EPILOGUE_4_ARGS
1858ENDPROC iemAImpl_ %+ %1 %+ _u16
1859
1860BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1861 PROLOGUE_4_ARGS
1862 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1863 %ifdef ASM_CALL64_GCC
1864 xchg A3, A2
1865 %1 [A0], A1_32, cl
1866 xchg A3, A2
1867 %else
1868 xchg A0, A2
1869 %1 [A2], A1_32, cl
1870 %endif
1871 IEM_SAVE_FLAGS A3, %2, %3
1872 EPILOGUE_4_ARGS
1873ENDPROC iemAImpl_ %+ %1 %+ _u32
1874
1875 %ifdef RT_ARCH_AMD64
1876BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1877 PROLOGUE_4_ARGS
1878 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1879 %ifdef ASM_CALL64_GCC
1880 xchg A3, A2
1881 %1 [A0], A1, cl
1882 xchg A3, A2
1883 %else
1884 xchg A0, A2
1885 %1 [A2], A1, cl
1886 %endif
1887 IEM_SAVE_FLAGS A3, %2, %3
1888 EPILOGUE_4_ARGS_EX 12
1889ENDPROC iemAImpl_ %+ %1 %+ _u64
1890 %endif ; RT_ARCH_AMD64
1891
1892%endmacro
1893
1894IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1895IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1896
1897
1898;;
1899; Macro for implementing a multiplication operations.
1900;
1901; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1902; 32-bit system where the 64-bit accesses requires hand coding.
1903;
1904; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1905; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1906; pointer to eflags in A3.
1907;
1908; The functions all return 0 so the caller can be used for div/idiv as well as
1909; for the mul/imul implementation.
1910;
1911; @param 1 The instruction mnemonic.
1912; @param 2 The modified flags.
1913; @param 3 The undefined flags.
1914; @param 4 Name suffix.
1915; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1916;
1917; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1918;
1919%macro IEMIMPL_MUL_OP 5
1920BEGINCODE
1921BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1922 PROLOGUE_3_ARGS
1923 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1924 mov al, [A0]
1925 %1 A1_8
1926 mov [A0], ax
1927 %if %5 != 1
1928 IEM_SAVE_FLAGS A2, %2, %3
1929 %else
1930 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1931 %endif
1932 xor eax, eax
1933 EPILOGUE_3_ARGS
1934ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1935
1936BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1937 PROLOGUE_4_ARGS
1938 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1939 mov ax, [A0]
1940 %ifdef ASM_CALL64_GCC
1941 %1 A2_16
1942 mov [A0], ax
1943 mov [A1], dx
1944 %else
1945 mov T1, A1
1946 %1 A2_16
1947 mov [A0], ax
1948 mov [T1], dx
1949 %endif
1950 %if %5 != 1
1951 IEM_SAVE_FLAGS A3, %2, %3
1952 %else
1953 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1954 %endif
1955 xor eax, eax
1956 EPILOGUE_4_ARGS
1957ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1958
1959BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1960 PROLOGUE_4_ARGS
1961 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1962 mov eax, [A0]
1963 %ifdef ASM_CALL64_GCC
1964 %1 A2_32
1965 mov [A0], eax
1966 mov [A1], edx
1967 %else
1968 mov T1, A1
1969 %1 A2_32
1970 mov [A0], eax
1971 mov [T1], edx
1972 %endif
1973 %if %5 != 1
1974 IEM_SAVE_FLAGS A3, %2, %3
1975 %else
1976 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1977 %endif
1978 xor eax, eax
1979 EPILOGUE_4_ARGS
1980ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1981
1982 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1983BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1984 PROLOGUE_4_ARGS
1985 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1986 mov rax, [A0]
1987 %ifdef ASM_CALL64_GCC
1988 %1 A2
1989 mov [A0], rax
1990 mov [A1], rdx
1991 %else
1992 mov T1, A1
1993 %1 A2
1994 mov [A0], rax
1995 mov [T1], rdx
1996 %endif
1997 %if %5 != 1
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 %else
2000 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2001 %endif
2002 xor eax, eax
2003 EPILOGUE_4_ARGS_EX 12
2004ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2005 %endif ; !RT_ARCH_AMD64
2006
2007%endmacro
2008
2009IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2010IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2011IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2012IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2013IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2014IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2015
2016
2017BEGINCODE
2018;;
2019; Worker function for negating a 32-bit number in T1:T0
2020; @uses None (T0,T1)
2021BEGINPROC iemAImpl_negate_T0_T1_u32
2022 push 0
2023 push 0
2024 xchg T0_32, [xSP]
2025 xchg T1_32, [xSP + xCB]
2026 sub T0_32, [xSP]
2027 sbb T1_32, [xSP + xCB]
2028 add xSP, xCB*2
2029 ret
2030ENDPROC iemAImpl_negate_T0_T1_u32
2031
2032%ifdef RT_ARCH_AMD64
2033;;
2034; Worker function for negating a 64-bit number in T1:T0
2035; @uses None (T0,T1)
2036BEGINPROC iemAImpl_negate_T0_T1_u64
2037 push 0
2038 push 0
2039 xchg T0, [xSP]
2040 xchg T1, [xSP + xCB]
2041 sub T0, [xSP]
2042 sbb T1, [xSP + xCB]
2043 add xSP, xCB*2
2044 ret
2045ENDPROC iemAImpl_negate_T0_T1_u64
2046%endif
2047
2048
2049;;
2050; Macro for implementing a division operations.
2051;
2052; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2053; 32-bit system where the 64-bit accesses requires hand coding.
2054;
2055; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2056; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2057; pointer to eflags in A3.
2058;
2059; The functions all return 0 on success and -1 if a divide error should be
2060; raised by the caller.
2061;
2062; @param 1 The instruction mnemonic.
2063; @param 2 The modified flags.
2064; @param 3 The undefined flags.
2065; @param 4 1 if signed, 0 if unsigned.
2066; @param 5 Function suffix.
2067; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2068; 2 for AMD (set AF, clear PF, ZF and SF).
2069;
2070; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2071;
2072%macro IEMIMPL_DIV_OP 6
2073BEGINCODE
2074BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2075 PROLOGUE_3_ARGS
2076
2077 ; div by chainsaw check.
2078 test A1_8, A1_8
2079 jz .div_zero
2080
2081 ; Overflow check - unsigned division is simple to verify, haven't
2082 ; found a simple way to check signed division yet unfortunately.
2083 %if %4 == 0
2084 cmp [A0 + 1], A1_8
2085 jae .div_overflow
2086 %else
2087 mov T0_16, [A0] ; T0 = dividend
2088 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2089 test A1_8, A1_8
2090 js .divisor_negative
2091 test T0_16, T0_16
2092 jns .both_positive
2093 neg T0_16
2094.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2095 push T0 ; Start off like unsigned below.
2096 shr T0_16, 7
2097 cmp T0_8, A1_8
2098 pop T0
2099 jb .div_no_overflow
2100 ja .div_overflow
2101 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2102 cmp T0_8, A1_8
2103 jae .div_overflow
2104 jmp .div_no_overflow
2105
2106.divisor_negative:
2107 neg A1_8
2108 test T0_16, T0_16
2109 jns .one_of_each
2110 neg T0_16
2111.both_positive: ; Same as unsigned shifted by sign indicator bit.
2112 shr T0_16, 7
2113 cmp T0_8, A1_8
2114 jae .div_overflow
2115.div_no_overflow:
2116 mov A1, T1 ; restore divisor
2117 %endif
2118
2119 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2120 mov ax, [A0]
2121 %1 A1_8
2122 mov [A0], ax
2123 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2124 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2125 %else
2126 IEM_SAVE_FLAGS A2, %2, %3
2127 %endif
2128 xor eax, eax
2129
2130.return:
2131 EPILOGUE_3_ARGS
2132
2133.div_zero:
2134.div_overflow:
2135 mov eax, -1
2136 jmp .return
2137ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2138
2139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2140 PROLOGUE_4_ARGS
2141
2142 ; div by chainsaw check.
2143 test A2_16, A2_16
2144 jz .div_zero
2145
2146 ; Overflow check - unsigned division is simple to verify, haven't
2147 ; found a simple way to check signed division yet unfortunately.
2148 %if %4 == 0
2149 cmp [A1], A2_16
2150 jae .div_overflow
2151 %else
2152 mov T0_16, [A1]
2153 shl T0_32, 16
2154 mov T0_16, [A0] ; T0 = dividend
2155 mov T1, A2 ; T1 = divisor
2156 test T1_16, T1_16
2157 js .divisor_negative
2158 test T0_32, T0_32
2159 jns .both_positive
2160 neg T0_32
2161.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2162 push T0 ; Start off like unsigned below.
2163 shr T0_32, 15
2164 cmp T0_16, T1_16
2165 pop T0
2166 jb .div_no_overflow
2167 ja .div_overflow
2168 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2169 cmp T0_16, T1_16
2170 jae .div_overflow
2171 jmp .div_no_overflow
2172
2173.divisor_negative:
2174 neg T1_16
2175 test T0_32, T0_32
2176 jns .one_of_each
2177 neg T0_32
2178.both_positive: ; Same as unsigned shifted by sign indicator bit.
2179 shr T0_32, 15
2180 cmp T0_16, T1_16
2181 jae .div_overflow
2182.div_no_overflow:
2183 %endif
2184
2185 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2186 %ifdef ASM_CALL64_GCC
2187 mov T1, A2
2188 mov ax, [A0]
2189 mov dx, [A1]
2190 %1 T1_16
2191 mov [A0], ax
2192 mov [A1], dx
2193 %else
2194 mov T1, A1
2195 mov ax, [A0]
2196 mov dx, [T1]
2197 %1 A2_16
2198 mov [A0], ax
2199 mov [T1], dx
2200 %endif
2201 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2202 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2203 %else
2204 IEM_SAVE_FLAGS A3, %2, %3
2205 %endif
2206 xor eax, eax
2207
2208.return:
2209 EPILOGUE_4_ARGS
2210
2211.div_zero:
2212.div_overflow:
2213 mov eax, -1
2214 jmp .return
2215ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2216
2217BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2218 PROLOGUE_4_ARGS
2219
2220 ; div by chainsaw check.
2221 test A2_32, A2_32
2222 jz .div_zero
2223
2224 ; Overflow check - unsigned division is simple to verify, haven't
2225 ; found a simple way to check signed division yet unfortunately.
2226 %if %4 == 0
2227 cmp [A1], A2_32
2228 jae .div_overflow
2229 %else
2230 push A2 ; save A2 so we modify it (we out of regs on x86).
2231 mov T0_32, [A0] ; T0 = dividend low
2232 mov T1_32, [A1] ; T1 = dividend high
2233 test A2_32, A2_32
2234 js .divisor_negative
2235 test T1_32, T1_32
2236 jns .both_positive
2237 call NAME(iemAImpl_negate_T0_T1_u32)
2238.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2239 push T0 ; Start off like unsigned below.
2240 shl T1_32, 1
2241 shr T0_32, 31
2242 or T1_32, T0_32
2243 cmp T1_32, A2_32
2244 pop T0
2245 jb .div_no_overflow
2246 ja .div_overflow
2247 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2248 cmp T0_32, A2_32
2249 jae .div_overflow
2250 jmp .div_no_overflow
2251
2252.divisor_negative:
2253 neg A2_32
2254 test T1_32, T1_32
2255 jns .one_of_each
2256 call NAME(iemAImpl_negate_T0_T1_u32)
2257.both_positive: ; Same as unsigned shifted by sign indicator bit.
2258 shl T1_32, 1
2259 shr T0_32, 31
2260 or T1_32, T0_32
2261 cmp T1_32, A2_32
2262 jae .div_overflow
2263.div_no_overflow:
2264 pop A2
2265 %endif
2266
2267 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2268 mov eax, [A0]
2269 %ifdef ASM_CALL64_GCC
2270 mov T1, A2
2271 mov eax, [A0]
2272 mov edx, [A1]
2273 %1 T1_32
2274 mov [A0], eax
2275 mov [A1], edx
2276 %else
2277 mov T1, A1
2278 mov eax, [A0]
2279 mov edx, [T1]
2280 %1 A2_32
2281 mov [A0], eax
2282 mov [T1], edx
2283 %endif
2284 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2285 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2286 %else
2287 IEM_SAVE_FLAGS A3, %2, %3
2288 %endif
2289 xor eax, eax
2290
2291.return:
2292 EPILOGUE_4_ARGS
2293
2294.div_overflow:
2295 %if %4 != 0
2296 pop A2
2297 %endif
2298.div_zero:
2299 mov eax, -1
2300 jmp .return
2301ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2302
2303 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2304BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2305 PROLOGUE_4_ARGS
2306
2307 test A2, A2
2308 jz .div_zero
2309 %if %4 == 0
2310 cmp [A1], A2
2311 jae .div_overflow
2312 %else
2313 push A2 ; save A2 so we modify it (we out of regs on x86).
2314 mov T0, [A0] ; T0 = dividend low
2315 mov T1, [A1] ; T1 = dividend high
2316 test A2, A2
2317 js .divisor_negative
2318 test T1, T1
2319 jns .both_positive
2320 call NAME(iemAImpl_negate_T0_T1_u64)
2321.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2322 push T0 ; Start off like unsigned below.
2323 shl T1, 1
2324 shr T0, 63
2325 or T1, T0
2326 cmp T1, A2
2327 pop T0
2328 jb .div_no_overflow
2329 ja .div_overflow
2330 mov T1, 0x7fffffffffffffff
2331 and T0, T1 ; Special case for covering (divisor - 1).
2332 cmp T0, A2
2333 jae .div_overflow
2334 jmp .div_no_overflow
2335
2336.divisor_negative:
2337 neg A2
2338 test T1, T1
2339 jns .one_of_each
2340 call NAME(iemAImpl_negate_T0_T1_u64)
2341.both_positive: ; Same as unsigned shifted by sign indicator bit.
2342 shl T1, 1
2343 shr T0, 63
2344 or T1, T0
2345 cmp T1, A2
2346 jae .div_overflow
2347.div_no_overflow:
2348 pop A2
2349 %endif
2350
2351 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2352 mov rax, [A0]
2353 %ifdef ASM_CALL64_GCC
2354 mov T1, A2
2355 mov rax, [A0]
2356 mov rdx, [A1]
2357 %1 T1
2358 mov [A0], rax
2359 mov [A1], rdx
2360 %else
2361 mov T1, A1
2362 mov rax, [A0]
2363 mov rdx, [T1]
2364 %1 A2
2365 mov [A0], rax
2366 mov [T1], rdx
2367 %endif
2368 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2369 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2370 %else
2371 IEM_SAVE_FLAGS A3, %2, %3
2372 %endif
2373 xor eax, eax
2374
2375.return:
2376 EPILOGUE_4_ARGS_EX 12
2377
2378.div_overflow:
2379 %if %4 != 0
2380 pop A2
2381 %endif
2382.div_zero:
2383 mov eax, -1
2384 jmp .return
2385ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2386 %endif ; !RT_ARCH_AMD64
2387
2388%endmacro
2389
2390IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2391IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2392IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2393IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2394IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2395IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2396
2397
2398;;
2399; Macro for implementing memory fence operation.
2400;
2401; No return value, no operands or anything.
2402;
2403; @param 1 The instruction.
2404;
2405%macro IEMIMPL_MEM_FENCE 1
2406BEGINCODE
2407BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2408 %1
2409 ret
2410ENDPROC iemAImpl_ %+ %1
2411%endmacro
2412
2413IEMIMPL_MEM_FENCE lfence
2414IEMIMPL_MEM_FENCE sfence
2415IEMIMPL_MEM_FENCE mfence
2416
2417;;
2418; Alternative for non-SSE2 host.
2419;
2420BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2421 push xAX
2422 xchg xAX, [xSP]
2423 add xSP, xCB
2424 ret
2425ENDPROC iemAImpl_alt_mem_fence
2426
2427
2428;;
2429; Initialize the FPU for the actual instruction being emulated, this means
2430; loading parts of the guest's control word and status word.
2431;
2432; @uses 24 bytes of stack. T0, T1
2433; @param 1 Expression giving the address of the FXSTATE of the guest.
2434;
2435%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2436 fnstenv [xSP]
2437
2438 ; FCW - for exception, precision and rounding control.
2439 movzx T0, word [%1 + X86FXSTATE.FCW]
2440 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2441 mov [xSP + X86FSTENV32P.FCW], T0_16
2442
2443 ; FSW - for undefined C0, C1, C2, and C3.
2444 movzx T1, word [%1 + X86FXSTATE.FSW]
2445 and T1, X86_FSW_C_MASK
2446 movzx T0, word [xSP + X86FSTENV32P.FSW]
2447 and T0, X86_FSW_TOP_MASK
2448 or T0, T1
2449 mov [xSP + X86FSTENV32P.FSW], T0_16
2450
2451 fldenv [xSP]
2452%endmacro
2453
2454
2455;;
2456; Initialize the FPU for the actual instruction being emulated, this means
2457; loading parts of the guest's control word, status word, and update the
2458; tag word for the top register if it's empty.
2459;
2460; ASSUMES actual TOP=7
2461;
2462; @uses 24 bytes of stack. T0, T1
2463; @param 1 Expression giving the address of the FXSTATE of the guest.
2464;
2465%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2466 fnstenv [xSP]
2467
2468 ; FCW - for exception, precision and rounding control.
2469 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2470 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2471 mov [xSP + X86FSTENV32P.FCW], T0_16
2472
2473 ; FSW - for undefined C0, C1, C2, and C3.
2474 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2475 and T1_32, X86_FSW_C_MASK
2476 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2477 and T0_32, X86_FSW_TOP_MASK
2478 or T0_32, T1_32
2479 mov [xSP + X86FSTENV32P.FSW], T0_16
2480
2481 ; FTW - Only for ST0 (in/out).
2482 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2483 shr T1_32, X86_FSW_TOP_SHIFT
2484 and T1_32, X86_FSW_TOP_SMASK
2485 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2486 jc %%st0_not_empty
2487 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2488%%st0_not_empty:
2489
2490 fldenv [xSP]
2491%endmacro
2492
2493
2494;;
2495; Need to move this as well somewhere better?
2496;
2497struc IEMFPURESULT
2498 .r80Result resw 5
2499 .FSW resw 1
2500endstruc
2501
2502
2503;;
2504; Need to move this as well somewhere better?
2505;
2506struc IEMFPURESULTTWO
2507 .r80Result1 resw 5
2508 .FSW resw 1
2509 .r80Result2 resw 5
2510endstruc
2511
2512
2513;
2514;---------------------- 16-bit signed integer operations ----------------------
2515;
2516
2517
2518;;
2519; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2520;
2521; @param A0 FPU context (fxsave).
2522; @param A1 Pointer to a IEMFPURESULT for the output.
2523; @param A2 Pointer to the 16-bit floating point value to convert.
2524;
2525BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2526 PROLOGUE_3_ARGS
2527 sub xSP, 20h
2528
2529 fninit
2530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2531 fild word [A2]
2532
2533 fnstsw word [A1 + IEMFPURESULT.FSW]
2534 fnclex
2535 fstp tword [A1 + IEMFPURESULT.r80Result]
2536
2537 fninit
2538 add xSP, 20h
2539 EPILOGUE_3_ARGS
2540ENDPROC iemAImpl_fild_r80_from_i16
2541
2542
2543;;
2544; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2545;
2546; @param A0 FPU context (fxsave).
2547; @param A1 Where to return the output FSW.
2548; @param A2 Where to store the 16-bit signed integer value.
2549; @param A3 Pointer to the 80-bit value.
2550;
2551BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2552 PROLOGUE_4_ARGS
2553 sub xSP, 20h
2554
2555 fninit
2556 fld tword [A3]
2557 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2558 fistp word [A2]
2559
2560 fnstsw word [A1]
2561
2562 fninit
2563 add xSP, 20h
2564 EPILOGUE_4_ARGS
2565ENDPROC iemAImpl_fist_r80_to_i16
2566
2567
2568;;
2569; Store a 80-bit floating point value (register) as a 16-bit signed integer
2570; (memory) with truncation.
2571;
2572; @param A0 FPU context (fxsave).
2573; @param A1 Where to return the output FSW.
2574; @param A2 Where to store the 16-bit signed integer value.
2575; @param A3 Pointer to the 80-bit value.
2576;
2577BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2578 PROLOGUE_4_ARGS
2579 sub xSP, 20h
2580
2581 fninit
2582 fld tword [A3]
2583 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2584 fisttp word [A2]
2585
2586 fnstsw word [A1]
2587
2588 fninit
2589 add xSP, 20h
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_fistt_r80_to_i16
2592
2593
2594;;
2595; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2596;
2597; @param 1 The instruction
2598;
2599; @param A0 FPU context (fxsave).
2600; @param A1 Pointer to a IEMFPURESULT for the output.
2601; @param A2 Pointer to the 80-bit value.
2602; @param A3 Pointer to the 16-bit value.
2603;
2604%macro IEMIMPL_FPU_R80_BY_I16 1
2605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2606 PROLOGUE_4_ARGS
2607 sub xSP, 20h
2608
2609 fninit
2610 fld tword [A2]
2611 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2612 %1 word [A3]
2613
2614 fnstsw word [A1 + IEMFPURESULT.FSW]
2615 fnclex
2616 fstp tword [A1 + IEMFPURESULT.r80Result]
2617
2618 fninit
2619 add xSP, 20h
2620 EPILOGUE_4_ARGS
2621ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2622%endmacro
2623
2624IEMIMPL_FPU_R80_BY_I16 fiadd
2625IEMIMPL_FPU_R80_BY_I16 fimul
2626IEMIMPL_FPU_R80_BY_I16 fisub
2627IEMIMPL_FPU_R80_BY_I16 fisubr
2628IEMIMPL_FPU_R80_BY_I16 fidiv
2629IEMIMPL_FPU_R80_BY_I16 fidivr
2630
2631
2632;;
2633; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2634; only returning FSW.
2635;
2636; @param 1 The instruction
2637;
2638; @param A0 FPU context (fxsave).
2639; @param A1 Where to store the output FSW.
2640; @param A2 Pointer to the 80-bit value.
2641; @param A3 Pointer to the 64-bit value.
2642;
2643%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2644BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2645 PROLOGUE_4_ARGS
2646 sub xSP, 20h
2647
2648 fninit
2649 fld tword [A2]
2650 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2651 %1 word [A3]
2652
2653 fnstsw word [A1]
2654
2655 fninit
2656 add xSP, 20h
2657 EPILOGUE_4_ARGS
2658ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2659%endmacro
2660
2661IEMIMPL_FPU_R80_BY_I16_FSW ficom
2662
2663
2664
2665;
2666;---------------------- 32-bit signed integer operations ----------------------
2667;
2668
2669
2670;;
2671; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Pointer to a IEMFPURESULT for the output.
2675; @param A2 Pointer to the 32-bit floating point value to convert.
2676;
2677BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2678 PROLOGUE_3_ARGS
2679 sub xSP, 20h
2680
2681 fninit
2682 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2683 fild dword [A2]
2684
2685 fnstsw word [A1 + IEMFPURESULT.FSW]
2686 fnclex
2687 fstp tword [A1 + IEMFPURESULT.r80Result]
2688
2689 fninit
2690 add xSP, 20h
2691 EPILOGUE_3_ARGS
2692ENDPROC iemAImpl_fild_r80_from_i32
2693
2694
2695;;
2696; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2697;
2698; @param A0 FPU context (fxsave).
2699; @param A1 Where to return the output FSW.
2700; @param A2 Where to store the 32-bit signed integer value.
2701; @param A3 Pointer to the 80-bit value.
2702;
2703BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2704 PROLOGUE_4_ARGS
2705 sub xSP, 20h
2706
2707 fninit
2708 fld tword [A3]
2709 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2710 fistp dword [A2]
2711
2712 fnstsw word [A1]
2713
2714 fninit
2715 add xSP, 20h
2716 EPILOGUE_4_ARGS
2717ENDPROC iemAImpl_fist_r80_to_i32
2718
2719
2720;;
2721; Store a 80-bit floating point value (register) as a 32-bit signed integer
2722; (memory) with truncation.
2723;
2724; @param A0 FPU context (fxsave).
2725; @param A1 Where to return the output FSW.
2726; @param A2 Where to store the 32-bit signed integer value.
2727; @param A3 Pointer to the 80-bit value.
2728;
2729BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2730 PROLOGUE_4_ARGS
2731 sub xSP, 20h
2732
2733 fninit
2734 fld tword [A3]
2735 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2736 fisttp dword [A2]
2737
2738 fnstsw word [A1]
2739
2740 fninit
2741 add xSP, 20h
2742 EPILOGUE_4_ARGS
2743ENDPROC iemAImpl_fistt_r80_to_i32
2744
2745
2746;;
2747; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2748;
2749; @param 1 The instruction
2750;
2751; @param A0 FPU context (fxsave).
2752; @param A1 Pointer to a IEMFPURESULT for the output.
2753; @param A2 Pointer to the 80-bit value.
2754; @param A3 Pointer to the 32-bit value.
2755;
2756%macro IEMIMPL_FPU_R80_BY_I32 1
2757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2758 PROLOGUE_4_ARGS
2759 sub xSP, 20h
2760
2761 fninit
2762 fld tword [A2]
2763 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2764 %1 dword [A3]
2765
2766 fnstsw word [A1 + IEMFPURESULT.FSW]
2767 fnclex
2768 fstp tword [A1 + IEMFPURESULT.r80Result]
2769
2770 fninit
2771 add xSP, 20h
2772 EPILOGUE_4_ARGS
2773ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2774%endmacro
2775
2776IEMIMPL_FPU_R80_BY_I32 fiadd
2777IEMIMPL_FPU_R80_BY_I32 fimul
2778IEMIMPL_FPU_R80_BY_I32 fisub
2779IEMIMPL_FPU_R80_BY_I32 fisubr
2780IEMIMPL_FPU_R80_BY_I32 fidiv
2781IEMIMPL_FPU_R80_BY_I32 fidivr
2782
2783
2784;;
2785; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2786; only returning FSW.
2787;
2788; @param 1 The instruction
2789;
2790; @param A0 FPU context (fxsave).
2791; @param A1 Where to store the output FSW.
2792; @param A2 Pointer to the 80-bit value.
2793; @param A3 Pointer to the 64-bit value.
2794;
2795%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2796BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2797 PROLOGUE_4_ARGS
2798 sub xSP, 20h
2799
2800 fninit
2801 fld tword [A2]
2802 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2803 %1 dword [A3]
2804
2805 fnstsw word [A1]
2806
2807 fninit
2808 add xSP, 20h
2809 EPILOGUE_4_ARGS
2810ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2811%endmacro
2812
2813IEMIMPL_FPU_R80_BY_I32_FSW ficom
2814
2815
2816
2817;
2818;---------------------- 64-bit signed integer operations ----------------------
2819;
2820
2821
2822;;
2823; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Pointer to a IEMFPURESULT for the output.
2827; @param A2 Pointer to the 64-bit floating point value to convert.
2828;
2829BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2830 PROLOGUE_3_ARGS
2831 sub xSP, 20h
2832
2833 fninit
2834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2835 fild qword [A2]
2836
2837 fnstsw word [A1 + IEMFPURESULT.FSW]
2838 fnclex
2839 fstp tword [A1 + IEMFPURESULT.r80Result]
2840
2841 fninit
2842 add xSP, 20h
2843 EPILOGUE_3_ARGS
2844ENDPROC iemAImpl_fild_r80_from_i64
2845
2846
2847;;
2848; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2849;
2850; @param A0 FPU context (fxsave).
2851; @param A1 Where to return the output FSW.
2852; @param A2 Where to store the 64-bit signed integer value.
2853; @param A3 Pointer to the 80-bit value.
2854;
2855BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2856 PROLOGUE_4_ARGS
2857 sub xSP, 20h
2858
2859 fninit
2860 fld tword [A3]
2861 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2862 fistp qword [A2]
2863
2864 fnstsw word [A1]
2865
2866 fninit
2867 add xSP, 20h
2868 EPILOGUE_4_ARGS
2869ENDPROC iemAImpl_fist_r80_to_i64
2870
2871
2872;;
2873; Store a 80-bit floating point value (register) as a 64-bit signed integer
2874; (memory) with truncation.
2875;
2876; @param A0 FPU context (fxsave).
2877; @param A1 Where to return the output FSW.
2878; @param A2 Where to store the 64-bit signed integer value.
2879; @param A3 Pointer to the 80-bit value.
2880;
2881BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2882 PROLOGUE_4_ARGS
2883 sub xSP, 20h
2884
2885 fninit
2886 fld tword [A3]
2887 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2888 fisttp qword [A2]
2889
2890 fnstsw word [A1]
2891
2892 fninit
2893 add xSP, 20h
2894 EPILOGUE_4_ARGS
2895ENDPROC iemAImpl_fistt_r80_to_i64
2896
2897
2898
2899;
2900;---------------------- 32-bit floating point operations ----------------------
2901;
2902
2903;;
2904; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2905;
2906; @param A0 FPU context (fxsave).
2907; @param A1 Pointer to a IEMFPURESULT for the output.
2908; @param A2 Pointer to the 32-bit floating point value to convert.
2909;
2910BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2911 PROLOGUE_3_ARGS
2912 sub xSP, 20h
2913
2914 fninit
2915 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2916 fld dword [A2]
2917
2918 fnstsw word [A1 + IEMFPURESULT.FSW]
2919 fnclex
2920 fstp tword [A1 + IEMFPURESULT.r80Result]
2921
2922 fninit
2923 add xSP, 20h
2924 EPILOGUE_3_ARGS
2925ENDPROC iemAImpl_fld_r80_from_r32
2926
2927
2928;;
2929; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2930;
2931; @param A0 FPU context (fxsave).
2932; @param A1 Where to return the output FSW.
2933; @param A2 Where to store the 32-bit value.
2934; @param A3 Pointer to the 80-bit value.
2935;
2936BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2937 PROLOGUE_4_ARGS
2938 sub xSP, 20h
2939
2940 fninit
2941 fld tword [A3]
2942 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2943 fst dword [A2]
2944
2945 fnstsw word [A1]
2946
2947 fninit
2948 add xSP, 20h
2949 EPILOGUE_4_ARGS
2950ENDPROC iemAImpl_fst_r80_to_r32
2951
2952
2953;;
2954; FPU instruction working on one 80-bit and one 32-bit floating point value.
2955;
2956; @param 1 The instruction
2957;
2958; @param A0 FPU context (fxsave).
2959; @param A1 Pointer to a IEMFPURESULT for the output.
2960; @param A2 Pointer to the 80-bit value.
2961; @param A3 Pointer to the 32-bit value.
2962;
2963%macro IEMIMPL_FPU_R80_BY_R32 1
2964BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2965 PROLOGUE_4_ARGS
2966 sub xSP, 20h
2967
2968 fninit
2969 fld tword [A2]
2970 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971 %1 dword [A3]
2972
2973 fnstsw word [A1 + IEMFPURESULT.FSW]
2974 fnclex
2975 fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977 fninit
2978 add xSP, 20h
2979 EPILOGUE_4_ARGS
2980ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2981%endmacro
2982
2983IEMIMPL_FPU_R80_BY_R32 fadd
2984IEMIMPL_FPU_R80_BY_R32 fmul
2985IEMIMPL_FPU_R80_BY_R32 fsub
2986IEMIMPL_FPU_R80_BY_R32 fsubr
2987IEMIMPL_FPU_R80_BY_R32 fdiv
2988IEMIMPL_FPU_R80_BY_R32 fdivr
2989
2990
2991;;
2992; FPU instruction working on one 80-bit and one 32-bit floating point value,
2993; only returning FSW.
2994;
2995; @param 1 The instruction
2996;
2997; @param A0 FPU context (fxsave).
2998; @param A1 Where to store the output FSW.
2999; @param A2 Pointer to the 80-bit value.
3000; @param A3 Pointer to the 64-bit value.
3001;
3002%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3004 PROLOGUE_4_ARGS
3005 sub xSP, 20h
3006
3007 fninit
3008 fld tword [A2]
3009 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3010 %1 dword [A3]
3011
3012 fnstsw word [A1]
3013
3014 fninit
3015 add xSP, 20h
3016 EPILOGUE_4_ARGS
3017ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3018%endmacro
3019
3020IEMIMPL_FPU_R80_BY_R32_FSW fcom
3021
3022
3023
3024;
3025;---------------------- 64-bit floating point operations ----------------------
3026;
3027
3028;;
3029; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3030;
3031; @param A0 FPU context (fxsave).
3032; @param A1 Pointer to a IEMFPURESULT for the output.
3033; @param A2 Pointer to the 64-bit floating point value to convert.
3034;
3035BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3036 PROLOGUE_3_ARGS
3037 sub xSP, 20h
3038
3039 fninit
3040 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3041 fld qword [A2]
3042
3043 fnstsw word [A1 + IEMFPURESULT.FSW]
3044 fnclex
3045 fstp tword [A1 + IEMFPURESULT.r80Result]
3046
3047 fninit
3048 add xSP, 20h
3049 EPILOGUE_3_ARGS
3050ENDPROC iemAImpl_fld_r80_from_r64
3051
3052
3053;;
3054; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3055;
3056; @param A0 FPU context (fxsave).
3057; @param A1 Where to return the output FSW.
3058; @param A2 Where to store the 64-bit value.
3059; @param A3 Pointer to the 80-bit value.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3062 PROLOGUE_4_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 fld tword [A3]
3067 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3068 fst qword [A2]
3069
3070 fnstsw word [A1]
3071
3072 fninit
3073 add xSP, 20h
3074 EPILOGUE_4_ARGS
3075ENDPROC iemAImpl_fst_r80_to_r64
3076
3077
3078;;
3079; FPU instruction working on one 80-bit and one 64-bit floating point value.
3080;
3081; @param 1 The instruction
3082;
3083; @param A0 FPU context (fxsave).
3084; @param A1 Pointer to a IEMFPURESULT for the output.
3085; @param A2 Pointer to the 80-bit value.
3086; @param A3 Pointer to the 64-bit value.
3087;
3088%macro IEMIMPL_FPU_R80_BY_R64 1
3089BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3090 PROLOGUE_4_ARGS
3091 sub xSP, 20h
3092
3093 fninit
3094 fld tword [A2]
3095 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3096 %1 qword [A3]
3097
3098 fnstsw word [A1 + IEMFPURESULT.FSW]
3099 fnclex
3100 fstp tword [A1 + IEMFPURESULT.r80Result]
3101
3102 fninit
3103 add xSP, 20h
3104 EPILOGUE_4_ARGS
3105ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3106%endmacro
3107
3108IEMIMPL_FPU_R80_BY_R64 fadd
3109IEMIMPL_FPU_R80_BY_R64 fmul
3110IEMIMPL_FPU_R80_BY_R64 fsub
3111IEMIMPL_FPU_R80_BY_R64 fsubr
3112IEMIMPL_FPU_R80_BY_R64 fdiv
3113IEMIMPL_FPU_R80_BY_R64 fdivr
3114
3115;;
3116; FPU instruction working on one 80-bit and one 64-bit floating point value,
3117; only returning FSW.
3118;
3119; @param 1 The instruction
3120;
3121; @param A0 FPU context (fxsave).
3122; @param A1 Where to store the output FSW.
3123; @param A2 Pointer to the 80-bit value.
3124; @param A3 Pointer to the 64-bit value.
3125;
3126%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3127BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3128 PROLOGUE_4_ARGS
3129 sub xSP, 20h
3130
3131 fninit
3132 fld tword [A2]
3133 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3134 %1 qword [A3]
3135
3136 fnstsw word [A1]
3137
3138 fninit
3139 add xSP, 20h
3140 EPILOGUE_4_ARGS
3141ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3142%endmacro
3143
3144IEMIMPL_FPU_R80_BY_R64_FSW fcom
3145
3146
3147
3148;
3149;---------------------- 80-bit floating point operations ----------------------
3150;
3151
3152;;
3153; Loads a 80-bit floating point register value from memory.
3154;
3155; @param A0 FPU context (fxsave).
3156; @param A1 Pointer to a IEMFPURESULT for the output.
3157; @param A2 Pointer to the 80-bit floating point value to load.
3158;
3159BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3160 PROLOGUE_3_ARGS
3161 sub xSP, 20h
3162
3163 fninit
3164 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3165 fld tword [A2]
3166
3167 fnstsw word [A1 + IEMFPURESULT.FSW]
3168 fnclex
3169 fstp tword [A1 + IEMFPURESULT.r80Result]
3170
3171 fninit
3172 add xSP, 20h
3173 EPILOGUE_3_ARGS
3174ENDPROC iemAImpl_fld_r80_from_r80
3175
3176
3177;;
3178; Store a 80-bit floating point register to memory
3179;
3180; @param A0 FPU context (fxsave).
3181; @param A1 Where to return the output FSW.
3182; @param A2 Where to store the 80-bit value.
3183; @param A3 Pointer to the 80-bit register value.
3184;
3185BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3186 PROLOGUE_4_ARGS
3187 sub xSP, 20h
3188
3189 fninit
3190 fld tword [A3]
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fstp tword [A2]
3193
3194 fnstsw word [A1]
3195
3196 fninit
3197 add xSP, 20h
3198 EPILOGUE_4_ARGS
3199ENDPROC iemAImpl_fst_r80_to_r80
3200
3201
3202;;
3203; Loads an 80-bit floating point register value in BCD format from memory.
3204;
3205; @param A0 FPU context (fxsave).
3206; @param A1 Pointer to a IEMFPURESULT for the output.
3207; @param A2 Pointer to the 80-bit BCD value to load.
3208;
3209BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3210 PROLOGUE_3_ARGS
3211 sub xSP, 20h
3212
3213 fninit
3214 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3215 fbld tword [A2]
3216
3217 fnstsw word [A1 + IEMFPURESULT.FSW]
3218 fnclex
3219 fstp tword [A1 + IEMFPURESULT.r80Result]
3220
3221 fninit
3222 add xSP, 20h
3223 EPILOGUE_3_ARGS
3224ENDPROC iemAImpl_fld_r80_from_d80
3225
3226
3227;;
3228; Store a 80-bit floating point register to memory as BCD
3229;
3230; @param A0 FPU context (fxsave).
3231; @param A1 Where to return the output FSW.
3232; @param A2 Where to store the 80-bit BCD value.
3233; @param A3 Pointer to the 80-bit register value.
3234;
3235BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3236 PROLOGUE_4_ARGS
3237 sub xSP, 20h
3238
3239 fninit
3240 fld tword [A3]
3241 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3242 fbstp tword [A2]
3243
3244 fnstsw word [A1]
3245
3246 fninit
3247 add xSP, 20h
3248 EPILOGUE_4_ARGS
3249ENDPROC iemAImpl_fst_r80_to_d80
3250
3251
3252;;
3253; FPU instruction working on two 80-bit floating point values.
3254;
3255; @param 1 The instruction
3256;
3257; @param A0 FPU context (fxsave).
3258; @param A1 Pointer to a IEMFPURESULT for the output.
3259; @param A2 Pointer to the first 80-bit value (ST0)
3260; @param A3 Pointer to the second 80-bit value (STn).
3261;
3262%macro IEMIMPL_FPU_R80_BY_R80 2
3263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3264 PROLOGUE_4_ARGS
3265 sub xSP, 20h
3266
3267 fninit
3268 fld tword [A3]
3269 fld tword [A2]
3270 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3271 %1 %2
3272
3273 fnstsw word [A1 + IEMFPURESULT.FSW]
3274 fnclex
3275 fstp tword [A1 + IEMFPURESULT.r80Result]
3276
3277 fninit
3278 add xSP, 20h
3279 EPILOGUE_4_ARGS
3280ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3281%endmacro
3282
3283IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3284IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3285IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3286IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3287IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3288IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3289IEMIMPL_FPU_R80_BY_R80 fprem, {}
3290IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3291IEMIMPL_FPU_R80_BY_R80 fscale, {}
3292
3293
3294;;
3295; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3296; storing the result in ST1 and popping the stack.
3297;
3298; @param 1 The instruction
3299;
3300; @param A0 FPU context (fxsave).
3301; @param A1 Pointer to a IEMFPURESULT for the output.
3302; @param A2 Pointer to the first 80-bit value (ST1).
3303; @param A3 Pointer to the second 80-bit value (ST0).
3304;
3305%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3306BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3307 PROLOGUE_4_ARGS
3308 sub xSP, 20h
3309
3310 fninit
3311 fld tword [A2]
3312 fld tword [A3]
3313 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3314 %1
3315
3316 fnstsw word [A1 + IEMFPURESULT.FSW]
3317 fnclex
3318 fstp tword [A1 + IEMFPURESULT.r80Result]
3319
3320 fninit
3321 add xSP, 20h
3322 EPILOGUE_4_ARGS
3323ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3324%endmacro
3325
3326IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3327IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3328IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3329
3330
3331;;
3332; FPU instruction working on two 80-bit floating point values, only
3333; returning FSW.
3334;
3335; @param 1 The instruction
3336;
3337; @param A0 FPU context (fxsave).
3338; @param A1 Pointer to a uint16_t for the resulting FSW.
3339; @param A2 Pointer to the first 80-bit value.
3340; @param A3 Pointer to the second 80-bit value.
3341;
3342%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3343BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3344 PROLOGUE_4_ARGS
3345 sub xSP, 20h
3346
3347 fninit
3348 fld tword [A3]
3349 fld tword [A2]
3350 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3351 %1 st0, st1
3352
3353 fnstsw word [A1]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_FSW fcom
3362IEMIMPL_FPU_R80_BY_R80_FSW fucom
3363
3364
3365;;
3366; FPU instruction working on two 80-bit floating point values,
3367; returning FSW and EFLAGS (eax).
3368;
3369; @param 1 The instruction
3370;
3371; @returns EFLAGS in EAX.
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st1
3387
3388 fnstsw word [A1]
3389 pushf
3390 pop xAX
3391
3392 fninit
3393 add xSP, 20h
3394 EPILOGUE_4_ARGS
3395ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3396%endmacro
3397
3398IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3399IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3400
3401
3402;;
3403; FPU instruction working on one 80-bit floating point value.
3404;
3405; @param 1 The instruction
3406;
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a IEMFPURESULT for the output.
3409; @param A2 Pointer to the 80-bit value.
3410;
3411%macro IEMIMPL_FPU_R80 1
3412BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3413 PROLOGUE_3_ARGS
3414 sub xSP, 20h
3415
3416 fninit
3417 fld tword [A2]
3418 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3419 %1
3420
3421 fnstsw word [A1 + IEMFPURESULT.FSW]
3422 fnclex
3423 fstp tword [A1 + IEMFPURESULT.r80Result]
3424
3425 fninit
3426 add xSP, 20h
3427 EPILOGUE_3_ARGS
3428ENDPROC iemAImpl_ %+ %1 %+ _r80
3429%endmacro
3430
3431IEMIMPL_FPU_R80 fchs
3432IEMIMPL_FPU_R80 fabs
3433IEMIMPL_FPU_R80 f2xm1
3434IEMIMPL_FPU_R80 fsqrt
3435IEMIMPL_FPU_R80 frndint
3436IEMIMPL_FPU_R80 fsin
3437IEMIMPL_FPU_R80 fcos
3438
3439
3440;;
3441; FPU instruction working on one 80-bit floating point value, only
3442; returning FSW.
3443;
3444; @param 1 The instruction
3445; @param 2 Non-zero to also restore FTW.
3446;
3447; @param A0 FPU context (fxsave).
3448; @param A1 Pointer to a uint16_t for the resulting FSW.
3449; @param A2 Pointer to the 80-bit value.
3450;
3451%macro IEMIMPL_FPU_R80_FSW 2
3452BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3453 PROLOGUE_3_ARGS
3454 sub xSP, 20h
3455
3456 fninit
3457 fld tword [A2]
3458%if %2 != 0
3459 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3460%else
3461 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3462%endif
3463 %1
3464
3465 fnstsw word [A1]
3466
3467 fninit
3468 add xSP, 20h
3469 EPILOGUE_3_ARGS
3470ENDPROC iemAImpl_ %+ %1 %+ _r80
3471%endmacro
3472
3473IEMIMPL_FPU_R80_FSW ftst, 0
3474IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3475
3476
3477
3478;;
3479; FPU instruction loading a 80-bit floating point constant.
3480;
3481; @param 1 The instruction
3482;
3483; @param A0 FPU context (fxsave).
3484; @param A1 Pointer to a IEMFPURESULT for the output.
3485;
3486%macro IEMIMPL_FPU_R80_CONST 1
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3488 PROLOGUE_2_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3493 %1
3494
3495 fnstsw word [A1 + IEMFPURESULT.FSW]
3496 fnclex
3497 fstp tword [A1 + IEMFPURESULT.r80Result]
3498
3499 fninit
3500 add xSP, 20h
3501 EPILOGUE_2_ARGS
3502ENDPROC iemAImpl_ %+ %1 %+
3503%endmacro
3504
3505IEMIMPL_FPU_R80_CONST fld1
3506IEMIMPL_FPU_R80_CONST fldl2t
3507IEMIMPL_FPU_R80_CONST fldl2e
3508IEMIMPL_FPU_R80_CONST fldpi
3509IEMIMPL_FPU_R80_CONST fldlg2
3510IEMIMPL_FPU_R80_CONST fldln2
3511IEMIMPL_FPU_R80_CONST fldz
3512
3513
3514;;
3515; FPU instruction working on one 80-bit floating point value, outputing two.
3516;
3517; @param 1 The instruction
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3521; @param A2 Pointer to the 80-bit value.
3522;
3523%macro IEMIMPL_FPU_R80_R80 1
3524BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3525 PROLOGUE_3_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A2]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 %1
3532
3533 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3534 fnclex
3535 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3536 fnclex
3537 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3538
3539 fninit
3540 add xSP, 20h
3541 EPILOGUE_3_ARGS
3542ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3543%endmacro
3544
3545IEMIMPL_FPU_R80_R80 fptan
3546IEMIMPL_FPU_R80_R80 fxtract
3547IEMIMPL_FPU_R80_R80 fsincos
3548
3549
3550
3551
3552;---------------------- SSE and MMX Operations ----------------------
3553
3554;; @todo what do we need to do for MMX?
3555%macro IEMIMPL_MMX_PROLOGUE 0
3556%endmacro
3557%macro IEMIMPL_MMX_EPILOGUE 0
3558%endmacro
3559
3560;; @todo what do we need to do for SSE?
3561%macro IEMIMPL_SSE_PROLOGUE 0
3562%endmacro
3563%macro IEMIMPL_SSE_EPILOGUE 0
3564%endmacro
3565
3566;; @todo what do we need to do for AVX?
3567%macro IEMIMPL_AVX_PROLOGUE 0
3568%endmacro
3569%macro IEMIMPL_AVX_EPILOGUE 0
3570%endmacro
3571
3572
3573;;
3574; Media instruction working on two full sized registers.
3575;
3576; @param 1 The instruction
3577; @param 2 Whether there is an MMX variant (1) or not (0).
3578;
3579; @param A0 FPU context (fxsave).
3580; @param A1 Pointer to the first media register size operand (input/output).
3581; @param A2 Pointer to the second media register size operand (input).
3582;
3583%macro IEMIMPL_MEDIA_F2 2
3584%if %2 != 0
3585BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3586 PROLOGUE_3_ARGS
3587 IEMIMPL_MMX_PROLOGUE
3588
3589 movq mm0, [A1]
3590 movq mm1, [A2]
3591 %1 mm0, mm1
3592 movq [A1], mm0
3593
3594 IEMIMPL_MMX_EPILOGUE
3595 EPILOGUE_3_ARGS
3596ENDPROC iemAImpl_ %+ %1 %+ _u64
3597%endif
3598
3599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3600 PROLOGUE_3_ARGS
3601 IEMIMPL_SSE_PROLOGUE
3602
3603 movdqu xmm0, [A1]
3604 movdqu xmm1, [A2]
3605 %1 xmm0, xmm1
3606 movdqu [A1], xmm0
3607
3608 IEMIMPL_SSE_EPILOGUE
3609 EPILOGUE_3_ARGS
3610ENDPROC iemAImpl_ %+ %1 %+ _u128
3611%endmacro
3612
3613IEMIMPL_MEDIA_F2 pshufb, 1
3614IEMIMPL_MEDIA_F2 pand, 1
3615IEMIMPL_MEDIA_F2 pandn, 1
3616IEMIMPL_MEDIA_F2 por, 1
3617IEMIMPL_MEDIA_F2 pxor, 1
3618IEMIMPL_MEDIA_F2 pcmpeqb, 1
3619IEMIMPL_MEDIA_F2 pcmpeqw, 1
3620IEMIMPL_MEDIA_F2 pcmpeqd, 1
3621IEMIMPL_MEDIA_F2 pcmpeqq, 0
3622IEMIMPL_MEDIA_F2 pcmpgtb, 1
3623IEMIMPL_MEDIA_F2 pcmpgtw, 1
3624IEMIMPL_MEDIA_F2 pcmpgtd, 1
3625IEMIMPL_MEDIA_F2 pcmpgtq, 0
3626IEMIMPL_MEDIA_F2 paddb, 1
3627IEMIMPL_MEDIA_F2 paddw, 1
3628IEMIMPL_MEDIA_F2 paddd, 1
3629IEMIMPL_MEDIA_F2 paddq, 1
3630IEMIMPL_MEDIA_F2 paddsb, 1
3631IEMIMPL_MEDIA_F2 paddsw, 1
3632IEMIMPL_MEDIA_F2 paddusb, 1
3633IEMIMPL_MEDIA_F2 paddusw, 1
3634IEMIMPL_MEDIA_F2 psubb, 1
3635IEMIMPL_MEDIA_F2 psubw, 1
3636IEMIMPL_MEDIA_F2 psubd, 1
3637IEMIMPL_MEDIA_F2 psubq, 1
3638IEMIMPL_MEDIA_F2 psubsb, 1
3639IEMIMPL_MEDIA_F2 psubsw, 1
3640IEMIMPL_MEDIA_F2 psubusb, 1
3641IEMIMPL_MEDIA_F2 psubusw, 1
3642IEMIMPL_MEDIA_F2 pmullw, 1
3643IEMIMPL_MEDIA_F2 pmulld, 0
3644IEMIMPL_MEDIA_F2 pmulhw, 1
3645IEMIMPL_MEDIA_F2 pmaddwd, 1
3646IEMIMPL_MEDIA_F2 pminub, 1
3647IEMIMPL_MEDIA_F2 pminuw, 0
3648IEMIMPL_MEDIA_F2 pminud, 0
3649IEMIMPL_MEDIA_F2 pminsb, 0
3650IEMIMPL_MEDIA_F2 pminsw, 1
3651IEMIMPL_MEDIA_F2 pminsd, 0
3652IEMIMPL_MEDIA_F2 pmaxub, 1
3653IEMIMPL_MEDIA_F2 pmaxuw, 0
3654IEMIMPL_MEDIA_F2 pmaxud, 0
3655IEMIMPL_MEDIA_F2 pmaxsb, 0
3656IEMIMPL_MEDIA_F2 pmaxsw, 1
3657IEMIMPL_MEDIA_F2 pmaxsd, 0
3658IEMIMPL_MEDIA_F2 pabsb, 1
3659IEMIMPL_MEDIA_F2 pabsw, 1
3660IEMIMPL_MEDIA_F2 pabsd, 1
3661IEMIMPL_MEDIA_F2 psignb, 1
3662IEMIMPL_MEDIA_F2 psignw, 1
3663IEMIMPL_MEDIA_F2 psignd, 1
3664IEMIMPL_MEDIA_F2 phaddw, 1
3665IEMIMPL_MEDIA_F2 phaddd, 1
3666IEMIMPL_MEDIA_F2 phsubw, 1
3667IEMIMPL_MEDIA_F2 phsubd, 1
3668IEMIMPL_MEDIA_F2 phaddsw, 1
3669IEMIMPL_MEDIA_F2 phsubsw, 1
3670IEMIMPL_MEDIA_F2 pmaddubsw, 1
3671IEMIMPL_MEDIA_F2 pmulhrsw, 1
3672IEMIMPL_MEDIA_F2 pmuludq, 1
3673
3674
3675;;
3676; Media instruction working on two full sized registers, but no FXSAVE state argument.
3677;
3678; @param 1 The instruction
3679; @param 2 Whether there is an MMX variant (1) or not (0).
3680;
3681; @param A0 Pointer to the first media register size operand (input/output).
3682; @param A1 Pointer to the second media register size operand (input).
3683;
3684%macro IEMIMPL_MEDIA_OPT_F2 2
3685%if %2 != 0
3686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3687 PROLOGUE_2_ARGS
3688 IEMIMPL_MMX_PROLOGUE
3689
3690 movq mm0, [A0]
3691 movq mm1, [A1]
3692 %1 mm0, mm1
3693 movq [A0], mm0
3694
3695 IEMIMPL_MMX_EPILOGUE
3696 EPILOGUE_2_ARGS
3697ENDPROC iemAImpl_ %+ %1 %+ _u64
3698%endif
3699
3700BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3701 PROLOGUE_2_ARGS
3702 IEMIMPL_SSE_PROLOGUE
3703
3704 movdqu xmm0, [A0]
3705 movdqu xmm1, [A1]
3706 %1 xmm0, xmm1
3707 movdqu [A0], xmm0
3708
3709 IEMIMPL_SSE_EPILOGUE
3710 EPILOGUE_2_ARGS
3711ENDPROC iemAImpl_ %+ %1 %+ _u128
3712%endmacro
3713
3714IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3715IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3716IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3717IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3718IEMIMPL_MEDIA_OPT_F2 psllw, 1
3719IEMIMPL_MEDIA_OPT_F2 pslld, 1
3720IEMIMPL_MEDIA_OPT_F2 psllq, 1
3721IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3722IEMIMPL_MEDIA_OPT_F2 psrld, 1
3723IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3724IEMIMPL_MEDIA_OPT_F2 psraw, 1
3725IEMIMPL_MEDIA_OPT_F2 psrad, 1
3726IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3727IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3728IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3729IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3730IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3731IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3732IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3733IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3734IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3735IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3736IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3737IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3738IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3739IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3740IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3741
3742;;
3743; Media instruction working on one full sized and one half sized register (lower half).
3744;
3745; @param 1 The instruction
3746; @param 2 1 if MMX is included, 0 if not.
3747;
3748; @param A0 Pointer to the first full sized media register operand (input/output).
3749; @param A1 Pointer to the second half sized media register operand (input).
3750;
3751%macro IEMIMPL_MEDIA_F1L1 2
3752 %if %2 != 0
3753BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3754 PROLOGUE_2_ARGS
3755 IEMIMPL_MMX_PROLOGUE
3756
3757 movq mm0, [A0]
3758 movq mm1, [A1]
3759 %1 mm0, mm1
3760 movq [A0], mm0
3761
3762 IEMIMPL_MMX_EPILOGUE
3763 EPILOGUE_2_ARGS
3764ENDPROC iemAImpl_ %+ %1 %+ _u64
3765 %endif
3766
3767BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3768 PROLOGUE_2_ARGS
3769 IEMIMPL_SSE_PROLOGUE
3770
3771 movdqu xmm0, [A0]
3772 movdqu xmm1, [A1]
3773 %1 xmm0, xmm1
3774 movdqu [A0], xmm0
3775
3776 IEMIMPL_SSE_EPILOGUE
3777 EPILOGUE_2_ARGS
3778ENDPROC iemAImpl_ %+ %1 %+ _u128
3779%endmacro
3780
3781IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3782IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3783IEMIMPL_MEDIA_F1L1 punpckldq, 1
3784IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3785
3786
3787;;
3788; Media instruction working two half sized input registers (lower half) and a full sized
3789; destination register (vpunpckh*).
3790;
3791; @param 1 The instruction
3792;
3793; @param A0 Pointer to the destination register (full sized, output only).
3794; @param A1 Pointer to the first full sized media source register operand, where we
3795; will only use the lower half as input - but we'll be loading it in full.
3796; @param A2 Pointer to the second full sized media source register operand, where we
3797; will only use the lower half as input - but we'll be loading it in full.
3798;
3799%macro IEMIMPL_MEDIA_F1L1L1 1
3800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3801 PROLOGUE_3_ARGS
3802 IEMIMPL_AVX_PROLOGUE
3803
3804 vmovdqu xmm0, [A1]
3805 vmovdqu xmm1, [A2]
3806 %1 xmm0, xmm0, xmm1
3807 vmovdqu [A0], xmm0
3808
3809 IEMIMPL_AVX_PROLOGUE
3810 EPILOGUE_3_ARGS
3811ENDPROC iemAImpl_ %+ %1 %+ _u128
3812
3813BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3814 PROLOGUE_3_ARGS
3815 IEMIMPL_AVX_PROLOGUE
3816
3817 vmovdqu ymm0, [A1]
3818 vmovdqu ymm1, [A2]
3819 %1 ymm0, ymm0, ymm1
3820 vmovdqu [A0], ymm0
3821
3822 IEMIMPL_AVX_PROLOGUE
3823 EPILOGUE_3_ARGS
3824ENDPROC iemAImpl_ %+ %1 %+ _u256
3825%endmacro
3826
3827IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3828IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3829IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3830IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3831
3832
3833;;
3834; Media instruction working on one full sized and one half sized register (high half).
3835;
3836; @param 1 The instruction
3837; @param 2 1 if MMX is included, 0 if not.
3838;
3839; @param A0 Pointer to the first full sized media register operand (input/output).
3840; @param A1 Pointer to the second full sized media register operand, where we
3841; will only use the upper half as input - but we'll load it in full.
3842;
3843%macro IEMIMPL_MEDIA_F1H1 2
3844IEMIMPL_MEDIA_F1L1 %1, %2
3845%endmacro
3846
3847IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3848IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3849IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3850IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3851
3852
3853;;
3854; Media instruction working two half sized input registers (high half) and a full sized
3855; destination register (vpunpckh*).
3856;
3857; @param 1 The instruction
3858;
3859; @param A0 Pointer to the destination register (full sized, output only).
3860; @param A1 Pointer to the first full sized media source register operand, where we
3861; will only use the upper half as input - but we'll be loading it in full.
3862; @param A2 Pointer to the second full sized media source register operand, where we
3863; will only use the upper half as input - but we'll be loading it in full.
3864;
3865%macro IEMIMPL_MEDIA_F1H1H1 1
3866IEMIMPL_MEDIA_F1L1L1 %1
3867%endmacro
3868
3869IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3870IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3871IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3872IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3873
3874
3875;
3876; Shufflers with evil 8-bit immediates.
3877;
3878
3879BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3880 PROLOGUE_3_ARGS
3881 IEMIMPL_MMX_PROLOGUE
3882
3883 movq mm1, [A1]
3884 movq mm0, mm0 ; paranoia!
3885 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3886 lea T1, [.imm0 xWrtRIP]
3887 lea T1, [T1 + T0]
3888 call T1
3889 movq [A0], mm0
3890
3891 IEMIMPL_MMX_EPILOGUE
3892 EPILOGUE_3_ARGS
3893%assign bImm 0
3894%rep 256
3895.imm %+ bImm:
3896 pshufw mm0, mm1, bImm
3897 ret
3898 %assign bImm bImm + 1
3899%endrep
3900.immEnd: ; 256*5 == 0x500
3901dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3902dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3903ENDPROC iemAImpl_pshufw_u64
3904
3905
3906%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3907BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3908 PROLOGUE_3_ARGS
3909 IEMIMPL_SSE_PROLOGUE
3910
3911 movdqu xmm1, [A1]
3912 movdqu xmm0, xmm1 ; paranoia!
3913 lea T1, [.imm0 xWrtRIP]
3914 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3915 lea T1, [T1 + T0*2]
3916 call T1
3917 movdqu [A0], xmm0
3918
3919 IEMIMPL_SSE_EPILOGUE
3920 EPILOGUE_3_ARGS
3921 %assign bImm 0
3922 %rep 256
3923.imm %+ bImm:
3924 %1 xmm0, xmm1, bImm
3925 ret
3926 %assign bImm bImm + 1
3927 %endrep
3928.immEnd: ; 256*6 == 0x600
3929dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3930dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3931ENDPROC iemAImpl_ %+ %1 %+ _u128
3932%endmacro
3933
3934IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3935IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3936IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3937
3938
3939%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3940BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3941 PROLOGUE_3_ARGS
3942 IEMIMPL_SSE_PROLOGUE
3943
3944 vmovdqu ymm1, [A1]
3945 vmovdqu ymm0, ymm1 ; paranoia!
3946 lea T1, [.imm0 xWrtRIP]
3947 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3948 lea T1, [T1 + T0*2]
3949 call T1
3950 vmovdqu [A0], ymm0
3951
3952 IEMIMPL_SSE_EPILOGUE
3953 EPILOGUE_3_ARGS
3954 %assign bImm 0
3955 %rep 256
3956.imm %+ bImm:
3957 %1 ymm0, ymm1, bImm
3958 ret
3959 %assign bImm bImm + 1
3960 %endrep
3961.immEnd: ; 256*6 == 0x600
3962dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3963dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3964ENDPROC iemAImpl_ %+ %1 %+ _u256
3965%endmacro
3966
3967IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3968IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3969IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3970
3971
3972;
3973; Shifts with evil 8-bit immediates.
3974;
3975
3976%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
3977BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
3978 PROLOGUE_2_ARGS
3979 IEMIMPL_MMX_PROLOGUE
3980
3981 movq mm0, [A0]
3982 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
3983 lea T1, [.imm0 xWrtRIP]
3984 lea T1, [T1 + T0]
3985 call T1
3986 movq [A0], mm0
3987
3988 IEMIMPL_MMX_EPILOGUE
3989 EPILOGUE_2_ARGS
3990%assign bImm 0
3991%rep 256
3992.imm %+ bImm:
3993 %1 mm0, bImm
3994 ret
3995 %assign bImm bImm + 1
3996%endrep
3997.immEnd: ; 256*5 == 0x500
3998dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3999dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4000ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4001%endmacro
4002
4003IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4004IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4005IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4006IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4007IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4008IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4009IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4010IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4011
4012
4013%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4014BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4015 PROLOGUE_2_ARGS
4016 IEMIMPL_SSE_PROLOGUE
4017
4018 movdqu xmm0, [A0]
4019 lea T1, [.imm0 xWrtRIP]
4020 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4021 lea T1, [T1 + T0*2]
4022 call T1
4023 movdqu [A0], xmm0
4024
4025 IEMIMPL_SSE_EPILOGUE
4026 EPILOGUE_2_ARGS
4027 %assign bImm 0
4028 %rep 256
4029.imm %+ bImm:
4030 %1 xmm0, bImm
4031 ret
4032 %assign bImm bImm + 1
4033 %endrep
4034.immEnd: ; 256*6 == 0x600
4035dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4036dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4037ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4038%endmacro
4039
4040IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4041IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4042IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4043IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4044IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4045IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4046IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4047IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4048IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4049IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4050
4051
4052;
4053; Move byte mask.
4054;
4055
4056BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4057 PROLOGUE_2_ARGS
4058 IEMIMPL_MMX_PROLOGUE
4059
4060 movq mm1, [A1]
4061 pmovmskb T0, mm1
4062 mov [A0], T0
4063%ifdef RT_ARCH_X86
4064 mov dword [A0 + 4], 0
4065%endif
4066 IEMIMPL_MMX_EPILOGUE
4067 EPILOGUE_2_ARGS
4068ENDPROC iemAImpl_pmovmskb_u64
4069
4070BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4071 PROLOGUE_2_ARGS
4072 IEMIMPL_SSE_PROLOGUE
4073
4074 movdqu xmm1, [A1]
4075 pmovmskb T0, xmm1
4076 mov [A0], T0
4077%ifdef RT_ARCH_X86
4078 mov dword [A0 + 4], 0
4079%endif
4080 IEMIMPL_SSE_EPILOGUE
4081 EPILOGUE_2_ARGS
4082ENDPROC iemAImpl_pmovmskb_u128
4083
4084BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4085 PROLOGUE_2_ARGS
4086 IEMIMPL_AVX_PROLOGUE
4087
4088 vmovdqu ymm1, [A1]
4089 vpmovmskb T0, ymm1
4090 mov [A0], T0
4091%ifdef RT_ARCH_X86
4092 mov dword [A0 + 4], 0
4093%endif
4094 IEMIMPL_AVX_EPILOGUE
4095 EPILOGUE_2_ARGS
4096ENDPROC iemAImpl_vpmovmskb_u256
4097
4098
4099;;
4100; Media instruction working on two full sized source registers and one destination (AVX).
4101;
4102; @param 1 The instruction
4103;
4104; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4105; @param A1 Pointer to the destination media register size operand (output).
4106; @param A2 Pointer to the first source media register size operand (input).
4107; @param A3 Pointer to the second source media register size operand (input).
4108;
4109%macro IEMIMPL_MEDIA_F3 1
4110BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4111 PROLOGUE_4_ARGS
4112 IEMIMPL_AVX_PROLOGUE
4113
4114 vmovdqu xmm0, [A2]
4115 vmovdqu xmm1, [A3]
4116 %1 xmm0, xmm0, xmm1
4117 vmovdqu [A1], xmm0
4118
4119 IEMIMPL_AVX_PROLOGUE
4120 EPILOGUE_4_ARGS
4121ENDPROC iemAImpl_ %+ %1 %+ _u128
4122
4123BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4124 PROLOGUE_4_ARGS
4125 IEMIMPL_AVX_PROLOGUE
4126
4127 vmovdqu ymm0, [A2]
4128 vmovdqu ymm1, [A3]
4129 %1 ymm0, ymm0, ymm1
4130 vmovdqu [A1], ymm0
4131
4132 IEMIMPL_AVX_PROLOGUE
4133 EPILOGUE_4_ARGS
4134ENDPROC iemAImpl_ %+ %1 %+ _u256
4135%endmacro
4136
4137IEMIMPL_MEDIA_F3 vpshufb
4138IEMIMPL_MEDIA_F3 vpand
4139IEMIMPL_MEDIA_F3 vpminub
4140IEMIMPL_MEDIA_F3 vpminuw
4141IEMIMPL_MEDIA_F3 vpminud
4142IEMIMPL_MEDIA_F3 vpminsb
4143IEMIMPL_MEDIA_F3 vpminsw
4144IEMIMPL_MEDIA_F3 vpminsd
4145IEMIMPL_MEDIA_F3 vpmaxub
4146IEMIMPL_MEDIA_F3 vpmaxuw
4147IEMIMPL_MEDIA_F3 vpmaxud
4148IEMIMPL_MEDIA_F3 vpmaxsb
4149IEMIMPL_MEDIA_F3 vpmaxsw
4150IEMIMPL_MEDIA_F3 vpmaxsd
4151IEMIMPL_MEDIA_F3 vpandn
4152IEMIMPL_MEDIA_F3 vpor
4153IEMIMPL_MEDIA_F3 vpxor
4154IEMIMPL_MEDIA_F3 vpcmpeqb
4155IEMIMPL_MEDIA_F3 vpcmpeqw
4156IEMIMPL_MEDIA_F3 vpcmpeqd
4157IEMIMPL_MEDIA_F3 vpcmpeqq
4158IEMIMPL_MEDIA_F3 vpcmpgtb
4159IEMIMPL_MEDIA_F3 vpcmpgtw
4160IEMIMPL_MEDIA_F3 vpcmpgtd
4161IEMIMPL_MEDIA_F3 vpcmpgtq
4162IEMIMPL_MEDIA_F3 vpaddb
4163IEMIMPL_MEDIA_F3 vpaddw
4164IEMIMPL_MEDIA_F3 vpaddd
4165IEMIMPL_MEDIA_F3 vpaddq
4166IEMIMPL_MEDIA_F3 vpsubb
4167IEMIMPL_MEDIA_F3 vpsubw
4168IEMIMPL_MEDIA_F3 vpsubd
4169IEMIMPL_MEDIA_F3 vpsubq
4170
4171
4172;;
4173; Media instruction working on two full sized source registers and one destination (AVX),
4174; but no XSAVE state pointer argument.
4175;
4176; @param 1 The instruction
4177;
4178; @param A0 Pointer to the destination media register size operand (output).
4179; @param A1 Pointer to the first source media register size operand (input).
4180; @param A2 Pointer to the second source media register size operand (input).
4181;
4182%macro IEMIMPL_MEDIA_OPT_F3 1
4183BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4184 PROLOGUE_3_ARGS
4185 IEMIMPL_AVX_PROLOGUE
4186
4187 vmovdqu xmm0, [A1]
4188 vmovdqu xmm1, [A2]
4189 %1 xmm0, xmm0, xmm1
4190 vmovdqu [A0], xmm0
4191
4192 IEMIMPL_AVX_PROLOGUE
4193 EPILOGUE_3_ARGS
4194ENDPROC iemAImpl_ %+ %1 %+ _u128
4195
4196BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4197 PROLOGUE_3_ARGS
4198 IEMIMPL_AVX_PROLOGUE
4199
4200 vmovdqu ymm0, [A1]
4201 vmovdqu ymm1, [A2]
4202 %1 ymm0, ymm0, ymm1
4203 vmovdqu [A0], ymm0
4204
4205 IEMIMPL_AVX_PROLOGUE
4206 EPILOGUE_3_ARGS
4207ENDPROC iemAImpl_ %+ %1 %+ _u256
4208%endmacro
4209
4210IEMIMPL_MEDIA_OPT_F3 vpacksswb
4211IEMIMPL_MEDIA_OPT_F3 vpackssdw
4212IEMIMPL_MEDIA_OPT_F3 vpackuswb
4213IEMIMPL_MEDIA_OPT_F3 vpackusdw
4214IEMIMPL_MEDIA_OPT_F3 vpmullw
4215IEMIMPL_MEDIA_OPT_F3 vpmulld
4216IEMIMPL_MEDIA_OPT_F3 vpmulhw
4217IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4218IEMIMPL_MEDIA_OPT_F3 vpavgb
4219IEMIMPL_MEDIA_OPT_F3 vpavgw
4220IEMIMPL_MEDIA_OPT_F3 vpsignb
4221IEMIMPL_MEDIA_OPT_F3 vpsignw
4222IEMIMPL_MEDIA_OPT_F3 vpsignd
4223IEMIMPL_MEDIA_OPT_F3 vphaddw
4224IEMIMPL_MEDIA_OPT_F3 vphaddd
4225IEMIMPL_MEDIA_OPT_F3 vphsubw
4226IEMIMPL_MEDIA_OPT_F3 vphsubd
4227IEMIMPL_MEDIA_OPT_F3 vphaddsw
4228IEMIMPL_MEDIA_OPT_F3 vphsubsw
4229IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4230IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4231IEMIMPL_MEDIA_OPT_F3 vpsadbw
4232IEMIMPL_MEDIA_OPT_F3 vpmuldq
4233IEMIMPL_MEDIA_OPT_F3 vpmuludq
4234IEMIMPL_MEDIA_OPT_F3 vunpcklps
4235IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4236IEMIMPL_MEDIA_OPT_F3 vunpckhps
4237IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4238
4239;;
4240; Media instruction working on one full sized source registers and one destination (AVX),
4241; but no XSAVE state pointer argument.
4242;
4243; @param 1 The instruction
4244; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4245;
4246; @param A0 Pointer to the destination media register size operand (output).
4247; @param A1 Pointer to the source media register size operand (input).
4248;
4249%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4250BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4251 PROLOGUE_2_ARGS
4252 IEMIMPL_AVX_PROLOGUE
4253
4254 vmovdqu xmm0, [A1]
4255 %1 xmm0, xmm0
4256 vmovdqu [A0], xmm0
4257
4258 IEMIMPL_AVX_PROLOGUE
4259 EPILOGUE_2_ARGS
4260ENDPROC iemAImpl_ %+ %1 %+ _u128
4261
4262 %if %2 == 1
4263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4264 PROLOGUE_2_ARGS
4265 IEMIMPL_AVX_PROLOGUE
4266
4267 vmovdqu ymm0, [A1]
4268 %1 ymm0, ymm0
4269 vmovdqu [A0], ymm0
4270
4271 IEMIMPL_AVX_PROLOGUE
4272 EPILOGUE_2_ARGS
4273ENDPROC iemAImpl_ %+ %1 %+ _u256
4274 %endif
4275%endmacro
4276
4277IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4278IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4279IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4280IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4281
4282
4283;
4284; The SSE 4.2 crc32
4285;
4286; @param A1 Pointer to the 32-bit destination.
4287; @param A2 The source operand, sized according to the suffix.
4288;
4289BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4290 PROLOGUE_2_ARGS
4291
4292 mov T0_32, [A0]
4293 crc32 T0_32, A1_8
4294 mov [A0], T0_32
4295
4296 EPILOGUE_2_ARGS
4297ENDPROC iemAImpl_crc32_u8
4298
4299BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4300 PROLOGUE_2_ARGS
4301
4302 mov T0_32, [A0]
4303 crc32 T0_32, A1_16
4304 mov [A0], T0_32
4305
4306 EPILOGUE_2_ARGS
4307ENDPROC iemAImpl_crc32_u16
4308
4309BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4310 PROLOGUE_2_ARGS
4311
4312 mov T0_32, [A0]
4313 crc32 T0_32, A1_32
4314 mov [A0], T0_32
4315
4316 EPILOGUE_2_ARGS
4317ENDPROC iemAImpl_crc32_u32
4318
4319%ifdef RT_ARCH_AMD64
4320BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4321 PROLOGUE_2_ARGS
4322
4323 mov T0_32, [A0]
4324 crc32 T0, A1
4325 mov [A0], T0_32
4326
4327 EPILOGUE_2_ARGS
4328ENDPROC iemAImpl_crc32_u64
4329%endif
4330
4331
4332;
4333; PTEST (SSE 4.1)
4334;
4335; @param A0 Pointer to the first source operand (aka readonly destination).
4336; @param A1 Pointer to the second source operand.
4337; @param A2 Pointer to the EFLAGS register.
4338;
4339BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4340 PROLOGUE_3_ARGS
4341 IEMIMPL_SSE_PROLOGUE
4342
4343 movdqu xmm0, [A0]
4344 movdqu xmm1, [A1]
4345 ptest xmm0, xmm1
4346 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4347
4348 IEMIMPL_SSE_EPILOGUE
4349 EPILOGUE_3_ARGS
4350ENDPROC iemAImpl_ptest_u128
4351
4352BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4353 PROLOGUE_3_ARGS
4354 IEMIMPL_SSE_PROLOGUE
4355
4356 vmovdqu ymm0, [A0]
4357 vmovdqu ymm1, [A1]
4358 vptest ymm0, ymm1
4359 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4360
4361 IEMIMPL_SSE_EPILOGUE
4362 EPILOGUE_3_ARGS
4363ENDPROC iemAImpl_vptest_u256
4364
4365
4366;;
4367; Template for the [v]pmov{s,z}x* instructions
4368;
4369; @param 1 The instruction
4370;
4371; @param A0 Pointer to the destination media register size operand (output).
4372; @param A1 The source operand value (input).
4373;
4374%macro IEMIMPL_V_PMOV_SZ_X 1
4375BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4376 PROLOGUE_2_ARGS
4377 IEMIMPL_SSE_PROLOGUE
4378
4379 movd xmm0, A1
4380 %1 xmm0, xmm0
4381 vmovdqu [A0], xmm0
4382
4383 IEMIMPL_SSE_PROLOGUE
4384 EPILOGUE_2_ARGS
4385ENDPROC iemAImpl_ %+ %1 %+ _u128
4386
4387BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4388 PROLOGUE_2_ARGS
4389 IEMIMPL_AVX_PROLOGUE
4390
4391 movd xmm0, A1
4392 v %+ %1 xmm0, xmm0
4393 vmovdqu [A0], xmm0
4394
4395 IEMIMPL_AVX_PROLOGUE
4396 EPILOGUE_2_ARGS
4397ENDPROC iemAImpl_v %+ %1 %+ _u128
4398
4399BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4400 PROLOGUE_2_ARGS
4401 IEMIMPL_AVX_PROLOGUE
4402
4403 movdqu xmm0, [A1]
4404 v %+ %1 ymm0, xmm0
4405 vmovdqu [A0], ymm0
4406
4407 IEMIMPL_AVX_PROLOGUE
4408 EPILOGUE_2_ARGS
4409ENDPROC iemAImpl_v %+ %1 %+ _u256
4410%endmacro
4411
4412IEMIMPL_V_PMOV_SZ_X pmovsxbw
4413IEMIMPL_V_PMOV_SZ_X pmovsxbd
4414IEMIMPL_V_PMOV_SZ_X pmovsxbq
4415IEMIMPL_V_PMOV_SZ_X pmovsxwd
4416IEMIMPL_V_PMOV_SZ_X pmovsxwq
4417IEMIMPL_V_PMOV_SZ_X pmovsxdq
4418
4419IEMIMPL_V_PMOV_SZ_X pmovzxbw
4420IEMIMPL_V_PMOV_SZ_X pmovzxbd
4421IEMIMPL_V_PMOV_SZ_X pmovzxbq
4422IEMIMPL_V_PMOV_SZ_X pmovzxwd
4423IEMIMPL_V_PMOV_SZ_X pmovzxwq
4424IEMIMPL_V_PMOV_SZ_X pmovzxdq
4425
4426
4427;;
4428; Need to move this as well somewhere better?
4429;
4430struc IEMSSERESULT
4431 .uResult resd 4
4432 .MXCSR resd 1
4433endstruc
4434
4435
4436;;
4437; Need to move this as well somewhere better?
4438;
4439struc IEMAVX128RESULT
4440 .uResult resd 4
4441 .MXCSR resd 1
4442endstruc
4443
4444
4445;;
4446; Need to move this as well somewhere better?
4447;
4448struc IEMAVX256RESULT
4449 .uResult resd 8
4450 .MXCSR resd 1
4451endstruc
4452
4453
4454;;
4455; Initialize the SSE MXCSR register using the guest value partially to
4456; account for rounding mode.
4457;
4458; @uses 4 bytes of stack to save the original value, T0.
4459; @param 1 Expression giving the address of the FXSTATE of the guest.
4460;
4461%macro SSE_LD_FXSTATE_MXCSR 1
4462 sub xSP, 4
4463
4464 stmxcsr [xSP]
4465 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4466 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4467 or T0_32, X86_MXCSR_XCPT_MASK
4468 sub xSP, 4
4469 mov [xSP], T0_32
4470 ldmxcsr [xSP]
4471 add xSP, 4
4472%endmacro
4473
4474
4475;;
4476; Restores the SSE MXCSR register with the original value.
4477;
4478; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4479; @param 1 Expression giving the address where to return the MXCSR value.
4480; @param 2 Expression giving the address of the FXSTATE of the guest.
4481;
4482; @note Restores the stack pointer.
4483;
4484%macro SSE_ST_FXSTATE_MXCSR 2
4485 sub xSP, 4
4486 stmxcsr [xSP]
4487 mov T0_32, [xSP]
4488 add xSP, 4
4489 ; Merge the status bits into the original MXCSR value.
4490 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4491 and T0_32, X86_MXCSR_XCPT_FLAGS
4492 or T0_32, T1_32
4493 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4494
4495 ldmxcsr [xSP]
4496 add xSP, 4
4497%endmacro
4498
4499
4500;;
4501; Initialize the SSE MXCSR register using the guest value partially to
4502; account for rounding mode.
4503;
4504; @uses 4 bytes of stack to save the original value.
4505; @param 1 Expression giving the address of the FXSTATE of the guest.
4506;
4507%macro AVX_LD_XSAVEAREA_MXCSR 1
4508 sub xSP, 4
4509
4510 stmxcsr [xSP]
4511 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4512 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4513 sub xSP, 4
4514 mov [xSP], T0_32
4515 ldmxcsr [xSP]
4516 add xSP, 4
4517%endmacro
4518
4519
4520;;
4521; Restores the AVX128 MXCSR register with the original value.
4522;
4523; @param 1 Expression giving the address where to return the MXCSR value.
4524;
4525; @note Restores the stack pointer.
4526;
4527%macro AVX128_ST_XSAVEAREA_MXCSR 1
4528 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4529
4530 ldmxcsr [xSP]
4531 add xSP, 4
4532%endmacro
4533
4534
4535;;
4536; Restores the AVX256 MXCSR register with the original value.
4537;
4538; @param 1 Expression giving the address where to return the MXCSR value.
4539;
4540; @note Restores the stack pointer.
4541;
4542%macro AVX256_ST_XSAVEAREA_MXCSR 1
4543 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4544
4545 ldmxcsr [xSP]
4546 add xSP, 4
4547%endmacro
4548
4549
4550;;
4551; Floating point instruction working on two full sized registers.
4552;
4553; @param 1 The instruction
4554; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4555;
4556; @param A0 FPU context (FXSTATE or XSAVEAREA).
4557; @param A1 Where to return the result including the MXCSR value.
4558; @param A2 Pointer to the first media register size operand (input/output).
4559; @param A3 Pointer to the second media register size operand (input).
4560;
4561%macro IEMIMPL_FP_F2 2
4562BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4563 PROLOGUE_4_ARGS
4564 IEMIMPL_SSE_PROLOGUE
4565 SSE_LD_FXSTATE_MXCSR A0
4566
4567 movdqu xmm0, [A2]
4568 movdqu xmm1, [A3]
4569 %1 xmm0, xmm1
4570 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4571
4572 SSE_ST_FXSTATE_MXCSR A1, A0
4573 IEMIMPL_SSE_PROLOGUE
4574 EPILOGUE_4_ARGS
4575ENDPROC iemAImpl_ %+ %1 %+ _u128
4576
4577 %if %2 == 3
4578BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4579 PROLOGUE_4_ARGS
4580 IEMIMPL_AVX_PROLOGUE
4581 AVX_LD_XSAVEAREA_MXCSR A0
4582
4583 vmovdqu xmm0, [A2]
4584 vmovdqu xmm1, [A3]
4585 v %+ %1 xmm0, xmm0, xmm1
4586 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4587
4588 AVX128_ST_XSAVEAREA_MXCSR A1
4589 IEMIMPL_AVX_PROLOGUE
4590 EPILOGUE_4_ARGS
4591ENDPROC iemAImpl_v %+ %1 %+ _u128
4592
4593BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4594 PROLOGUE_4_ARGS
4595 IEMIMPL_AVX_PROLOGUE
4596 AVX_LD_XSAVEAREA_MXCSR A0
4597
4598 vmovdqu ymm0, [A2]
4599 vmovdqu ymm1, [A3]
4600 v %+ %1 ymm0, ymm0, ymm1
4601 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4602
4603 AVX256_ST_XSAVEAREA_MXCSR A1
4604 IEMIMPL_AVX_PROLOGUE
4605 EPILOGUE_4_ARGS
4606ENDPROC iemAImpl_v %+ %1 %+ _u256
4607 %elif %2 == 2
4608BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4609 PROLOGUE_4_ARGS
4610 IEMIMPL_AVX_PROLOGUE
4611 AVX_LD_XSAVEAREA_MXCSR A0
4612
4613 vmovdqu xmm0, [A2]
4614 vmovdqu xmm1, [A3]
4615 v %+ %1 xmm0, xmm1
4616 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4617
4618 AVX128_ST_XSAVEAREA_MXCSR A1
4619 IEMIMPL_AVX_PROLOGUE
4620 EPILOGUE_4_ARGS
4621ENDPROC iemAImpl_v %+ %1 %+ _u128
4622
4623BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4624 PROLOGUE_4_ARGS
4625 IEMIMPL_AVX_PROLOGUE
4626 AVX_LD_XSAVEAREA_MXCSR A0
4627
4628 vmovdqu ymm0, [A2]
4629 vmovdqu ymm1, [A3]
4630 v %+ %1 ymm0, ymm1
4631 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4632
4633 AVX256_ST_XSAVEAREA_MXCSR A1
4634 IEMIMPL_AVX_PROLOGUE
4635 EPILOGUE_4_ARGS
4636ENDPROC iemAImpl_v %+ %1 %+ _u256
4637 %endif
4638%endmacro
4639
4640IEMIMPL_FP_F2 addps, 3
4641IEMIMPL_FP_F2 addpd, 3
4642IEMIMPL_FP_F2 mulps, 3
4643IEMIMPL_FP_F2 mulpd, 3
4644IEMIMPL_FP_F2 subps, 3
4645IEMIMPL_FP_F2 subpd, 3
4646IEMIMPL_FP_F2 minps, 3
4647IEMIMPL_FP_F2 minpd, 3
4648IEMIMPL_FP_F2 divps, 3
4649IEMIMPL_FP_F2 divpd, 3
4650IEMIMPL_FP_F2 maxps, 3
4651IEMIMPL_FP_F2 maxpd, 3
4652IEMIMPL_FP_F2 haddps, 3
4653IEMIMPL_FP_F2 haddpd, 3
4654IEMIMPL_FP_F2 hsubps, 3
4655IEMIMPL_FP_F2 hsubpd, 3
4656IEMIMPL_FP_F2 addsubps, 3
4657IEMIMPL_FP_F2 addsubpd, 3
4658
4659
4660;;
4661; These are actually unary operations but to keep it simple
4662; we treat them as binary for now, so the output result is
4663; always in sync with the register where the result might get written
4664; to.
4665IEMIMPL_FP_F2 sqrtps, 2
4666IEMIMPL_FP_F2 sqrtpd, 2
4667IEMIMPL_FP_F2 cvtdq2ps, 2
4668IEMIMPL_FP_F2 cvtps2dq, 2
4669IEMIMPL_FP_F2 cvttps2dq, 2
4670IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4671IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4672IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4673
4674
4675;;
4676; Floating point instruction working on a full sized register and a single precision operand.
4677;
4678; @param 1 The instruction
4679;
4680; @param A0 FPU context (FXSTATE or XSAVEAREA).
4681; @param A1 Where to return the result including the MXCSR value.
4682; @param A2 Pointer to the first media register size operand (input/output).
4683; @param A3 Pointer to the second single precision floating point value (input).
4684;
4685%macro IEMIMPL_FP_F2_R32 1
4686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4687 PROLOGUE_4_ARGS
4688 IEMIMPL_SSE_PROLOGUE
4689 SSE_LD_FXSTATE_MXCSR A0
4690
4691 movdqu xmm0, [A2]
4692 movd xmm1, [A3]
4693 %1 xmm0, xmm1
4694 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4695
4696 SSE_ST_FXSTATE_MXCSR A1, A0
4697 IEMIMPL_SSE_EPILOGUE
4698 EPILOGUE_4_ARGS
4699ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4700
4701BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4702 PROLOGUE_4_ARGS
4703 IEMIMPL_AVX_PROLOGUE
4704 AVX_LD_XSAVEAREA_MXCSR A0
4705
4706 vmovdqu xmm0, [A2]
4707 vmovd xmm1, [A3]
4708 v %+ %1 xmm0, xmm0, xmm1
4709 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4710
4711 AVX128_ST_XSAVEAREA_MXCSR A1
4712 IEMIMPL_AVX_PROLOGUE
4713 EPILOGUE_4_ARGS
4714ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4715%endmacro
4716
4717IEMIMPL_FP_F2_R32 addss
4718IEMIMPL_FP_F2_R32 mulss
4719IEMIMPL_FP_F2_R32 subss
4720IEMIMPL_FP_F2_R32 minss
4721IEMIMPL_FP_F2_R32 divss
4722IEMIMPL_FP_F2_R32 maxss
4723IEMIMPL_FP_F2_R32 cvtss2sd
4724IEMIMPL_FP_F2_R32 sqrtss
4725
4726
4727;;
4728; Floating point instruction working on a full sized register and a double precision operand.
4729;
4730; @param 1 The instruction
4731;
4732; @param A0 FPU context (FXSTATE or XSAVEAREA).
4733; @param A1 Where to return the result including the MXCSR value.
4734; @param A2 Pointer to the first media register size operand (input/output).
4735; @param A3 Pointer to the second double precision floating point value (input).
4736;
4737%macro IEMIMPL_FP_F2_R64 1
4738BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4739 PROLOGUE_4_ARGS
4740 IEMIMPL_SSE_PROLOGUE
4741 SSE_LD_FXSTATE_MXCSR A0
4742
4743 movdqu xmm0, [A2]
4744 movq xmm1, [A3]
4745 %1 xmm0, xmm1
4746 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4747
4748 SSE_ST_FXSTATE_MXCSR A1, A0
4749 IEMIMPL_SSE_EPILOGUE
4750 EPILOGUE_4_ARGS
4751ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4752
4753BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4754 PROLOGUE_4_ARGS
4755 IEMIMPL_AVX_PROLOGUE
4756 AVX_LD_XSAVEAREA_MXCSR A0
4757
4758 vmovdqu xmm0, [A2]
4759 vmovq xmm1, [A3]
4760 v %+ %1 xmm0, xmm0, xmm1
4761 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4762
4763 AVX128_ST_XSAVEAREA_MXCSR A1
4764 IEMIMPL_AVX_EPILOGUE
4765 EPILOGUE_4_ARGS
4766ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4767%endmacro
4768
4769IEMIMPL_FP_F2_R64 addsd
4770IEMIMPL_FP_F2_R64 mulsd
4771IEMIMPL_FP_F2_R64 subsd
4772IEMIMPL_FP_F2_R64 minsd
4773IEMIMPL_FP_F2_R64 divsd
4774IEMIMPL_FP_F2_R64 maxsd
4775IEMIMPL_FP_F2_R64 cvtsd2ss
4776IEMIMPL_FP_F2_R64 sqrtsd
4777
4778
4779;;
4780; Macro for the cvtpd2ps/cvtps2pd instructions.
4781;
4782; 1 The instruction name.
4783; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4784;
4785; @param A0 FPU context (FXSTATE or XSAVEAREA).
4786; @param A1 Where to return the result including the MXCSR value.
4787; @param A2 Pointer to the first media register size operand (input/output).
4788; @param A3 Pointer to the second media register size operand (input).
4789;
4790%macro IEMIMPL_CVT_F2 2
4791BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4792 PROLOGUE_4_ARGS
4793 IEMIMPL_SSE_PROLOGUE
4794 SSE_LD_FXSTATE_MXCSR A0
4795
4796 movdqu xmm0, [A2]
4797 movdqu xmm1, [A3]
4798 %1 xmm0, xmm1
4799 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4800
4801 SSE_ST_FXSTATE_MXCSR A1, A0
4802 IEMIMPL_SSE_EPILOGUE
4803 EPILOGUE_4_ARGS
4804ENDPROC iemAImpl_ %+ %1 %+ _u128
4805
4806BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4807 PROLOGUE_4_ARGS
4808 IEMIMPL_AVX_PROLOGUE
4809 AVX_LD_XSAVEAREA_MXCSR A0
4810
4811 vmovdqu xmm0, [A2]
4812 vmovdqu xmm1, [A3]
4813 v %+ %1 xmm0, xmm1
4814 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4815
4816 AVX128_ST_XSAVEAREA_MXCSR A1
4817 IEMIMPL_AVX_EPILOGUE
4818 EPILOGUE_4_ARGS
4819ENDPROC iemAImpl_v %+ %1 %+ _u128
4820
4821BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4822 PROLOGUE_4_ARGS
4823 IEMIMPL_AVX_PROLOGUE
4824 AVX_LD_XSAVEAREA_MXCSR A0
4825
4826 vmovdqu ymm0, [A2]
4827 vmovdqu ymm1, [A3]
4828 %if %2 == 0
4829 v %+ %1 xmm0, ymm1
4830 %else
4831 v %+ %1 ymm0, xmm1
4832 %endif
4833 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4834
4835 AVX256_ST_XSAVEAREA_MXCSR A1
4836 IEMIMPL_AVX_EPILOGUE
4837 EPILOGUE_4_ARGS
4838ENDPROC iemAImpl_v %+ %1 %+ _u256
4839%endmacro
4840
4841IEMIMPL_CVT_F2 cvtpd2ps, 0
4842IEMIMPL_CVT_F2 cvtps2pd, 1
4843
4844
4845;;
4846; shufps instructions with 8-bit immediates.
4847;
4848; @param A0 Pointer to the destination media register size operand (input/output).
4849; @param A1 Pointer to the first source media register size operand (input).
4850; @param A2 The 8-bit immediate
4851;
4852BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4853 PROLOGUE_3_ARGS
4854 IEMIMPL_SSE_PROLOGUE
4855
4856 movdqu xmm0, [A0]
4857 movdqu xmm1, [A1]
4858 lea T1, [.imm0 xWrtRIP]
4859 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2
4860 lea T1, [T1 + T0*2]
4861 call T1
4862 movdqu [A0], xmm0
4863
4864 IEMIMPL_SSE_EPILOGUE
4865 EPILOGUE_3_ARGS
4866 %assign bImm 0
4867 %rep 256
4868.imm %+ bImm:
4869 shufps xmm0, xmm1, bImm
4870 ret
4871 int3
4872 %assign bImm bImm + 1
4873 %endrep
4874.immEnd: ; 256*6 == 0x600
4875dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4876dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4877ENDPROC iemAImpl_shufps_u128
4878
4879
4880;;
4881; shufpd instruction with 8-bit immediates.
4882;
4883; @param A0 Pointer to the destination media register size operand (input/output).
4884; @param A1 Pointer to the first source media register size operand (input).
4885; @param A2 The 8-bit immediate
4886;
4887BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4888 PROLOGUE_3_ARGS
4889 IEMIMPL_SSE_PROLOGUE
4890
4891 movdqu xmm0, [A0]
4892 movdqu xmm1, [A1]
4893 lea T1, [.imm0 xWrtRIP]
4894 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2
4895 lea T1, [T1 + T0*2]
4896 call T1
4897 movdqu [A0], xmm0
4898
4899 IEMIMPL_SSE_EPILOGUE
4900 EPILOGUE_3_ARGS
4901 %assign bImm 0
4902 %rep 256
4903.imm %+ bImm:
4904 shufpd xmm0, xmm1, bImm
4905 ret
4906 %assign bImm bImm + 1
4907 %endrep
4908.immEnd: ; 256*6 == 0x600
4909dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4910dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4911ENDPROC iemAImpl_shufpd_u128
4912
4913
4914;;
4915; vshufp{s,d} instructions with 8-bit immediates.
4916;
4917; @param 1 The instruction name.
4918;
4919; @param A0 Pointer to the destination media register size operand (output).
4920; @param A1 Pointer to the first source media register size operand (input).
4921; @param A2 Pointer to the second source media register size operand (input).
4922; @param A3 The 8-bit immediate
4923;
4924%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4925BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4926 PROLOGUE_4_ARGS
4927 IEMIMPL_AVX_PROLOGUE
4928
4929 movdqu xmm0, [A1]
4930 movdqu xmm1, [A2]
4931 lea T1, [.imm0 xWrtRIP]
4932 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4933 lea T1, [T1 + T0*2]
4934 call T1
4935 movdqu [A0], xmm0
4936
4937 IEMIMPL_AVX_EPILOGUE
4938 EPILOGUE_4_ARGS
4939 %assign bImm 0
4940 %rep 256
4941.imm %+ bImm:
4942 %1 xmm0, xmm0, xmm1, bImm
4943 ret
4944 %assign bImm bImm + 1
4945 %endrep
4946.immEnd: ; 256*6 == 0x600
4947dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4948dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4949ENDPROC iemAImpl_ %+ %1 %+ _u128
4950
4951BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4952 PROLOGUE_4_ARGS
4953 IEMIMPL_AVX_PROLOGUE
4954
4955 vmovdqu ymm0, [A1]
4956 vmovdqu ymm1, [A2]
4957 lea T1, [.imm0 xWrtRIP]
4958 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4959 lea T1, [T1 + T0*2]
4960 call T1
4961 vmovdqu [A0], ymm0
4962
4963 IEMIMPL_AVX_EPILOGUE
4964 EPILOGUE_4_ARGS
4965 %assign bImm 0
4966 %rep 256
4967.imm %+ bImm:
4968 %1 ymm0, ymm0, ymm1, bImm
4969 ret
4970 %assign bImm bImm + 1
4971 %endrep
4972.immEnd: ; 256*6 == 0x600
4973dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4974dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4975ENDPROC iemAImpl_ %+ %1 %+ _u256
4976%endmacro
4977
4978IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
4979IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
4980
4981
4982;;
4983; One of the [p]blendv{b,ps,pd} variants
4984;
4985; @param 1 The instruction
4986;
4987; @param A0 Pointer to the first media register sized operand (input/output).
4988; @param A1 Pointer to the second media sized value (input).
4989; @param A2 Pointer to the media register sized mask value (input).
4990;
4991%macro IEMIMPL_P_BLEND 1
4992BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4993 PROLOGUE_3_ARGS
4994 IEMIMPL_SSE_PROLOGUE
4995
4996 movdqu xmm0, [A2] ; This is implicit
4997 movdqu xmm1, [A0]
4998 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
4999 %1 xmm1, xmm2
5000 movdqu [A0], xmm1
5001
5002 IEMIMPL_SSE_PROLOGUE
5003 EPILOGUE_3_ARGS
5004ENDPROC iemAImpl_ %+ %1 %+ _u128
5005%endmacro
5006
5007IEMIMPL_P_BLEND pblendvb
5008IEMIMPL_P_BLEND blendvps
5009IEMIMPL_P_BLEND blendvpd
5010
5011
5012;;
5013; One of the v[p]blendv{b,ps,pd} variants
5014;
5015; @param 1 The instruction
5016;
5017; @param A0 Pointer to the first media register sized operand (output).
5018; @param A1 Pointer to the first media register sized operand (input).
5019; @param A2 Pointer to the second media register sized operand (input).
5020; @param A3 Pointer to the media register sized mask value (input).
5021%macro IEMIMPL_AVX_P_BLEND 1
5022BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5023 PROLOGUE_4_ARGS
5024 IEMIMPL_AVX_PROLOGUE
5025
5026 vmovdqu xmm0, [A1]
5027 vmovdqu xmm1, [A2]
5028 vmovdqu xmm2, [A3]
5029 %1 xmm0, xmm0, xmm1, xmm2
5030 vmovdqu [A0], xmm0
5031
5032 IEMIMPL_AVX_PROLOGUE
5033 EPILOGUE_4_ARGS
5034ENDPROC iemAImpl_ %+ %1 %+ _u128
5035
5036BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5037 PROLOGUE_4_ARGS
5038 IEMIMPL_AVX_PROLOGUE
5039
5040 vmovdqu ymm0, [A1]
5041 vmovdqu ymm1, [A2]
5042 vmovdqu ymm2, [A3]
5043 %1 ymm0, ymm0, ymm1, ymm2
5044 vmovdqu [A0], ymm0
5045
5046 IEMIMPL_AVX_PROLOGUE
5047 EPILOGUE_4_ARGS
5048ENDPROC iemAImpl_ %+ %1 %+ _u256
5049%endmacro
5050
5051IEMIMPL_AVX_P_BLEND vpblendvb
5052IEMIMPL_AVX_P_BLEND vblendvps
5053IEMIMPL_AVX_P_BLEND vblendvpd
5054
5055
5056;;
5057; palignr mm1, mm2/m64 instruction.
5058;
5059; @param A0 Pointer to the first media register sized operand (output).
5060; @param A1 The second register sized operand (input).
5061; @param A2 The 8-bit immediate.
5062BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5063 PROLOGUE_3_ARGS
5064 IEMIMPL_MMX_PROLOGUE
5065
5066 movq mm0, [A0]
5067 movq mm1, A1
5068 lea T1, [.imm0 xWrtRIP]
5069 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2
5070 lea T1, [T1 + T0*2]
5071 call T1
5072 movq [A0], mm0
5073
5074 IEMIMPL_MMX_EPILOGUE
5075 EPILOGUE_3_ARGS
5076 %assign bImm 0
5077 %rep 256
5078.imm %+ bImm:
5079 palignr mm0, mm1, bImm
5080 ret
5081 %assign bImm bImm + 1
5082 %endrep
5083.immEnd: ; 256*6 == 0x600
5084dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5085dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5086ENDPROC iemAImpl_palignr_u64
5087
5088
5089;;
5090; SSE instructions with 8-bit immediates of the form
5091; xxx xmm1, xmm2, imm8.
5092; where the instruction encoding takes up 6 bytes.
5093;
5094; @param 1 The instruction name.
5095;
5096; @param A0 Pointer to the first media register size operand (input/output).
5097; @param A1 Pointer to the second source media register size operand (input).
5098; @param A2 The 8-bit immediate
5099;
5100%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5102 PROLOGUE_3_ARGS
5103 IEMIMPL_SSE_PROLOGUE
5104
5105 movdqu xmm0, [A0]
5106 movdqu xmm1, [A1]
5107 lea T1, [.imm0 xWrtRIP]
5108 lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2
5109 lea T1, [T1 + T0*2]
5110 call T1
5111 movdqu [A0], xmm0
5112
5113 IEMIMPL_SSE_EPILOGUE
5114 EPILOGUE_3_ARGS
5115 %assign bImm 0
5116 %rep 256
5117.imm %+ bImm:
5118 %1 xmm0, xmm1, bImm
5119 ret
5120 int3
5121 %assign bImm bImm + 1
5122 %endrep
5123.immEnd: ; 256*8 == 0x800
5124dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5125dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5126ENDPROC iemAImpl_ %+ %1 %+ _u128
5127%endmacro
5128
5129IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5130IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5131IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5132IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5133IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5134IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5135
5136
5137;;
5138; AVX instructions with 8-bit immediates of the form
5139; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5140; where the instruction encoding takes up 6 bytes.
5141;
5142; @param 1 The instruction name.
5143; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5144;
5145; @param A0 Pointer to the destination media register size operand (output).
5146; @param A1 Pointer to the first source media register size operand (input).
5147; @param A2 Pointer to the second source media register size operand (input).
5148; @param A3 The 8-bit immediate
5149;
5150%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5151BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5152 PROLOGUE_4_ARGS
5153 IEMIMPL_AVX_PROLOGUE
5154
5155 movdqu xmm0, [A1]
5156 movdqu xmm1, [A2]
5157 lea T1, [.imm0 xWrtRIP]
5158 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5159 lea T1, [T1 + T0*2]
5160 call T1
5161 movdqu [A0], xmm0
5162
5163 IEMIMPL_AVX_EPILOGUE
5164 EPILOGUE_4_ARGS
5165 %assign bImm 0
5166 %rep 256
5167.imm %+ bImm:
5168 %1 xmm0, xmm0, xmm1, bImm
5169 ret
5170 int3
5171 %assign bImm bImm + 1
5172 %endrep
5173.immEnd: ; 256*8 == 0x800
5174dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5175dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5176ENDPROC iemAImpl_ %+ %1 %+ _u128
5177
5178 %if %2 == 1
5179BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5180 PROLOGUE_4_ARGS
5181 IEMIMPL_AVX_PROLOGUE
5182
5183 vmovdqu ymm0, [A1]
5184 vmovdqu ymm1, [A2]
5185 lea T1, [.imm0 xWrtRIP]
5186 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5187 lea T1, [T1 + T0*2]
5188 call T1
5189 vmovdqu [A0], ymm0
5190
5191 IEMIMPL_AVX_EPILOGUE
5192 EPILOGUE_4_ARGS
5193 %assign bImm 0
5194 %rep 256
5195.imm %+ bImm:
5196 %1 ymm0, ymm0, ymm1, bImm
5197 ret
5198 int3
5199 %assign bImm bImm + 1
5200 %endrep
5201.immEnd: ; 256*8 == 0x800
5202dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5203dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5204ENDPROC iemAImpl_ %+ %1 %+ _u256
5205 %endif
5206%endmacro
5207
5208IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5209IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5210IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5211IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5212IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5213
5214
5215;;
5216; Need to move this as well somewhere better?
5217;
5218struc IEMPCMPISTRISRC
5219 .uSrc1 resd 4
5220 .uSrc2 resd 4
5221endstruc
5222
5223;;
5224; The pcmpistri instruction.
5225;
5226; @param A0 Pointer to the ECX register to store the result to (output).
5227; @param A1 Pointer to the EFLAGS register.
5228; @param A2 Pointer to the structure containing the source operands (input).
5229; @param A3 The 8-bit immediate
5230;
5231BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5232 PROLOGUE_4_ARGS
5233 IEMIMPL_SSE_PROLOGUE
5234
5235 movdqu xmm0, [A2 + IEMPCMPISTRISRC.uSrc1]
5236 movdqu xmm1, [A2 + IEMPCMPISTRISRC.uSrc2]
5237 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5238 lea T1, [.imm0 xWrtRIP]
5239 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5240 lea T1, [T1 + T0*2]
5241 call T1
5242
5243 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5244 mov [T2], ecx
5245
5246 IEMIMPL_SSE_EPILOGUE
5247 EPILOGUE_4_ARGS
5248 %assign bImm 0
5249 %rep 256
5250.imm %+ bImm:
5251 pcmpistri xmm0, xmm1, bImm
5252 ret
5253 int3
5254 %assign bImm bImm + 1
5255 %endrep
5256.immEnd: ; 256*8 == 0x800
5257dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5258dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5259ENDPROC iemAImpl_pcmpistri_u128
5260
5261
5262;;
5263; pinsrw instruction.
5264;
5265; @param A0 Pointer to the first media register size operand (input/output).
5266; @param A1 The 16 bit input operand (input).
5267; @param A2 The 8-bit immediate
5268;
5269BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5270 PROLOGUE_3_ARGS
5271 IEMIMPL_SSE_PROLOGUE
5272
5273 movq mm0, [A0]
5274 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5
5275 lea T1, [.imm0 xWrtRIP]
5276 lea T1, [T1 + T0]
5277 call T1
5278 movq [A0], mm0
5279
5280 IEMIMPL_SSE_EPILOGUE
5281 EPILOGUE_3_ARGS
5282 %assign bImm 0
5283 %rep 256
5284.imm %+ bImm:
5285 pinsrw mm0, A1_32, bImm
5286 ret
5287 %assign bImm bImm + 1
5288 %endrep
5289.immEnd: ; 256*5 == 0x500
5290dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5291dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5292ENDPROC iemAImpl_pinsrw_u64
5293
5294BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5295 PROLOGUE_3_ARGS
5296 IEMIMPL_SSE_PROLOGUE
5297
5298 movdqu xmm0, [A0]
5299 lea T1, [.imm0 xWrtRIP]
5300 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2
5301 lea T1, [T1 + T0*2]
5302 call T1
5303 movdqu [A0], xmm0
5304
5305 IEMIMPL_SSE_EPILOGUE
5306 EPILOGUE_3_ARGS
5307 %assign bImm 0
5308 %rep 256
5309.imm %+ bImm:
5310 pinsrw xmm0, A1_32, bImm
5311 ret
5312 %assign bImm bImm + 1
5313 %endrep
5314.immEnd: ; 256*6 == 0x600
5315dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5316dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5317ENDPROC iemAImpl_pinsrw_u128
5318
5319;;
5320; vpinsrw instruction.
5321;
5322; @param A0 Pointer to the first media register size operand (output).
5323; @param A1 Pointer to the source media register size operand (input).
5324; @param A2 The 16 bit input operand (input).
5325; @param A3 The 8-bit immediate
5326;
5327BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5328 PROLOGUE_4_ARGS
5329 IEMIMPL_SSE_PROLOGUE
5330
5331 movdqu xmm0, [A1]
5332 lea T1, [.imm0 xWrtRIP]
5333 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2
5334 lea T1, [T1 + T0*2]
5335 mov A1, A2 ; A2 requires longer encoding on Windows
5336 call T1
5337 movdqu [A0], xmm0
5338
5339 IEMIMPL_SSE_EPILOGUE
5340 EPILOGUE_4_ARGS
5341 %assign bImm 0
5342 %rep 256
5343.imm %+ bImm:
5344 vpinsrw xmm0, xmm0, A1_32, bImm
5345 ret
5346 %assign bImm bImm + 1
5347 %endrep
5348.immEnd: ; 256*6 == 0x600
5349dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5350dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5351ENDPROC iemAImpl_vpinsrw_u128
5352
5353
5354;;
5355; pextrw instruction.
5356;
5357; @param A0 Pointer to the 16bit output operand (output).
5358; @param A1 Pointer to the media register size operand (input).
5359; @param A2 The 8-bit immediate
5360;
5361BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5362 PROLOGUE_3_ARGS
5363 IEMIMPL_SSE_PROLOGUE
5364
5365 movq mm0, A1
5366 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5
5367 lea T1, [.imm0 xWrtRIP]
5368 lea T1, [T1 + T0]
5369 call T1
5370 mov word [A0], T0_16
5371
5372 IEMIMPL_SSE_EPILOGUE
5373 EPILOGUE_3_ARGS
5374 %assign bImm 0
5375 %rep 256
5376.imm %+ bImm:
5377 pextrw T0_32, mm0, bImm
5378 ret
5379 %assign bImm bImm + 1
5380 %endrep
5381.immEnd: ; 256*5 == 0x500
5382dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5383dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5384ENDPROC iemAImpl_pextrw_u64
5385
5386BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5387 PROLOGUE_3_ARGS
5388 IEMIMPL_SSE_PROLOGUE
5389
5390 movdqu xmm0, [A1]
5391 lea T1, [.imm0 xWrtRIP]
5392 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2
5393 lea T1, [T1 + T0*2]
5394 call T1
5395 mov word [A0], T0_16
5396
5397 IEMIMPL_SSE_EPILOGUE
5398 EPILOGUE_3_ARGS
5399 %assign bImm 0
5400 %rep 256
5401.imm %+ bImm:
5402 pextrw T0_32, xmm0, bImm
5403 ret
5404 %assign bImm bImm + 1
5405 %endrep
5406.immEnd: ; 256*6 == 0x600
5407dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5408dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5409ENDPROC iemAImpl_pextrw_u128
5410
5411;;
5412; vpextrw instruction.
5413;
5414; @param A0 Pointer to the 16bit output operand (output).
5415; @param A1 Pointer to the source media register size operand (input).
5416; @param A2 The 8-bit immediate
5417;
5418BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5419 PROLOGUE_3_ARGS
5420 IEMIMPL_SSE_PROLOGUE
5421
5422 movdqu xmm0, [A1]
5423 lea T1, [.imm0 xWrtRIP]
5424 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2
5425 lea T1, [T1 + T0*2]
5426 call T1
5427 mov word [A0], T0_16
5428
5429 IEMIMPL_SSE_EPILOGUE
5430 EPILOGUE_3_ARGS
5431 %assign bImm 0
5432 %rep 256
5433.imm %+ bImm:
5434 vpextrw T0_32, xmm0, bImm
5435 ret
5436 %assign bImm bImm + 1
5437 %endrep
5438.immEnd: ; 256*6 == 0x600
5439dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5440dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5441ENDPROC iemAImpl_vpextrw_u128
5442
5443
5444;;
5445; movmskp{s,d} SSE instruction template
5446;
5447; @param 1 The SSE instruction name.
5448; @param 2 The AVX instruction name.
5449;
5450; @param A0 Pointer to the output register (output/byte sized).
5451; @param A1 Pointer to the source media register size operand (input).
5452;
5453%macro IEMIMPL_MEDIA_MOVMSK_P 2
5454BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5455 PROLOGUE_2_ARGS
5456 IEMIMPL_SSE_PROLOGUE
5457
5458 movdqu xmm0, [A1]
5459 %1 T0, xmm0
5460 mov byte [A0], T0_8
5461
5462 IEMIMPL_SSE_EPILOGUE
5463 EPILOGUE_2_ARGS
5464ENDPROC iemAImpl_ %+ %1 %+ _u128
5465
5466BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5467 PROLOGUE_2_ARGS
5468 IEMIMPL_AVX_PROLOGUE
5469
5470 movdqu xmm0, [A1]
5471 %2 T0, xmm0
5472 mov byte [A0], T0_8
5473
5474 IEMIMPL_AVX_EPILOGUE
5475 EPILOGUE_2_ARGS
5476ENDPROC iemAImpl_ %+ %2 %+ _u128
5477
5478BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5479 PROLOGUE_2_ARGS
5480 IEMIMPL_AVX_PROLOGUE
5481
5482 vmovdqu ymm0, [A1]
5483 %2 T0, ymm0
5484 mov byte [A0], T0_8
5485
5486 IEMIMPL_AVX_EPILOGUE
5487 EPILOGUE_2_ARGS
5488ENDPROC iemAImpl_ %+ %2 %+ _u256
5489%endmacro
5490
5491IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5492IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5493
5494
5495;;
5496; Restores the SSE MXCSR register with the original value.
5497;
5498; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5499; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5500; @param 2 Expression giving the address of the FXSTATE of the guest.
5501;
5502; @note Restores the stack pointer.
5503;
5504%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5505 sub xSP, 4
5506 stmxcsr [xSP]
5507 mov T0_32, [xSP]
5508 add xSP, 4
5509 ; Merge the status bits into the original MXCSR value.
5510 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5511 and T0_32, X86_MXCSR_XCPT_FLAGS
5512 or T0_32, T1_32
5513 mov [%1], T0_32
5514
5515 ldmxcsr [xSP]
5516 add xSP, 4
5517%endmacro
5518
5519
5520;;
5521; cvttsd2si instruction - 32-bit variant.
5522;
5523; @param A0 FPU context (FXSTATE or XSAVEAREA).
5524; @param A1 Where to return the MXCSR value.
5525; @param A2 Pointer to the result operand (output).
5526; @param A3 Pointer to the second operand (input).
5527;
5528BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5529 PROLOGUE_4_ARGS
5530 IEMIMPL_SSE_PROLOGUE
5531 SSE_LD_FXSTATE_MXCSR A0
5532
5533 cvttsd2si T0_32, [A3]
5534 mov dword [A2], T0_32
5535
5536 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5537 IEMIMPL_SSE_EPILOGUE
5538 EPILOGUE_4_ARGS
5539ENDPROC iemAImpl_cvttsd2si_i32_r64
5540
5541;;
5542; cvttsd2si instruction - 64-bit variant.
5543;
5544; @param A0 FPU context (FXSTATE or XSAVEAREA).
5545; @param A1 Where to return the MXCSR value.
5546; @param A2 Pointer to the result operand (output).
5547; @param A3 Pointer to the second operand (input).
5548;
5549BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5550 PROLOGUE_4_ARGS
5551 IEMIMPL_SSE_PROLOGUE
5552 SSE_LD_FXSTATE_MXCSR A0
5553
5554 cvttsd2si T0, [A3]
5555 mov qword [A2], T0
5556
5557 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5558 IEMIMPL_SSE_EPILOGUE
5559 EPILOGUE_4_ARGS
5560ENDPROC iemAImpl_cvttsd2si_i64_r64
5561
5562
5563;;
5564; cvtsd2si instruction - 32-bit variant.
5565;
5566; @param A0 FPU context (FXSTATE or XSAVEAREA).
5567; @param A1 Where to return the MXCSR value.
5568; @param A2 Pointer to the result operand (output).
5569; @param A3 Pointer to the second operand (input).
5570;
5571BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5572 PROLOGUE_4_ARGS
5573 IEMIMPL_SSE_PROLOGUE
5574 SSE_LD_FXSTATE_MXCSR A0
5575
5576 cvtsd2si T0_32, [A3]
5577 mov dword [A2], T0_32
5578
5579 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5580 IEMIMPL_SSE_EPILOGUE
5581 EPILOGUE_4_ARGS
5582ENDPROC iemAImpl_cvtsd2si_i32_r64
5583
5584;;
5585; cvtsd2si instruction - 64-bit variant.
5586;
5587; @param A0 FPU context (FXSTATE or XSAVEAREA).
5588; @param A1 Where to return the MXCSR value.
5589; @param A2 Pointer to the result operand (output).
5590; @param A3 Pointer to the second operand (input).
5591;
5592BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5593 PROLOGUE_4_ARGS
5594 IEMIMPL_SSE_PROLOGUE
5595 SSE_LD_FXSTATE_MXCSR A0
5596
5597 cvtsd2si T0, [A3]
5598 mov qword [A2], T0
5599
5600 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5601 IEMIMPL_SSE_EPILOGUE
5602 EPILOGUE_4_ARGS
5603ENDPROC iemAImpl_cvtsd2si_i64_r64
5604
5605
5606;;
5607; cvttss2si instruction - 32-bit variant.
5608;
5609; @param A0 FPU context (FXSTATE or XSAVEAREA).
5610; @param A1 Where to return the MXCSR value.
5611; @param A2 Pointer to the result operand (output).
5612; @param A3 Pointer to the second operand (input).
5613;
5614BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5615 PROLOGUE_4_ARGS
5616 IEMIMPL_SSE_PROLOGUE
5617 SSE_LD_FXSTATE_MXCSR A0
5618
5619 cvttss2si T0_32, [A3]
5620 mov dword [A2], T0_32
5621
5622 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5623 IEMIMPL_SSE_EPILOGUE
5624 EPILOGUE_4_ARGS
5625ENDPROC iemAImpl_cvttss2si_i32_r32
5626
5627;;
5628; cvttss2si instruction - 64-bit variant.
5629;
5630; @param A0 FPU context (FXSTATE or XSAVEAREA).
5631; @param A1 Where to return the MXCSR value.
5632; @param A2 Pointer to the result operand (output).
5633; @param A3 Pointer to the second operand (input).
5634;
5635BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5636 PROLOGUE_4_ARGS
5637 IEMIMPL_SSE_PROLOGUE
5638 SSE_LD_FXSTATE_MXCSR A0
5639
5640 cvttss2si T0, [A3]
5641 mov qword [A2], T0
5642
5643 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5644 IEMIMPL_SSE_EPILOGUE
5645 EPILOGUE_4_ARGS
5646ENDPROC iemAImpl_cvttss2si_i64_r32
5647
5648
5649;;
5650; cvtss2si instruction - 32-bit variant.
5651;
5652; @param A0 FPU context (FXSTATE or XSAVEAREA).
5653; @param A1 Where to return the MXCSR value.
5654; @param A2 Pointer to the result operand (output).
5655; @param A3 Pointer to the second operand (input).
5656;
5657BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5658 PROLOGUE_4_ARGS
5659 IEMIMPL_SSE_PROLOGUE
5660 SSE_LD_FXSTATE_MXCSR A0
5661
5662 cvtss2si T0_32, [A3]
5663 mov dword [A2], T0_32
5664
5665 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5666 IEMIMPL_SSE_EPILOGUE
5667 EPILOGUE_4_ARGS
5668ENDPROC iemAImpl_cvtss2si_i32_r32
5669
5670;;
5671; cvtss2si instruction - 64-bit variant.
5672;
5673; @param A0 FPU context (FXSTATE or XSAVEAREA).
5674; @param A1 Where to return the MXCSR value.
5675; @param A2 Pointer to the result operand (output).
5676; @param A3 Pointer to the second operand (input).
5677;
5678BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5679 PROLOGUE_4_ARGS
5680 IEMIMPL_SSE_PROLOGUE
5681 SSE_LD_FXSTATE_MXCSR A0
5682
5683 cvtss2si T0, [A3]
5684 mov qword [A2], T0
5685
5686 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5687 IEMIMPL_SSE_EPILOGUE
5688 EPILOGUE_4_ARGS
5689ENDPROC iemAImpl_cvtss2si_i64_r32
5690
5691
5692;;
5693; cvtsi2ss instruction - 32-bit variant.
5694;
5695; @param A0 FPU context (FXSTATE or XSAVEAREA).
5696; @param A1 Where to return the MXCSR value.
5697; @param A2 Pointer to the result operand (output).
5698; @param A3 Pointer to the second operand (input).
5699;
5700BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5701 PROLOGUE_4_ARGS
5702 IEMIMPL_SSE_PROLOGUE
5703 SSE_LD_FXSTATE_MXCSR A0
5704
5705 cvtsi2ss xmm0, dword [A3]
5706 movd dword [A2], xmm0
5707
5708 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5709 IEMIMPL_SSE_EPILOGUE
5710 EPILOGUE_4_ARGS
5711ENDPROC iemAImpl_cvtsi2ss_r32_i32
5712
5713;;
5714; cvtsi2ss instruction - 64-bit variant.
5715;
5716; @param A0 FPU context (FXSTATE or XSAVEAREA).
5717; @param A1 Where to return the MXCSR value.
5718; @param A2 Pointer to the result operand (output).
5719; @param A3 Pointer to the second operand (input).
5720;
5721BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5722 PROLOGUE_4_ARGS
5723 IEMIMPL_SSE_PROLOGUE
5724 SSE_LD_FXSTATE_MXCSR A0
5725
5726 cvtsi2ss xmm0, qword [A3]
5727 movd dword [A2], xmm0
5728
5729 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5730 IEMIMPL_SSE_EPILOGUE
5731 EPILOGUE_4_ARGS
5732ENDPROC iemAImpl_cvtsi2ss_r32_i64
5733
5734
5735;;
5736; cvtsi2sd instruction - 32-bit variant.
5737;
5738; @param A0 FPU context (FXSTATE or XSAVEAREA).
5739; @param A1 Where to return the MXCSR value.
5740; @param A2 Pointer to the result operand (output).
5741; @param A3 Pointer to the second operand (input).
5742;
5743BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
5744 PROLOGUE_4_ARGS
5745 IEMIMPL_SSE_PROLOGUE
5746 SSE_LD_FXSTATE_MXCSR A0
5747
5748 cvtsi2sd xmm0, dword [A3]
5749 movq [A2], xmm0
5750
5751 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5752 IEMIMPL_SSE_EPILOGUE
5753 EPILOGUE_4_ARGS
5754ENDPROC iemAImpl_cvtsi2sd_r64_i32
5755
5756;;
5757; cvtsi2sd instruction - 64-bit variant.
5758;
5759; @param A0 FPU context (FXSTATE or XSAVEAREA).
5760; @param A1 Where to return the MXCSR value.
5761; @param A2 Pointer to the result operand (output).
5762; @param A3 Pointer to the second operand (input).
5763;
5764BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
5765 PROLOGUE_4_ARGS
5766 IEMIMPL_SSE_PROLOGUE
5767 SSE_LD_FXSTATE_MXCSR A0
5768
5769 cvtsi2sd xmm0, qword [A3]
5770 movq [A2], xmm0
5771
5772 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5773 IEMIMPL_SSE_EPILOGUE
5774 EPILOGUE_4_ARGS
5775ENDPROC iemAImpl_cvtsi2sd_r64_i64
5776
5777
5778;;
5779; Initialize the SSE MXCSR register using the guest value partially to
5780; account for rounding mode.
5781;
5782; @uses 4 bytes of stack to save the original value, T0.
5783; @param 1 Expression giving the address of the MXCSR register of the guest.
5784;
5785%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
5786 sub xSP, 4
5787
5788 stmxcsr [xSP]
5789 mov T0_32, [%1]
5790 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5791 or T0_32, X86_MXCSR_XCPT_MASK
5792 sub xSP, 4
5793 mov [xSP], T0_32
5794 ldmxcsr [xSP]
5795 add xSP, 4
5796%endmacro
5797
5798
5799;;
5800; Restores the SSE MXCSR register with the original value.
5801;
5802; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5803; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5804;
5805; @note Restores the stack pointer.
5806;
5807%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
5808 sub xSP, 4
5809 stmxcsr [xSP]
5810 mov T0_32, [xSP]
5811 add xSP, 4
5812 ; Merge the status bits into the original MXCSR value.
5813 mov T1_32, [%1]
5814 and T0_32, X86_MXCSR_XCPT_FLAGS
5815 or T0_32, T1_32
5816 mov [%1], T0_32
5817
5818 ldmxcsr [xSP]
5819 add xSP, 4
5820%endmacro
5821
5822
5823;
5824; UCOMISS (SSE)
5825;
5826; @param A0 Pointer to the MXCSR value (input/output).
5827; @param A1 Pointer to the EFLAGS value (input/output).
5828; @param A2 Pointer to the first source operand (aka readonly destination).
5829; @param A3 Pointer to the second source operand.
5830;
5831BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
5832 PROLOGUE_4_ARGS
5833 IEMIMPL_SSE_PROLOGUE
5834 SSE_LD_FXSTATE_MXCSR_ONLY A0
5835
5836 movdqu xmm0, [A2]
5837 movdqu xmm1, [A3]
5838 ucomiss xmm0, xmm1
5839 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5840
5841 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5842 IEMIMPL_SSE_EPILOGUE
5843 EPILOGUE_4_ARGS
5844ENDPROC iemAImpl_ucomiss_u128
5845
5846BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
5847 PROLOGUE_4_ARGS
5848 IEMIMPL_SSE_PROLOGUE
5849 SSE_LD_FXSTATE_MXCSR_ONLY A0
5850
5851 movdqu xmm0, [A2]
5852 movdqu xmm1, [A3]
5853 vucomiss xmm0, xmm1
5854 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5855
5856 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5857 IEMIMPL_SSE_EPILOGUE
5858 EPILOGUE_4_ARGS
5859ENDPROC iemAImpl_vucomiss_u128
5860
5861
5862;
5863; UCOMISD (SSE)
5864;
5865; @param A0 Pointer to the MXCSR value (input/output).
5866; @param A1 Pointer to the EFLAGS value (input/output).
5867; @param A2 Pointer to the first source operand (aka readonly destination).
5868; @param A3 Pointer to the second source operand.
5869;
5870BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
5871 PROLOGUE_4_ARGS
5872 IEMIMPL_SSE_PROLOGUE
5873 SSE_LD_FXSTATE_MXCSR_ONLY A0
5874
5875 movdqu xmm0, [A2]
5876 movdqu xmm1, [A3]
5877 ucomisd xmm0, xmm1
5878 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5879
5880 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5881 IEMIMPL_SSE_EPILOGUE
5882 EPILOGUE_4_ARGS
5883ENDPROC iemAImpl_ucomisd_u128
5884
5885BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
5886 PROLOGUE_4_ARGS
5887 IEMIMPL_SSE_PROLOGUE
5888 SSE_LD_FXSTATE_MXCSR_ONLY A0
5889
5890 movdqu xmm0, [A2]
5891 movdqu xmm1, [A3]
5892 vucomisd xmm0, xmm1
5893 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5894
5895 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5896 IEMIMPL_SSE_EPILOGUE
5897 EPILOGUE_4_ARGS
5898ENDPROC iemAImpl_vucomisd_u128
5899
5900;
5901; COMISS (SSE)
5902;
5903; @param A0 Pointer to the MXCSR value (input/output).
5904; @param A1 Pointer to the EFLAGS value (input/output).
5905; @param A2 Pointer to the first source operand (aka readonly destination).
5906; @param A3 Pointer to the second source operand.
5907;
5908BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
5909 PROLOGUE_4_ARGS
5910 IEMIMPL_SSE_PROLOGUE
5911 SSE_LD_FXSTATE_MXCSR_ONLY A0
5912
5913 movdqu xmm0, [A2]
5914 movdqu xmm1, [A3]
5915 comiss xmm0, xmm1
5916 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5917
5918 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5919 IEMIMPL_SSE_EPILOGUE
5920 EPILOGUE_4_ARGS
5921ENDPROC iemAImpl_comiss_u128
5922
5923BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
5924 PROLOGUE_4_ARGS
5925 IEMIMPL_SSE_PROLOGUE
5926 SSE_LD_FXSTATE_MXCSR_ONLY A0
5927
5928 movdqu xmm0, [A2]
5929 movdqu xmm1, [A3]
5930 vcomiss xmm0, xmm1
5931 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5932
5933 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5934 IEMIMPL_SSE_EPILOGUE
5935 EPILOGUE_4_ARGS
5936ENDPROC iemAImpl_vcomiss_u128
5937
5938
5939;
5940; COMISD (SSE)
5941;
5942; @param A0 Pointer to the MXCSR value (input/output).
5943; @param A1 Pointer to the EFLAGS value (input/output).
5944; @param A2 Pointer to the first source operand (aka readonly destination).
5945; @param A3 Pointer to the second source operand.
5946;
5947BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
5948 PROLOGUE_4_ARGS
5949 IEMIMPL_SSE_PROLOGUE
5950 SSE_LD_FXSTATE_MXCSR_ONLY A0
5951
5952 movdqu xmm0, [A2]
5953 movdqu xmm1, [A3]
5954 comisd xmm0, xmm1
5955 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5956
5957 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5958 IEMIMPL_SSE_EPILOGUE
5959 EPILOGUE_4_ARGS
5960ENDPROC iemAImpl_comisd_u128
5961
5962BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
5963 PROLOGUE_4_ARGS
5964 IEMIMPL_SSE_PROLOGUE
5965 SSE_LD_FXSTATE_MXCSR_ONLY A0
5966
5967 movdqu xmm0, [A2]
5968 movdqu xmm1, [A3]
5969 vcomisd xmm0, xmm1
5970 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5971
5972 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5973 IEMIMPL_SSE_EPILOGUE
5974 EPILOGUE_4_ARGS
5975ENDPROC iemAImpl_vcomisd_u128
5976
5977
5978;;
5979; Need to move this as well somewhere better?
5980;
5981struc IEMMEDIAF2XMMSRC
5982 .uSrc1 resd 4
5983 .uSrc2 resd 4
5984endstruc
5985
5986
5987;
5988; CMPPS (SSE)
5989;
5990; @param A0 Pointer to the MXCSR value (input/output).
5991; @param A1 Pointer to the first media register size operand (output).
5992; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
5993; @param A3 The 8-bit immediate (input).
5994;
5995BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
5996 PROLOGUE_4_ARGS
5997 IEMIMPL_SSE_PROLOGUE
5998 SSE_LD_FXSTATE_MXCSR_ONLY A0
5999
6000 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6001 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6002 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5
6003 lea T1, [.imm0 xWrtRIP]
6004 lea T1, [T1 + T0]
6005 call T1
6006 movdqu [A1], xmm0
6007
6008 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6009 IEMIMPL_SSE_EPILOGUE
6010 EPILOGUE_4_ARGS
6011 %assign bImm 0
6012 %rep 256
6013.imm %+ bImm:
6014 cmpps xmm0, xmm1, bImm
6015 ret
6016 %assign bImm bImm + 1
6017 %endrep
6018.immEnd: ; 256*5 == 0x500
6019dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6020dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6021ENDPROC iemAImpl_cmpps_u128
6022
6023;;
6024; SSE instructions with 8-bit immediates of the form
6025; xxx xmm1, xmm2, imm8.
6026; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6027; register.
6028;
6029; @param 1 The instruction name.
6030;
6031; @param A0 Pointer to the MXCSR value (input/output).
6032; @param A1 Pointer to the first media register size operand (output).
6033; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6034; @param A3 The 8-bit immediate (input).
6035;
6036%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6037BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6038 PROLOGUE_4_ARGS
6039 IEMIMPL_SSE_PROLOGUE
6040 SSE_LD_FXSTATE_MXCSR_ONLY A0
6041
6042 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6043 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6044 lea T1, [.imm0 xWrtRIP]
6045 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
6046 lea T1, [T1 + T0*2]
6047 call T1
6048 movdqu [A1], xmm0
6049
6050 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6051 IEMIMPL_SSE_EPILOGUE
6052 EPILOGUE_4_ARGS
6053 %assign bImm 0
6054 %rep 256
6055.imm %+ bImm:
6056 %1 xmm0, xmm1, bImm
6057 ret
6058 %assign bImm bImm + 1
6059 %endrep
6060.immEnd: ; 256*6 == 0x600
6061dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6062dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6063ENDPROC iemAImpl_ %+ %1 %+ _u128
6064%endmacro
6065
6066IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6067IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6068IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6069
6070;;
6071; SSE instructions of the form
6072; xxx mm, xmm.
6073; and we need to load and save the MXCSR register.
6074;
6075; @param 1 The instruction name.
6076;
6077; @param A0 Pointer to the MXCSR value (input/output).
6078; @param A1 Pointer to the first MMX register sized operand (output).
6079; @param A2 Pointer to the media register sized operand (input).
6080;
6081%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6082BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6083 PROLOGUE_3_ARGS
6084 IEMIMPL_SSE_PROLOGUE
6085 SSE_LD_FXSTATE_MXCSR_ONLY A0
6086
6087 movdqu xmm0, [A2]
6088 %1 mm0, xmm0
6089 movq [A1], mm0
6090
6091 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6092 IEMIMPL_SSE_EPILOGUE
6093 EPILOGUE_3_ARGS
6094ENDPROC iemAImpl_ %+ %1 %+ _u128
6095%endmacro
6096
6097IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6098IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6099
6100;;
6101; SSE instructions of the form
6102; xxx xmm, xmm/m64.
6103; and we need to load and save the MXCSR register.
6104;
6105; @param 1 The instruction name.
6106;
6107; @param A0 Pointer to the MXCSR value (input/output).
6108; @param A1 Pointer to the first media register sized operand (input/output).
6109; @param A2 The 64bit source value from a MMX media register (input)
6110;
6111%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6112BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6113 PROLOGUE_3_ARGS
6114 IEMIMPL_SSE_PROLOGUE
6115 SSE_LD_FXSTATE_MXCSR_ONLY A0
6116
6117 movdqu xmm0, [A1]
6118 movq mm0, A2
6119 %1 xmm0, mm0
6120 movdqu [A1], xmm0
6121
6122 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6123 IEMIMPL_SSE_EPILOGUE
6124 EPILOGUE_3_ARGS
6125ENDPROC iemAImpl_ %+ %1 %+ _u128
6126%endmacro
6127
6128IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6129IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6130
6131;;
6132; SSE instructions of the form
6133; xxx mm, xmm/m64.
6134; and we need to load and save the MXCSR register.
6135;
6136; @param 1 The instruction name.
6137;
6138; @param A0 Pointer to the MXCSR value (input/output).
6139; @param A1 Pointer to the first MMX media register sized operand (output).
6140; @param A2 The 64bit source value (input).
6141;
6142%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6143BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6144 PROLOGUE_3_ARGS
6145 IEMIMPL_SSE_PROLOGUE
6146 SSE_LD_FXSTATE_MXCSR_ONLY A0
6147
6148 movq xmm0, A2
6149 %1 mm0, xmm0
6150 movq [A1], mm0
6151
6152 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6153 IEMIMPL_SSE_EPILOGUE
6154 EPILOGUE_3_ARGS
6155ENDPROC iemAImpl_ %+ %1 %+ _u128
6156%endmacro
6157
6158IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6159IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette