IEMAllAImpl.asm@ 95308

Last change on this file since 95308 was 95308, checked in by vboxsync, 2 years ago
VMM/IEM: Implemented ANDN, BEXTR, SHLX, SARX, SHRX, RORX, TZCNT, and LZCNT. Fixed long-mod bug in 32-bit version of BSR and BSF (would clear the upper 32 bits of the destination register when ZF=1). bugref:9898
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 101.8 KB

Line
1	; $Id: IEMAllAImpl.asm 95308 2022-06-19 20:40:26Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;*********************************************************************************************************************************
20	;* Header Files *
21	;*********************************************************************************************************************************
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;*********************************************************************************************************************************
28	;* Defined Constants And Macros *
29	;*********************************************************************************************************************************
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%define T2 r10 ; only AMD64
176	%define T2_32 r10d
177	%define T2_16 r10w
178	%define T2_8 r10b
179
180	%else
181	; x86
182	%macro PROLOGUE_1_ARGS 0
183	push edi
184	%endmacro
185	%macro EPILOGUE_1_ARGS 0
186	pop edi
187	ret 0
188	%endmacro
189	%macro EPILOGUE_1_ARGS_EX 1
190	pop edi
191	ret %1
192	%endmacro
193
194	%macro PROLOGUE_2_ARGS 0
195	push edi
196	%endmacro
197	%macro EPILOGUE_2_ARGS 0
198	pop edi
199	ret 0
200	%endmacro
201	%macro EPILOGUE_2_ARGS_EX 1
202	pop edi
203	ret %1
204	%endmacro
205
206	%macro PROLOGUE_3_ARGS 0
207	push ebx
208	mov ebx, [esp + 4 + 4]
209	push edi
210	%endmacro
211	%macro EPILOGUE_3_ARGS_EX 1
212	%if (%1) < 4
213	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214	%endif
215	pop edi
216	pop ebx
217	ret %1
218	%endmacro
219	%macro EPILOGUE_3_ARGS 0
220	EPILOGUE_3_ARGS_EX 4
221	%endmacro
222
223	%macro PROLOGUE_4_ARGS 0
224	push ebx
225	push edi
226	push esi
227	mov ebx, [esp + 12 + 4 + 0]
228	mov esi, [esp + 12 + 4 + 4]
229	%endmacro
230	%macro EPILOGUE_4_ARGS_EX 1
231	%if (%1) < 8
232	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233	%endif
234	pop esi
235	pop edi
236	pop ebx
237	ret %1
238	%endmacro
239	%macro EPILOGUE_4_ARGS 0
240	EPILOGUE_4_ARGS_EX 8
241	%endmacro
242
243	%define A0 ecx
244	%define A0_32 ecx
245	%define A0_16 cx
246	%define A0_8 cl
247
248	%define A1 edx
249	%define A1_32 edx
250	%define A1_16 dx
251	%define A1_8 dl
252
253	%define A2 ebx
254	%define A2_32 ebx
255	%define A2_16 bx
256	%define A2_8 bl
257
258	%define A3 esi
259	%define A3_32 esi
260	%define A3_16 si
261
262	%define T0 eax
263	%define T0_32 eax
264	%define T0_16 ax
265	%define T0_8 al
266
267	%define T1 edi
268	%define T1_32 edi
269	%define T1_16 di
270	%endif
271
272
273	;;
274	; Load the relevant flags from [%1] if there are undefined flags (%3).
275	;
276	; @remarks Clobbers T0, stack. Changes EFLAGS.
277	; @param A2 The register pointing to the flags.
278	; @param 1 The parameter (A0..A3) pointing to the eflags.
279	; @param 2 The set of modified flags.
280	; @param 3 The set of undefined flags.
281	;
282	%macro IEM_MAYBE_LOAD_FLAGS 3
283	;%if (%3) != 0
284	pushf ; store current flags
285	mov T0_32, [%1] ; load the guest flags
286	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
287	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
288	or [xSP], T0 ; merge guest flags with host flags.
289	popf ; load the mixed flags.
290	;%endif
291	%endmacro
292
293	;;
294	; Update the flag.
295	;
296	; @remarks Clobbers T0, T1, stack.
297	; @param 1 The register pointing to the EFLAGS.
298	; @param 2 The mask of modified flags to save.
299	; @param 3 The mask of undefined flags to (maybe) save.
300	;
301	%macro IEM_SAVE_FLAGS 3
302	%if (%2 \| %3) != 0
303	pushf
304	pop T1
305	mov T0_32, [%1] ; flags
306	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
307	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
308	or T0_32, T1_32 ; combine the flags.
309	mov [%1], T0_32 ; save the flags.
310	%endif
311	%endmacro
312
313	;;
314	; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315	;
316	; @remarks Clobbers T0, T1, stack.
317	; @param 1 The register pointing to the EFLAGS.
318	; @param 2 The mask of modified flags to save.
319	; @param 3 Mask of additional flags to always clear
320	; @param 4 Mask of additional flags to always set.
321	;
322	%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323	%if (%2 \| %3 \| %4) != 0
324	pushf
325	pop T1
326	mov T0_32, [%1] ; load flags.
327	and T0_32, ~(%2 \| %3) ; clear the modified and always cleared flags.
328	and T1_32, (%2) ; select the modified flags.
329	or T0_32, T1_32 ; combine the flags.
330	%if (%4) != 0
331	or T0_32, %4 ; add the always set flags.
332	%endif
333	mov [%1], T0_32 ; save the result.
334	%endif
335	%endmacro
336
337	;;
338	; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
339	; signed input (%4[%5]) and parity index (%6).
340	;
341	; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
342	; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
343	; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
344	;
345	; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
346	; @param 1 The register pointing to the EFLAGS.
347	; @param 2 The mask of modified flags to save.
348	; @param 3 Mask of additional flags to always clear
349	; @param 4 The result register to set SF by.
350	; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
351	; @param 6 The (full) register containing the parity table index. Will be modified!
352
353	%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
354	%ifdef RT_ARCH_AMD64
355	pushf
356	pop T2
357	%else
358	push T0
359	pushf
360	pop T0
361	%endif
362	mov T1_32, [%1] ; load flags.
363	and T1_32, ~(%2 \| %3 \| X86_EFL_PF \| X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
364	%ifdef RT_ARCH_AMD64
365	and T2_32, (%2) ; select the modified flags.
366	or T1_32, T2_32 ; combine the flags.
367	%else
368	and T0_32, (%2) ; select the modified flags.
369	or T1_32, T0_32 ; combine the flags.
370	pop T0
371	%endif
372
373	; First calculate SF as it's likely to be refereing to the same register as %6 does.
374	bt %4, %5 - 1
375	jnc %%sf_clear
376	or T1_32, X86_EFL_SF
377	%%sf_clear:
378
379	; Parity last.
380	and %6, 0xff
381	%ifdef RT_ARCH_AMD64
382	lea T2, [NAME(g_afParity) xWrtRIP]
383	or T1_8, [T2 + %6]
384	%else
385	or T1_8, [NAME(g_afParity) + %6]
386	%endif
387
388	mov [%1], T1_32 ; save the result.
389	%endmacro
390
391	;;
392	; Calculates the new EFLAGS using fixed clear and set bit masks.
393	;
394	; @remarks Clobbers T0.
395	; @param 1 The register pointing to the EFLAGS.
396	; @param 2 Mask of additional flags to always clear
397	; @param 3 Mask of additional flags to always set.
398	;
399	%macro IEM_ADJUST_FLAGS 3
400	%if (%2 \| %3) != 0
401	mov T0_32, [%1] ; Load flags.
402	%if (%2) != 0
403	and T0_32, ~(%2) ; Remove the always cleared flags.
404	%endif
405	%if (%3) != 0
406	or T0_32, %3 ; Add the always set flags.
407	%endif
408	mov [%1], T0_32 ; Save the result.
409	%endif
410	%endmacro
411
412	;;
413	; Calculates the new EFLAGS using fixed clear and set bit masks.
414	;
415	; @remarks Clobbers T0, %4, EFLAGS.
416	; @param 1 The register pointing to the EFLAGS.
417	; @param 2 Mask of additional flags to always clear
418	; @param 3 Mask of additional flags to always set.
419	; @param 4 The (full) register containing the parity table index. Will be modified!
420	;
421	%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
422	mov T0_32, [%1] ; Load flags.
423	and T0_32, ~(%2 \| X86_EFL_PF) ; Remove PF and the always cleared flags.
424	%if (%3) != 0
425	or T0_32, %3 ; Add the always set flags.
426	%endif
427	and %4, 0xff
428	%ifdef RT_ARCH_AMD64
429	lea T2, [NAME(g_afParity) xWrtRIP]
430	or T0_8, [T2 + %4]
431	%else
432	or T0_8, [NAME(g_afParity) + %4]
433	%endif
434	mov [%1], T0_32 ; Save the result.
435	%endmacro
436
437
438	;*********************************************************************************************************************************
439	;* External Symbols *
440	;*********************************************************************************************************************************
441	extern NAME(g_afParity)
442
443
444	;;
445	; Macro for implementing a binary operator.
446	;
447	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
448	; variants, except on 32-bit system where the 64-bit accesses requires hand
449	; coding.
450	;
451	; All the functions takes a pointer to the destination memory operand in A0,
452	; the source register operand in A1 and a pointer to eflags in A2.
453	;
454	; @param 1 The instruction mnemonic.
455	; @param 2 Non-zero if there should be a locked version.
456	; @param 3 The modified flags.
457	; @param 4 The undefined flags.
458	;
459	%macro IEMIMPL_BIN_OP 4
460	BEGINCODE
461	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
462	PROLOGUE_3_ARGS
463	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
464	%1 byte [A0], A1_8
465	IEM_SAVE_FLAGS A2, %3, %4
466	EPILOGUE_3_ARGS
467	ENDPROC iemAImpl_ %+ %1 %+ _u8
468
469	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
470	PROLOGUE_3_ARGS
471	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472	%1 word [A0], A1_16
473	IEM_SAVE_FLAGS A2, %3, %4
474	EPILOGUE_3_ARGS
475	ENDPROC iemAImpl_ %+ %1 %+ _u16
476
477	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
478	PROLOGUE_3_ARGS
479	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480	%1 dword [A0], A1_32
481	IEM_SAVE_FLAGS A2, %3, %4
482	EPILOGUE_3_ARGS
483	ENDPROC iemAImpl_ %+ %1 %+ _u32
484
485	%ifdef RT_ARCH_AMD64
486	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
487	PROLOGUE_3_ARGS
488	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489	%1 qword [A0], A1
490	IEM_SAVE_FLAGS A2, %3, %4
491	EPILOGUE_3_ARGS_EX 8
492	ENDPROC iemAImpl_ %+ %1 %+ _u64
493	%endif ; RT_ARCH_AMD64
494
495	%if %2 != 0 ; locked versions requested?
496
497	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
498	PROLOGUE_3_ARGS
499	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500	lock %1 byte [A0], A1_8
501	IEM_SAVE_FLAGS A2, %3, %4
502	EPILOGUE_3_ARGS
503	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
504
505	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
506	PROLOGUE_3_ARGS
507	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508	lock %1 word [A0], A1_16
509	IEM_SAVE_FLAGS A2, %3, %4
510	EPILOGUE_3_ARGS
511	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
512
513	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
514	PROLOGUE_3_ARGS
515	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
516	lock %1 dword [A0], A1_32
517	IEM_SAVE_FLAGS A2, %3, %4
518	EPILOGUE_3_ARGS
519	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
520
521	%ifdef RT_ARCH_AMD64
522	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
523	PROLOGUE_3_ARGS
524	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
525	lock %1 qword [A0], A1
526	IEM_SAVE_FLAGS A2, %3, %4
527	EPILOGUE_3_ARGS_EX 8
528	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
529	%endif ; RT_ARCH_AMD64
530	%endif ; locked
531	%endmacro
532
533	; instr,lock, modified-flags, undefined flags
534	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
535	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
536	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
537	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
538	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
539	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
540	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
541	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
542	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
543
544
545	;;
546	; Macro for implementing a binary operator, VEX variant with separate input/output.
547	;
548	; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
549	; where the 64-bit accesses requires hand coding.
550	;
551	; All the functions takes a pointer to the destination memory operand in A0,
552	; the first source register operand in A1, the second source register operand
553	; in A2 and a pointer to eflags in A3.
554	;
555	; @param 1 The instruction mnemonic.
556	; @param 2 The modified flags.
557	; @param 3 The undefined flags.
558	;
559	%macro IEMIMPL_VEX_BIN_OP 3
560	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
561	PROLOGUE_4_ARGS
562	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
563	%1 T0_32, A1_32, A2_32
564	mov [A0], T0_32
565	IEM_SAVE_FLAGS A3, %2, %3
566	EPILOGUE_4_ARGS
567	ENDPROC iemAImpl_ %+ %1 %+ _u32
568
569	%ifdef RT_ARCH_AMD64
570	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
571	PROLOGUE_4_ARGS
572	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573	%1 T0, A1, A2
574	mov [A0], T0
575	IEM_SAVE_FLAGS A3, %2, %3
576	EPILOGUE_4_ARGS
577	ENDPROC iemAImpl_ %+ %1 %+ _u64
578	%endif ; RT_ARCH_AMD64
579	%endmacro
580
581	; instr, modified-flags, undefined-flags
582	IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
583	IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF)
584
585
586	;;
587	; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
588	;
589	; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
590	; where the 64-bit accesses requires hand coding.
591	;
592	; All the functions takes a pointer to the destination memory operand in A0,
593	; the first source register operand in A1, the second source register operand
594	; in A2 and a pointer to eflags in A3.
595	;
596	; @param 1 The instruction mnemonic.
597	;
598	%macro IEMIMPL_VEX_BIN_OP_NOEFL 2
599	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
600	PROLOGUE_3_ARGS
601	%1 T0_32, A1_32, A2_32
602	mov [A0], T0_32
603	EPILOGUE_3_ARGS
604	ENDPROC iemAImpl_ %+ %1 %+ _u32
605
606	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
607	PROLOGUE_3_ARGS
608	%ifdef ASM_CALL64_GCC
609	mov cl, A2_8
610	%2 A1_32, cl
611	mov [A0], A1_32
612	%else
613	xchg A2, A0
614	%2 A1_32, cl
615	mov [A2], A1_32
616	%endif
617	EPILOGUE_3_ARGS
618	ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
619
620	%ifdef RT_ARCH_AMD64
621	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
622	PROLOGUE_3_ARGS
623	%1 T0, A1, A2
624	mov [A0], T0
625	EPILOGUE_3_ARGS
626	ENDPROC iemAImpl_ %+ %1 %+ _u64
627
628	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
629	PROLOGUE_3_ARGS
630	%ifdef ASM_CALL64_GCC
631	mov cl, A2_8
632	%2 A1, cl
633	mov [A0], A1_32
634	%else
635	xchg A2, A0
636	%2 A1, cl
637	mov [A2], A1_32
638	%endif
639	mov [A0], A1
640	EPILOGUE_3_ARGS
641	ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
642	%endif ; RT_ARCH_AMD64
643	%endmacro
644
645	; instr, fallback instr
646	IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar
647	IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl
648	IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr
649
650
651	;
652	; RORX uses a immediate byte for the shift count, so we only do
653	; fallback implementation of that one.
654	;
655	BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
656	PROLOGUE_3_ARGS
657	%ifdef ASM_CALL64_GCC
658	mov cl, A2_8
659	ror A1_32, cl
660	mov [A0], A1_32
661	%else
662	xchg A2, A0
663	ror A1_32, cl
664	mov [A2], A1_32
665	%endif
666	EPILOGUE_3_ARGS
667	ENDPROC iemAImpl_rorx_u32
668
669	%ifdef RT_ARCH_AMD64
670	BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
671	PROLOGUE_3_ARGS
672	%ifdef ASM_CALL64_GCC
673	mov cl, A2_8
674	ror A1, cl
675	mov [A0], A1_32
676	%else
677	xchg A2, A0
678	ror A1, cl
679	mov [A2], A1_32
680	%endif
681	mov [A0], A1
682	EPILOGUE_3_ARGS
683	ENDPROC iemAImpl_rorx_u64
684	%endif ; RT_ARCH_AMD64
685
686
687	;;
688	; Macro for implementing a bit operator.
689	;
690	; This will generate code for the 16, 32 and 64 bit accesses with locked
691	; variants, except on 32-bit system where the 64-bit accesses requires hand
692	; coding.
693	;
694	; All the functions takes a pointer to the destination memory operand in A0,
695	; the source register operand in A1 and a pointer to eflags in A2.
696	;
697	; @param 1 The instruction mnemonic.
698	; @param 2 Non-zero if there should be a locked version.
699	; @param 3 The modified flags.
700	; @param 4 The undefined flags.
701	;
702	%macro IEMIMPL_BIT_OP 4
703	BEGINCODE
704	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
705	PROLOGUE_3_ARGS
706	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
707	%1 word [A0], A1_16
708	IEM_SAVE_FLAGS A2, %3, %4
709	EPILOGUE_3_ARGS
710	ENDPROC iemAImpl_ %+ %1 %+ _u16
711
712	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
713	PROLOGUE_3_ARGS
714	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
715	%1 dword [A0], A1_32
716	IEM_SAVE_FLAGS A2, %3, %4
717	EPILOGUE_3_ARGS
718	ENDPROC iemAImpl_ %+ %1 %+ _u32
719
720	%ifdef RT_ARCH_AMD64
721	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
722	PROLOGUE_3_ARGS
723	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
724	%1 qword [A0], A1
725	IEM_SAVE_FLAGS A2, %3, %4
726	EPILOGUE_3_ARGS_EX 8
727	ENDPROC iemAImpl_ %+ %1 %+ _u64
728	%endif ; RT_ARCH_AMD64
729
730	%if %2 != 0 ; locked versions requested?
731
732	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
733	PROLOGUE_3_ARGS
734	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
735	lock %1 word [A0], A1_16
736	IEM_SAVE_FLAGS A2, %3, %4
737	EPILOGUE_3_ARGS
738	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
739
740	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
741	PROLOGUE_3_ARGS
742	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
743	lock %1 dword [A0], A1_32
744	IEM_SAVE_FLAGS A2, %3, %4
745	EPILOGUE_3_ARGS
746	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
747
748	%ifdef RT_ARCH_AMD64
749	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
750	PROLOGUE_3_ARGS
751	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
752	lock %1 qword [A0], A1
753	IEM_SAVE_FLAGS A2, %3, %4
754	EPILOGUE_3_ARGS_EX 8
755	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
756	%endif ; RT_ARCH_AMD64
757	%endif ; locked
758	%endmacro
759	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
760	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
761	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
762	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
763
764	;;
765	; Macro for implementing a bit search operator.
766	;
767	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
768	; system where the 64-bit accesses requires hand coding.
769	;
770	; All the functions takes a pointer to the destination memory operand in A0,
771	; the source register operand in A1 and a pointer to eflags in A2.
772	;
773	; In the ZF case the destination register is 'undefined', however it seems that
774	; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
775	; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
776	; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
777	; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
778	;
779	; @param 1 The instruction mnemonic.
780	; @param 2 The modified flags.
781	; @param 3 The undefined flags.
782	; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
783	;
784	%macro IEMIMPL_BIT_OP2 4
785	BEGINCODE
786	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
787	PROLOGUE_3_ARGS
788	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
789	%1 T0_16, A1_16
790	%if %4 != 0
791	jz .unchanged_dst
792	%endif
793	mov [A0], T0_16
794	.unchanged_dst:
795	IEM_SAVE_FLAGS A2, %2, %3
796	EPILOGUE_3_ARGS
797	ENDPROC iemAImpl_ %+ %1 %+ _u16
798
799	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
800	PROLOGUE_3_ARGS
801	%1 T1_16, A1_16
802	%if %4 != 0
803	jz .unchanged_dst
804	%endif
805	mov [A0], T1_16
806	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
807	EPILOGUE_3_ARGS
808	.unchanged_dst:
809	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
810	EPILOGUE_3_ARGS
811	ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
812
813	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
814	PROLOGUE_3_ARGS
815	%1 T0_16, A1_16
816	%if %4 != 0
817	jz .unchanged_dst
818	%endif
819	mov [A0], T0_16
820	.unchanged_dst:
821	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
822	EPILOGUE_3_ARGS
823	ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
824
825
826	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
827	PROLOGUE_3_ARGS
828	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
829	%1 T0_32, A1_32
830	%if %4 != 0
831	jz .unchanged_dst
832	%endif
833	mov [A0], T0_32
834	.unchanged_dst:
835	IEM_SAVE_FLAGS A2, %2, %3
836	EPILOGUE_3_ARGS
837	ENDPROC iemAImpl_ %+ %1 %+ _u32
838
839	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
840	PROLOGUE_3_ARGS
841	%1 T1_32, A1_32
842	%if %4 != 0
843	jz .unchanged_dst
844	%endif
845	mov [A0], T1_32
846	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
847	EPILOGUE_3_ARGS
848	.unchanged_dst:
849	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
850	EPILOGUE_3_ARGS
851	ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
852
853	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
854	PROLOGUE_3_ARGS
855	%1 T0_32, A1_32
856	%if %4 != 0
857	jz .unchanged_dst
858	%endif
859	mov [A0], T0_32
860	.unchanged_dst:
861	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
862	EPILOGUE_3_ARGS
863	ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
864
865
866	%ifdef RT_ARCH_AMD64
867
868	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
869	PROLOGUE_3_ARGS
870	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
871	%1 T0, A1
872	%if %4 != 0
873	jz .unchanged_dst
874	%endif
875	mov [A0], T0
876	.unchanged_dst:
877	IEM_SAVE_FLAGS A2, %2, %3
878	EPILOGUE_3_ARGS_EX 8
879	ENDPROC iemAImpl_ %+ %1 %+ _u64
880
881	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
882	PROLOGUE_3_ARGS
883	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
884	%1 T1, A1
885	%if %4 != 0
886	jz .unchanged_dst
887	%endif
888	mov [A0], T1
889	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
890	EPILOGUE_3_ARGS
891	.unchanged_dst:
892	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
893	EPILOGUE_3_ARGS
894	ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
895
896	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
897	PROLOGUE_3_ARGS
898	%1 T0, A1
899	%if %4 != 0
900	jz .unchanged_dst
901	%endif
902	mov [A0], T0
903	.unchanged_dst:
904	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
905	EPILOGUE_3_ARGS_EX 8
906	ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
907
908	%endif ; RT_ARCH_AMD64
909	%endmacro
910
911	IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
912	IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
913	IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF), 0
914	IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF), 0
915
916
917	;
918	; IMUL is also a similar but yet different case (no lock, no mem dst).
919	; The rDX:rAX variant of imul is handled together with mul further down.
920	;
921	BEGINCODE
922	; @param 1 EFLAGS that are modified.
923	; @param 2 Undefined EFLAGS.
924	; @param 3 Function suffix.
925	; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
926	; 2 for AMD (set AF, clear PF, ZF and SF).
927	%macro IEMIMPL_IMUL_TWO 4
928	BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
929	PROLOGUE_3_ARGS
930	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
931	imul A1_16, word [A0]
932	mov [A0], A1_16
933	%if %4 != 1
934	IEM_SAVE_FLAGS A2, %1, %2
935	%else
936	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1_16, 16, A1
937	%endif
938	EPILOGUE_3_ARGS
939	ENDPROC iemAImpl_imul_two_u16 %+ %3
940
941	BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
942	PROLOGUE_3_ARGS
943	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
944	imul A1_32, dword [A0]
945	mov [A0], A1_32
946	%if %4 != 1
947	IEM_SAVE_FLAGS A2, %1, %2
948	%else
949	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1_32, 32, A1
950	%endif
951	EPILOGUE_3_ARGS
952	ENDPROC iemAImpl_imul_two_u32 %+ %3
953
954	%ifdef RT_ARCH_AMD64
955	BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
956	PROLOGUE_3_ARGS
957	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
958	imul A1, qword [A0]
959	mov [A0], A1
960	%if %4 != 1
961	IEM_SAVE_FLAGS A2, %1, %2
962	%else
963	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1, 64, A1
964	%endif
965	EPILOGUE_3_ARGS_EX 8
966	ENDPROC iemAImpl_imul_two_u64 %+ %3
967	%endif ; RT_ARCH_AMD64
968	%endmacro
969	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, , 0
970	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, 0, _intel, 1
971	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, 0, _amd, 2
972
973
974	;
975	; XCHG for memory operands. This implies locking. No flag changes.
976	;
977	; Each function takes two arguments, first the pointer to the memory,
978	; then the pointer to the register. They all return void.
979	;
980	BEGINCODE
981	BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
982	PROLOGUE_2_ARGS
983	mov T0_8, [A1]
984	xchg [A0], T0_8
985	mov [A1], T0_8
986	EPILOGUE_2_ARGS
987	ENDPROC iemAImpl_xchg_u8_locked
988
989	BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
990	PROLOGUE_2_ARGS
991	mov T0_16, [A1]
992	xchg [A0], T0_16
993	mov [A1], T0_16
994	EPILOGUE_2_ARGS
995	ENDPROC iemAImpl_xchg_u16_locked
996
997	BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
998	PROLOGUE_2_ARGS
999	mov T0_32, [A1]
1000	xchg [A0], T0_32
1001	mov [A1], T0_32
1002	EPILOGUE_2_ARGS
1003	ENDPROC iemAImpl_xchg_u32_locked
1004
1005	%ifdef RT_ARCH_AMD64
1006	BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1007	PROLOGUE_2_ARGS
1008	mov T0, [A1]
1009	xchg [A0], T0
1010	mov [A1], T0
1011	EPILOGUE_2_ARGS
1012	ENDPROC iemAImpl_xchg_u64_locked
1013	%endif
1014
1015	; Unlocked variants for fDisregardLock mode.
1016
1017	BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1018	PROLOGUE_2_ARGS
1019	mov T0_8, [A1]
1020	mov T1_8, [A0]
1021	mov [A0], T0_8
1022	mov [A1], T1_8
1023	EPILOGUE_2_ARGS
1024	ENDPROC iemAImpl_xchg_u8_unlocked
1025
1026	BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1027	PROLOGUE_2_ARGS
1028	mov T0_16, [A1]
1029	mov T1_16, [A0]
1030	mov [A0], T0_16
1031	mov [A1], T1_16
1032	EPILOGUE_2_ARGS
1033	ENDPROC iemAImpl_xchg_u16_unlocked
1034
1035	BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1036	PROLOGUE_2_ARGS
1037	mov T0_32, [A1]
1038	mov T1_32, [A0]
1039	mov [A0], T0_32
1040	mov [A1], T1_32
1041	EPILOGUE_2_ARGS
1042	ENDPROC iemAImpl_xchg_u32_unlocked
1043
1044	%ifdef RT_ARCH_AMD64
1045	BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1046	PROLOGUE_2_ARGS
1047	mov T0, [A1]
1048	mov T1, [A0]
1049	mov [A0], T0
1050	mov [A1], T1
1051	EPILOGUE_2_ARGS
1052	ENDPROC iemAImpl_xchg_u64_unlocked
1053	%endif
1054
1055
1056	;
1057	; XADD for memory operands.
1058	;
1059	; Each function takes three arguments, first the pointer to the
1060	; memory/register, then the pointer to the register, and finally a pointer to
1061	; eflags. They all return void.
1062	;
1063	BEGINCODE
1064	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1065	PROLOGUE_3_ARGS
1066	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1067	mov T0_8, [A1]
1068	xadd [A0], T0_8
1069	mov [A1], T0_8
1070	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1071	EPILOGUE_3_ARGS
1072	ENDPROC iemAImpl_xadd_u8
1073
1074	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1075	PROLOGUE_3_ARGS
1076	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1077	mov T0_16, [A1]
1078	xadd [A0], T0_16
1079	mov [A1], T0_16
1080	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1081	EPILOGUE_3_ARGS
1082	ENDPROC iemAImpl_xadd_u16
1083
1084	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1085	PROLOGUE_3_ARGS
1086	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1087	mov T0_32, [A1]
1088	xadd [A0], T0_32
1089	mov [A1], T0_32
1090	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1091	EPILOGUE_3_ARGS
1092	ENDPROC iemAImpl_xadd_u32
1093
1094	%ifdef RT_ARCH_AMD64
1095	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1096	PROLOGUE_3_ARGS
1097	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1098	mov T0, [A1]
1099	xadd [A0], T0
1100	mov [A1], T0
1101	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1102	EPILOGUE_3_ARGS
1103	ENDPROC iemAImpl_xadd_u64
1104	%endif ; RT_ARCH_AMD64
1105
1106	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1107	PROLOGUE_3_ARGS
1108	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1109	mov T0_8, [A1]
1110	lock xadd [A0], T0_8
1111	mov [A1], T0_8
1112	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1113	EPILOGUE_3_ARGS
1114	ENDPROC iemAImpl_xadd_u8_locked
1115
1116	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1117	PROLOGUE_3_ARGS
1118	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1119	mov T0_16, [A1]
1120	lock xadd [A0], T0_16
1121	mov [A1], T0_16
1122	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1123	EPILOGUE_3_ARGS
1124	ENDPROC iemAImpl_xadd_u16_locked
1125
1126	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1127	PROLOGUE_3_ARGS
1128	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1129	mov T0_32, [A1]
1130	lock xadd [A0], T0_32
1131	mov [A1], T0_32
1132	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1133	EPILOGUE_3_ARGS
1134	ENDPROC iemAImpl_xadd_u32_locked
1135
1136	%ifdef RT_ARCH_AMD64
1137	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1138	PROLOGUE_3_ARGS
1139	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1140	mov T0, [A1]
1141	lock xadd [A0], T0
1142	mov [A1], T0
1143	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1144	EPILOGUE_3_ARGS
1145	ENDPROC iemAImpl_xadd_u64_locked
1146	%endif ; RT_ARCH_AMD64
1147
1148
1149	;
1150	; CMPXCHG8B.
1151	;
1152	; These are tricky register wise, so the code is duplicated for each calling
1153	; convention.
1154	;
1155	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1156	;
1157	; C-proto:
1158	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1159	; uint32_t *pEFlags));
1160	;
1161	; Note! Identical to iemAImpl_cmpxchg16b.
1162	;
1163	BEGINCODE
1164	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1165	%ifdef RT_ARCH_AMD64
1166	%ifdef ASM_CALL64_MSC
1167	push rbx
1168
1169	mov r11, rdx ; pu64EaxEdx (is also T1)
1170	mov r10, rcx ; pu64Dst
1171
1172	mov ebx, [r8]
1173	mov ecx, [r8 + 4]
1174	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1175	mov eax, [r11]
1176	mov edx, [r11 + 4]
1177
1178	lock cmpxchg8b [r10]
1179
1180	mov [r11], eax
1181	mov [r11 + 4], edx
1182	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1183
1184	pop rbx
1185	ret
1186	%else
1187	push rbx
1188
1189	mov r10, rcx ; pEFlags
1190	mov r11, rdx ; pu64EbxEcx (is also T1)
1191
1192	mov ebx, [r11]
1193	mov ecx, [r11 + 4]
1194	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1195	mov eax, [rsi]
1196	mov edx, [rsi + 4]
1197
1198	lock cmpxchg8b [rdi]
1199
1200	mov [rsi], eax
1201	mov [rsi + 4], edx
1202	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1203
1204	pop rbx
1205	ret
1206
1207	%endif
1208	%else
1209	push esi
1210	push edi
1211	push ebx
1212	push ebp
1213
1214	mov edi, ecx ; pu64Dst
1215	mov esi, edx ; pu64EaxEdx
1216	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1217	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1218
1219	mov ebx, [ecx]
1220	mov ecx, [ecx + 4]
1221	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1222	mov eax, [esi]
1223	mov edx, [esi + 4]
1224
1225	lock cmpxchg8b [edi]
1226
1227	mov [esi], eax
1228	mov [esi + 4], edx
1229	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1230
1231	pop ebp
1232	pop ebx
1233	pop edi
1234	pop esi
1235	ret 8
1236	%endif
1237	ENDPROC iemAImpl_cmpxchg8b
1238
1239	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1240	; Lazy bird always lock prefixes cmpxchg8b.
1241	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1242	ENDPROC iemAImpl_cmpxchg8b_locked
1243
1244	%ifdef RT_ARCH_AMD64
1245
1246	;
1247	; CMPXCHG16B.
1248	;
1249	; These are tricky register wise, so the code is duplicated for each calling
1250	; convention.
1251	;
1252	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1253	;
1254	; C-proto:
1255	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1256	; uint32_t *pEFlags));
1257	;
1258	; Note! Identical to iemAImpl_cmpxchg8b.
1259	;
1260	BEGINCODE
1261	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1262	%ifdef ASM_CALL64_MSC
1263	push rbx
1264
1265	mov r11, rdx ; pu64RaxRdx (is also T1)
1266	mov r10, rcx ; pu64Dst
1267
1268	mov rbx, [r8]
1269	mov rcx, [r8 + 8]
1270	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1271	mov rax, [r11]
1272	mov rdx, [r11 + 8]
1273
1274	lock cmpxchg16b [r10]
1275
1276	mov [r11], rax
1277	mov [r11 + 8], rdx
1278	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1279
1280	pop rbx
1281	ret
1282	%else
1283	push rbx
1284
1285	mov r10, rcx ; pEFlags
1286	mov r11, rdx ; pu64RbxRcx (is also T1)
1287
1288	mov rbx, [r11]
1289	mov rcx, [r11 + 8]
1290	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1291	mov rax, [rsi]
1292	mov rdx, [rsi + 8]
1293
1294	lock cmpxchg16b [rdi]
1295
1296	mov [rsi], rax
1297	mov [rsi + 8], rdx
1298	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1299
1300	pop rbx
1301	ret
1302
1303	%endif
1304	ENDPROC iemAImpl_cmpxchg16b
1305
1306	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1307	; Lazy bird always lock prefixes cmpxchg16b.
1308	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1309	ENDPROC iemAImpl_cmpxchg16b_locked
1310
1311	%endif ; RT_ARCH_AMD64
1312
1313
1314	;
1315	; CMPXCHG.
1316	;
1317	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1318	;
1319	; C-proto:
1320	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
1321	;
1322	BEGINCODE
1323	%macro IEMIMPL_CMPXCHG 2
1324	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1325	PROLOGUE_4_ARGS
1326	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1327	mov al, [A1]
1328	%1 cmpxchg [A0], A2_8
1329	mov [A1], al
1330	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1331	EPILOGUE_4_ARGS
1332	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1333
1334	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1335	PROLOGUE_4_ARGS
1336	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1337	mov ax, [A1]
1338	%1 cmpxchg [A0], A2_16
1339	mov [A1], ax
1340	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1341	EPILOGUE_4_ARGS
1342	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1343
1344	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1345	PROLOGUE_4_ARGS
1346	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1347	mov eax, [A1]
1348	%1 cmpxchg [A0], A2_32
1349	mov [A1], eax
1350	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1351	EPILOGUE_4_ARGS
1352	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1353
1354	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1355	%ifdef RT_ARCH_AMD64
1356	PROLOGUE_4_ARGS
1357	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1358	mov rax, [A1]
1359	%1 cmpxchg [A0], A2
1360	mov [A1], rax
1361	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1362	EPILOGUE_4_ARGS
1363	%else
1364	;
1365	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1366	;
1367	push esi
1368	push edi
1369	push ebx
1370	push ebp
1371
1372	mov edi, ecx ; pu64Dst
1373	mov esi, edx ; pu64Rax
1374	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1375	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1376
1377	mov ebx, [ecx]
1378	mov ecx, [ecx + 4]
1379	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1380	mov eax, [esi]
1381	mov edx, [esi + 4]
1382
1383	lock cmpxchg8b [edi]
1384
1385	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1386	jz .cmpxchg8b_not_equal
1387	cmp eax, eax ; just set the other flags.
1388	.store:
1389	mov [esi], eax
1390	mov [esi + 4], edx
1391	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1392
1393	pop ebp
1394	pop ebx
1395	pop edi
1396	pop esi
1397	ret 8
1398
1399	.cmpxchg8b_not_equal:
1400	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1401	jne .store
1402	cmp [esi], eax
1403	jmp .store
1404
1405	%endif
1406	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1407	%endmacro ; IEMIMPL_CMPXCHG
1408
1409	IEMIMPL_CMPXCHG , ,
1410	IEMIMPL_CMPXCHG lock, _locked
1411
1412	;;
1413	; Macro for implementing a unary operator.
1414	;
1415	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1416	; variants, except on 32-bit system where the 64-bit accesses requires hand
1417	; coding.
1418	;
1419	; All the functions takes a pointer to the destination memory operand in A0,
1420	; the source register operand in A1 and a pointer to eflags in A2.
1421	;
1422	; @param 1 The instruction mnemonic.
1423	; @param 2 The modified flags.
1424	; @param 3 The undefined flags.
1425	;
1426	%macro IEMIMPL_UNARY_OP 3
1427	BEGINCODE
1428	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1429	PROLOGUE_2_ARGS
1430	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1431	%1 byte [A0]
1432	IEM_SAVE_FLAGS A1, %2, %3
1433	EPILOGUE_2_ARGS
1434	ENDPROC iemAImpl_ %+ %1 %+ _u8
1435
1436	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1437	PROLOGUE_2_ARGS
1438	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1439	lock %1 byte [A0]
1440	IEM_SAVE_FLAGS A1, %2, %3
1441	EPILOGUE_2_ARGS
1442	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1443
1444	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1445	PROLOGUE_2_ARGS
1446	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1447	%1 word [A0]
1448	IEM_SAVE_FLAGS A1, %2, %3
1449	EPILOGUE_2_ARGS
1450	ENDPROC iemAImpl_ %+ %1 %+ _u16
1451
1452	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1453	PROLOGUE_2_ARGS
1454	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1455	lock %1 word [A0]
1456	IEM_SAVE_FLAGS A1, %2, %3
1457	EPILOGUE_2_ARGS
1458	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1459
1460	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1461	PROLOGUE_2_ARGS
1462	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1463	%1 dword [A0]
1464	IEM_SAVE_FLAGS A1, %2, %3
1465	EPILOGUE_2_ARGS
1466	ENDPROC iemAImpl_ %+ %1 %+ _u32
1467
1468	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1469	PROLOGUE_2_ARGS
1470	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1471	lock %1 dword [A0]
1472	IEM_SAVE_FLAGS A1, %2, %3
1473	EPILOGUE_2_ARGS
1474	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1475
1476	%ifdef RT_ARCH_AMD64
1477	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1478	PROLOGUE_2_ARGS
1479	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1480	%1 qword [A0]
1481	IEM_SAVE_FLAGS A1, %2, %3
1482	EPILOGUE_2_ARGS
1483	ENDPROC iemAImpl_ %+ %1 %+ _u64
1484
1485	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1486	PROLOGUE_2_ARGS
1487	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1488	lock %1 qword [A0]
1489	IEM_SAVE_FLAGS A1, %2, %3
1490	EPILOGUE_2_ARGS
1491	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1492	%endif ; RT_ARCH_AMD64
1493
1494	%endmacro
1495
1496	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1497	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1498	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1499	IEMIMPL_UNARY_OP not, 0, 0
1500
1501
1502	;
1503	; BSWAP. No flag changes.
1504	;
1505	; Each function takes one argument, pointer to the value to bswap
1506	; (input/output). They all return void.
1507	;
1508	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1509	PROLOGUE_1_ARGS
1510	mov T0_32, [A0] ; just in case any of the upper bits are used.
1511	db 66h
1512	bswap T0_32
1513	mov [A0], T0_32
1514	EPILOGUE_1_ARGS
1515	ENDPROC iemAImpl_bswap_u16
1516
1517	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1518	PROLOGUE_1_ARGS
1519	mov T0_32, [A0]
1520	bswap T0_32
1521	mov [A0], T0_32
1522	EPILOGUE_1_ARGS
1523	ENDPROC iemAImpl_bswap_u32
1524
1525	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1526	%ifdef RT_ARCH_AMD64
1527	PROLOGUE_1_ARGS
1528	mov T0, [A0]
1529	bswap T0
1530	mov [A0], T0
1531	EPILOGUE_1_ARGS
1532	%else
1533	PROLOGUE_1_ARGS
1534	mov T0, [A0]
1535	mov T1, [A0 + 4]
1536	bswap T0
1537	bswap T1
1538	mov [A0 + 4], T0
1539	mov [A0], T1
1540	EPILOGUE_1_ARGS
1541	%endif
1542	ENDPROC iemAImpl_bswap_u64
1543
1544
1545	;;
1546	; Macro for implementing a shift operation.
1547	;
1548	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1549	; 32-bit system where the 64-bit accesses requires hand coding.
1550	;
1551	; All the functions takes a pointer to the destination memory operand in A0,
1552	; the shift count in A1 and a pointer to eflags in A2.
1553	;
1554	; @param 1 The instruction mnemonic.
1555	; @param 2 The modified flags.
1556	; @param 3 The undefined flags.
1557	;
1558	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1559	;
1560	; @note the _intel and _amd variants are implemented in C.
1561	;
1562	%macro IEMIMPL_SHIFT_OP 3
1563	BEGINCODE
1564	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1565	PROLOGUE_3_ARGS
1566	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1567	%ifdef ASM_CALL64_GCC
1568	mov cl, A1_8
1569	%1 byte [A0], cl
1570	%else
1571	xchg A1, A0
1572	%1 byte [A1], cl
1573	%endif
1574	IEM_SAVE_FLAGS A2, %2, %3
1575	EPILOGUE_3_ARGS
1576	ENDPROC iemAImpl_ %+ %1 %+ _u8
1577
1578	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1579	PROLOGUE_3_ARGS
1580	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1581	%ifdef ASM_CALL64_GCC
1582	mov cl, A1_8
1583	%1 word [A0], cl
1584	%else
1585	xchg A1, A0
1586	%1 word [A1], cl
1587	%endif
1588	IEM_SAVE_FLAGS A2, %2, %3
1589	EPILOGUE_3_ARGS
1590	ENDPROC iemAImpl_ %+ %1 %+ _u16
1591
1592	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1593	PROLOGUE_3_ARGS
1594	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1595	%ifdef ASM_CALL64_GCC
1596	mov cl, A1_8
1597	%1 dword [A0], cl
1598	%else
1599	xchg A1, A0
1600	%1 dword [A1], cl
1601	%endif
1602	IEM_SAVE_FLAGS A2, %2, %3
1603	EPILOGUE_3_ARGS
1604	ENDPROC iemAImpl_ %+ %1 %+ _u32
1605
1606	%ifdef RT_ARCH_AMD64
1607	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1608	PROLOGUE_3_ARGS
1609	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1610	%ifdef ASM_CALL64_GCC
1611	mov cl, A1_8
1612	%1 qword [A0], cl
1613	%else
1614	xchg A1, A0
1615	%1 qword [A1], cl
1616	%endif
1617	IEM_SAVE_FLAGS A2, %2, %3
1618	EPILOGUE_3_ARGS
1619	ENDPROC iemAImpl_ %+ %1 %+ _u64
1620	%endif ; RT_ARCH_AMD64
1621
1622	%endmacro
1623
1624	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1625	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1626	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1627	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1628	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1629	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1630	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1631
1632
1633	;;
1634	; Macro for implementing a double precision shift operation.
1635	;
1636	; This will generate code for the 16, 32 and 64 bit accesses, except on
1637	; 32-bit system where the 64-bit accesses requires hand coding.
1638	;
1639	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1640	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1641	;
1642	; @param 1 The instruction mnemonic.
1643	; @param 2 The modified flags.
1644	; @param 3 The undefined flags.
1645	;
1646	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1647	;
1648	; @note the _intel and _amd variants are implemented in C.
1649	;
1650	%macro IEMIMPL_SHIFT_DBL_OP 3
1651	BEGINCODE
1652	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1653	PROLOGUE_4_ARGS
1654	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1655	%ifdef ASM_CALL64_GCC
1656	xchg A3, A2
1657	%1 [A0], A1_16, cl
1658	xchg A3, A2
1659	%else
1660	xchg A0, A2
1661	%1 [A2], A1_16, cl
1662	%endif
1663	IEM_SAVE_FLAGS A3, %2, %3
1664	EPILOGUE_4_ARGS
1665	ENDPROC iemAImpl_ %+ %1 %+ _u16
1666
1667	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1668	PROLOGUE_4_ARGS
1669	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1670	%ifdef ASM_CALL64_GCC
1671	xchg A3, A2
1672	%1 [A0], A1_32, cl
1673	xchg A3, A2
1674	%else
1675	xchg A0, A2
1676	%1 [A2], A1_32, cl
1677	%endif
1678	IEM_SAVE_FLAGS A3, %2, %3
1679	EPILOGUE_4_ARGS
1680	ENDPROC iemAImpl_ %+ %1 %+ _u32
1681
1682	%ifdef RT_ARCH_AMD64
1683	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1684	PROLOGUE_4_ARGS
1685	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1686	%ifdef ASM_CALL64_GCC
1687	xchg A3, A2
1688	%1 [A0], A1, cl
1689	xchg A3, A2
1690	%else
1691	xchg A0, A2
1692	%1 [A2], A1, cl
1693	%endif
1694	IEM_SAVE_FLAGS A3, %2, %3
1695	EPILOGUE_4_ARGS_EX 12
1696	ENDPROC iemAImpl_ %+ %1 %+ _u64
1697	%endif ; RT_ARCH_AMD64
1698
1699	%endmacro
1700
1701	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1702	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1703
1704
1705	;;
1706	; Macro for implementing a multiplication operations.
1707	;
1708	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1709	; 32-bit system where the 64-bit accesses requires hand coding.
1710	;
1711	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1712	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1713	; pointer to eflags in A3.
1714	;
1715	; The functions all return 0 so the caller can be used for div/idiv as well as
1716	; for the mul/imul implementation.
1717	;
1718	; @param 1 The instruction mnemonic.
1719	; @param 2 The modified flags.
1720	; @param 3 The undefined flags.
1721	; @param 4 Name suffix.
1722	; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1723	;
1724	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1725	;
1726	%macro IEMIMPL_MUL_OP 5
1727	BEGINCODE
1728	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1729	PROLOGUE_3_ARGS
1730	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1731	mov al, [A0]
1732	%1 A1_8
1733	mov [A0], ax
1734	%if %5 != 1
1735	IEM_SAVE_FLAGS A2, %2, %3
1736	%else
1737	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF \| X86_EFL_ZF, ax, 8, xAX
1738	%endif
1739	xor eax, eax
1740	EPILOGUE_3_ARGS
1741	ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1742
1743	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1744	PROLOGUE_4_ARGS
1745	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1746	mov ax, [A0]
1747	%ifdef ASM_CALL64_GCC
1748	%1 A2_16
1749	mov [A0], ax
1750	mov [A1], dx
1751	%else
1752	mov T1, A1
1753	%1 A2_16
1754	mov [A0], ax
1755	mov [T1], dx
1756	%endif
1757	%if %5 != 1
1758	IEM_SAVE_FLAGS A3, %2, %3
1759	%else
1760	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, ax, 16, xAX
1761	%endif
1762	xor eax, eax
1763	EPILOGUE_4_ARGS
1764	ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1765
1766	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1767	PROLOGUE_4_ARGS
1768	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1769	mov eax, [A0]
1770	%ifdef ASM_CALL64_GCC
1771	%1 A2_32
1772	mov [A0], eax
1773	mov [A1], edx
1774	%else
1775	mov T1, A1
1776	%1 A2_32
1777	mov [A0], eax
1778	mov [T1], edx
1779	%endif
1780	%if %5 != 1
1781	IEM_SAVE_FLAGS A3, %2, %3
1782	%else
1783	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, eax, 32, xAX
1784	%endif
1785	xor eax, eax
1786	EPILOGUE_4_ARGS
1787	ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1788
1789	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1790	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1791	PROLOGUE_4_ARGS
1792	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1793	mov rax, [A0]
1794	%ifdef ASM_CALL64_GCC
1795	%1 A2
1796	mov [A0], rax
1797	mov [A1], rdx
1798	%else
1799	mov T1, A1
1800	%1 A2
1801	mov [A0], rax
1802	mov [T1], rdx
1803	%endif
1804	%if %5 != 1
1805	IEM_SAVE_FLAGS A3, %2, %3
1806	%else
1807	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, rax, 64, xAX
1808	%endif
1809	xor eax, eax
1810	EPILOGUE_4_ARGS_EX 12
1811	ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
1812	%endif ; !RT_ARCH_AMD64
1813
1814	%endmacro
1815
1816	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), , 0
1817	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), 0, _intel, 1
1818	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), 0, _amd, 2
1819	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), , 0
1820	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), 0, _intel, 1
1821	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), 0, _amd, 2
1822
1823
1824	BEGINCODE
1825	;;
1826	; Worker function for negating a 32-bit number in T1:T0
1827	; @uses None (T0,T1)
1828	BEGINPROC iemAImpl_negate_T0_T1_u32
1829	push 0
1830	push 0
1831	xchg T0_32, [xSP]
1832	xchg T1_32, [xSP + xCB]
1833	sub T0_32, [xSP]
1834	sbb T1_32, [xSP + xCB]
1835	add xSP, xCB*2
1836	ret
1837	ENDPROC iemAImpl_negate_T0_T1_u32
1838
1839	%ifdef RT_ARCH_AMD64
1840	;;
1841	; Worker function for negating a 64-bit number in T1:T0
1842	; @uses None (T0,T1)
1843	BEGINPROC iemAImpl_negate_T0_T1_u64
1844	push 0
1845	push 0
1846	xchg T0, [xSP]
1847	xchg T1, [xSP + xCB]
1848	sub T0, [xSP]
1849	sbb T1, [xSP + xCB]
1850	add xSP, xCB*2
1851	ret
1852	ENDPROC iemAImpl_negate_T0_T1_u64
1853	%endif
1854
1855
1856	;;
1857	; Macro for implementing a division operations.
1858	;
1859	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1860	; 32-bit system where the 64-bit accesses requires hand coding.
1861	;
1862	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1863	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1864	; pointer to eflags in A3.
1865	;
1866	; The functions all return 0 on success and -1 if a divide error should be
1867	; raised by the caller.
1868	;
1869	; @param 1 The instruction mnemonic.
1870	; @param 2 The modified flags.
1871	; @param 3 The undefined flags.
1872	; @param 4 1 if signed, 0 if unsigned.
1873	; @param 5 Function suffix.
1874	; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
1875	; 2 for AMD (set AF, clear PF, ZF and SF).
1876	;
1877	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1878	;
1879	%macro IEMIMPL_DIV_OP 6
1880	BEGINCODE
1881	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
1882	PROLOGUE_3_ARGS
1883
1884	; div by chainsaw check.
1885	test A1_8, A1_8
1886	jz .div_zero
1887
1888	; Overflow check - unsigned division is simple to verify, haven't
1889	; found a simple way to check signed division yet unfortunately.
1890	%if %4 == 0
1891	cmp [A0 + 1], A1_8
1892	jae .div_overflow
1893	%else
1894	mov T0_16, [A0] ; T0 = dividend
1895	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1896	test A1_8, A1_8
1897	js .divisor_negative
1898	test T0_16, T0_16
1899	jns .both_positive
1900	neg T0_16
1901	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1902	push T0 ; Start off like unsigned below.
1903	shr T0_16, 7
1904	cmp T0_8, A1_8
1905	pop T0
1906	jb .div_no_overflow
1907	ja .div_overflow
1908	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1909	cmp T0_8, A1_8
1910	jae .div_overflow
1911	jmp .div_no_overflow
1912
1913	.divisor_negative:
1914	neg A1_8
1915	test T0_16, T0_16
1916	jns .one_of_each
1917	neg T0_16
1918	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1919	shr T0_16, 7
1920	cmp T0_8, A1_8
1921	jae .div_overflow
1922	.div_no_overflow:
1923	mov A1, T1 ; restore divisor
1924	%endif
1925
1926	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1927	mov ax, [A0]
1928	%1 A1_8
1929	mov [A0], ax
1930	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
1931	IEM_ADJUST_FLAGS A2, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
1932	%else
1933	IEM_SAVE_FLAGS A2, %2, %3
1934	%endif
1935	xor eax, eax
1936
1937	.return:
1938	EPILOGUE_3_ARGS
1939
1940	.div_zero:
1941	.div_overflow:
1942	mov eax, -1
1943	jmp .return
1944	ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
1945
1946	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
1947	PROLOGUE_4_ARGS
1948
1949	; div by chainsaw check.
1950	test A2_16, A2_16
1951	jz .div_zero
1952
1953	; Overflow check - unsigned division is simple to verify, haven't
1954	; found a simple way to check signed division yet unfortunately.
1955	%if %4 == 0
1956	cmp [A1], A2_16
1957	jae .div_overflow
1958	%else
1959	mov T0_16, [A1]
1960	shl T0_32, 16
1961	mov T0_16, [A0] ; T0 = dividend
1962	mov T1, A2 ; T1 = divisor
1963	test T1_16, T1_16
1964	js .divisor_negative
1965	test T0_32, T0_32
1966	jns .both_positive
1967	neg T0_32
1968	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1969	push T0 ; Start off like unsigned below.
1970	shr T0_32, 15
1971	cmp T0_16, T1_16
1972	pop T0
1973	jb .div_no_overflow
1974	ja .div_overflow
1975	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1976	cmp T0_16, T1_16
1977	jae .div_overflow
1978	jmp .div_no_overflow
1979
1980	.divisor_negative:
1981	neg T1_16
1982	test T0_32, T0_32
1983	jns .one_of_each
1984	neg T0_32
1985	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1986	shr T0_32, 15
1987	cmp T0_16, T1_16
1988	jae .div_overflow
1989	.div_no_overflow:
1990	%endif
1991
1992	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1993	%ifdef ASM_CALL64_GCC
1994	mov T1, A2
1995	mov ax, [A0]
1996	mov dx, [A1]
1997	%1 T1_16
1998	mov [A0], ax
1999	mov [A1], dx
2000	%else
2001	mov T1, A1
2002	mov ax, [A0]
2003	mov dx, [T1]
2004	%1 A2_16
2005	mov [A0], ax
2006	mov [T1], dx
2007	%endif
2008	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2009	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2010	%else
2011	IEM_SAVE_FLAGS A3, %2, %3
2012	%endif
2013	xor eax, eax
2014
2015	.return:
2016	EPILOGUE_4_ARGS
2017
2018	.div_zero:
2019	.div_overflow:
2020	mov eax, -1
2021	jmp .return
2022	ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2023
2024	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2025	PROLOGUE_4_ARGS
2026
2027	; div by chainsaw check.
2028	test A2_32, A2_32
2029	jz .div_zero
2030
2031	; Overflow check - unsigned division is simple to verify, haven't
2032	; found a simple way to check signed division yet unfortunately.
2033	%if %4 == 0
2034	cmp [A1], A2_32
2035	jae .div_overflow
2036	%else
2037	push A2 ; save A2 so we modify it (we out of regs on x86).
2038	mov T0_32, [A0] ; T0 = dividend low
2039	mov T1_32, [A1] ; T1 = dividend high
2040	test A2_32, A2_32
2041	js .divisor_negative
2042	test T1_32, T1_32
2043	jns .both_positive
2044	call NAME(iemAImpl_negate_T0_T1_u32)
2045	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2046	push T0 ; Start off like unsigned below.
2047	shl T1_32, 1
2048	shr T0_32, 31
2049	or T1_32, T0_32
2050	cmp T1_32, A2_32
2051	pop T0
2052	jb .div_no_overflow
2053	ja .div_overflow
2054	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2055	cmp T0_32, A2_32
2056	jae .div_overflow
2057	jmp .div_no_overflow
2058
2059	.divisor_negative:
2060	neg A2_32
2061	test T1_32, T1_32
2062	jns .one_of_each
2063	call NAME(iemAImpl_negate_T0_T1_u32)
2064	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2065	shl T1_32, 1
2066	shr T0_32, 31
2067	or T1_32, T0_32
2068	cmp T1_32, A2_32
2069	jae .div_overflow
2070	.div_no_overflow:
2071	pop A2
2072	%endif
2073
2074	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2075	mov eax, [A0]
2076	%ifdef ASM_CALL64_GCC
2077	mov T1, A2
2078	mov eax, [A0]
2079	mov edx, [A1]
2080	%1 T1_32
2081	mov [A0], eax
2082	mov [A1], edx
2083	%else
2084	mov T1, A1
2085	mov eax, [A0]
2086	mov edx, [T1]
2087	%1 A2_32
2088	mov [A0], eax
2089	mov [T1], edx
2090	%endif
2091	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2092	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2093	%else
2094	IEM_SAVE_FLAGS A3, %2, %3
2095	%endif
2096	xor eax, eax
2097
2098	.return:
2099	EPILOGUE_4_ARGS
2100
2101	.div_overflow:
2102	%if %4 != 0
2103	pop A2
2104	%endif
2105	.div_zero:
2106	mov eax, -1
2107	jmp .return
2108	ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2109
2110	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2111	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2112	PROLOGUE_4_ARGS
2113
2114	test A2, A2
2115	jz .div_zero
2116	%if %4 == 0
2117	cmp [A1], A2
2118	jae .div_overflow
2119	%else
2120	push A2 ; save A2 so we modify it (we out of regs on x86).
2121	mov T0, [A0] ; T0 = dividend low
2122	mov T1, [A1] ; T1 = dividend high
2123	test A2, A2
2124	js .divisor_negative
2125	test T1, T1
2126	jns .both_positive
2127	call NAME(iemAImpl_negate_T0_T1_u64)
2128	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2129	push T0 ; Start off like unsigned below.
2130	shl T1, 1
2131	shr T0, 63
2132	or T1, T0
2133	cmp T1, A2
2134	pop T0
2135	jb .div_no_overflow
2136	ja .div_overflow
2137	mov T1, 0x7fffffffffffffff
2138	and T0, T1 ; Special case for covering (divisor - 1).
2139	cmp T0, A2
2140	jae .div_overflow
2141	jmp .div_no_overflow
2142
2143	.divisor_negative:
2144	neg A2
2145	test T1, T1
2146	jns .one_of_each
2147	call NAME(iemAImpl_negate_T0_T1_u64)
2148	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2149	shl T1, 1
2150	shr T0, 63
2151	or T1, T0
2152	cmp T1, A2
2153	jae .div_overflow
2154	.div_no_overflow:
2155	pop A2
2156	%endif
2157
2158	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2159	mov rax, [A0]
2160	%ifdef ASM_CALL64_GCC
2161	mov T1, A2
2162	mov rax, [A0]
2163	mov rdx, [A1]
2164	%1 T1
2165	mov [A0], rax
2166	mov [A1], rdx
2167	%else
2168	mov T1, A1
2169	mov rax, [A0]
2170	mov rdx, [T1]
2171	%1 A2
2172	mov [A0], rax
2173	mov [T1], rdx
2174	%endif
2175	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2176	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2177	%else
2178	IEM_SAVE_FLAGS A3, %2, %3
2179	%endif
2180	xor eax, eax
2181
2182	.return:
2183	EPILOGUE_4_ARGS_EX 12
2184
2185	.div_overflow:
2186	%if %4 != 0
2187	pop A2
2188	%endif
2189	.div_zero:
2190	mov eax, -1
2191	jmp .return
2192	ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2193	%endif ; !RT_ARCH_AMD64
2194
2195	%endmacro
2196
2197	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0, , 0
2198	IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2199	IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2200	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1, , 0
2201	IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2202	IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2203
2204
2205	;;
2206	; Macro for implementing memory fence operation.
2207	;
2208	; No return value, no operands or anything.
2209	;
2210	; @param 1 The instruction.
2211	;
2212	%macro IEMIMPL_MEM_FENCE 1
2213	BEGINCODE
2214	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2215	%1
2216	ret
2217	ENDPROC iemAImpl_ %+ %1
2218	%endmacro
2219
2220	IEMIMPL_MEM_FENCE lfence
2221	IEMIMPL_MEM_FENCE sfence
2222	IEMIMPL_MEM_FENCE mfence
2223
2224	;;
2225	; Alternative for non-SSE2 host.
2226	;
2227	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2228	push xAX
2229	xchg xAX, [xSP]
2230	add xSP, xCB
2231	ret
2232	ENDPROC iemAImpl_alt_mem_fence
2233
2234
2235	;;
2236	; Initialize the FPU for the actual instruction being emulated, this means
2237	; loading parts of the guest's control word and status word.
2238	;
2239	; @uses 24 bytes of stack. T0, T1
2240	; @param 1 Expression giving the address of the FXSTATE of the guest.
2241	;
2242	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2243	fnstenv [xSP]
2244
2245	; FCW - for exception, precision and rounding control.
2246	movzx T0, word [%1 + X86FXSTATE.FCW]
2247	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
2248	mov [xSP + X86FSTENV32P.FCW], T0_16
2249
2250	; FSW - for undefined C0, C1, C2, and C3.
2251	movzx T1, word [%1 + X86FXSTATE.FSW]
2252	and T1, X86_FSW_C_MASK
2253	movzx T0, word [xSP + X86FSTENV32P.FSW]
2254	and T0, X86_FSW_TOP_MASK
2255	or T0, T1
2256	mov [xSP + X86FSTENV32P.FSW], T0_16
2257
2258	fldenv [xSP]
2259	%endmacro
2260
2261
2262	;;
2263	; Initialize the FPU for the actual instruction being emulated, this means
2264	; loading parts of the guest's control word, status word, and update the
2265	; tag word for the top register if it's empty.
2266	;
2267	; ASSUMES actual TOP=7
2268	;
2269	; @uses 24 bytes of stack. T0, T1
2270	; @param 1 Expression giving the address of the FXSTATE of the guest.
2271	;
2272	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2273	fnstenv [xSP]
2274
2275	; FCW - for exception, precision and rounding control.
2276	movzx T0_32, word [%1 + X86FXSTATE.FCW]
2277	and T0_32, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
2278	mov [xSP + X86FSTENV32P.FCW], T0_16
2279
2280	; FSW - for undefined C0, C1, C2, and C3.
2281	movzx T1_32, word [%1 + X86FXSTATE.FSW]
2282	and T1_32, X86_FSW_C_MASK
2283	movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2284	and T0_32, X86_FSW_TOP_MASK
2285	or T0_32, T1_32
2286	mov [xSP + X86FSTENV32P.FSW], T0_16
2287
2288	; FTW - Only for ST0 (in/out).
2289	movzx T1_32, word [%1 + X86FXSTATE.FSW]
2290	shr T1_32, X86_FSW_TOP_SHIFT
2291	and T1_32, X86_FSW_TOP_SMASK
2292	bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2293	jc %%st0_not_empty
2294	or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2295	%%st0_not_empty:
2296
2297	fldenv [xSP]
2298	%endmacro
2299
2300
2301	;;
2302	; Need to move this as well somewhere better?
2303	;
2304	struc IEMFPURESULT
2305	.r80Result resw 5
2306	.FSW resw 1
2307	endstruc
2308
2309
2310	;;
2311	; Need to move this as well somewhere better?
2312	;
2313	struc IEMFPURESULTTWO
2314	.r80Result1 resw 5
2315	.FSW resw 1
2316	.r80Result2 resw 5
2317	endstruc
2318
2319
2320	;
2321	;---------------------- 16-bit signed integer operations ----------------------
2322	;
2323
2324
2325	;;
2326	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2327	;
2328	; @param A0 FPU context (fxsave).
2329	; @param A1 Pointer to a IEMFPURESULT for the output.
2330	; @param A2 Pointer to the 16-bit floating point value to convert.
2331	;
2332	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2333	PROLOGUE_3_ARGS
2334	sub xSP, 20h
2335
2336	fninit
2337	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2338	fild word [A2]
2339
2340	fnstsw word [A1 + IEMFPURESULT.FSW]
2341	fnclex
2342	fstp tword [A1 + IEMFPURESULT.r80Result]
2343
2344	fninit
2345	add xSP, 20h
2346	EPILOGUE_3_ARGS
2347	ENDPROC iemAImpl_fild_r80_from_i16
2348
2349
2350	;;
2351	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2352	;
2353	; @param A0 FPU context (fxsave).
2354	; @param A1 Where to return the output FSW.
2355	; @param A2 Where to store the 16-bit signed integer value.
2356	; @param A3 Pointer to the 80-bit value.
2357	;
2358	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2359	PROLOGUE_4_ARGS
2360	sub xSP, 20h
2361
2362	fninit
2363	fld tword [A3]
2364	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2365	fistp word [A2]
2366
2367	fnstsw word [A1]
2368
2369	fninit
2370	add xSP, 20h
2371	EPILOGUE_4_ARGS
2372	ENDPROC iemAImpl_fist_r80_to_i16
2373
2374
2375	;;
2376	; Store a 80-bit floating point value (register) as a 16-bit signed integer
2377	; (memory) with truncation.
2378	;
2379	; @param A0 FPU context (fxsave).
2380	; @param A1 Where to return the output FSW.
2381	; @param A2 Where to store the 16-bit signed integer value.
2382	; @param A3 Pointer to the 80-bit value.
2383	;
2384	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2385	PROLOGUE_4_ARGS
2386	sub xSP, 20h
2387
2388	fninit
2389	fld tword [A3]
2390	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2391	fisttp word [A2]
2392
2393	fnstsw word [A1]
2394
2395	fninit
2396	add xSP, 20h
2397	EPILOGUE_4_ARGS
2398	ENDPROC iemAImpl_fistt_r80_to_i16
2399
2400
2401	;;
2402	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2403	;
2404	; @param 1 The instruction
2405	;
2406	; @param A0 FPU context (fxsave).
2407	; @param A1 Pointer to a IEMFPURESULT for the output.
2408	; @param A2 Pointer to the 80-bit value.
2409	; @param A3 Pointer to the 16-bit value.
2410	;
2411	%macro IEMIMPL_FPU_R80_BY_I16 1
2412	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2413	PROLOGUE_4_ARGS
2414	sub xSP, 20h
2415
2416	fninit
2417	fld tword [A2]
2418	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2419	%1 word [A3]
2420
2421	fnstsw word [A1 + IEMFPURESULT.FSW]
2422	fnclex
2423	fstp tword [A1 + IEMFPURESULT.r80Result]
2424
2425	fninit
2426	add xSP, 20h
2427	EPILOGUE_4_ARGS
2428	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2429	%endmacro
2430
2431	IEMIMPL_FPU_R80_BY_I16 fiadd
2432	IEMIMPL_FPU_R80_BY_I16 fimul
2433	IEMIMPL_FPU_R80_BY_I16 fisub
2434	IEMIMPL_FPU_R80_BY_I16 fisubr
2435	IEMIMPL_FPU_R80_BY_I16 fidiv
2436	IEMIMPL_FPU_R80_BY_I16 fidivr
2437
2438
2439	;;
2440	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2441	; only returning FSW.
2442	;
2443	; @param 1 The instruction
2444	;
2445	; @param A0 FPU context (fxsave).
2446	; @param A1 Where to store the output FSW.
2447	; @param A2 Pointer to the 80-bit value.
2448	; @param A3 Pointer to the 64-bit value.
2449	;
2450	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2451	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2452	PROLOGUE_4_ARGS
2453	sub xSP, 20h
2454
2455	fninit
2456	fld tword [A2]
2457	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2458	%1 word [A3]
2459
2460	fnstsw word [A1]
2461
2462	fninit
2463	add xSP, 20h
2464	EPILOGUE_4_ARGS
2465	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2466	%endmacro
2467
2468	IEMIMPL_FPU_R80_BY_I16_FSW ficom
2469
2470
2471
2472	;
2473	;---------------------- 32-bit signed integer operations ----------------------
2474	;
2475
2476
2477	;;
2478	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2479	;
2480	; @param A0 FPU context (fxsave).
2481	; @param A1 Pointer to a IEMFPURESULT for the output.
2482	; @param A2 Pointer to the 32-bit floating point value to convert.
2483	;
2484	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2485	PROLOGUE_3_ARGS
2486	sub xSP, 20h
2487
2488	fninit
2489	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2490	fild dword [A2]
2491
2492	fnstsw word [A1 + IEMFPURESULT.FSW]
2493	fnclex
2494	fstp tword [A1 + IEMFPURESULT.r80Result]
2495
2496	fninit
2497	add xSP, 20h
2498	EPILOGUE_3_ARGS
2499	ENDPROC iemAImpl_fild_r80_from_i32
2500
2501
2502	;;
2503	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2504	;
2505	; @param A0 FPU context (fxsave).
2506	; @param A1 Where to return the output FSW.
2507	; @param A2 Where to store the 32-bit signed integer value.
2508	; @param A3 Pointer to the 80-bit value.
2509	;
2510	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2511	PROLOGUE_4_ARGS
2512	sub xSP, 20h
2513
2514	fninit
2515	fld tword [A3]
2516	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2517	fistp dword [A2]
2518
2519	fnstsw word [A1]
2520
2521	fninit
2522	add xSP, 20h
2523	EPILOGUE_4_ARGS
2524	ENDPROC iemAImpl_fist_r80_to_i32
2525
2526
2527	;;
2528	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2529	; (memory) with truncation.
2530	;
2531	; @param A0 FPU context (fxsave).
2532	; @param A1 Where to return the output FSW.
2533	; @param A2 Where to store the 32-bit signed integer value.
2534	; @param A3 Pointer to the 80-bit value.
2535	;
2536	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2537	PROLOGUE_4_ARGS
2538	sub xSP, 20h
2539
2540	fninit
2541	fld tword [A3]
2542	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2543	fisttp dword [A2]
2544
2545	fnstsw word [A1]
2546
2547	fninit
2548	add xSP, 20h
2549	EPILOGUE_4_ARGS
2550	ENDPROC iemAImpl_fistt_r80_to_i32
2551
2552
2553	;;
2554	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2555	;
2556	; @param 1 The instruction
2557	;
2558	; @param A0 FPU context (fxsave).
2559	; @param A1 Pointer to a IEMFPURESULT for the output.
2560	; @param A2 Pointer to the 80-bit value.
2561	; @param A3 Pointer to the 32-bit value.
2562	;
2563	%macro IEMIMPL_FPU_R80_BY_I32 1
2564	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2565	PROLOGUE_4_ARGS
2566	sub xSP, 20h
2567
2568	fninit
2569	fld tword [A2]
2570	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2571	%1 dword [A3]
2572
2573	fnstsw word [A1 + IEMFPURESULT.FSW]
2574	fnclex
2575	fstp tword [A1 + IEMFPURESULT.r80Result]
2576
2577	fninit
2578	add xSP, 20h
2579	EPILOGUE_4_ARGS
2580	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2581	%endmacro
2582
2583	IEMIMPL_FPU_R80_BY_I32 fiadd
2584	IEMIMPL_FPU_R80_BY_I32 fimul
2585	IEMIMPL_FPU_R80_BY_I32 fisub
2586	IEMIMPL_FPU_R80_BY_I32 fisubr
2587	IEMIMPL_FPU_R80_BY_I32 fidiv
2588	IEMIMPL_FPU_R80_BY_I32 fidivr
2589
2590
2591	;;
2592	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2593	; only returning FSW.
2594	;
2595	; @param 1 The instruction
2596	;
2597	; @param A0 FPU context (fxsave).
2598	; @param A1 Where to store the output FSW.
2599	; @param A2 Pointer to the 80-bit value.
2600	; @param A3 Pointer to the 64-bit value.
2601	;
2602	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2603	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2604	PROLOGUE_4_ARGS
2605	sub xSP, 20h
2606
2607	fninit
2608	fld tword [A2]
2609	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2610	%1 dword [A3]
2611
2612	fnstsw word [A1]
2613
2614	fninit
2615	add xSP, 20h
2616	EPILOGUE_4_ARGS
2617	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2618	%endmacro
2619
2620	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2621
2622
2623
2624	;
2625	;---------------------- 64-bit signed integer operations ----------------------
2626	;
2627
2628
2629	;;
2630	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2631	;
2632	; @param A0 FPU context (fxsave).
2633	; @param A1 Pointer to a IEMFPURESULT for the output.
2634	; @param A2 Pointer to the 64-bit floating point value to convert.
2635	;
2636	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2637	PROLOGUE_3_ARGS
2638	sub xSP, 20h
2639
2640	fninit
2641	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2642	fild qword [A2]
2643
2644	fnstsw word [A1 + IEMFPURESULT.FSW]
2645	fnclex
2646	fstp tword [A1 + IEMFPURESULT.r80Result]
2647
2648	fninit
2649	add xSP, 20h
2650	EPILOGUE_3_ARGS
2651	ENDPROC iemAImpl_fild_r80_from_i64
2652
2653
2654	;;
2655	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2656	;
2657	; @param A0 FPU context (fxsave).
2658	; @param A1 Where to return the output FSW.
2659	; @param A2 Where to store the 64-bit signed integer value.
2660	; @param A3 Pointer to the 80-bit value.
2661	;
2662	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2663	PROLOGUE_4_ARGS
2664	sub xSP, 20h
2665
2666	fninit
2667	fld tword [A3]
2668	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2669	fistp qword [A2]
2670
2671	fnstsw word [A1]
2672
2673	fninit
2674	add xSP, 20h
2675	EPILOGUE_4_ARGS
2676	ENDPROC iemAImpl_fist_r80_to_i64
2677
2678
2679	;;
2680	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2681	; (memory) with truncation.
2682	;
2683	; @param A0 FPU context (fxsave).
2684	; @param A1 Where to return the output FSW.
2685	; @param A2 Where to store the 64-bit signed integer value.
2686	; @param A3 Pointer to the 80-bit value.
2687	;
2688	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2689	PROLOGUE_4_ARGS
2690	sub xSP, 20h
2691
2692	fninit
2693	fld tword [A3]
2694	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2695	fisttp qword [A2]
2696
2697	fnstsw word [A1]
2698
2699	fninit
2700	add xSP, 20h
2701	EPILOGUE_4_ARGS
2702	ENDPROC iemAImpl_fistt_r80_to_i64
2703
2704
2705
2706	;
2707	;---------------------- 32-bit floating point operations ----------------------
2708	;
2709
2710	;;
2711	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2712	;
2713	; @param A0 FPU context (fxsave).
2714	; @param A1 Pointer to a IEMFPURESULT for the output.
2715	; @param A2 Pointer to the 32-bit floating point value to convert.
2716	;
2717	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2718	PROLOGUE_3_ARGS
2719	sub xSP, 20h
2720
2721	fninit
2722	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2723	fld dword [A2]
2724
2725	fnstsw word [A1 + IEMFPURESULT.FSW]
2726	fnclex
2727	fstp tword [A1 + IEMFPURESULT.r80Result]
2728
2729	fninit
2730	add xSP, 20h
2731	EPILOGUE_3_ARGS
2732	ENDPROC iemAImpl_fld_r80_from_r32
2733
2734
2735	;;
2736	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2737	;
2738	; @param A0 FPU context (fxsave).
2739	; @param A1 Where to return the output FSW.
2740	; @param A2 Where to store the 32-bit value.
2741	; @param A3 Pointer to the 80-bit value.
2742	;
2743	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2744	PROLOGUE_4_ARGS
2745	sub xSP, 20h
2746
2747	fninit
2748	fld tword [A3]
2749	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2750	fst dword [A2]
2751
2752	fnstsw word [A1]
2753
2754	fninit
2755	add xSP, 20h
2756	EPILOGUE_4_ARGS
2757	ENDPROC iemAImpl_fst_r80_to_r32
2758
2759
2760	;;
2761	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2762	;
2763	; @param 1 The instruction
2764	;
2765	; @param A0 FPU context (fxsave).
2766	; @param A1 Pointer to a IEMFPURESULT for the output.
2767	; @param A2 Pointer to the 80-bit value.
2768	; @param A3 Pointer to the 32-bit value.
2769	;
2770	%macro IEMIMPL_FPU_R80_BY_R32 1
2771	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2772	PROLOGUE_4_ARGS
2773	sub xSP, 20h
2774
2775	fninit
2776	fld tword [A2]
2777	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2778	%1 dword [A3]
2779
2780	fnstsw word [A1 + IEMFPURESULT.FSW]
2781	fnclex
2782	fstp tword [A1 + IEMFPURESULT.r80Result]
2783
2784	fninit
2785	add xSP, 20h
2786	EPILOGUE_4_ARGS
2787	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2788	%endmacro
2789
2790	IEMIMPL_FPU_R80_BY_R32 fadd
2791	IEMIMPL_FPU_R80_BY_R32 fmul
2792	IEMIMPL_FPU_R80_BY_R32 fsub
2793	IEMIMPL_FPU_R80_BY_R32 fsubr
2794	IEMIMPL_FPU_R80_BY_R32 fdiv
2795	IEMIMPL_FPU_R80_BY_R32 fdivr
2796
2797
2798	;;
2799	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2800	; only returning FSW.
2801	;
2802	; @param 1 The instruction
2803	;
2804	; @param A0 FPU context (fxsave).
2805	; @param A1 Where to store the output FSW.
2806	; @param A2 Pointer to the 80-bit value.
2807	; @param A3 Pointer to the 64-bit value.
2808	;
2809	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2810	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2811	PROLOGUE_4_ARGS
2812	sub xSP, 20h
2813
2814	fninit
2815	fld tword [A2]
2816	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2817	%1 dword [A3]
2818
2819	fnstsw word [A1]
2820
2821	fninit
2822	add xSP, 20h
2823	EPILOGUE_4_ARGS
2824	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2825	%endmacro
2826
2827	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2828
2829
2830
2831	;
2832	;---------------------- 64-bit floating point operations ----------------------
2833	;
2834
2835	;;
2836	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2837	;
2838	; @param A0 FPU context (fxsave).
2839	; @param A1 Pointer to a IEMFPURESULT for the output.
2840	; @param A2 Pointer to the 64-bit floating point value to convert.
2841	;
2842	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
2843	PROLOGUE_3_ARGS
2844	sub xSP, 20h
2845
2846	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2847	fld qword [A2]
2848
2849	fnstsw word [A1 + IEMFPURESULT.FSW]
2850	fnclex
2851	fstp tword [A1 + IEMFPURESULT.r80Result]
2852
2853	fninit
2854	add xSP, 20h
2855	EPILOGUE_3_ARGS
2856	ENDPROC iemAImpl_fld_r80_from_r64
2857
2858
2859	;;
2860	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2861	;
2862	; @param A0 FPU context (fxsave).
2863	; @param A1 Where to return the output FSW.
2864	; @param A2 Where to store the 64-bit value.
2865	; @param A3 Pointer to the 80-bit value.
2866	;
2867	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2868	PROLOGUE_4_ARGS
2869	sub xSP, 20h
2870
2871	fninit
2872	fld tword [A3]
2873	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2874	fst qword [A2]
2875
2876	fnstsw word [A1]
2877
2878	fninit
2879	add xSP, 20h
2880	EPILOGUE_4_ARGS
2881	ENDPROC iemAImpl_fst_r80_to_r64
2882
2883
2884	;;
2885	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2886	;
2887	; @param 1 The instruction
2888	;
2889	; @param A0 FPU context (fxsave).
2890	; @param A1 Pointer to a IEMFPURESULT for the output.
2891	; @param A2 Pointer to the 80-bit value.
2892	; @param A3 Pointer to the 64-bit value.
2893	;
2894	%macro IEMIMPL_FPU_R80_BY_R64 1
2895	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2896	PROLOGUE_4_ARGS
2897	sub xSP, 20h
2898
2899	fninit
2900	fld tword [A2]
2901	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2902	%1 qword [A3]
2903
2904	fnstsw word [A1 + IEMFPURESULT.FSW]
2905	fnclex
2906	fstp tword [A1 + IEMFPURESULT.r80Result]
2907
2908	fninit
2909	add xSP, 20h
2910	EPILOGUE_4_ARGS
2911	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2912	%endmacro
2913
2914	IEMIMPL_FPU_R80_BY_R64 fadd
2915	IEMIMPL_FPU_R80_BY_R64 fmul
2916	IEMIMPL_FPU_R80_BY_R64 fsub
2917	IEMIMPL_FPU_R80_BY_R64 fsubr
2918	IEMIMPL_FPU_R80_BY_R64 fdiv
2919	IEMIMPL_FPU_R80_BY_R64 fdivr
2920
2921	;;
2922	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2923	; only returning FSW.
2924	;
2925	; @param 1 The instruction
2926	;
2927	; @param A0 FPU context (fxsave).
2928	; @param A1 Where to store the output FSW.
2929	; @param A2 Pointer to the 80-bit value.
2930	; @param A3 Pointer to the 64-bit value.
2931	;
2932	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2933	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2934	PROLOGUE_4_ARGS
2935	sub xSP, 20h
2936
2937	fninit
2938	fld tword [A2]
2939	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2940	%1 qword [A3]
2941
2942	fnstsw word [A1]
2943
2944	fninit
2945	add xSP, 20h
2946	EPILOGUE_4_ARGS
2947	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2948	%endmacro
2949
2950	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2951
2952
2953
2954	;
2955	;---------------------- 80-bit floating point operations ----------------------
2956	;
2957
2958	;;
2959	; Loads a 80-bit floating point register value from memory.
2960	;
2961	; @param A0 FPU context (fxsave).
2962	; @param A1 Pointer to a IEMFPURESULT for the output.
2963	; @param A2 Pointer to the 80-bit floating point value to load.
2964	;
2965	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2966	PROLOGUE_3_ARGS
2967	sub xSP, 20h
2968
2969	fninit
2970	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971	fld tword [A2]
2972
2973	fnstsw word [A1 + IEMFPURESULT.FSW]
2974	fnclex
2975	fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977	fninit
2978	add xSP, 20h
2979	EPILOGUE_3_ARGS
2980	ENDPROC iemAImpl_fld_r80_from_r80
2981
2982
2983	;;
2984	; Store a 80-bit floating point register to memory
2985	;
2986	; @param A0 FPU context (fxsave).
2987	; @param A1 Where to return the output FSW.
2988	; @param A2 Where to store the 80-bit value.
2989	; @param A3 Pointer to the 80-bit register value.
2990	;
2991	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2992	PROLOGUE_4_ARGS
2993	sub xSP, 20h
2994
2995	fninit
2996	fld tword [A3]
2997	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2998	fstp tword [A2]
2999
3000	fnstsw word [A1]
3001
3002	fninit
3003	add xSP, 20h
3004	EPILOGUE_4_ARGS
3005	ENDPROC iemAImpl_fst_r80_to_r80
3006
3007
3008	;;
3009	; Loads an 80-bit floating point register value in BCD format from memory.
3010	;
3011	; @param A0 FPU context (fxsave).
3012	; @param A1 Pointer to a IEMFPURESULT for the output.
3013	; @param A2 Pointer to the 80-bit BCD value to load.
3014	;
3015	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3016	PROLOGUE_3_ARGS
3017	sub xSP, 20h
3018
3019	fninit
3020	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3021	fbld tword [A2]
3022
3023	fnstsw word [A1 + IEMFPURESULT.FSW]
3024	fnclex
3025	fstp tword [A1 + IEMFPURESULT.r80Result]
3026
3027	fninit
3028	add xSP, 20h
3029	EPILOGUE_3_ARGS
3030	ENDPROC iemAImpl_fld_r80_from_d80
3031
3032
3033	;;
3034	; Store a 80-bit floating point register to memory as BCD
3035	;
3036	; @param A0 FPU context (fxsave).
3037	; @param A1 Where to return the output FSW.
3038	; @param A2 Where to store the 80-bit BCD value.
3039	; @param A3 Pointer to the 80-bit register value.
3040	;
3041	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3042	PROLOGUE_4_ARGS
3043	sub xSP, 20h
3044
3045	fninit
3046	fld tword [A3]
3047	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3048	fbstp tword [A2]
3049
3050	fnstsw word [A1]
3051
3052	fninit
3053	add xSP, 20h
3054	EPILOGUE_4_ARGS
3055	ENDPROC iemAImpl_fst_r80_to_d80
3056
3057
3058	;;
3059	; FPU instruction working on two 80-bit floating point values.
3060	;
3061	; @param 1 The instruction
3062	;
3063	; @param A0 FPU context (fxsave).
3064	; @param A1 Pointer to a IEMFPURESULT for the output.
3065	; @param A2 Pointer to the first 80-bit value (ST0)
3066	; @param A3 Pointer to the second 80-bit value (STn).
3067	;
3068	%macro IEMIMPL_FPU_R80_BY_R80 2
3069	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3070	PROLOGUE_4_ARGS
3071	sub xSP, 20h
3072
3073	fninit
3074	fld tword [A3]
3075	fld tword [A2]
3076	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3077	%1 %2
3078
3079	fnstsw word [A1 + IEMFPURESULT.FSW]
3080	fnclex
3081	fstp tword [A1 + IEMFPURESULT.r80Result]
3082
3083	fninit
3084	add xSP, 20h
3085	EPILOGUE_4_ARGS
3086	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3087	%endmacro
3088
3089	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3090	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3091	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3092	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3093	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3094	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3095	IEMIMPL_FPU_R80_BY_R80 fprem, {}
3096	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3097	IEMIMPL_FPU_R80_BY_R80 fscale, {}
3098
3099
3100	;;
3101	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3102	; storing the result in ST1 and popping the stack.
3103	;
3104	; @param 1 The instruction
3105	;
3106	; @param A0 FPU context (fxsave).
3107	; @param A1 Pointer to a IEMFPURESULT for the output.
3108	; @param A2 Pointer to the first 80-bit value (ST1).
3109	; @param A3 Pointer to the second 80-bit value (ST0).
3110	;
3111	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3112	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3113	PROLOGUE_4_ARGS
3114	sub xSP, 20h
3115
3116	fninit
3117	fld tword [A2]
3118	fld tword [A3]
3119	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3120	%1
3121
3122	fnstsw word [A1 + IEMFPURESULT.FSW]
3123	fnclex
3124	fstp tword [A1 + IEMFPURESULT.r80Result]
3125
3126	fninit
3127	add xSP, 20h
3128	EPILOGUE_4_ARGS
3129	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3130	%endmacro
3131
3132	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3133	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3134	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3135
3136
3137	;;
3138	; FPU instruction working on two 80-bit floating point values, only
3139	; returning FSW.
3140	;
3141	; @param 1 The instruction
3142	;
3143	; @param A0 FPU context (fxsave).
3144	; @param A1 Pointer to a uint16_t for the resulting FSW.
3145	; @param A2 Pointer to the first 80-bit value.
3146	; @param A3 Pointer to the second 80-bit value.
3147	;
3148	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3149	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3150	PROLOGUE_4_ARGS
3151	sub xSP, 20h
3152
3153	fninit
3154	fld tword [A3]
3155	fld tword [A2]
3156	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3157	%1 st0, st1
3158
3159	fnstsw word [A1]
3160
3161	fninit
3162	add xSP, 20h
3163	EPILOGUE_4_ARGS
3164	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3165	%endmacro
3166
3167	IEMIMPL_FPU_R80_BY_R80_FSW fcom
3168	IEMIMPL_FPU_R80_BY_R80_FSW fucom
3169
3170
3171	;;
3172	; FPU instruction working on two 80-bit floating point values,
3173	; returning FSW and EFLAGS (eax).
3174	;
3175	; @param 1 The instruction
3176	;
3177	; @returns EFLAGS in EAX.
3178	; @param A0 FPU context (fxsave).
3179	; @param A1 Pointer to a uint16_t for the resulting FSW.
3180	; @param A2 Pointer to the first 80-bit value.
3181	; @param A3 Pointer to the second 80-bit value.
3182	;
3183	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3184	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3185	PROLOGUE_4_ARGS
3186	sub xSP, 20h
3187
3188	fninit
3189	fld tword [A3]
3190	fld tword [A2]
3191	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192	%1 st1
3193
3194	fnstsw word [A1]
3195	pushf
3196	pop xAX
3197
3198	fninit
3199	add xSP, 20h
3200	EPILOGUE_4_ARGS
3201	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3202	%endmacro
3203
3204	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3205	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3206
3207
3208	;;
3209	; FPU instruction working on one 80-bit floating point value.
3210	;
3211	; @param 1 The instruction
3212	;
3213	; @param A0 FPU context (fxsave).
3214	; @param A1 Pointer to a IEMFPURESULT for the output.
3215	; @param A2 Pointer to the 80-bit value.
3216	;
3217	%macro IEMIMPL_FPU_R80 1
3218	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3219	PROLOGUE_3_ARGS
3220	sub xSP, 20h
3221
3222	fninit
3223	fld tword [A2]
3224	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3225	%1
3226
3227	fnstsw word [A1 + IEMFPURESULT.FSW]
3228	fnclex
3229	fstp tword [A1 + IEMFPURESULT.r80Result]
3230
3231	fninit
3232	add xSP, 20h
3233	EPILOGUE_3_ARGS
3234	ENDPROC iemAImpl_ %+ %1 %+ _r80
3235	%endmacro
3236
3237	IEMIMPL_FPU_R80 fchs
3238	IEMIMPL_FPU_R80 fabs
3239	IEMIMPL_FPU_R80 f2xm1
3240	IEMIMPL_FPU_R80 fsqrt
3241	IEMIMPL_FPU_R80 frndint
3242	IEMIMPL_FPU_R80 fsin
3243	IEMIMPL_FPU_R80 fcos
3244
3245
3246	;;
3247	; FPU instruction working on one 80-bit floating point value, only
3248	; returning FSW.
3249	;
3250	; @param 1 The instruction
3251	; @param 2 Non-zero to also restore FTW.
3252	;
3253	; @param A0 FPU context (fxsave).
3254	; @param A1 Pointer to a uint16_t for the resulting FSW.
3255	; @param A2 Pointer to the 80-bit value.
3256	;
3257	%macro IEMIMPL_FPU_R80_FSW 2
3258	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3259	PROLOGUE_3_ARGS
3260	sub xSP, 20h
3261
3262	fninit
3263	fld tword [A2]
3264	%if %2 != 0
3265	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3266	%else
3267	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3268	%endif
3269	%1
3270
3271	fnstsw word [A1]
3272
3273	fninit
3274	add xSP, 20h
3275	EPILOGUE_3_ARGS
3276	ENDPROC iemAImpl_ %+ %1 %+ _r80
3277	%endmacro
3278
3279	IEMIMPL_FPU_R80_FSW ftst, 0
3280	IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3281
3282
3283
3284	;;
3285	; FPU instruction loading a 80-bit floating point constant.
3286	;
3287	; @param 1 The instruction
3288	;
3289	; @param A0 FPU context (fxsave).
3290	; @param A1 Pointer to a IEMFPURESULT for the output.
3291	;
3292	%macro IEMIMPL_FPU_R80_CONST 1
3293	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3294	PROLOGUE_2_ARGS
3295	sub xSP, 20h
3296
3297	fninit
3298	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3299	%1
3300
3301	fnstsw word [A1 + IEMFPURESULT.FSW]
3302	fnclex
3303	fstp tword [A1 + IEMFPURESULT.r80Result]
3304
3305	fninit
3306	add xSP, 20h
3307	EPILOGUE_2_ARGS
3308	ENDPROC iemAImpl_ %+ %1 %+
3309	%endmacro
3310
3311	IEMIMPL_FPU_R80_CONST fld1
3312	IEMIMPL_FPU_R80_CONST fldl2t
3313	IEMIMPL_FPU_R80_CONST fldl2e
3314	IEMIMPL_FPU_R80_CONST fldpi
3315	IEMIMPL_FPU_R80_CONST fldlg2
3316	IEMIMPL_FPU_R80_CONST fldln2
3317	IEMIMPL_FPU_R80_CONST fldz
3318
3319
3320	;;
3321	; FPU instruction working on one 80-bit floating point value, outputing two.
3322	;
3323	; @param 1 The instruction
3324	;
3325	; @param A0 FPU context (fxsave).
3326	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3327	; @param A2 Pointer to the 80-bit value.
3328	;
3329	%macro IEMIMPL_FPU_R80_R80 1
3330	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3331	PROLOGUE_3_ARGS
3332	sub xSP, 20h
3333
3334	fninit
3335	fld tword [A2]
3336	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3337	%1
3338
3339	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3340	fnclex
3341	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3342	fnclex
3343	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3344
3345	fninit
3346	add xSP, 20h
3347	EPILOGUE_3_ARGS
3348	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3349	%endmacro
3350
3351	IEMIMPL_FPU_R80_R80 fptan
3352	IEMIMPL_FPU_R80_R80 fxtract
3353	IEMIMPL_FPU_R80_R80 fsincos
3354
3355
3356
3357
3358	;---------------------- SSE and MMX Operations ----------------------
3359
3360	;; @todo what do we need to do for MMX?
3361	%macro IEMIMPL_MMX_PROLOGUE 0
3362	%endmacro
3363	%macro IEMIMPL_MMX_EPILOGUE 0
3364	%endmacro
3365
3366	;; @todo what do we need to do for SSE?
3367	%macro IEMIMPL_SSE_PROLOGUE 0
3368	%endmacro
3369	%macro IEMIMPL_SSE_EPILOGUE 0
3370	%endmacro
3371
3372
3373	;;
3374	; Media instruction working on two full sized registers.
3375	;
3376	; @param 1 The instruction
3377	;
3378	; @param A0 FPU context (fxsave).
3379	; @param A1 Pointer to the first media register size operand (input/output).
3380	; @param A2 Pointer to the second media register size operand (input).
3381	;
3382	%macro IEMIMPL_MEDIA_F2 1
3383	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3384	PROLOGUE_3_ARGS
3385	IEMIMPL_MMX_PROLOGUE
3386
3387	movq mm0, [A1]
3388	movq mm1, [A2]
3389	%1 mm0, mm1
3390	movq [A1], mm0
3391
3392	IEMIMPL_MMX_EPILOGUE
3393	EPILOGUE_3_ARGS
3394	ENDPROC iemAImpl_ %+ %1 %+ _u64
3395
3396	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3397	PROLOGUE_3_ARGS
3398	IEMIMPL_SSE_PROLOGUE
3399
3400	movdqu xmm0, [A1]
3401	movdqu xmm1, [A2]
3402	%1 xmm0, xmm1
3403	movdqu [A1], xmm0
3404
3405	IEMIMPL_SSE_EPILOGUE
3406	EPILOGUE_3_ARGS
3407	ENDPROC iemAImpl_ %+ %1 %+ _u128
3408	%endmacro
3409
3410	IEMIMPL_MEDIA_F2 pxor
3411	IEMIMPL_MEDIA_F2 pcmpeqb
3412	IEMIMPL_MEDIA_F2 pcmpeqw
3413	IEMIMPL_MEDIA_F2 pcmpeqd
3414
3415
3416	;;
3417	; Media instruction working on one full sized and one half sized register (lower half).
3418	;
3419	; @param 1 The instruction
3420	; @param 2 1 if MMX is included, 0 if not.
3421	;
3422	; @param A0 FPU context (fxsave).
3423	; @param A1 Pointer to the first full sized media register operand (input/output).
3424	; @param A2 Pointer to the second half sized media register operand (input).
3425	;
3426	%macro IEMIMPL_MEDIA_F1L1 2
3427	%if %2 != 0
3428	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3429	PROLOGUE_3_ARGS
3430	IEMIMPL_MMX_PROLOGUE
3431
3432	movq mm0, [A1]
3433	movd mm1, [A2]
3434	%1 mm0, mm1
3435	movq [A1], mm0
3436
3437	IEMIMPL_MMX_EPILOGUE
3438	EPILOGUE_3_ARGS
3439	ENDPROC iemAImpl_ %+ %1 %+ _u64
3440	%endif
3441
3442	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3443	PROLOGUE_3_ARGS
3444	IEMIMPL_SSE_PROLOGUE
3445
3446	movdqu xmm0, [A1]
3447	movq xmm1, [A2]
3448	%1 xmm0, xmm1
3449	movdqu [A1], xmm0
3450
3451	IEMIMPL_SSE_EPILOGUE
3452	EPILOGUE_3_ARGS
3453	ENDPROC iemAImpl_ %+ %1 %+ _u128
3454	%endmacro
3455
3456	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3457	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3458	IEMIMPL_MEDIA_F1L1 punpckldq, 1
3459	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3460
3461
3462	;;
3463	; Media instruction working on one full sized and one half sized register (high half).
3464	;
3465	; @param 1 The instruction
3466	; @param 2 1 if MMX is included, 0 if not.
3467	;
3468	; @param A0 FPU context (fxsave).
3469	; @param A1 Pointer to the first full sized media register operand (input/output).
3470	; @param A2 Pointer to the second full sized media register operand, where we
3471	; will only use the upper half (input).
3472	;
3473	%macro IEMIMPL_MEDIA_F1H1 2
3474	%if %2 != 0
3475	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3476	PROLOGUE_3_ARGS
3477	IEMIMPL_MMX_PROLOGUE
3478
3479	movq mm0, [A1]
3480	movq mm1, [A2]
3481	%1 mm0, mm1
3482	movq [A1], mm0
3483
3484	IEMIMPL_MMX_EPILOGUE
3485	EPILOGUE_3_ARGS
3486	ENDPROC iemAImpl_ %+ %1 %+ _u64
3487	%endif
3488
3489	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3490	PROLOGUE_3_ARGS
3491	IEMIMPL_SSE_PROLOGUE
3492
3493	movdqu xmm0, [A1]
3494	movdqu xmm1, [A2]
3495	%1 xmm0, xmm1
3496	movdqu [A1], xmm0
3497
3498	IEMIMPL_SSE_EPILOGUE
3499	EPILOGUE_3_ARGS
3500	ENDPROC iemAImpl_ %+ %1 %+ _u128
3501	%endmacro
3502
3503	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3504	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3505	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3506	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3507
3508
3509	;
3510	; Shufflers with evil 8-bit immediates.
3511	;
3512
3513	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
3514	PROLOGUE_4_ARGS
3515	IEMIMPL_MMX_PROLOGUE
3516
3517	movq mm0, [A1]
3518	movq mm1, [A2]
3519	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
3520	lea T1, [.imm0 xWrtRIP]
3521	lea T1, [T1 + T0]
3522	call T1
3523	movq [A1], mm0
3524
3525	IEMIMPL_MMX_EPILOGUE
3526	EPILOGUE_4_ARGS
3527	%assign bImm 0
3528	%rep 256
3529	.imm %+ bImm:
3530	pshufw mm0, mm1, bImm
3531	ret
3532	%assign bImm bImm + 1
3533	%endrep
3534	.immEnd: ; 256*5 == 0x500
3535	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3536	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3537	ENDPROC iemAImpl_pshufw
3538
3539
3540	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3541	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
3542	PROLOGUE_4_ARGS
3543	IEMIMPL_SSE_PROLOGUE
3544
3545	movdqu xmm0, [A1]
3546	movdqu xmm1, [A2]
3547	lea T1, [.imm0 xWrtRIP]
3548	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
3549	lea T1, [T1 + T0*2]
3550	call T1
3551	movdqu [A1], xmm0
3552
3553	IEMIMPL_SSE_EPILOGUE
3554	EPILOGUE_4_ARGS
3555	%assign bImm 0
3556	%rep 256
3557	.imm %+ bImm:
3558	%1 xmm0, xmm1, bImm
3559	ret
3560	%assign bImm bImm + 1
3561	%endrep
3562	.immEnd: ; 256*6 == 0x600
3563	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3564	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3565	ENDPROC iemAImpl_ %+ %1
3566	%endmacro
3567
3568	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3569	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3570	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3571
3572
3573	;
3574	; Move byte mask.
3575	;
3576
3577	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
3578	PROLOGUE_3_ARGS
3579	IEMIMPL_MMX_PROLOGUE
3580
3581	mov T0, [A1]
3582	movq mm1, [A2]
3583	pmovmskb T0, mm1
3584	mov [A1], T0
3585	%ifdef RT_ARCH_X86
3586	mov dword [A1 + 4], 0
3587	%endif
3588	IEMIMPL_MMX_EPILOGUE
3589	EPILOGUE_3_ARGS
3590	ENDPROC iemAImpl_pmovmskb_u64
3591
3592	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3593	PROLOGUE_3_ARGS
3594	IEMIMPL_SSE_PROLOGUE
3595
3596	mov T0, [A1]
3597	movdqu xmm1, [A2]
3598	pmovmskb T0, xmm1
3599	mov [A1], T0
3600	%ifdef RT_ARCH_X86
3601	mov dword [A1 + 4], 0
3602	%endif
3603	IEMIMPL_SSE_EPILOGUE
3604	EPILOGUE_3_ARGS
3605	ENDPROC iemAImpl_pmovmskb_u128
3606

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 95308

Download in other formats: