IEMAllAImpl.asm@ 93725

Last change on this file since 93725 was 93115, checked in by vboxsync, 3 years ago
scm --update-copyright-year
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 82.8 KB

Line
1	; $Id: IEMAllAImpl.asm 93115 2022-01-01 11:31:46Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
20	; Header Files ;
21	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
28	; Defined Constants And Macros ;
29	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%else
176	; x86
177	%macro PROLOGUE_1_ARGS 0
178	push edi
179	%endmacro
180	%macro EPILOGUE_1_ARGS 0
181	pop edi
182	ret 0
183	%endmacro
184	%macro EPILOGUE_1_ARGS_EX 1
185	pop edi
186	ret %1
187	%endmacro
188
189	%macro PROLOGUE_2_ARGS 0
190	push edi
191	%endmacro
192	%macro EPILOGUE_2_ARGS 0
193	pop edi
194	ret 0
195	%endmacro
196	%macro EPILOGUE_2_ARGS_EX 1
197	pop edi
198	ret %1
199	%endmacro
200
201	%macro PROLOGUE_3_ARGS 0
202	push ebx
203	mov ebx, [esp + 4 + 4]
204	push edi
205	%endmacro
206	%macro EPILOGUE_3_ARGS_EX 1
207	%if (%1) < 4
208	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
209	%endif
210	pop edi
211	pop ebx
212	ret %1
213	%endmacro
214	%macro EPILOGUE_3_ARGS 0
215	EPILOGUE_3_ARGS_EX 4
216	%endmacro
217
218	%macro PROLOGUE_4_ARGS 0
219	push ebx
220	push edi
221	push esi
222	mov ebx, [esp + 12 + 4 + 0]
223	mov esi, [esp + 12 + 4 + 4]
224	%endmacro
225	%macro EPILOGUE_4_ARGS_EX 1
226	%if (%1) < 8
227	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
228	%endif
229	pop esi
230	pop edi
231	pop ebx
232	ret %1
233	%endmacro
234	%macro EPILOGUE_4_ARGS 0
235	EPILOGUE_4_ARGS_EX 8
236	%endmacro
237
238	%define A0 ecx
239	%define A0_32 ecx
240	%define A0_16 cx
241	%define A0_8 cl
242
243	%define A1 edx
244	%define A1_32 edx
245	%define A1_16 dx
246	%define A1_8 dl
247
248	%define A2 ebx
249	%define A2_32 ebx
250	%define A2_16 bx
251	%define A2_8 bl
252
253	%define A3 esi
254	%define A3_32 esi
255	%define A3_16 si
256
257	%define T0 eax
258	%define T0_32 eax
259	%define T0_16 ax
260	%define T0_8 al
261
262	%define T1 edi
263	%define T1_32 edi
264	%define T1_16 di
265	%endif
266
267
268	;;
269	; Load the relevant flags from [%1] if there are undefined flags (%3).
270	;
271	; @remarks Clobbers T0, stack. Changes EFLAGS.
272	; @param A2 The register pointing to the flags.
273	; @param 1 The parameter (A0..A3) pointing to the eflags.
274	; @param 2 The set of modified flags.
275	; @param 3 The set of undefined flags.
276	;
277	%macro IEM_MAYBE_LOAD_FLAGS 3
278	;%if (%3) != 0
279	pushf ; store current flags
280	mov T0_32, [%1] ; load the guest flags
281	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
282	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
283	or [xSP], T0 ; merge guest flags with host flags.
284	popf ; load the mixed flags.
285	;%endif
286	%endmacro
287
288	;;
289	; Update the flag.
290	;
291	; @remarks Clobbers T0, T1, stack.
292	; @param 1 The register pointing to the EFLAGS.
293	; @param 2 The mask of modified flags to save.
294	; @param 3 The mask of undefined flags to (maybe) save.
295	;
296	%macro IEM_SAVE_FLAGS 3
297	%if (%2 \| %3) != 0
298	pushf
299	pop T1
300	mov T0_32, [%1] ; flags
301	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
302	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
303	or T0_32, T1_32 ; combine the flags.
304	mov [%1], T0_32 ; save the flags.
305	%endif
306	%endmacro
307
308
309	;;
310	; Macro for implementing a binary operator.
311	;
312	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
313	; variants, except on 32-bit system where the 64-bit accesses requires hand
314	; coding.
315	;
316	; All the functions takes a pointer to the destination memory operand in A0,
317	; the source register operand in A1 and a pointer to eflags in A2.
318	;
319	; @param 1 The instruction mnemonic.
320	; @param 2 Non-zero if there should be a locked version.
321	; @param 3 The modified flags.
322	; @param 4 The undefined flags.
323	;
324	%macro IEMIMPL_BIN_OP 4
325	BEGINCODE
326	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
327	PROLOGUE_3_ARGS
328	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
329	%1 byte [A0], A1_8
330	IEM_SAVE_FLAGS A2, %3, %4
331	EPILOGUE_3_ARGS
332	ENDPROC iemAImpl_ %+ %1 %+ _u8
333
334	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
335	PROLOGUE_3_ARGS
336	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
337	%1 word [A0], A1_16
338	IEM_SAVE_FLAGS A2, %3, %4
339	EPILOGUE_3_ARGS
340	ENDPROC iemAImpl_ %+ %1 %+ _u16
341
342	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
343	PROLOGUE_3_ARGS
344	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
345	%1 dword [A0], A1_32
346	IEM_SAVE_FLAGS A2, %3, %4
347	EPILOGUE_3_ARGS
348	ENDPROC iemAImpl_ %+ %1 %+ _u32
349
350	%ifdef RT_ARCH_AMD64
351	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
352	PROLOGUE_3_ARGS
353	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
354	%1 qword [A0], A1
355	IEM_SAVE_FLAGS A2, %3, %4
356	EPILOGUE_3_ARGS_EX 8
357	ENDPROC iemAImpl_ %+ %1 %+ _u64
358	%endif ; RT_ARCH_AMD64
359
360	%if %2 != 0 ; locked versions requested?
361
362	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
363	PROLOGUE_3_ARGS
364	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
365	lock %1 byte [A0], A1_8
366	IEM_SAVE_FLAGS A2, %3, %4
367	EPILOGUE_3_ARGS
368	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
369
370	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
371	PROLOGUE_3_ARGS
372	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
373	lock %1 word [A0], A1_16
374	IEM_SAVE_FLAGS A2, %3, %4
375	EPILOGUE_3_ARGS
376	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
377
378	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
379	PROLOGUE_3_ARGS
380	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
381	lock %1 dword [A0], A1_32
382	IEM_SAVE_FLAGS A2, %3, %4
383	EPILOGUE_3_ARGS
384	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
385
386	%ifdef RT_ARCH_AMD64
387	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
388	PROLOGUE_3_ARGS
389	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
390	lock %1 qword [A0], A1
391	IEM_SAVE_FLAGS A2, %3, %4
392	EPILOGUE_3_ARGS_EX 8
393	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
394	%endif ; RT_ARCH_AMD64
395	%endif ; locked
396	%endmacro
397
398	; instr,lock,modified-flags.
399	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
400	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
401	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
402	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
403	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
404	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
405	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
406	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
407	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
408
409
410	;;
411	; Macro for implementing a bit operator.
412	;
413	; This will generate code for the 16, 32 and 64 bit accesses with locked
414	; variants, except on 32-bit system where the 64-bit accesses requires hand
415	; coding.
416	;
417	; All the functions takes a pointer to the destination memory operand in A0,
418	; the source register operand in A1 and a pointer to eflags in A2.
419	;
420	; @param 1 The instruction mnemonic.
421	; @param 2 Non-zero if there should be a locked version.
422	; @param 3 The modified flags.
423	; @param 4 The undefined flags.
424	;
425	%macro IEMIMPL_BIT_OP 4
426	BEGINCODE
427	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
428	PROLOGUE_3_ARGS
429	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
430	%1 word [A0], A1_16
431	IEM_SAVE_FLAGS A2, %3, %4
432	EPILOGUE_3_ARGS
433	ENDPROC iemAImpl_ %+ %1 %+ _u16
434
435	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
436	PROLOGUE_3_ARGS
437	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
438	%1 dword [A0], A1_32
439	IEM_SAVE_FLAGS A2, %3, %4
440	EPILOGUE_3_ARGS
441	ENDPROC iemAImpl_ %+ %1 %+ _u32
442
443	%ifdef RT_ARCH_AMD64
444	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
445	PROLOGUE_3_ARGS
446	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
447	%1 qword [A0], A1
448	IEM_SAVE_FLAGS A2, %3, %4
449	EPILOGUE_3_ARGS_EX 8
450	ENDPROC iemAImpl_ %+ %1 %+ _u64
451	%endif ; RT_ARCH_AMD64
452
453	%if %2 != 0 ; locked versions requested?
454
455	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
456	PROLOGUE_3_ARGS
457	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
458	lock %1 word [A0], A1_16
459	IEM_SAVE_FLAGS A2, %3, %4
460	EPILOGUE_3_ARGS
461	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
462
463	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
464	PROLOGUE_3_ARGS
465	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
466	lock %1 dword [A0], A1_32
467	IEM_SAVE_FLAGS A2, %3, %4
468	EPILOGUE_3_ARGS
469	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
470
471	%ifdef RT_ARCH_AMD64
472	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
473	PROLOGUE_3_ARGS
474	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
475	lock %1 qword [A0], A1
476	IEM_SAVE_FLAGS A2, %3, %4
477	EPILOGUE_3_ARGS_EX 8
478	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
479	%endif ; RT_ARCH_AMD64
480	%endif ; locked
481	%endmacro
482	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
483	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
484	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
485	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
486
487	;;
488	; Macro for implementing a bit search operator.
489	;
490	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
491	; system where the 64-bit accesses requires hand coding.
492	;
493	; All the functions takes a pointer to the destination memory operand in A0,
494	; the source register operand in A1 and a pointer to eflags in A2.
495	;
496	; @param 1 The instruction mnemonic.
497	; @param 2 The modified flags.
498	; @param 3 The undefined flags.
499	;
500	%macro IEMIMPL_BIT_OP 3
501	BEGINCODE
502	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
503	PROLOGUE_3_ARGS
504	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
505	%1 T0_16, A1_16
506	jz .unchanged_dst
507	mov [A0], T0_16
508	.unchanged_dst:
509	IEM_SAVE_FLAGS A2, %2, %3
510	EPILOGUE_3_ARGS
511	ENDPROC iemAImpl_ %+ %1 %+ _u16
512
513	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
514	PROLOGUE_3_ARGS
515	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
516	%1 T0_32, A1_32
517	jz .unchanged_dst
518	mov [A0], T0_32
519	.unchanged_dst:
520	IEM_SAVE_FLAGS A2, %2, %3
521	EPILOGUE_3_ARGS
522	ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524	%ifdef RT_ARCH_AMD64
525	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526	PROLOGUE_3_ARGS
527	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
528	%1 T0, A1
529	jz .unchanged_dst
530	mov [A0], T0
531	.unchanged_dst:
532	IEM_SAVE_FLAGS A2, %2, %3
533	EPILOGUE_3_ARGS_EX 8
534	ENDPROC iemAImpl_ %+ %1 %+ _u64
535	%endif ; RT_ARCH_AMD64
536	%endmacro
537	IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
538	IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
539
540
541	;
542	; IMUL is also a similar but yet different case (no lock, no mem dst).
543	; The rDX:rAX variant of imul is handled together with mul further down.
544	;
545	BEGINCODE
546	BEGINPROC_FASTCALL iemAImpl_imul_two_u16, 12
547	PROLOGUE_3_ARGS
548	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
549	imul A1_16, word [A0]
550	mov [A0], A1_16
551	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
552	EPILOGUE_3_ARGS
553	ENDPROC iemAImpl_imul_two_u16
554
555	BEGINPROC_FASTCALL iemAImpl_imul_two_u32, 12
556	PROLOGUE_3_ARGS
557	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
558	imul A1_32, dword [A0]
559	mov [A0], A1_32
560	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
561	EPILOGUE_3_ARGS
562	ENDPROC iemAImpl_imul_two_u32
563
564	%ifdef RT_ARCH_AMD64
565	BEGINPROC_FASTCALL iemAImpl_imul_two_u64, 16
566	PROLOGUE_3_ARGS
567	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
568	imul A1, qword [A0]
569	mov [A0], A1
570	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
571	EPILOGUE_3_ARGS_EX 8
572	ENDPROC iemAImpl_imul_two_u64
573	%endif ; RT_ARCH_AMD64
574
575
576	;
577	; XCHG for memory operands. This implies locking. No flag changes.
578	;
579	; Each function takes two arguments, first the pointer to the memory,
580	; then the pointer to the register. They all return void.
581	;
582	BEGINCODE
583	BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
584	PROLOGUE_2_ARGS
585	mov T0_8, [A1]
586	xchg [A0], T0_8
587	mov [A1], T0_8
588	EPILOGUE_2_ARGS
589	ENDPROC iemAImpl_xchg_u8_locked
590
591	BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
592	PROLOGUE_2_ARGS
593	mov T0_16, [A1]
594	xchg [A0], T0_16
595	mov [A1], T0_16
596	EPILOGUE_2_ARGS
597	ENDPROC iemAImpl_xchg_u16_locked
598
599	BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
600	PROLOGUE_2_ARGS
601	mov T0_32, [A1]
602	xchg [A0], T0_32
603	mov [A1], T0_32
604	EPILOGUE_2_ARGS
605	ENDPROC iemAImpl_xchg_u32_locked
606
607	%ifdef RT_ARCH_AMD64
608	BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
609	PROLOGUE_2_ARGS
610	mov T0, [A1]
611	xchg [A0], T0
612	mov [A1], T0
613	EPILOGUE_2_ARGS
614	ENDPROC iemAImpl_xchg_u64_locked
615	%endif
616
617	; Unlocked variants for fDisregardLock mode.
618
619	BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
620	PROLOGUE_2_ARGS
621	mov T0_8, [A1]
622	mov T1_8, [A0]
623	mov [A0], T0_8
624	mov [A1], T1_8
625	EPILOGUE_2_ARGS
626	ENDPROC iemAImpl_xchg_u8_unlocked
627
628	BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
629	PROLOGUE_2_ARGS
630	mov T0_16, [A1]
631	mov T1_16, [A0]
632	mov [A0], T0_16
633	mov [A1], T1_16
634	EPILOGUE_2_ARGS
635	ENDPROC iemAImpl_xchg_u16_unlocked
636
637	BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
638	PROLOGUE_2_ARGS
639	mov T0_32, [A1]
640	mov T1_32, [A0]
641	mov [A0], T0_32
642	mov [A1], T1_32
643	EPILOGUE_2_ARGS
644	ENDPROC iemAImpl_xchg_u32_unlocked
645
646	%ifdef RT_ARCH_AMD64
647	BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
648	PROLOGUE_2_ARGS
649	mov T0, [A1]
650	mov T1, [A0]
651	mov [A0], T0
652	mov [A1], T1
653	EPILOGUE_2_ARGS
654	ENDPROC iemAImpl_xchg_u64_unlocked
655	%endif
656
657
658	;
659	; XADD for memory operands.
660	;
661	; Each function takes three arguments, first the pointer to the
662	; memory/register, then the pointer to the register, and finally a pointer to
663	; eflags. They all return void.
664	;
665	BEGINCODE
666	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
667	PROLOGUE_3_ARGS
668	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
669	mov T0_8, [A1]
670	xadd [A0], T0_8
671	mov [A1], T0_8
672	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
673	EPILOGUE_3_ARGS
674	ENDPROC iemAImpl_xadd_u8
675
676	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
677	PROLOGUE_3_ARGS
678	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
679	mov T0_16, [A1]
680	xadd [A0], T0_16
681	mov [A1], T0_16
682	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
683	EPILOGUE_3_ARGS
684	ENDPROC iemAImpl_xadd_u16
685
686	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
687	PROLOGUE_3_ARGS
688	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
689	mov T0_32, [A1]
690	xadd [A0], T0_32
691	mov [A1], T0_32
692	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
693	EPILOGUE_3_ARGS
694	ENDPROC iemAImpl_xadd_u32
695
696	%ifdef RT_ARCH_AMD64
697	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
698	PROLOGUE_3_ARGS
699	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
700	mov T0, [A1]
701	xadd [A0], T0
702	mov [A1], T0
703	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
704	EPILOGUE_3_ARGS
705	ENDPROC iemAImpl_xadd_u64
706	%endif ; RT_ARCH_AMD64
707
708	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
709	PROLOGUE_3_ARGS
710	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
711	mov T0_8, [A1]
712	lock xadd [A0], T0_8
713	mov [A1], T0_8
714	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
715	EPILOGUE_3_ARGS
716	ENDPROC iemAImpl_xadd_u8_locked
717
718	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
719	PROLOGUE_3_ARGS
720	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
721	mov T0_16, [A1]
722	lock xadd [A0], T0_16
723	mov [A1], T0_16
724	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
725	EPILOGUE_3_ARGS
726	ENDPROC iemAImpl_xadd_u16_locked
727
728	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
729	PROLOGUE_3_ARGS
730	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
731	mov T0_32, [A1]
732	lock xadd [A0], T0_32
733	mov [A1], T0_32
734	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
735	EPILOGUE_3_ARGS
736	ENDPROC iemAImpl_xadd_u32_locked
737
738	%ifdef RT_ARCH_AMD64
739	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
740	PROLOGUE_3_ARGS
741	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
742	mov T0, [A1]
743	lock xadd [A0], T0
744	mov [A1], T0
745	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
746	EPILOGUE_3_ARGS
747	ENDPROC iemAImpl_xadd_u64_locked
748	%endif ; RT_ARCH_AMD64
749
750
751	;
752	; CMPXCHG8B.
753	;
754	; These are tricky register wise, so the code is duplicated for each calling
755	; convention.
756	;
757	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
758	;
759	; C-proto:
760	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
761	; uint32_t *pEFlags));
762	;
763	; Note! Identical to iemAImpl_cmpxchg16b.
764	;
765	BEGINCODE
766	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
767	%ifdef RT_ARCH_AMD64
768	%ifdef ASM_CALL64_MSC
769	push rbx
770
771	mov r11, rdx ; pu64EaxEdx (is also T1)
772	mov r10, rcx ; pu64Dst
773
774	mov ebx, [r8]
775	mov ecx, [r8 + 4]
776	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
777	mov eax, [r11]
778	mov edx, [r11 + 4]
779
780	lock cmpxchg8b [r10]
781
782	mov [r11], eax
783	mov [r11 + 4], edx
784	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
785
786	pop rbx
787	ret
788	%else
789	push rbx
790
791	mov r10, rcx ; pEFlags
792	mov r11, rdx ; pu64EbxEcx (is also T1)
793
794	mov ebx, [r11]
795	mov ecx, [r11 + 4]
796	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
797	mov eax, [rsi]
798	mov edx, [rsi + 4]
799
800	lock cmpxchg8b [rdi]
801
802	mov [rsi], eax
803	mov [rsi + 4], edx
804	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
805
806	pop rbx
807	ret
808
809	%endif
810	%else
811	push esi
812	push edi
813	push ebx
814	push ebp
815
816	mov edi, ecx ; pu64Dst
817	mov esi, edx ; pu64EaxEdx
818	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
819	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
820
821	mov ebx, [ecx]
822	mov ecx, [ecx + 4]
823	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
824	mov eax, [esi]
825	mov edx, [esi + 4]
826
827	lock cmpxchg8b [edi]
828
829	mov [esi], eax
830	mov [esi + 4], edx
831	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
832
833	pop ebp
834	pop ebx
835	pop edi
836	pop esi
837	ret 8
838	%endif
839	ENDPROC iemAImpl_cmpxchg8b
840
841	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
842	; Lazy bird always lock prefixes cmpxchg8b.
843	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
844	ENDPROC iemAImpl_cmpxchg8b_locked
845
846	%ifdef RT_ARCH_AMD64
847
848	;
849	; CMPXCHG16B.
850	;
851	; These are tricky register wise, so the code is duplicated for each calling
852	; convention.
853	;
854	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
855	;
856	; C-proto:
857	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
858	; uint32_t *pEFlags));
859	;
860	; Note! Identical to iemAImpl_cmpxchg8b.
861	;
862	BEGINCODE
863	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
864	%ifdef ASM_CALL64_MSC
865	push rbx
866
867	mov r11, rdx ; pu64RaxRdx (is also T1)
868	mov r10, rcx ; pu64Dst
869
870	mov rbx, [r8]
871	mov rcx, [r8 + 8]
872	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
873	mov rax, [r11]
874	mov rdx, [r11 + 8]
875
876	lock cmpxchg16b [r10]
877
878	mov [r11], rax
879	mov [r11 + 8], rdx
880	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
881
882	pop rbx
883	ret
884	%else
885	push rbx
886
887	mov r10, rcx ; pEFlags
888	mov r11, rdx ; pu64RbxRcx (is also T1)
889
890	mov rbx, [r11]
891	mov rcx, [r11 + 8]
892	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
893	mov rax, [rsi]
894	mov rdx, [rsi + 8]
895
896	lock cmpxchg16b [rdi]
897
898	mov [rsi], eax
899	mov [rsi + 8], edx
900	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
901
902	pop rbx
903	ret
904
905	%endif
906	ENDPROC iemAImpl_cmpxchg16b
907
908	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
909	; Lazy bird always lock prefixes cmpxchg8b.
910	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
911	ENDPROC iemAImpl_cmpxchg16b_locked
912
913	%endif ; RT_ARCH_AMD64
914
915
916	;
917	; CMPXCHG.
918	;
919	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
920	;
921	; C-proto:
922	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
923	;
924	BEGINCODE
925	%macro IEMIMPL_CMPXCHG 2
926	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
927	PROLOGUE_4_ARGS
928	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
929	mov al, [A1]
930	%1 cmpxchg [A0], A2_8
931	mov [A1], al
932	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
933	EPILOGUE_4_ARGS
934	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
935
936	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
937	PROLOGUE_4_ARGS
938	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
939	mov ax, [A1]
940	%1 cmpxchg [A0], A2_16
941	mov [A1], ax
942	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
943	EPILOGUE_4_ARGS
944	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
945
946	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
947	PROLOGUE_4_ARGS
948	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
949	mov eax, [A1]
950	%1 cmpxchg [A0], A2_32
951	mov [A1], eax
952	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
953	EPILOGUE_4_ARGS
954	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
955
956	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
957	%ifdef RT_ARCH_AMD64
958	PROLOGUE_4_ARGS
959	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
960	mov rax, [A1]
961	%1 cmpxchg [A0], A2
962	mov [A1], rax
963	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
964	EPILOGUE_4_ARGS
965	%else
966	;
967	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
968	;
969	push esi
970	push edi
971	push ebx
972	push ebp
973
974	mov edi, ecx ; pu64Dst
975	mov esi, edx ; pu64Rax
976	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
977	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
978
979	mov ebx, [ecx]
980	mov ecx, [ecx + 4]
981	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
982	mov eax, [esi]
983	mov edx, [esi + 4]
984
985	lock cmpxchg8b [edi]
986
987	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
988	jz .cmpxchg8b_not_equal
989	cmp eax, eax ; just set the other flags.
990	.store:
991	mov [esi], eax
992	mov [esi + 4], edx
993	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
994
995	pop ebp
996	pop ebx
997	pop edi
998	pop esi
999	ret 8
1000
1001	.cmpxchg8b_not_equal:
1002	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1003	jne .store
1004	cmp [esi], eax
1005	jmp .store
1006
1007	%endif
1008	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1009	%endmacro ; IEMIMPL_CMPXCHG
1010
1011	IEMIMPL_CMPXCHG , ,
1012	IEMIMPL_CMPXCHG lock, _locked
1013
1014	;;
1015	; Macro for implementing a unary operator.
1016	;
1017	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1018	; variants, except on 32-bit system where the 64-bit accesses requires hand
1019	; coding.
1020	;
1021	; All the functions takes a pointer to the destination memory operand in A0,
1022	; the source register operand in A1 and a pointer to eflags in A2.
1023	;
1024	; @param 1 The instruction mnemonic.
1025	; @param 2 The modified flags.
1026	; @param 3 The undefined flags.
1027	;
1028	%macro IEMIMPL_UNARY_OP 3
1029	BEGINCODE
1030	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1031	PROLOGUE_2_ARGS
1032	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1033	%1 byte [A0]
1034	IEM_SAVE_FLAGS A1, %2, %3
1035	EPILOGUE_2_ARGS
1036	ENDPROC iemAImpl_ %+ %1 %+ _u8
1037
1038	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1039	PROLOGUE_2_ARGS
1040	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1041	lock %1 byte [A0]
1042	IEM_SAVE_FLAGS A1, %2, %3
1043	EPILOGUE_2_ARGS
1044	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1045
1046	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1047	PROLOGUE_2_ARGS
1048	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1049	%1 word [A0]
1050	IEM_SAVE_FLAGS A1, %2, %3
1051	EPILOGUE_2_ARGS
1052	ENDPROC iemAImpl_ %+ %1 %+ _u16
1053
1054	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1055	PROLOGUE_2_ARGS
1056	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1057	lock %1 word [A0]
1058	IEM_SAVE_FLAGS A1, %2, %3
1059	EPILOGUE_2_ARGS
1060	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1061
1062	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1063	PROLOGUE_2_ARGS
1064	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1065	%1 dword [A0]
1066	IEM_SAVE_FLAGS A1, %2, %3
1067	EPILOGUE_2_ARGS
1068	ENDPROC iemAImpl_ %+ %1 %+ _u32
1069
1070	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1071	PROLOGUE_2_ARGS
1072	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1073	lock %1 dword [A0]
1074	IEM_SAVE_FLAGS A1, %2, %3
1075	EPILOGUE_2_ARGS
1076	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1077
1078	%ifdef RT_ARCH_AMD64
1079	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1080	PROLOGUE_2_ARGS
1081	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1082	%1 qword [A0]
1083	IEM_SAVE_FLAGS A1, %2, %3
1084	EPILOGUE_2_ARGS
1085	ENDPROC iemAImpl_ %+ %1 %+ _u64
1086
1087	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1088	PROLOGUE_2_ARGS
1089	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1090	lock %1 qword [A0]
1091	IEM_SAVE_FLAGS A1, %2, %3
1092	EPILOGUE_2_ARGS
1093	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1094	%endif ; RT_ARCH_AMD64
1095
1096	%endmacro
1097
1098	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1099	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1100	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1101	IEMIMPL_UNARY_OP not, 0, 0
1102
1103
1104	;;
1105	; Macro for implementing memory fence operation.
1106	;
1107	; No return value, no operands or anything.
1108	;
1109	; @param 1 The instruction.
1110	;
1111	%macro IEMIMPL_MEM_FENCE 1
1112	BEGINCODE
1113	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
1114	%1
1115	ret
1116	ENDPROC iemAImpl_ %+ %1
1117	%endmacro
1118
1119	IEMIMPL_MEM_FENCE lfence
1120	IEMIMPL_MEM_FENCE sfence
1121	IEMIMPL_MEM_FENCE mfence
1122
1123	;;
1124	; Alternative for non-SSE2 host.
1125	;
1126	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
1127	push xAX
1128	xchg xAX, [xSP]
1129	add xSP, xCB
1130	ret
1131	ENDPROC iemAImpl_alt_mem_fence
1132
1133
1134
1135	;;
1136	; Macro for implementing a shift operation.
1137	;
1138	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1139	; 32-bit system where the 64-bit accesses requires hand coding.
1140	;
1141	; All the functions takes a pointer to the destination memory operand in A0,
1142	; the shift count in A1 and a pointer to eflags in A2.
1143	;
1144	; @param 1 The instruction mnemonic.
1145	; @param 2 The modified flags.
1146	; @param 3 The undefined flags.
1147	;
1148	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1149	;
1150	%macro IEMIMPL_SHIFT_OP 3
1151	BEGINCODE
1152	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1153	PROLOGUE_3_ARGS
1154	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1155	%ifdef ASM_CALL64_GCC
1156	mov cl, A1_8
1157	%1 byte [A0], cl
1158	%else
1159	xchg A1, A0
1160	%1 byte [A1], cl
1161	%endif
1162	IEM_SAVE_FLAGS A2, %2, %3
1163	EPILOGUE_3_ARGS
1164	ENDPROC iemAImpl_ %+ %1 %+ _u8
1165
1166	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1167	PROLOGUE_3_ARGS
1168	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1169	%ifdef ASM_CALL64_GCC
1170	mov cl, A1_8
1171	%1 word [A0], cl
1172	%else
1173	xchg A1, A0
1174	%1 word [A1], cl
1175	%endif
1176	IEM_SAVE_FLAGS A2, %2, %3
1177	EPILOGUE_3_ARGS
1178	ENDPROC iemAImpl_ %+ %1 %+ _u16
1179
1180	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1181	PROLOGUE_3_ARGS
1182	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1183	%ifdef ASM_CALL64_GCC
1184	mov cl, A1_8
1185	%1 dword [A0], cl
1186	%else
1187	xchg A1, A0
1188	%1 dword [A1], cl
1189	%endif
1190	IEM_SAVE_FLAGS A2, %2, %3
1191	EPILOGUE_3_ARGS
1192	ENDPROC iemAImpl_ %+ %1 %+ _u32
1193
1194	%ifdef RT_ARCH_AMD64
1195	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1196	PROLOGUE_3_ARGS
1197	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1198	%ifdef ASM_CALL64_GCC
1199	mov cl, A1_8
1200	%1 qword [A0], cl
1201	%else
1202	xchg A1, A0
1203	%1 qword [A1], cl
1204	%endif
1205	IEM_SAVE_FLAGS A2, %2, %3
1206	EPILOGUE_3_ARGS
1207	ENDPROC iemAImpl_ %+ %1 %+ _u64
1208	%endif ; RT_ARCH_AMD64
1209
1210	%endmacro
1211
1212	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1213	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1214	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1215	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1216	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1217	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1218	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1219
1220
1221	;;
1222	; Macro for implementing a double precision shift operation.
1223	;
1224	; This will generate code for the 16, 32 and 64 bit accesses, except on
1225	; 32-bit system where the 64-bit accesses requires hand coding.
1226	;
1227	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1228	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1229	;
1230	; @param 1 The instruction mnemonic.
1231	; @param 2 The modified flags.
1232	; @param 3 The undefined flags.
1233	;
1234	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1235	;
1236	%macro IEMIMPL_SHIFT_DBL_OP 3
1237	BEGINCODE
1238	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1239	PROLOGUE_4_ARGS
1240	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1241	%ifdef ASM_CALL64_GCC
1242	xchg A3, A2
1243	%1 [A0], A1_16, cl
1244	xchg A3, A2
1245	%else
1246	xchg A0, A2
1247	%1 [A2], A1_16, cl
1248	%endif
1249	IEM_SAVE_FLAGS A3, %2, %3
1250	EPILOGUE_4_ARGS
1251	ENDPROC iemAImpl_ %+ %1 %+ _u16
1252
1253	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1254	PROLOGUE_4_ARGS
1255	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1256	%ifdef ASM_CALL64_GCC
1257	xchg A3, A2
1258	%1 [A0], A1_32, cl
1259	xchg A3, A2
1260	%else
1261	xchg A0, A2
1262	%1 [A2], A1_32, cl
1263	%endif
1264	IEM_SAVE_FLAGS A3, %2, %3
1265	EPILOGUE_4_ARGS
1266	ENDPROC iemAImpl_ %+ %1 %+ _u32
1267
1268	%ifdef RT_ARCH_AMD64
1269	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1270	PROLOGUE_4_ARGS
1271	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1272	%ifdef ASM_CALL64_GCC
1273	xchg A3, A2
1274	%1 [A0], A1, cl
1275	xchg A3, A2
1276	%else
1277	xchg A0, A2
1278	%1 [A2], A1, cl
1279	%endif
1280	IEM_SAVE_FLAGS A3, %2, %3
1281	EPILOGUE_4_ARGS_EX 12
1282	ENDPROC iemAImpl_ %+ %1 %+ _u64
1283	%endif ; RT_ARCH_AMD64
1284
1285	%endmacro
1286
1287	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1288	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1289
1290
1291	;;
1292	; Macro for implementing a multiplication operations.
1293	;
1294	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1295	; 32-bit system where the 64-bit accesses requires hand coding.
1296	;
1297	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1298	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1299	; pointer to eflags in A3.
1300	;
1301	; The functions all return 0 so the caller can be used for div/idiv as well as
1302	; for the mul/imul implementation.
1303	;
1304	; @param 1 The instruction mnemonic.
1305	; @param 2 The modified flags.
1306	; @param 3 The undefined flags.
1307	;
1308	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1309	;
1310	%macro IEMIMPL_MUL_OP 3
1311	BEGINCODE
1312	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1313	PROLOGUE_3_ARGS
1314	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1315	mov al, [A0]
1316	%1 A1_8
1317	mov [A0], ax
1318	IEM_SAVE_FLAGS A2, %2, %3
1319	xor eax, eax
1320	EPILOGUE_3_ARGS
1321	ENDPROC iemAImpl_ %+ %1 %+ _u8
1322
1323	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1324	PROLOGUE_4_ARGS
1325	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1326	mov ax, [A0]
1327	%ifdef ASM_CALL64_GCC
1328	%1 A2_16
1329	mov [A0], ax
1330	mov [A1], dx
1331	%else
1332	mov T1, A1
1333	%1 A2_16
1334	mov [A0], ax
1335	mov [T1], dx
1336	%endif
1337	IEM_SAVE_FLAGS A3, %2, %3
1338	xor eax, eax
1339	EPILOGUE_4_ARGS
1340	ENDPROC iemAImpl_ %+ %1 %+ _u16
1341
1342	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1343	PROLOGUE_4_ARGS
1344	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1345	mov eax, [A0]
1346	%ifdef ASM_CALL64_GCC
1347	%1 A2_32
1348	mov [A0], eax
1349	mov [A1], edx
1350	%else
1351	mov T1, A1
1352	%1 A2_32
1353	mov [A0], eax
1354	mov [T1], edx
1355	%endif
1356	IEM_SAVE_FLAGS A3, %2, %3
1357	xor eax, eax
1358	EPILOGUE_4_ARGS
1359	ENDPROC iemAImpl_ %+ %1 %+ _u32
1360
1361	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1362	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1363	PROLOGUE_4_ARGS
1364	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1365	mov rax, [A0]
1366	%ifdef ASM_CALL64_GCC
1367	%1 A2
1368	mov [A0], rax
1369	mov [A1], rdx
1370	%else
1371	mov T1, A1
1372	%1 A2
1373	mov [A0], rax
1374	mov [T1], rdx
1375	%endif
1376	IEM_SAVE_FLAGS A3, %2, %3
1377	xor eax, eax
1378	EPILOGUE_4_ARGS_EX 12
1379	ENDPROC iemAImpl_ %+ %1 %+ _u64
1380	%endif ; !RT_ARCH_AMD64
1381
1382	%endmacro
1383
1384	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1385	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1386
1387
1388	BEGINCODE
1389	;;
1390	; Worker function for negating a 32-bit number in T1:T0
1391	; @uses None (T0,T1)
1392	BEGINPROC iemAImpl_negate_T0_T1_u32
1393	push 0
1394	push 0
1395	xchg T0_32, [xSP]
1396	xchg T1_32, [xSP + xCB]
1397	sub T0_32, [xSP]
1398	sbb T1_32, [xSP + xCB]
1399	add xSP, xCB*2
1400	ret
1401	ENDPROC iemAImpl_negate_T0_T1_u32
1402
1403	%ifdef RT_ARCH_AMD64
1404	;;
1405	; Worker function for negating a 64-bit number in T1:T0
1406	; @uses None (T0,T1)
1407	BEGINPROC iemAImpl_negate_T0_T1_u64
1408	push 0
1409	push 0
1410	xchg T0, [xSP]
1411	xchg T1, [xSP + xCB]
1412	sub T0, [xSP]
1413	sbb T1, [xSP + xCB]
1414	add xSP, xCB*2
1415	ret
1416	ENDPROC iemAImpl_negate_T0_T1_u64
1417	%endif
1418
1419
1420	;;
1421	; Macro for implementing a division operations.
1422	;
1423	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1424	; 32-bit system where the 64-bit accesses requires hand coding.
1425	;
1426	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1427	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1428	; pointer to eflags in A3.
1429	;
1430	; The functions all return 0 on success and -1 if a divide error should be
1431	; raised by the caller.
1432	;
1433	; @param 1 The instruction mnemonic.
1434	; @param 2 The modified flags.
1435	; @param 3 The undefined flags.
1436	; @param 4 1 if signed, 0 if unsigned.
1437	;
1438	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1439	;
1440	%macro IEMIMPL_DIV_OP 4
1441	BEGINCODE
1442	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1443	PROLOGUE_3_ARGS
1444
1445	; div by chainsaw check.
1446	test A1_8, A1_8
1447	jz .div_zero
1448
1449	; Overflow check - unsigned division is simple to verify, haven't
1450	; found a simple way to check signed division yet unfortunately.
1451	%if %4 == 0
1452	cmp [A0 + 1], A1_8
1453	jae .div_overflow
1454	%else
1455	mov T0_16, [A0] ; T0 = dividend
1456	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1457	test A1_8, A1_8
1458	js .divisor_negative
1459	test T0_16, T0_16
1460	jns .both_positive
1461	neg T0_16
1462	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1463	push T0 ; Start off like unsigned below.
1464	shr T0_16, 7
1465	cmp T0_8, A1_8
1466	pop T0
1467	jb .div_no_overflow
1468	ja .div_overflow
1469	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1470	cmp T0_8, A1_8
1471	jae .div_overflow
1472	jmp .div_no_overflow
1473
1474	.divisor_negative:
1475	neg A1_8
1476	test T0_16, T0_16
1477	jns .one_of_each
1478	neg T0_16
1479	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1480	shr T0_16, 7
1481	cmp T0_8, A1_8
1482	jae .div_overflow
1483	.div_no_overflow:
1484	mov A1, T1 ; restore divisor
1485	%endif
1486
1487	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1488	mov ax, [A0]
1489	%1 A1_8
1490	mov [A0], ax
1491	IEM_SAVE_FLAGS A2, %2, %3
1492	xor eax, eax
1493
1494	.return:
1495	EPILOGUE_3_ARGS
1496
1497	.div_zero:
1498	.div_overflow:
1499	mov eax, -1
1500	jmp .return
1501	ENDPROC iemAImpl_ %+ %1 %+ _u8
1502
1503	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1504	PROLOGUE_4_ARGS
1505
1506	; div by chainsaw check.
1507	test A2_16, A2_16
1508	jz .div_zero
1509
1510	; Overflow check - unsigned division is simple to verify, haven't
1511	; found a simple way to check signed division yet unfortunately.
1512	%if %4 == 0
1513	cmp [A1], A2_16
1514	jae .div_overflow
1515	%else
1516	mov T0_16, [A1]
1517	shl T0_32, 16
1518	mov T0_16, [A0] ; T0 = dividend
1519	mov T1, A2 ; T1 = divisor
1520	test T1_16, T1_16
1521	js .divisor_negative
1522	test T0_32, T0_32
1523	jns .both_positive
1524	neg T0_32
1525	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1526	push T0 ; Start off like unsigned below.
1527	shr T0_32, 15
1528	cmp T0_16, T1_16
1529	pop T0
1530	jb .div_no_overflow
1531	ja .div_overflow
1532	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1533	cmp T0_16, T1_16
1534	jae .div_overflow
1535	jmp .div_no_overflow
1536
1537	.divisor_negative:
1538	neg T1_16
1539	test T0_32, T0_32
1540	jns .one_of_each
1541	neg T0_32
1542	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1543	shr T0_32, 15
1544	cmp T0_16, T1_16
1545	jae .div_overflow
1546	.div_no_overflow:
1547	%endif
1548
1549	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1550	%ifdef ASM_CALL64_GCC
1551	mov T1, A2
1552	mov ax, [A0]
1553	mov dx, [A1]
1554	%1 T1_16
1555	mov [A0], ax
1556	mov [A1], dx
1557	%else
1558	mov T1, A1
1559	mov ax, [A0]
1560	mov dx, [T1]
1561	%1 A2_16
1562	mov [A0], ax
1563	mov [T1], dx
1564	%endif
1565	IEM_SAVE_FLAGS A3, %2, %3
1566	xor eax, eax
1567
1568	.return:
1569	EPILOGUE_4_ARGS
1570
1571	.div_zero:
1572	.div_overflow:
1573	mov eax, -1
1574	jmp .return
1575	ENDPROC iemAImpl_ %+ %1 %+ _u16
1576
1577	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1578	PROLOGUE_4_ARGS
1579
1580	; div by chainsaw check.
1581	test A2_32, A2_32
1582	jz .div_zero
1583
1584	; Overflow check - unsigned division is simple to verify, haven't
1585	; found a simple way to check signed division yet unfortunately.
1586	%if %4 == 0
1587	cmp [A1], A2_32
1588	jae .div_overflow
1589	%else
1590	push A2 ; save A2 so we modify it (we out of regs on x86).
1591	mov T0_32, [A0] ; T0 = dividend low
1592	mov T1_32, [A1] ; T1 = dividend high
1593	test A2_32, A2_32
1594	js .divisor_negative
1595	test T1_32, T1_32
1596	jns .both_positive
1597	call NAME(iemAImpl_negate_T0_T1_u32)
1598	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1599	push T0 ; Start off like unsigned below.
1600	shl T1_32, 1
1601	shr T0_32, 31
1602	or T1_32, T0_32
1603	cmp T1_32, A2_32
1604	pop T0
1605	jb .div_no_overflow
1606	ja .div_overflow
1607	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
1608	cmp T0_32, A2_32
1609	jae .div_overflow
1610	jmp .div_no_overflow
1611
1612	.divisor_negative:
1613	neg A2_32
1614	test T1_32, T1_32
1615	jns .one_of_each
1616	call NAME(iemAImpl_negate_T0_T1_u32)
1617	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1618	shl T1_32, 1
1619	shr T0_32, 31
1620	or T1_32, T0_32
1621	cmp T1_32, A2_32
1622	jae .div_overflow
1623	.div_no_overflow:
1624	pop A2
1625	%endif
1626
1627	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1628	mov eax, [A0]
1629	%ifdef ASM_CALL64_GCC
1630	mov T1, A2
1631	mov eax, [A0]
1632	mov edx, [A1]
1633	%1 T1_32
1634	mov [A0], eax
1635	mov [A1], edx
1636	%else
1637	mov T1, A1
1638	mov eax, [A0]
1639	mov edx, [T1]
1640	%1 A2_32
1641	mov [A0], eax
1642	mov [T1], edx
1643	%endif
1644	IEM_SAVE_FLAGS A3, %2, %3
1645	xor eax, eax
1646
1647	.return:
1648	EPILOGUE_4_ARGS
1649
1650	.div_overflow:
1651	%if %4 != 0
1652	pop A2
1653	%endif
1654	.div_zero:
1655	mov eax, -1
1656	jmp .return
1657	ENDPROC iemAImpl_ %+ %1 %+ _u32
1658
1659	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1660	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1661	PROLOGUE_4_ARGS
1662
1663	test A2, A2
1664	jz .div_zero
1665	%if %4 == 0
1666	cmp [A1], A2
1667	jae .div_overflow
1668	%else
1669	push A2 ; save A2 so we modify it (we out of regs on x86).
1670	mov T0, [A0] ; T0 = dividend low
1671	mov T1, [A1] ; T1 = dividend high
1672	test A2, A2
1673	js .divisor_negative
1674	test T1, T1
1675	jns .both_positive
1676	call NAME(iemAImpl_negate_T0_T1_u64)
1677	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1678	push T0 ; Start off like unsigned below.
1679	shl T1, 1
1680	shr T0, 63
1681	or T1, T0
1682	cmp T1, A2
1683	pop T0
1684	jb .div_no_overflow
1685	ja .div_overflow
1686	mov T1, 0x7fffffffffffffff
1687	and T0, T1 ; Special case for covering (divisor - 1).
1688	cmp T0, A2
1689	jae .div_overflow
1690	jmp .div_no_overflow
1691
1692	.divisor_negative:
1693	neg A2
1694	test T1, T1
1695	jns .one_of_each
1696	call NAME(iemAImpl_negate_T0_T1_u64)
1697	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1698	shl T1, 1
1699	shr T0, 63
1700	or T1, T0
1701	cmp T1, A2
1702	jae .div_overflow
1703	.div_no_overflow:
1704	pop A2
1705	%endif
1706
1707	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1708	mov rax, [A0]
1709	%ifdef ASM_CALL64_GCC
1710	mov T1, A2
1711	mov rax, [A0]
1712	mov rdx, [A1]
1713	%1 T1
1714	mov [A0], rax
1715	mov [A1], rdx
1716	%else
1717	mov T1, A1
1718	mov rax, [A0]
1719	mov rdx, [T1]
1720	%1 A2
1721	mov [A0], rax
1722	mov [T1], rdx
1723	%endif
1724	IEM_SAVE_FLAGS A3, %2, %3
1725	xor eax, eax
1726
1727	.return:
1728	EPILOGUE_4_ARGS_EX 12
1729
1730	.div_overflow:
1731	%if %4 != 0
1732	pop A2
1733	%endif
1734	.div_zero:
1735	mov eax, -1
1736	jmp .return
1737	ENDPROC iemAImpl_ %+ %1 %+ _u64
1738	%endif ; !RT_ARCH_AMD64
1739
1740	%endmacro
1741
1742	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1743	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1744
1745
1746	;
1747	; BSWAP. No flag changes.
1748	;
1749	; Each function takes one argument, pointer to the value to bswap
1750	; (input/output). They all return void.
1751	;
1752	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1753	PROLOGUE_1_ARGS
1754	mov T0_32, [A0] ; just in case any of the upper bits are used.
1755	db 66h
1756	bswap T0_32
1757	mov [A0], T0_32
1758	EPILOGUE_1_ARGS
1759	ENDPROC iemAImpl_bswap_u16
1760
1761	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1762	PROLOGUE_1_ARGS
1763	mov T0_32, [A0]
1764	bswap T0_32
1765	mov [A0], T0_32
1766	EPILOGUE_1_ARGS
1767	ENDPROC iemAImpl_bswap_u32
1768
1769	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1770	%ifdef RT_ARCH_AMD64
1771	PROLOGUE_1_ARGS
1772	mov T0, [A0]
1773	bswap T0
1774	mov [A0], T0
1775	EPILOGUE_1_ARGS
1776	%else
1777	PROLOGUE_1_ARGS
1778	mov T0, [A0]
1779	mov T1, [A0 + 4]
1780	bswap T0
1781	bswap T1
1782	mov [A0 + 4], T0
1783	mov [A0], T1
1784	EPILOGUE_1_ARGS
1785	%endif
1786	ENDPROC iemAImpl_bswap_u64
1787
1788
1789	;;
1790	; Initialize the FPU for the actual instruction being emulated, this means
1791	; loading parts of the guest's control word and status word.
1792	;
1793	; @uses 24 bytes of stack.
1794	; @param 1 Expression giving the address of the FXSTATE of the guest.
1795	;
1796	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
1797	fnstenv [xSP]
1798
1799	; FCW - for exception, precision and rounding control.
1800	movzx T0, word [%1 + X86FXSTATE.FCW]
1801	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
1802	mov [xSP + X86FSTENV32P.FCW], T0_16
1803
1804	; FSW - for undefined C0, C1, C2, and C3.
1805	movzx T1, word [%1 + X86FXSTATE.FSW]
1806	and T1, X86_FSW_C_MASK
1807	movzx T0, word [xSP + X86FSTENV32P.FSW]
1808	and T0, X86_FSW_TOP_MASK
1809	or T0, T1
1810	mov [xSP + X86FSTENV32P.FSW], T0_16
1811
1812	fldenv [xSP]
1813	%endmacro
1814
1815
1816	;;
1817	; Need to move this as well somewhere better?
1818	;
1819	struc IEMFPURESULT
1820	.r80Result resw 5
1821	.FSW resw 1
1822	endstruc
1823
1824
1825	;;
1826	; Need to move this as well somewhere better?
1827	;
1828	struc IEMFPURESULTTWO
1829	.r80Result1 resw 5
1830	.FSW resw 1
1831	.r80Result2 resw 5
1832	endstruc
1833
1834
1835	;
1836	;---------------------- 16-bit signed integer operations ----------------------
1837	;
1838
1839
1840	;;
1841	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
1842	;
1843	; @param A0 FPU context (fxsave).
1844	; @param A1 Pointer to a IEMFPURESULT for the output.
1845	; @param A2 Pointer to the 16-bit floating point value to convert.
1846	;
1847	BEGINPROC_FASTCALL iemAImpl_fild_i16_to_r80, 12
1848	PROLOGUE_3_ARGS
1849	sub xSP, 20h
1850
1851	fninit
1852	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1853	fild word [A2]
1854
1855	fnstsw word [A1 + IEMFPURESULT.FSW]
1856	fnclex
1857	fstp tword [A1 + IEMFPURESULT.r80Result]
1858
1859	fninit
1860	add xSP, 20h
1861	EPILOGUE_3_ARGS
1862	ENDPROC iemAImpl_fild_i16_to_r80
1863
1864
1865	;;
1866	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
1867	;
1868	; @param A0 FPU context (fxsave).
1869	; @param A1 Where to return the output FSW.
1870	; @param A2 Where to store the 16-bit signed integer value.
1871	; @param A3 Pointer to the 80-bit value.
1872	;
1873	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
1874	PROLOGUE_4_ARGS
1875	sub xSP, 20h
1876
1877	fninit
1878	fld tword [A3]
1879	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1880	fistp word [A2]
1881
1882	fnstsw word [A1]
1883
1884	fninit
1885	add xSP, 20h
1886	EPILOGUE_4_ARGS
1887	ENDPROC iemAImpl_fist_r80_to_i16
1888
1889
1890	;;
1891	; Store a 80-bit floating point value (register) as a 16-bit signed integer
1892	; (memory) with truncation.
1893	;
1894	; @param A0 FPU context (fxsave).
1895	; @param A1 Where to return the output FSW.
1896	; @param A2 Where to store the 16-bit signed integer value.
1897	; @param A3 Pointer to the 80-bit value.
1898	;
1899	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
1900	PROLOGUE_4_ARGS
1901	sub xSP, 20h
1902
1903	fninit
1904	fld tword [A3]
1905	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1906	fisttp dword [A2]
1907
1908	fnstsw word [A1]
1909
1910	fninit
1911	add xSP, 20h
1912	EPILOGUE_4_ARGS
1913	ENDPROC iemAImpl_fistt_r80_to_i16
1914
1915
1916	;;
1917	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
1918	;
1919	; @param 1 The instruction
1920	;
1921	; @param A0 FPU context (fxsave).
1922	; @param A1 Pointer to a IEMFPURESULT for the output.
1923	; @param A2 Pointer to the 80-bit value.
1924	; @param A3 Pointer to the 16-bit value.
1925	;
1926	%macro IEMIMPL_FPU_R80_BY_I16 1
1927	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1928	PROLOGUE_4_ARGS
1929	sub xSP, 20h
1930
1931	fninit
1932	fld tword [A2]
1933	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1934	%1 word [A3]
1935
1936	fnstsw word [A1 + IEMFPURESULT.FSW]
1937	fnclex
1938	fstp tword [A1 + IEMFPURESULT.r80Result]
1939
1940	fninit
1941	add xSP, 20h
1942	EPILOGUE_4_ARGS
1943	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1944	%endmacro
1945
1946	IEMIMPL_FPU_R80_BY_I16 fiadd
1947	IEMIMPL_FPU_R80_BY_I16 fimul
1948	IEMIMPL_FPU_R80_BY_I16 fisub
1949	IEMIMPL_FPU_R80_BY_I16 fisubr
1950	IEMIMPL_FPU_R80_BY_I16 fidiv
1951	IEMIMPL_FPU_R80_BY_I16 fidivr
1952
1953
1954	;;
1955	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
1956	; only returning FSW.
1957	;
1958	; @param 1 The instruction
1959	;
1960	; @param A0 FPU context (fxsave).
1961	; @param A1 Where to store the output FSW.
1962	; @param A2 Pointer to the 80-bit value.
1963	; @param A3 Pointer to the 64-bit value.
1964	;
1965	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
1966	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1967	PROLOGUE_4_ARGS
1968	sub xSP, 20h
1969
1970	fninit
1971	fld tword [A2]
1972	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1973	%1 word [A3]
1974
1975	fnstsw word [A1]
1976
1977	fninit
1978	add xSP, 20h
1979	EPILOGUE_4_ARGS
1980	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1981	%endmacro
1982
1983	IEMIMPL_FPU_R80_BY_I16_FSW ficom
1984
1985
1986
1987	;
1988	;---------------------- 32-bit signed integer operations ----------------------
1989	;
1990
1991
1992	;;
1993	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
1994	;
1995	; @param A0 FPU context (fxsave).
1996	; @param A1 Pointer to a IEMFPURESULT for the output.
1997	; @param A2 Pointer to the 32-bit floating point value to convert.
1998	;
1999	BEGINPROC_FASTCALL iemAImpl_fild_i32_to_r80, 12
2000	PROLOGUE_3_ARGS
2001	sub xSP, 20h
2002
2003	fninit
2004	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2005	fild dword [A2]
2006
2007	fnstsw word [A1 + IEMFPURESULT.FSW]
2008	fnclex
2009	fstp tword [A1 + IEMFPURESULT.r80Result]
2010
2011	fninit
2012	add xSP, 20h
2013	EPILOGUE_3_ARGS
2014	ENDPROC iemAImpl_fild_i32_to_r80
2015
2016
2017	;;
2018	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2019	;
2020	; @param A0 FPU context (fxsave).
2021	; @param A1 Where to return the output FSW.
2022	; @param A2 Where to store the 32-bit signed integer value.
2023	; @param A3 Pointer to the 80-bit value.
2024	;
2025	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2026	PROLOGUE_4_ARGS
2027	sub xSP, 20h
2028
2029	fninit
2030	fld tword [A3]
2031	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2032	fistp dword [A2]
2033
2034	fnstsw word [A1]
2035
2036	fninit
2037	add xSP, 20h
2038	EPILOGUE_4_ARGS
2039	ENDPROC iemAImpl_fist_r80_to_i32
2040
2041
2042	;;
2043	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2044	; (memory) with truncation.
2045	;
2046	; @param A0 FPU context (fxsave).
2047	; @param A1 Where to return the output FSW.
2048	; @param A2 Where to store the 32-bit signed integer value.
2049	; @param A3 Pointer to the 80-bit value.
2050	;
2051	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2052	PROLOGUE_4_ARGS
2053	sub xSP, 20h
2054
2055	fninit
2056	fld tword [A3]
2057	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2058	fisttp dword [A2]
2059
2060	fnstsw word [A1]
2061
2062	fninit
2063	add xSP, 20h
2064	EPILOGUE_4_ARGS
2065	ENDPROC iemAImpl_fistt_r80_to_i32
2066
2067
2068	;;
2069	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2070	;
2071	; @param 1 The instruction
2072	;
2073	; @param A0 FPU context (fxsave).
2074	; @param A1 Pointer to a IEMFPURESULT for the output.
2075	; @param A2 Pointer to the 80-bit value.
2076	; @param A3 Pointer to the 32-bit value.
2077	;
2078	%macro IEMIMPL_FPU_R80_BY_I32 1
2079	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2080	PROLOGUE_4_ARGS
2081	sub xSP, 20h
2082
2083	fninit
2084	fld tword [A2]
2085	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2086	%1 dword [A3]
2087
2088	fnstsw word [A1 + IEMFPURESULT.FSW]
2089	fnclex
2090	fstp tword [A1 + IEMFPURESULT.r80Result]
2091
2092	fninit
2093	add xSP, 20h
2094	EPILOGUE_4_ARGS
2095	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2096	%endmacro
2097
2098	IEMIMPL_FPU_R80_BY_I32 fiadd
2099	IEMIMPL_FPU_R80_BY_I32 fimul
2100	IEMIMPL_FPU_R80_BY_I32 fisub
2101	IEMIMPL_FPU_R80_BY_I32 fisubr
2102	IEMIMPL_FPU_R80_BY_I32 fidiv
2103	IEMIMPL_FPU_R80_BY_I32 fidivr
2104
2105
2106	;;
2107	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2108	; only returning FSW.
2109	;
2110	; @param 1 The instruction
2111	;
2112	; @param A0 FPU context (fxsave).
2113	; @param A1 Where to store the output FSW.
2114	; @param A2 Pointer to the 80-bit value.
2115	; @param A3 Pointer to the 64-bit value.
2116	;
2117	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2118	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2119	PROLOGUE_4_ARGS
2120	sub xSP, 20h
2121
2122	fninit
2123	fld tword [A2]
2124	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2125	%1 dword [A3]
2126
2127	fnstsw word [A1]
2128
2129	fninit
2130	add xSP, 20h
2131	EPILOGUE_4_ARGS
2132	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2133	%endmacro
2134
2135	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2136
2137
2138
2139	;
2140	;---------------------- 64-bit signed integer operations ----------------------
2141	;
2142
2143
2144	;;
2145	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2146	;
2147	; @param A0 FPU context (fxsave).
2148	; @param A1 Pointer to a IEMFPURESULT for the output.
2149	; @param A2 Pointer to the 64-bit floating point value to convert.
2150	;
2151	BEGINPROC_FASTCALL iemAImpl_fild_i64_to_r80, 12
2152	PROLOGUE_3_ARGS
2153	sub xSP, 20h
2154
2155	fninit
2156	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2157	fild qword [A2]
2158
2159	fnstsw word [A1 + IEMFPURESULT.FSW]
2160	fnclex
2161	fstp tword [A1 + IEMFPURESULT.r80Result]
2162
2163	fninit
2164	add xSP, 20h
2165	EPILOGUE_3_ARGS
2166	ENDPROC iemAImpl_fild_i64_to_r80
2167
2168
2169	;;
2170	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2171	;
2172	; @param A0 FPU context (fxsave).
2173	; @param A1 Where to return the output FSW.
2174	; @param A2 Where to store the 64-bit signed integer value.
2175	; @param A3 Pointer to the 80-bit value.
2176	;
2177	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2178	PROLOGUE_4_ARGS
2179	sub xSP, 20h
2180
2181	fninit
2182	fld tword [A3]
2183	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2184	fistp qword [A2]
2185
2186	fnstsw word [A1]
2187
2188	fninit
2189	add xSP, 20h
2190	EPILOGUE_4_ARGS
2191	ENDPROC iemAImpl_fist_r80_to_i64
2192
2193
2194	;;
2195	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2196	; (memory) with truncation.
2197	;
2198	; @param A0 FPU context (fxsave).
2199	; @param A1 Where to return the output FSW.
2200	; @param A2 Where to store the 64-bit signed integer value.
2201	; @param A3 Pointer to the 80-bit value.
2202	;
2203	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2204	PROLOGUE_4_ARGS
2205	sub xSP, 20h
2206
2207	fninit
2208	fld tword [A3]
2209	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2210	fisttp qword [A2]
2211
2212	fnstsw word [A1]
2213
2214	fninit
2215	add xSP, 20h
2216	EPILOGUE_4_ARGS
2217	ENDPROC iemAImpl_fistt_r80_to_i64
2218
2219
2220
2221	;
2222	;---------------------- 32-bit floating point operations ----------------------
2223	;
2224
2225	;;
2226	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2227	;
2228	; @param A0 FPU context (fxsave).
2229	; @param A1 Pointer to a IEMFPURESULT for the output.
2230	; @param A2 Pointer to the 32-bit floating point value to convert.
2231	;
2232	BEGINPROC_FASTCALL iemAImpl_fld_r32_to_r80, 12
2233	PROLOGUE_3_ARGS
2234	sub xSP, 20h
2235
2236	fninit
2237	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2238	fld dword [A2]
2239
2240	fnstsw word [A1 + IEMFPURESULT.FSW]
2241	fnclex
2242	fstp tword [A1 + IEMFPURESULT.r80Result]
2243
2244	fninit
2245	add xSP, 20h
2246	EPILOGUE_3_ARGS
2247	ENDPROC iemAImpl_fld_r32_to_r80
2248
2249
2250	;;
2251	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2252	;
2253	; @param A0 FPU context (fxsave).
2254	; @param A1 Where to return the output FSW.
2255	; @param A2 Where to store the 32-bit value.
2256	; @param A3 Pointer to the 80-bit value.
2257	;
2258	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2259	PROLOGUE_4_ARGS
2260	sub xSP, 20h
2261
2262	fninit
2263	fld tword [A3]
2264	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2265	fst dword [A2]
2266
2267	fnstsw word [A1]
2268
2269	fninit
2270	add xSP, 20h
2271	EPILOGUE_4_ARGS
2272	ENDPROC iemAImpl_fst_r80_to_r32
2273
2274
2275	;;
2276	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2277	;
2278	; @param 1 The instruction
2279	;
2280	; @param A0 FPU context (fxsave).
2281	; @param A1 Pointer to a IEMFPURESULT for the output.
2282	; @param A2 Pointer to the 80-bit value.
2283	; @param A3 Pointer to the 32-bit value.
2284	;
2285	%macro IEMIMPL_FPU_R80_BY_R32 1
2286	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2287	PROLOGUE_4_ARGS
2288	sub xSP, 20h
2289
2290	fninit
2291	fld tword [A2]
2292	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2293	%1 dword [A3]
2294
2295	fnstsw word [A1 + IEMFPURESULT.FSW]
2296	fnclex
2297	fstp tword [A1 + IEMFPURESULT.r80Result]
2298
2299	fninit
2300	add xSP, 20h
2301	EPILOGUE_4_ARGS
2302	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2303	%endmacro
2304
2305	IEMIMPL_FPU_R80_BY_R32 fadd
2306	IEMIMPL_FPU_R80_BY_R32 fmul
2307	IEMIMPL_FPU_R80_BY_R32 fsub
2308	IEMIMPL_FPU_R80_BY_R32 fsubr
2309	IEMIMPL_FPU_R80_BY_R32 fdiv
2310	IEMIMPL_FPU_R80_BY_R32 fdivr
2311
2312
2313	;;
2314	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2315	; only returning FSW.
2316	;
2317	; @param 1 The instruction
2318	;
2319	; @param A0 FPU context (fxsave).
2320	; @param A1 Where to store the output FSW.
2321	; @param A2 Pointer to the 80-bit value.
2322	; @param A3 Pointer to the 64-bit value.
2323	;
2324	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2325	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2326	PROLOGUE_4_ARGS
2327	sub xSP, 20h
2328
2329	fninit
2330	fld tword [A2]
2331	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2332	%1 dword [A3]
2333
2334	fnstsw word [A1]
2335
2336	fninit
2337	add xSP, 20h
2338	EPILOGUE_4_ARGS
2339	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2340	%endmacro
2341
2342	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2343
2344
2345
2346	;
2347	;---------------------- 64-bit floating point operations ----------------------
2348	;
2349
2350	;;
2351	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2352	;
2353	; @param A0 FPU context (fxsave).
2354	; @param A1 Pointer to a IEMFPURESULT for the output.
2355	; @param A2 Pointer to the 64-bit floating point value to convert.
2356	;
2357	BEGINPROC_FASTCALL iemAImpl_fld_r64_to_r80, 12
2358	PROLOGUE_3_ARGS
2359	sub xSP, 20h
2360
2361	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2362	fld qword [A2]
2363
2364	fnstsw word [A1 + IEMFPURESULT.FSW]
2365	fnclex
2366	fstp tword [A1 + IEMFPURESULT.r80Result]
2367
2368	fninit
2369	add xSP, 20h
2370	EPILOGUE_3_ARGS
2371	ENDPROC iemAImpl_fld_r64_to_r80
2372
2373
2374	;;
2375	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2376	;
2377	; @param A0 FPU context (fxsave).
2378	; @param A1 Where to return the output FSW.
2379	; @param A2 Where to store the 64-bit value.
2380	; @param A3 Pointer to the 80-bit value.
2381	;
2382	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2383	PROLOGUE_4_ARGS
2384	sub xSP, 20h
2385
2386	fninit
2387	fld tword [A3]
2388	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2389	fst qword [A2]
2390
2391	fnstsw word [A1]
2392
2393	fninit
2394	add xSP, 20h
2395	EPILOGUE_4_ARGS
2396	ENDPROC iemAImpl_fst_r80_to_r64
2397
2398
2399	;;
2400	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2401	;
2402	; @param 1 The instruction
2403	;
2404	; @param A0 FPU context (fxsave).
2405	; @param A1 Pointer to a IEMFPURESULT for the output.
2406	; @param A2 Pointer to the 80-bit value.
2407	; @param A3 Pointer to the 64-bit value.
2408	;
2409	%macro IEMIMPL_FPU_R80_BY_R64 1
2410	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2411	PROLOGUE_4_ARGS
2412	sub xSP, 20h
2413
2414	fninit
2415	fld tword [A2]
2416	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2417	%1 qword [A3]
2418
2419	fnstsw word [A1 + IEMFPURESULT.FSW]
2420	fnclex
2421	fstp tword [A1 + IEMFPURESULT.r80Result]
2422
2423	fninit
2424	add xSP, 20h
2425	EPILOGUE_4_ARGS
2426	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2427	%endmacro
2428
2429	IEMIMPL_FPU_R80_BY_R64 fadd
2430	IEMIMPL_FPU_R80_BY_R64 fmul
2431	IEMIMPL_FPU_R80_BY_R64 fsub
2432	IEMIMPL_FPU_R80_BY_R64 fsubr
2433	IEMIMPL_FPU_R80_BY_R64 fdiv
2434	IEMIMPL_FPU_R80_BY_R64 fdivr
2435
2436	;;
2437	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2438	; only returning FSW.
2439	;
2440	; @param 1 The instruction
2441	;
2442	; @param A0 FPU context (fxsave).
2443	; @param A1 Where to store the output FSW.
2444	; @param A2 Pointer to the 80-bit value.
2445	; @param A3 Pointer to the 64-bit value.
2446	;
2447	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2448	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2449	PROLOGUE_4_ARGS
2450	sub xSP, 20h
2451
2452	fninit
2453	fld tword [A2]
2454	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2455	%1 qword [A3]
2456
2457	fnstsw word [A1]
2458
2459	fninit
2460	add xSP, 20h
2461	EPILOGUE_4_ARGS
2462	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2463	%endmacro
2464
2465	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2466
2467
2468
2469	;
2470	;---------------------- 80-bit floating point operations ----------------------
2471	;
2472
2473	;;
2474	; Loads a 80-bit floating point register value from memory.
2475	;
2476	; @param A0 FPU context (fxsave).
2477	; @param A1 Pointer to a IEMFPURESULT for the output.
2478	; @param A2 Pointer to the 80-bit floating point value to load.
2479	;
2480	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2481	PROLOGUE_3_ARGS
2482	sub xSP, 20h
2483
2484	fninit
2485	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2486	fld tword [A2]
2487
2488	fnstsw word [A1 + IEMFPURESULT.FSW]
2489	fnclex
2490	fstp tword [A1 + IEMFPURESULT.r80Result]
2491
2492	fninit
2493	add xSP, 20h
2494	EPILOGUE_3_ARGS
2495	ENDPROC iemAImpl_fld_r80_from_r80
2496
2497
2498	;;
2499	; Store a 80-bit floating point register to memory
2500	;
2501	; @param A0 FPU context (fxsave).
2502	; @param A1 Where to return the output FSW.
2503	; @param A2 Where to store the 80-bit value.
2504	; @param A3 Pointer to the 80-bit register value.
2505	;
2506	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2507	PROLOGUE_4_ARGS
2508	sub xSP, 20h
2509
2510	fninit
2511	fld tword [A3]
2512	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2513	fstp tword [A2]
2514
2515	fnstsw word [A1]
2516
2517	fninit
2518	add xSP, 20h
2519	EPILOGUE_4_ARGS
2520	ENDPROC iemAImpl_fst_r80_to_r80
2521
2522
2523	;;
2524	; FPU instruction working on two 80-bit floating point values.
2525	;
2526	; @param 1 The instruction
2527	;
2528	; @param A0 FPU context (fxsave).
2529	; @param A1 Pointer to a IEMFPURESULT for the output.
2530	; @param A2 Pointer to the first 80-bit value (ST0)
2531	; @param A3 Pointer to the second 80-bit value (STn).
2532	;
2533	%macro IEMIMPL_FPU_R80_BY_R80 2
2534	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2535	PROLOGUE_4_ARGS
2536	sub xSP, 20h
2537
2538	fninit
2539	fld tword [A3]
2540	fld tword [A2]
2541	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2542	%1 %2
2543
2544	fnstsw word [A1 + IEMFPURESULT.FSW]
2545	fnclex
2546	fstp tword [A1 + IEMFPURESULT.r80Result]
2547
2548	fninit
2549	add xSP, 20h
2550	EPILOGUE_4_ARGS
2551	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2552	%endmacro
2553
2554	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
2555	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
2556	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
2557	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
2558	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
2559	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
2560	IEMIMPL_FPU_R80_BY_R80 fprem, {}
2561	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
2562	IEMIMPL_FPU_R80_BY_R80 fscale, {}
2563
2564
2565	;;
2566	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
2567	; storing the result in ST1 and popping the stack.
2568	;
2569	; @param 1 The instruction
2570	;
2571	; @param A0 FPU context (fxsave).
2572	; @param A1 Pointer to a IEMFPURESULT for the output.
2573	; @param A2 Pointer to the first 80-bit value (ST1).
2574	; @param A3 Pointer to the second 80-bit value (ST0).
2575	;
2576	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
2577	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2578	PROLOGUE_4_ARGS
2579	sub xSP, 20h
2580
2581	fninit
2582	fld tword [A2]
2583	fld tword [A3]
2584	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2585	%1
2586
2587	fnstsw word [A1 + IEMFPURESULT.FSW]
2588	fnclex
2589	fstp tword [A1 + IEMFPURESULT.r80Result]
2590
2591	fninit
2592	add xSP, 20h
2593	EPILOGUE_4_ARGS
2594	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2595	%endmacro
2596
2597	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
2598	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
2599	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
2600
2601
2602	;;
2603	; FPU instruction working on two 80-bit floating point values, only
2604	; returning FSW.
2605	;
2606	; @param 1 The instruction
2607	;
2608	; @param A0 FPU context (fxsave).
2609	; @param A1 Pointer to a uint16_t for the resulting FSW.
2610	; @param A2 Pointer to the first 80-bit value.
2611	; @param A3 Pointer to the second 80-bit value.
2612	;
2613	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
2614	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2615	PROLOGUE_4_ARGS
2616	sub xSP, 20h
2617
2618	fninit
2619	fld tword [A3]
2620	fld tword [A2]
2621	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2622	%1 st0, st1
2623
2624	fnstsw word [A1]
2625
2626	fninit
2627	add xSP, 20h
2628	EPILOGUE_4_ARGS
2629	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2630	%endmacro
2631
2632	IEMIMPL_FPU_R80_BY_R80_FSW fcom
2633	IEMIMPL_FPU_R80_BY_R80_FSW fucom
2634
2635
2636	;;
2637	; FPU instruction working on two 80-bit floating point values,
2638	; returning FSW and EFLAGS (eax).
2639	;
2640	; @param 1 The instruction
2641	;
2642	; @returns EFLAGS in EAX.
2643	; @param A0 FPU context (fxsave).
2644	; @param A1 Pointer to a uint16_t for the resulting FSW.
2645	; @param A2 Pointer to the first 80-bit value.
2646	; @param A3 Pointer to the second 80-bit value.
2647	;
2648	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
2649	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2650	PROLOGUE_4_ARGS
2651	sub xSP, 20h
2652
2653	fninit
2654	fld tword [A3]
2655	fld tword [A2]
2656	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2657	%1 st1
2658
2659	fnstsw word [A1]
2660	pushf
2661	pop xAX
2662
2663	fninit
2664	add xSP, 20h
2665	EPILOGUE_4_ARGS
2666	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2667	%endmacro
2668
2669	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
2670	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
2671
2672
2673	;;
2674	; FPU instruction working on one 80-bit floating point value.
2675	;
2676	; @param 1 The instruction
2677	;
2678	; @param A0 FPU context (fxsave).
2679	; @param A1 Pointer to a IEMFPURESULT for the output.
2680	; @param A2 Pointer to the 80-bit value.
2681	;
2682	%macro IEMIMPL_FPU_R80 1
2683	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2684	PROLOGUE_3_ARGS
2685	sub xSP, 20h
2686
2687	fninit
2688	fld tword [A2]
2689	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2690	%1
2691
2692	fnstsw word [A1 + IEMFPURESULT.FSW]
2693	fnclex
2694	fstp tword [A1 + IEMFPURESULT.r80Result]
2695
2696	fninit
2697	add xSP, 20h
2698	EPILOGUE_3_ARGS
2699	ENDPROC iemAImpl_ %+ %1 %+ _r80
2700	%endmacro
2701
2702	IEMIMPL_FPU_R80 fchs
2703	IEMIMPL_FPU_R80 fabs
2704	IEMIMPL_FPU_R80 f2xm1
2705	IEMIMPL_FPU_R80 fsqrt
2706	IEMIMPL_FPU_R80 frndint
2707	IEMIMPL_FPU_R80 fsin
2708	IEMIMPL_FPU_R80 fcos
2709
2710
2711	;;
2712	; FPU instruction working on one 80-bit floating point value, only
2713	; returning FSW.
2714	;
2715	; @param 1 The instruction
2716	;
2717	; @param A0 FPU context (fxsave).
2718	; @param A1 Pointer to a uint16_t for the resulting FSW.
2719	; @param A2 Pointer to the 80-bit value.
2720	;
2721	%macro IEMIMPL_FPU_R80_FSW 1
2722	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2723	PROLOGUE_3_ARGS
2724	sub xSP, 20h
2725
2726	fninit
2727	fld tword [A2]
2728	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2729	%1
2730
2731	fnstsw word [A1]
2732
2733	fninit
2734	add xSP, 20h
2735	EPILOGUE_3_ARGS
2736	ENDPROC iemAImpl_ %+ %1 %+ _r80
2737	%endmacro
2738
2739	IEMIMPL_FPU_R80_FSW ftst
2740	IEMIMPL_FPU_R80_FSW fxam
2741
2742
2743
2744	;;
2745	; FPU instruction loading a 80-bit floating point constant.
2746	;
2747	; @param 1 The instruction
2748	;
2749	; @param A0 FPU context (fxsave).
2750	; @param A1 Pointer to a IEMFPURESULT for the output.
2751	;
2752	%macro IEMIMPL_FPU_R80_CONST 1
2753	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
2754	PROLOGUE_2_ARGS
2755	sub xSP, 20h
2756
2757	fninit
2758	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2759	%1
2760
2761	fnstsw word [A1 + IEMFPURESULT.FSW]
2762	fnclex
2763	fstp tword [A1 + IEMFPURESULT.r80Result]
2764
2765	fninit
2766	add xSP, 20h
2767	EPILOGUE_2_ARGS
2768	ENDPROC iemAImpl_ %+ %1 %+
2769	%endmacro
2770
2771	IEMIMPL_FPU_R80_CONST fld1
2772	IEMIMPL_FPU_R80_CONST fldl2t
2773	IEMIMPL_FPU_R80_CONST fldl2e
2774	IEMIMPL_FPU_R80_CONST fldpi
2775	IEMIMPL_FPU_R80_CONST fldlg2
2776	IEMIMPL_FPU_R80_CONST fldln2
2777	IEMIMPL_FPU_R80_CONST fldz
2778
2779
2780	;;
2781	; FPU instruction working on one 80-bit floating point value, outputing two.
2782	;
2783	; @param 1 The instruction
2784	;
2785	; @param A0 FPU context (fxsave).
2786	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
2787	; @param A2 Pointer to the 80-bit value.
2788	;
2789	%macro IEMIMPL_FPU_R80_R80 1
2790	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
2791	PROLOGUE_3_ARGS
2792	sub xSP, 20h
2793
2794	fninit
2795	fld tword [A2]
2796	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2797	%1
2798
2799	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
2800	fnclex
2801	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
2802	fnclex
2803	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
2804
2805	fninit
2806	add xSP, 20h
2807	EPILOGUE_3_ARGS
2808	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
2809	%endmacro
2810
2811	IEMIMPL_FPU_R80_R80 fptan
2812	IEMIMPL_FPU_R80_R80 fxtract
2813	IEMIMPL_FPU_R80_R80 fsincos
2814
2815
2816
2817
2818	;---------------------- SSE and MMX Operations ----------------------
2819
2820	;; @todo what do we need to do for MMX?
2821	%macro IEMIMPL_MMX_PROLOGUE 0
2822	%endmacro
2823	%macro IEMIMPL_MMX_EPILOGUE 0
2824	%endmacro
2825
2826	;; @todo what do we need to do for SSE?
2827	%macro IEMIMPL_SSE_PROLOGUE 0
2828	%endmacro
2829	%macro IEMIMPL_SSE_EPILOGUE 0
2830	%endmacro
2831
2832
2833	;;
2834	; Media instruction working on two full sized registers.
2835	;
2836	; @param 1 The instruction
2837	;
2838	; @param A0 FPU context (fxsave).
2839	; @param A1 Pointer to the first media register size operand (input/output).
2840	; @param A2 Pointer to the second media register size operand (input).
2841	;
2842	%macro IEMIMPL_MEDIA_F2 1
2843	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2844	PROLOGUE_3_ARGS
2845	IEMIMPL_MMX_PROLOGUE
2846
2847	movq mm0, [A1]
2848	movq mm1, [A2]
2849	%1 mm0, mm1
2850	movq [A1], mm0
2851
2852	IEMIMPL_MMX_EPILOGUE
2853	EPILOGUE_3_ARGS
2854	ENDPROC iemAImpl_ %+ %1 %+ _u64
2855
2856	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2857	PROLOGUE_3_ARGS
2858	IEMIMPL_SSE_PROLOGUE
2859
2860	movdqu xmm0, [A1]
2861	movdqu xmm1, [A2]
2862	%1 xmm0, xmm1
2863	movdqu [A1], xmm0
2864
2865	IEMIMPL_SSE_EPILOGUE
2866	EPILOGUE_3_ARGS
2867	ENDPROC iemAImpl_ %+ %1 %+ _u128
2868	%endmacro
2869
2870	IEMIMPL_MEDIA_F2 pxor
2871	IEMIMPL_MEDIA_F2 pcmpeqb
2872	IEMIMPL_MEDIA_F2 pcmpeqw
2873	IEMIMPL_MEDIA_F2 pcmpeqd
2874
2875
2876	;;
2877	; Media instruction working on one full sized and one half sized register (lower half).
2878	;
2879	; @param 1 The instruction
2880	; @param 2 1 if MMX is included, 0 if not.
2881	;
2882	; @param A0 FPU context (fxsave).
2883	; @param A1 Pointer to the first full sized media register operand (input/output).
2884	; @param A2 Pointer to the second half sized media register operand (input).
2885	;
2886	%macro IEMIMPL_MEDIA_F1L1 2
2887	%if %2 != 0
2888	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2889	PROLOGUE_3_ARGS
2890	IEMIMPL_MMX_PROLOGUE
2891
2892	movq mm0, [A1]
2893	movd mm1, [A2]
2894	%1 mm0, mm1
2895	movq [A1], mm0
2896
2897	IEMIMPL_MMX_EPILOGUE
2898	EPILOGUE_3_ARGS
2899	ENDPROC iemAImpl_ %+ %1 %+ _u64
2900	%endif
2901
2902	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2903	PROLOGUE_3_ARGS
2904	IEMIMPL_SSE_PROLOGUE
2905
2906	movdqu xmm0, [A1]
2907	movq xmm1, [A2]
2908	%1 xmm0, xmm1
2909	movdqu [A1], xmm0
2910
2911	IEMIMPL_SSE_EPILOGUE
2912	EPILOGUE_3_ARGS
2913	ENDPROC iemAImpl_ %+ %1 %+ _u128
2914	%endmacro
2915
2916	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
2917	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
2918	IEMIMPL_MEDIA_F1L1 punpckldq, 1
2919	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
2920
2921
2922	;;
2923	; Media instruction working on one full sized and one half sized register (high half).
2924	;
2925	; @param 1 The instruction
2926	; @param 2 1 if MMX is included, 0 if not.
2927	;
2928	; @param A0 FPU context (fxsave).
2929	; @param A1 Pointer to the first full sized media register operand (input/output).
2930	; @param A2 Pointer to the second full sized media register operand, where we
2931	; will only use the upper half (input).
2932	;
2933	%macro IEMIMPL_MEDIA_F1H1 2
2934	%if %2 != 0
2935	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2936	PROLOGUE_3_ARGS
2937	IEMIMPL_MMX_PROLOGUE
2938
2939	movq mm0, [A1]
2940	movq mm1, [A2]
2941	%1 mm0, mm1
2942	movq [A1], mm0
2943
2944	IEMIMPL_MMX_EPILOGUE
2945	EPILOGUE_3_ARGS
2946	ENDPROC iemAImpl_ %+ %1 %+ _u64
2947	%endif
2948
2949	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2950	PROLOGUE_3_ARGS
2951	IEMIMPL_SSE_PROLOGUE
2952
2953	movdqu xmm0, [A1]
2954	movdqu xmm1, [A2]
2955	%1 xmm0, xmm1
2956	movdqu [A1], xmm0
2957
2958	IEMIMPL_SSE_EPILOGUE
2959	EPILOGUE_3_ARGS
2960	ENDPROC iemAImpl_ %+ %1 %+ _u128
2961	%endmacro
2962
2963	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
2964	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
2965	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
2966	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
2967
2968
2969	;
2970	; Shufflers with evil 8-bit immediates.
2971	;
2972
2973	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
2974	PROLOGUE_4_ARGS
2975	IEMIMPL_MMX_PROLOGUE
2976
2977	movq mm0, [A1]
2978	movq mm1, [A2]
2979	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
2980	lea T1, [.imm0 xWrtRIP]
2981	lea T1, [T1 + T0]
2982	call T1
2983	movq [A1], mm0
2984
2985	IEMIMPL_MMX_EPILOGUE
2986	EPILOGUE_4_ARGS
2987	%assign bImm 0
2988	%rep 256
2989	.imm %+ bImm:
2990	pshufw mm0, mm1, bImm
2991	ret
2992	%assign bImm bImm + 1
2993	%endrep
2994	.immEnd: ; 256*5 == 0x500
2995	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2996	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2997	ENDPROC iemAImpl_pshufw
2998
2999
3000	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3001	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
3002	PROLOGUE_4_ARGS
3003	IEMIMPL_SSE_PROLOGUE
3004
3005	movdqu xmm0, [A1]
3006	movdqu xmm1, [A2]
3007	lea T1, [.imm0 xWrtRIP]
3008	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
3009	lea T1, [T1 + T0*2]
3010	call T1
3011	movdqu [A1], xmm0
3012
3013	IEMIMPL_SSE_EPILOGUE
3014	EPILOGUE_4_ARGS
3015	%assign bImm 0
3016	%rep 256
3017	.imm %+ bImm:
3018	%1 xmm0, xmm1, bImm
3019	ret
3020	%assign bImm bImm + 1
3021	%endrep
3022	.immEnd: ; 256*6 == 0x600
3023	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3024	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3025	ENDPROC iemAImpl_ %+ %1
3026	%endmacro
3027
3028	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3029	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3030	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3031
3032
3033	;
3034	; Move byte mask.
3035	;
3036
3037	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
3038	PROLOGUE_3_ARGS
3039	IEMIMPL_MMX_PROLOGUE
3040
3041	mov T0, [A1]
3042	movq mm1, [A2]
3043	pmovmskb T0, mm1
3044	mov [A1], T0
3045	%ifdef RT_ARCH_X86
3046	mov dword [A1 + 4], 0
3047	%endif
3048	IEMIMPL_MMX_EPILOGUE
3049	EPILOGUE_3_ARGS
3050	ENDPROC iemAImpl_pmovmskb_u64
3051
3052	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3053	PROLOGUE_3_ARGS
3054	IEMIMPL_SSE_PROLOGUE
3055
3056	mov T0, [A1]
3057	movdqu xmm1, [A2]
3058	pmovmskb T0, xmm1
3059	mov [A1], T0
3060	%ifdef RT_ARCH_X86
3061	mov dword [A1 + 4], 0
3062	%endif
3063	IEMIMPL_SSE_EPILOGUE
3064	EPILOGUE_3_ARGS
3065	ENDPROC iemAImpl_pmovmskb_u128
3066

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 93725

Download in other formats: