IEMAllAImpl.asm@ 66250

Last change on this file since 66250 was 65506, checked in by vboxsync, 8 years ago
IEM: cmpxchg16v implementation (disabled).
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 81.6 KB

Line
1	; $Id: IEMAllAImpl.asm 65506 2017-01-29 14:25:45Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	; Copyright (C) 2011-2016 Oracle Corporation
7	;
8	; This file is part of VirtualBox Open Source Edition (OSE), as
9	; available from http://www.virtualbox.org. This file is free software;
10	; you can redistribute it and/or modify it under the terms of the GNU
11	; General Public License (GPL) as published by the Free Software
12	; Foundation, in version 2 as it comes in the "COPYING" file of the
13	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15	;
16
17
18	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
19	; Header Files ;
20	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
21	%include "VBox/asmdefs.mac"
22	%include "VBox/err.mac"
23	%include "iprt/x86.mac"
24
25
26	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
27	; Defined Constants And Macros ;
28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30	;;
31	; RET XX / RET wrapper for fastcall.
32	;
33	%macro RET_FASTCALL 1
34	%ifdef RT_ARCH_X86
35	%ifdef RT_OS_WINDOWS
36	ret %1
37	%else
38	ret
39	%endif
40	%else
41	ret
42	%endif
43	%endmacro
44
45	;;
46	; NAME for fastcall functions.
47	;
48	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
49	; escaping (or whatever the dollar is good for here). Thus the ugly
50	; prefix argument.
51	;
52	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
53	%ifdef RT_ARCH_X86
54	%ifdef RT_OS_WINDOWS
55	%undef NAME_FASTCALL
56	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
57	%endif
58	%endif
59
60	;;
61	; BEGINPROC for fastcall functions.
62	;
63	; @param 1 The function name (C).
64	; @param 2 The argument size on x86.
65	;
66	%macro BEGINPROC_FASTCALL 2
67	%ifdef ASM_FORMAT_PE
68	export %1=NAME_FASTCALL(%1,%2,$@)
69	%endif
70	%ifdef __NASM__
71	%ifdef ASM_FORMAT_OMF
72	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
73	%endif
74	%endif
75	%ifndef ASM_FORMAT_BIN
76	global NAME_FASTCALL(%1,%2,$@)
77	%endif
78	NAME_FASTCALL(%1,%2,@):
79	%endmacro
80
81
82	;
83	; We employ some macro assembly here to hid the calling convention differences.
84	;
85	%ifdef RT_ARCH_AMD64
86	%macro PROLOGUE_1_ARGS 0
87	%endmacro
88	%macro EPILOGUE_1_ARGS 0
89	ret
90	%endmacro
91	%macro EPILOGUE_1_ARGS_EX 0
92	ret
93	%endmacro
94
95	%macro PROLOGUE_2_ARGS 0
96	%endmacro
97	%macro EPILOGUE_2_ARGS 0
98	ret
99	%endmacro
100	%macro EPILOGUE_2_ARGS_EX 1
101	ret
102	%endmacro
103
104	%macro PROLOGUE_3_ARGS 0
105	%endmacro
106	%macro EPILOGUE_3_ARGS 0
107	ret
108	%endmacro
109	%macro EPILOGUE_3_ARGS_EX 1
110	ret
111	%endmacro
112
113	%macro PROLOGUE_4_ARGS 0
114	%endmacro
115	%macro EPILOGUE_4_ARGS 0
116	ret
117	%endmacro
118	%macro EPILOGUE_4_ARGS_EX 1
119	ret
120	%endmacro
121
122	%ifdef ASM_CALL64_GCC
123	%define A0 rdi
124	%define A0_32 edi
125	%define A0_16 di
126	%define A0_8 dil
127
128	%define A1 rsi
129	%define A1_32 esi
130	%define A1_16 si
131	%define A1_8 sil
132
133	%define A2 rdx
134	%define A2_32 edx
135	%define A2_16 dx
136	%define A2_8 dl
137
138	%define A3 rcx
139	%define A3_32 ecx
140	%define A3_16 cx
141	%endif
142
143	%ifdef ASM_CALL64_MSC
144	%define A0 rcx
145	%define A0_32 ecx
146	%define A0_16 cx
147	%define A0_8 cl
148
149	%define A1 rdx
150	%define A1_32 edx
151	%define A1_16 dx
152	%define A1_8 dl
153
154	%define A2 r8
155	%define A2_32 r8d
156	%define A2_16 r8w
157	%define A2_8 r8b
158
159	%define A3 r9
160	%define A3_32 r9d
161	%define A3_16 r9w
162	%endif
163
164	%define T0 rax
165	%define T0_32 eax
166	%define T0_16 ax
167	%define T0_8 al
168
169	%define T1 r11
170	%define T1_32 r11d
171	%define T1_16 r11w
172	%define T1_8 r11b
173
174	%else
175	; x86
176	%macro PROLOGUE_1_ARGS 0
177	push edi
178	%endmacro
179	%macro EPILOGUE_1_ARGS 0
180	pop edi
181	ret 0
182	%endmacro
183	%macro EPILOGUE_1_ARGS_EX 1
184	pop edi
185	ret %1
186	%endmacro
187
188	%macro PROLOGUE_2_ARGS 0
189	push edi
190	%endmacro
191	%macro EPILOGUE_2_ARGS 0
192	pop edi
193	ret 0
194	%endmacro
195	%macro EPILOGUE_2_ARGS_EX 1
196	pop edi
197	ret %1
198	%endmacro
199
200	%macro PROLOGUE_3_ARGS 0
201	push ebx
202	mov ebx, [esp + 4 + 4]
203	push edi
204	%endmacro
205	%macro EPILOGUE_3_ARGS_EX 1
206	%if (%1) < 4
207	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
208	%endif
209	pop edi
210	pop ebx
211	ret %1
212	%endmacro
213	%macro EPILOGUE_3_ARGS 0
214	EPILOGUE_3_ARGS_EX 4
215	%endmacro
216
217	%macro PROLOGUE_4_ARGS 0
218	push ebx
219	push edi
220	push esi
221	mov ebx, [esp + 12 + 4 + 0]
222	mov esi, [esp + 12 + 4 + 4]
223	%endmacro
224	%macro EPILOGUE_4_ARGS_EX 1
225	%if (%1) < 8
226	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
227	%endif
228	pop esi
229	pop edi
230	pop ebx
231	ret %1
232	%endmacro
233	%macro EPILOGUE_4_ARGS 0
234	EPILOGUE_4_ARGS_EX 8
235	%endmacro
236
237	%define A0 ecx
238	%define A0_32 ecx
239	%define A0_16 cx
240	%define A0_8 cl
241
242	%define A1 edx
243	%define A1_32 edx
244	%define A1_16 dx
245	%define A1_8 dl
246
247	%define A2 ebx
248	%define A2_32 ebx
249	%define A2_16 bx
250	%define A2_8 bl
251
252	%define A3 esi
253	%define A3_32 esi
254	%define A3_16 si
255
256	%define T0 eax
257	%define T0_32 eax
258	%define T0_16 ax
259	%define T0_8 al
260
261	%define T1 edi
262	%define T1_32 edi
263	%define T1_16 di
264	%endif
265
266
267	;;
268	; Load the relevant flags from [%1] if there are undefined flags (%3).
269	;
270	; @remarks Clobbers T0, stack. Changes EFLAGS.
271	; @param A2 The register pointing to the flags.
272	; @param 1 The parameter (A0..A3) pointing to the eflags.
273	; @param 2 The set of modified flags.
274	; @param 3 The set of undefined flags.
275	;
276	%macro IEM_MAYBE_LOAD_FLAGS 3
277	;%if (%3) != 0
278	pushf ; store current flags
279	mov T0_32, [%1] ; load the guest flags
280	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
281	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
282	or [xSP], T0 ; merge guest flags with host flags.
283	popf ; load the mixed flags.
284	;%endif
285	%endmacro
286
287	;;
288	; Update the flag.
289	;
290	; @remarks Clobbers T0, T1, stack.
291	; @param 1 The register pointing to the EFLAGS.
292	; @param 2 The mask of modified flags to save.
293	; @param 3 The mask of undefined flags to (maybe) save.
294	;
295	%macro IEM_SAVE_FLAGS 3
296	%if (%2 \| %3) != 0
297	pushf
298	pop T1
299	mov T0_32, [%1] ; flags
300	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
301	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
302	or T0_32, T1_32 ; combine the flags.
303	mov [%1], T0_32 ; save the flags.
304	%endif
305	%endmacro
306
307
308	;;
309	; Macro for implementing a binary operator.
310	;
311	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
312	; variants, except on 32-bit system where the 64-bit accesses requires hand
313	; coding.
314	;
315	; All the functions takes a pointer to the destination memory operand in A0,
316	; the source register operand in A1 and a pointer to eflags in A2.
317	;
318	; @param 1 The instruction mnemonic.
319	; @param 2 Non-zero if there should be a locked version.
320	; @param 3 The modified flags.
321	; @param 4 The undefined flags.
322	;
323	%macro IEMIMPL_BIN_OP 4
324	BEGINCODE
325	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
326	PROLOGUE_3_ARGS
327	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
328	%1 byte [A0], A1_8
329	IEM_SAVE_FLAGS A2, %3, %4
330	EPILOGUE_3_ARGS
331	ENDPROC iemAImpl_ %+ %1 %+ _u8
332
333	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
334	PROLOGUE_3_ARGS
335	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
336	%1 word [A0], A1_16
337	IEM_SAVE_FLAGS A2, %3, %4
338	EPILOGUE_3_ARGS
339	ENDPROC iemAImpl_ %+ %1 %+ _u16
340
341	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
342	PROLOGUE_3_ARGS
343	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
344	%1 dword [A0], A1_32
345	IEM_SAVE_FLAGS A2, %3, %4
346	EPILOGUE_3_ARGS
347	ENDPROC iemAImpl_ %+ %1 %+ _u32
348
349	%ifdef RT_ARCH_AMD64
350	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
351	PROLOGUE_3_ARGS
352	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
353	%1 qword [A0], A1
354	IEM_SAVE_FLAGS A2, %3, %4
355	EPILOGUE_3_ARGS_EX 8
356	ENDPROC iemAImpl_ %+ %1 %+ _u64
357	%endif ; RT_ARCH_AMD64
358
359	%if %2 != 0 ; locked versions requested?
360
361	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
362	PROLOGUE_3_ARGS
363	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
364	lock %1 byte [A0], A1_8
365	IEM_SAVE_FLAGS A2, %3, %4
366	EPILOGUE_3_ARGS
367	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
368
369	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
370	PROLOGUE_3_ARGS
371	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
372	lock %1 word [A0], A1_16
373	IEM_SAVE_FLAGS A2, %3, %4
374	EPILOGUE_3_ARGS
375	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
376
377	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
378	PROLOGUE_3_ARGS
379	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
380	lock %1 dword [A0], A1_32
381	IEM_SAVE_FLAGS A2, %3, %4
382	EPILOGUE_3_ARGS
383	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
384
385	%ifdef RT_ARCH_AMD64
386	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
387	PROLOGUE_3_ARGS
388	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
389	lock %1 qword [A0], A1
390	IEM_SAVE_FLAGS A2, %3, %4
391	EPILOGUE_3_ARGS_EX 8
392	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
393	%endif ; RT_ARCH_AMD64
394	%endif ; locked
395	%endmacro
396
397	; instr,lock,modified-flags.
398	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
399	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
400	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
401	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
402	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
403	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
404	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
405	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
406	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
407
408
409	;;
410	; Macro for implementing a bit operator.
411	;
412	; This will generate code for the 16, 32 and 64 bit accesses with locked
413	; variants, except on 32-bit system where the 64-bit accesses requires hand
414	; coding.
415	;
416	; All the functions takes a pointer to the destination memory operand in A0,
417	; the source register operand in A1 and a pointer to eflags in A2.
418	;
419	; @param 1 The instruction mnemonic.
420	; @param 2 Non-zero if there should be a locked version.
421	; @param 3 The modified flags.
422	; @param 4 The undefined flags.
423	;
424	%macro IEMIMPL_BIT_OP 4
425	BEGINCODE
426	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
427	PROLOGUE_3_ARGS
428	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
429	%1 word [A0], A1_16
430	IEM_SAVE_FLAGS A2, %3, %4
431	EPILOGUE_3_ARGS
432	ENDPROC iemAImpl_ %+ %1 %+ _u16
433
434	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
435	PROLOGUE_3_ARGS
436	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
437	%1 dword [A0], A1_32
438	IEM_SAVE_FLAGS A2, %3, %4
439	EPILOGUE_3_ARGS
440	ENDPROC iemAImpl_ %+ %1 %+ _u32
441
442	%ifdef RT_ARCH_AMD64
443	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
444	PROLOGUE_3_ARGS
445	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
446	%1 qword [A0], A1
447	IEM_SAVE_FLAGS A2, %3, %4
448	EPILOGUE_3_ARGS_EX 8
449	ENDPROC iemAImpl_ %+ %1 %+ _u64
450	%endif ; RT_ARCH_AMD64
451
452	%if %2 != 0 ; locked versions requested?
453
454	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
455	PROLOGUE_3_ARGS
456	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
457	lock %1 word [A0], A1_16
458	IEM_SAVE_FLAGS A2, %3, %4
459	EPILOGUE_3_ARGS
460	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
461
462	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
463	PROLOGUE_3_ARGS
464	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
465	lock %1 dword [A0], A1_32
466	IEM_SAVE_FLAGS A2, %3, %4
467	EPILOGUE_3_ARGS
468	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
469
470	%ifdef RT_ARCH_AMD64
471	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
472	PROLOGUE_3_ARGS
473	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
474	lock %1 qword [A0], A1
475	IEM_SAVE_FLAGS A2, %3, %4
476	EPILOGUE_3_ARGS_EX 8
477	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
478	%endif ; RT_ARCH_AMD64
479	%endif ; locked
480	%endmacro
481	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
482	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
483	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
484	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
485
486	;;
487	; Macro for implementing a bit search operator.
488	;
489	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
490	; system where the 64-bit accesses requires hand coding.
491	;
492	; All the functions takes a pointer to the destination memory operand in A0,
493	; the source register operand in A1 and a pointer to eflags in A2.
494	;
495	; @param 1 The instruction mnemonic.
496	; @param 2 The modified flags.
497	; @param 3 The undefined flags.
498	;
499	%macro IEMIMPL_BIT_OP 3
500	BEGINCODE
501	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
502	PROLOGUE_3_ARGS
503	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
504	%1 T0_16, A1_16
505	jz .unchanged_dst
506	mov [A0], T0_16
507	.unchanged_dst:
508	IEM_SAVE_FLAGS A2, %2, %3
509	EPILOGUE_3_ARGS
510	ENDPROC iemAImpl_ %+ %1 %+ _u16
511
512	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
513	PROLOGUE_3_ARGS
514	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
515	%1 T0_32, A1_32
516	jz .unchanged_dst
517	mov [A0], T0_32
518	.unchanged_dst:
519	IEM_SAVE_FLAGS A2, %2, %3
520	EPILOGUE_3_ARGS
521	ENDPROC iemAImpl_ %+ %1 %+ _u32
522
523	%ifdef RT_ARCH_AMD64
524	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
525	PROLOGUE_3_ARGS
526	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
527	%1 T0, A1
528	jz .unchanged_dst
529	mov [A0], T0
530	.unchanged_dst:
531	IEM_SAVE_FLAGS A2, %2, %3
532	EPILOGUE_3_ARGS_EX 8
533	ENDPROC iemAImpl_ %+ %1 %+ _u64
534	%endif ; RT_ARCH_AMD64
535	%endmacro
536	IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
537	IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
538
539
540	;
541	; IMUL is also a similar but yet different case (no lock, no mem dst).
542	; The rDX:rAX variant of imul is handled together with mul further down.
543	;
544	BEGINCODE
545	BEGINPROC_FASTCALL iemAImpl_imul_two_u16, 12
546	PROLOGUE_3_ARGS
547	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
548	imul A1_16, word [A0]
549	mov [A0], A1_16
550	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
551	EPILOGUE_3_ARGS
552	ENDPROC iemAImpl_imul_two_u16
553
554	BEGINPROC_FASTCALL iemAImpl_imul_two_u32, 12
555	PROLOGUE_3_ARGS
556	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
557	imul A1_32, dword [A0]
558	mov [A0], A1_32
559	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
560	EPILOGUE_3_ARGS
561	ENDPROC iemAImpl_imul_two_u32
562
563	%ifdef RT_ARCH_AMD64
564	BEGINPROC_FASTCALL iemAImpl_imul_two_u64, 16
565	PROLOGUE_3_ARGS
566	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
567	imul A1, qword [A0]
568	mov [A0], A1
569	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
570	EPILOGUE_3_ARGS_EX 8
571	ENDPROC iemAImpl_imul_two_u64
572	%endif ; RT_ARCH_AMD64
573
574
575	;
576	; XCHG for memory operands. This implies locking. No flag changes.
577	;
578	; Each function takes two arguments, first the pointer to the memory,
579	; then the pointer to the register. They all return void.
580	;
581	BEGINCODE
582	BEGINPROC_FASTCALL iemAImpl_xchg_u8, 8
583	PROLOGUE_2_ARGS
584	mov T0_8, [A1]
585	xchg [A0], T0_8
586	mov [A1], T0_8
587	EPILOGUE_2_ARGS
588	ENDPROC iemAImpl_xchg_u8
589
590	BEGINPROC_FASTCALL iemAImpl_xchg_u16, 8
591	PROLOGUE_2_ARGS
592	mov T0_16, [A1]
593	xchg [A0], T0_16
594	mov [A1], T0_16
595	EPILOGUE_2_ARGS
596	ENDPROC iemAImpl_xchg_u16
597
598	BEGINPROC_FASTCALL iemAImpl_xchg_u32, 8
599	PROLOGUE_2_ARGS
600	mov T0_32, [A1]
601	xchg [A0], T0_32
602	mov [A1], T0_32
603	EPILOGUE_2_ARGS
604	ENDPROC iemAImpl_xchg_u32
605
606	%ifdef RT_ARCH_AMD64
607	BEGINPROC_FASTCALL iemAImpl_xchg_u64, 8
608	PROLOGUE_2_ARGS
609	mov T0, [A1]
610	xchg [A0], T0
611	mov [A1], T0
612	EPILOGUE_2_ARGS
613	ENDPROC iemAImpl_xchg_u64
614	%endif
615
616
617	;
618	; XADD for memory operands.
619	;
620	; Each function takes three arguments, first the pointer to the
621	; memory/register, then the pointer to the register, and finally a pointer to
622	; eflags. They all return void.
623	;
624	BEGINCODE
625	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
626	PROLOGUE_3_ARGS
627	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
628	mov T0_8, [A1]
629	xadd [A0], T0_8
630	mov [A1], T0_8
631	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
632	EPILOGUE_3_ARGS
633	ENDPROC iemAImpl_xadd_u8
634
635	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
636	PROLOGUE_3_ARGS
637	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
638	mov T0_16, [A1]
639	xadd [A0], T0_16
640	mov [A1], T0_16
641	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
642	EPILOGUE_3_ARGS
643	ENDPROC iemAImpl_xadd_u16
644
645	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
646	PROLOGUE_3_ARGS
647	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
648	mov T0_32, [A1]
649	xadd [A0], T0_32
650	mov [A1], T0_32
651	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
652	EPILOGUE_3_ARGS
653	ENDPROC iemAImpl_xadd_u32
654
655	%ifdef RT_ARCH_AMD64
656	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
657	PROLOGUE_3_ARGS
658	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
659	mov T0, [A1]
660	xadd [A0], T0
661	mov [A1], T0
662	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
663	EPILOGUE_3_ARGS
664	ENDPROC iemAImpl_xadd_u64
665	%endif ; RT_ARCH_AMD64
666
667	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
668	PROLOGUE_3_ARGS
669	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
670	mov T0_8, [A1]
671	lock xadd [A0], T0_8
672	mov [A1], T0_8
673	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
674	EPILOGUE_3_ARGS
675	ENDPROC iemAImpl_xadd_u8_locked
676
677	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
678	PROLOGUE_3_ARGS
679	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
680	mov T0_16, [A1]
681	lock xadd [A0], T0_16
682	mov [A1], T0_16
683	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
684	EPILOGUE_3_ARGS
685	ENDPROC iemAImpl_xadd_u16_locked
686
687	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
688	PROLOGUE_3_ARGS
689	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
690	mov T0_32, [A1]
691	lock xadd [A0], T0_32
692	mov [A1], T0_32
693	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
694	EPILOGUE_3_ARGS
695	ENDPROC iemAImpl_xadd_u32_locked
696
697	%ifdef RT_ARCH_AMD64
698	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
699	PROLOGUE_3_ARGS
700	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
701	mov T0, [A1]
702	lock xadd [A0], T0
703	mov [A1], T0
704	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
705	EPILOGUE_3_ARGS
706	ENDPROC iemAImpl_xadd_u64_locked
707	%endif ; RT_ARCH_AMD64
708
709
710	;
711	; CMPXCHG8B.
712	;
713	; These are tricky register wise, so the code is duplicated for each calling
714	; convention.
715	;
716	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
717	;
718	; C-proto:
719	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
720	; uint32_t *pEFlags));
721	;
722	; Note! Identical to iemAImpl_cmpxchg16b.
723	;
724	BEGINCODE
725	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
726	%ifdef RT_ARCH_AMD64
727	%ifdef ASM_CALL64_MSC
728	push rbx
729
730	mov r11, rdx ; pu64EaxEdx (is also T1)
731	mov r10, rcx ; pu64Dst
732
733	mov ebx, [r8]
734	mov ecx, [r8 + 4]
735	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
736	mov eax, [r11]
737	mov edx, [r11 + 4]
738
739	lock cmpxchg8b [r10]
740
741	mov [r11], eax
742	mov [r11 + 4], edx
743	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
744
745	pop rbx
746	ret
747	%else
748	push rbx
749
750	mov r10, rcx ; pEFlags
751	mov r11, rdx ; pu64EbxEcx (is also T1)
752
753	mov ebx, [r11]
754	mov ecx, [r11 + 4]
755	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
756	mov eax, [rsi]
757	mov edx, [rsi + 4]
758
759	lock cmpxchg8b [rdi]
760
761	mov [rsi], eax
762	mov [rsi + 4], edx
763	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
764
765	pop rbx
766	ret
767
768	%endif
769	%else
770	push esi
771	push edi
772	push ebx
773	push ebp
774
775	mov edi, ecx ; pu64Dst
776	mov esi, edx ; pu64EaxEdx
777	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
778	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
779
780	mov ebx, [ecx]
781	mov ecx, [ecx + 4]
782	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
783	mov eax, [esi]
784	mov edx, [esi + 4]
785
786	lock cmpxchg8b [edi]
787
788	mov [esi], eax
789	mov [esi + 4], edx
790	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
791
792	pop ebp
793	pop ebx
794	pop edi
795	pop esi
796	ret 8
797	%endif
798	ENDPROC iemAImpl_cmpxchg8b
799
800	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
801	; Lazy bird always lock prefixes cmpxchg8b.
802	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
803	ENDPROC iemAImpl_cmpxchg8b_locked
804
805	%ifdef RT_ARCH_AMD64
806
807	;
808	; CMPXCHG16B.
809	;
810	; These are tricky register wise, so the code is duplicated for each calling
811	; convention.
812	;
813	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
814	;
815	; C-proto:
816	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
817	; uint32_t *pEFlags));
818	;
819	; Note! Identical to iemAImpl_cmpxchg8b.
820	;
821	BEGINCODE
822	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
823	%ifdef ASM_CALL64_MSC
824	push rbx
825
826	mov r11, rdx ; pu64RaxRdx (is also T1)
827	mov r10, rcx ; pu64Dst
828
829	mov rbx, [r8]
830	mov rcx, [r8 + 8]
831	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
832	mov rax, [r11]
833	mov rdx, [r11 + 8]
834
835	lock cmpxchg16b [r10]
836
837	mov [r11], rax
838	mov [r11 + 8], rdx
839	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
840
841	pop rbx
842	ret
843	%else
844	push rbx
845
846	mov r10, rcx ; pEFlags
847	mov r11, rdx ; pu64RbxRcx (is also T1)
848
849	mov rbx, [r11]
850	mov rcx, [r11 + 8]
851	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
852	mov rax, [rsi]
853	mov rdx, [rsi + 8]
854
855	lock cmpxchg16b [rdi]
856
857	mov [rsi], eax
858	mov [rsi + 8], edx
859	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
860
861	pop rbx
862	ret
863
864	%endif
865	ENDPROC iemAImpl_cmpxchg16b
866
867	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
868	; Lazy bird always lock prefixes cmpxchg8b.
869	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
870	ENDPROC iemAImpl_cmpxchg16b_locked
871
872	%endif ; RT_ARCH_AMD64
873
874
875	;
876	; CMPXCHG.
877	;
878	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
879	;
880	; C-proto:
881	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
882	;
883	BEGINCODE
884	%macro IEMIMPL_CMPXCHG 2
885	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
886	PROLOGUE_4_ARGS
887	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
888	mov al, [A1]
889	%1 cmpxchg [A0], A2_8
890	mov [A1], al
891	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
892	EPILOGUE_4_ARGS
893	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
894
895	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
896	PROLOGUE_4_ARGS
897	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
898	mov ax, [A1]
899	%1 cmpxchg [A0], A2_16
900	mov [A1], ax
901	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
902	EPILOGUE_4_ARGS
903	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
904
905	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
906	PROLOGUE_4_ARGS
907	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
908	mov eax, [A1]
909	%1 cmpxchg [A0], A2_32
910	mov [A1], eax
911	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
912	EPILOGUE_4_ARGS
913	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
914
915	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
916	%ifdef RT_ARCH_AMD64
917	PROLOGUE_4_ARGS
918	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
919	mov rax, [A1]
920	%1 cmpxchg [A0], A2
921	mov [A1], rax
922	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
923	EPILOGUE_4_ARGS
924	%else
925	;
926	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
927	;
928	push esi
929	push edi
930	push ebx
931	push ebp
932
933	mov edi, ecx ; pu64Dst
934	mov esi, edx ; pu64Rax
935	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
936	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
937
938	mov ebx, [ecx]
939	mov ecx, [ecx + 4]
940	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
941	mov eax, [esi]
942	mov edx, [esi + 4]
943
944	lock cmpxchg8b [edi]
945
946	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
947	jz .cmpxchg8b_not_equal
948	cmp eax, eax ; just set the other flags.
949	.store:
950	mov [esi], eax
951	mov [esi + 4], edx
952	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
953
954	pop ebp
955	pop ebx
956	pop edi
957	pop esi
958	ret 8
959
960	.cmpxchg8b_not_equal:
961	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
962	jne .store
963	cmp [esi], eax
964	jmp .store
965
966	%endif
967	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
968	%endmacro ; IEMIMPL_CMPXCHG
969
970	IEMIMPL_CMPXCHG , ,
971	IEMIMPL_CMPXCHG lock, _locked
972
973	;;
974	; Macro for implementing a unary operator.
975	;
976	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
977	; variants, except on 32-bit system where the 64-bit accesses requires hand
978	; coding.
979	;
980	; All the functions takes a pointer to the destination memory operand in A0,
981	; the source register operand in A1 and a pointer to eflags in A2.
982	;
983	; @param 1 The instruction mnemonic.
984	; @param 2 The modified flags.
985	; @param 3 The undefined flags.
986	;
987	%macro IEMIMPL_UNARY_OP 3
988	BEGINCODE
989	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
990	PROLOGUE_2_ARGS
991	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
992	%1 byte [A0]
993	IEM_SAVE_FLAGS A1, %2, %3
994	EPILOGUE_2_ARGS
995	ENDPROC iemAImpl_ %+ %1 %+ _u8
996
997	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
998	PROLOGUE_2_ARGS
999	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1000	lock %1 byte [A0]
1001	IEM_SAVE_FLAGS A1, %2, %3
1002	EPILOGUE_2_ARGS
1003	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1004
1005	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1006	PROLOGUE_2_ARGS
1007	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1008	%1 word [A0]
1009	IEM_SAVE_FLAGS A1, %2, %3
1010	EPILOGUE_2_ARGS
1011	ENDPROC iemAImpl_ %+ %1 %+ _u16
1012
1013	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1014	PROLOGUE_2_ARGS
1015	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1016	lock %1 word [A0]
1017	IEM_SAVE_FLAGS A1, %2, %3
1018	EPILOGUE_2_ARGS
1019	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1020
1021	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1022	PROLOGUE_2_ARGS
1023	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1024	%1 dword [A0]
1025	IEM_SAVE_FLAGS A1, %2, %3
1026	EPILOGUE_2_ARGS
1027	ENDPROC iemAImpl_ %+ %1 %+ _u32
1028
1029	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1030	PROLOGUE_2_ARGS
1031	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1032	lock %1 dword [A0]
1033	IEM_SAVE_FLAGS A1, %2, %3
1034	EPILOGUE_2_ARGS
1035	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1036
1037	%ifdef RT_ARCH_AMD64
1038	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1039	PROLOGUE_2_ARGS
1040	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1041	%1 qword [A0]
1042	IEM_SAVE_FLAGS A1, %2, %3
1043	EPILOGUE_2_ARGS
1044	ENDPROC iemAImpl_ %+ %1 %+ _u64
1045
1046	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1047	PROLOGUE_2_ARGS
1048	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1049	lock %1 qword [A0]
1050	IEM_SAVE_FLAGS A1, %2, %3
1051	EPILOGUE_2_ARGS
1052	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1053	%endif ; RT_ARCH_AMD64
1054
1055	%endmacro
1056
1057	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1058	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1059	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1060	IEMIMPL_UNARY_OP not, 0, 0
1061
1062
1063	;;
1064	; Macro for implementing memory fence operation.
1065	;
1066	; No return value, no operands or anything.
1067	;
1068	; @param 1 The instruction.
1069	;
1070	%macro IEMIMPL_MEM_FENCE 1
1071	BEGINCODE
1072	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
1073	%1
1074	ret
1075	ENDPROC iemAImpl_ %+ %1
1076	%endmacro
1077
1078	IEMIMPL_MEM_FENCE lfence
1079	IEMIMPL_MEM_FENCE sfence
1080	IEMIMPL_MEM_FENCE mfence
1081
1082	;;
1083	; Alternative for non-SSE2 host.
1084	;
1085	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
1086	push xAX
1087	xchg xAX, [xSP]
1088	add xSP, xCB
1089	ret
1090	ENDPROC iemAImpl_alt_mem_fence
1091
1092
1093
1094	;;
1095	; Macro for implementing a shift operation.
1096	;
1097	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1098	; 32-bit system where the 64-bit accesses requires hand coding.
1099	;
1100	; All the functions takes a pointer to the destination memory operand in A0,
1101	; the shift count in A1 and a pointer to eflags in A2.
1102	;
1103	; @param 1 The instruction mnemonic.
1104	; @param 2 The modified flags.
1105	; @param 3 The undefined flags.
1106	;
1107	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1108	;
1109	%macro IEMIMPL_SHIFT_OP 3
1110	BEGINCODE
1111	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1112	PROLOGUE_3_ARGS
1113	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1114	%ifdef ASM_CALL64_GCC
1115	mov cl, A1_8
1116	%1 byte [A0], cl
1117	%else
1118	xchg A1, A0
1119	%1 byte [A1], cl
1120	%endif
1121	IEM_SAVE_FLAGS A2, %2, %3
1122	EPILOGUE_3_ARGS
1123	ENDPROC iemAImpl_ %+ %1 %+ _u8
1124
1125	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1126	PROLOGUE_3_ARGS
1127	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1128	%ifdef ASM_CALL64_GCC
1129	mov cl, A1_8
1130	%1 word [A0], cl
1131	%else
1132	xchg A1, A0
1133	%1 word [A1], cl
1134	%endif
1135	IEM_SAVE_FLAGS A2, %2, %3
1136	EPILOGUE_3_ARGS
1137	ENDPROC iemAImpl_ %+ %1 %+ _u16
1138
1139	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1140	PROLOGUE_3_ARGS
1141	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1142	%ifdef ASM_CALL64_GCC
1143	mov cl, A1_8
1144	%1 dword [A0], cl
1145	%else
1146	xchg A1, A0
1147	%1 dword [A1], cl
1148	%endif
1149	IEM_SAVE_FLAGS A2, %2, %3
1150	EPILOGUE_3_ARGS
1151	ENDPROC iemAImpl_ %+ %1 %+ _u32
1152
1153	%ifdef RT_ARCH_AMD64
1154	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1155	PROLOGUE_3_ARGS
1156	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1157	%ifdef ASM_CALL64_GCC
1158	mov cl, A1_8
1159	%1 qword [A0], cl
1160	%else
1161	xchg A1, A0
1162	%1 qword [A1], cl
1163	%endif
1164	IEM_SAVE_FLAGS A2, %2, %3
1165	EPILOGUE_3_ARGS
1166	ENDPROC iemAImpl_ %+ %1 %+ _u64
1167	%endif ; RT_ARCH_AMD64
1168
1169	%endmacro
1170
1171	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1172	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1173	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1174	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1175	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1176	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1177	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1178
1179
1180	;;
1181	; Macro for implementing a double precision shift operation.
1182	;
1183	; This will generate code for the 16, 32 and 64 bit accesses, except on
1184	; 32-bit system where the 64-bit accesses requires hand coding.
1185	;
1186	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1187	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1188	;
1189	; @param 1 The instruction mnemonic.
1190	; @param 2 The modified flags.
1191	; @param 3 The undefined flags.
1192	;
1193	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1194	;
1195	%macro IEMIMPL_SHIFT_DBL_OP 3
1196	BEGINCODE
1197	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1198	PROLOGUE_4_ARGS
1199	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1200	%ifdef ASM_CALL64_GCC
1201	xchg A3, A2
1202	%1 [A0], A1_16, cl
1203	xchg A3, A2
1204	%else
1205	xchg A0, A2
1206	%1 [A2], A1_16, cl
1207	%endif
1208	IEM_SAVE_FLAGS A3, %2, %3
1209	EPILOGUE_4_ARGS
1210	ENDPROC iemAImpl_ %+ %1 %+ _u16
1211
1212	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1213	PROLOGUE_4_ARGS
1214	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1215	%ifdef ASM_CALL64_GCC
1216	xchg A3, A2
1217	%1 [A0], A1_32, cl
1218	xchg A3, A2
1219	%else
1220	xchg A0, A2
1221	%1 [A2], A1_32, cl
1222	%endif
1223	IEM_SAVE_FLAGS A3, %2, %3
1224	EPILOGUE_4_ARGS
1225	ENDPROC iemAImpl_ %+ %1 %+ _u32
1226
1227	%ifdef RT_ARCH_AMD64
1228	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1229	PROLOGUE_4_ARGS
1230	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1231	%ifdef ASM_CALL64_GCC
1232	xchg A3, A2
1233	%1 [A0], A1, cl
1234	xchg A3, A2
1235	%else
1236	xchg A0, A2
1237	%1 [A2], A1, cl
1238	%endif
1239	IEM_SAVE_FLAGS A3, %2, %3
1240	EPILOGUE_4_ARGS_EX 12
1241	ENDPROC iemAImpl_ %+ %1 %+ _u64
1242	%endif ; RT_ARCH_AMD64
1243
1244	%endmacro
1245
1246	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1247	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1248
1249
1250	;;
1251	; Macro for implementing a multiplication operations.
1252	;
1253	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1254	; 32-bit system where the 64-bit accesses requires hand coding.
1255	;
1256	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1257	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1258	; pointer to eflags in A3.
1259	;
1260	; The functions all return 0 so the caller can be used for div/idiv as well as
1261	; for the mul/imul implementation.
1262	;
1263	; @param 1 The instruction mnemonic.
1264	; @param 2 The modified flags.
1265	; @param 3 The undefined flags.
1266	;
1267	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1268	;
1269	%macro IEMIMPL_MUL_OP 3
1270	BEGINCODE
1271	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1272	PROLOGUE_3_ARGS
1273	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1274	mov al, [A0]
1275	%1 A1_8
1276	mov [A0], ax
1277	IEM_SAVE_FLAGS A2, %2, %3
1278	xor eax, eax
1279	EPILOGUE_3_ARGS
1280	ENDPROC iemAImpl_ %+ %1 %+ _u8
1281
1282	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1283	PROLOGUE_4_ARGS
1284	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1285	mov ax, [A0]
1286	%ifdef ASM_CALL64_GCC
1287	%1 A2_16
1288	mov [A0], ax
1289	mov [A1], dx
1290	%else
1291	mov T1, A1
1292	%1 A2_16
1293	mov [A0], ax
1294	mov [T1], dx
1295	%endif
1296	IEM_SAVE_FLAGS A3, %2, %3
1297	xor eax, eax
1298	EPILOGUE_4_ARGS
1299	ENDPROC iemAImpl_ %+ %1 %+ _u16
1300
1301	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1302	PROLOGUE_4_ARGS
1303	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1304	mov eax, [A0]
1305	%ifdef ASM_CALL64_GCC
1306	%1 A2_32
1307	mov [A0], eax
1308	mov [A1], edx
1309	%else
1310	mov T1, A1
1311	%1 A2_32
1312	mov [A0], eax
1313	mov [T1], edx
1314	%endif
1315	IEM_SAVE_FLAGS A3, %2, %3
1316	xor eax, eax
1317	EPILOGUE_4_ARGS
1318	ENDPROC iemAImpl_ %+ %1 %+ _u32
1319
1320	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1321	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1322	PROLOGUE_4_ARGS
1323	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1324	mov rax, [A0]
1325	%ifdef ASM_CALL64_GCC
1326	%1 A2
1327	mov [A0], rax
1328	mov [A1], rdx
1329	%else
1330	mov T1, A1
1331	%1 A2
1332	mov [A0], rax
1333	mov [T1], rdx
1334	%endif
1335	IEM_SAVE_FLAGS A3, %2, %3
1336	xor eax, eax
1337	EPILOGUE_4_ARGS_EX 12
1338	ENDPROC iemAImpl_ %+ %1 %+ _u64
1339	%endif ; !RT_ARCH_AMD64
1340
1341	%endmacro
1342
1343	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1344	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1345
1346
1347	BEGINCODE
1348	;;
1349	; Worker function for negating a 32-bit number in T1:T0
1350	; @uses None (T0,T1)
1351	iemAImpl_negate_T0_T1_u32:
1352	push 0
1353	push 0
1354	xchg T0_32, [xSP]
1355	xchg T1_32, [xSP + xCB]
1356	sub T0_32, [xSP]
1357	sbb T1_32, [xSP + xCB]
1358	add xSP, xCB*2
1359	ret
1360
1361	%ifdef RT_ARCH_AMD64
1362	;;
1363	; Worker function for negating a 64-bit number in T1:T0
1364	; @uses None (T0,T1)
1365	iemAImpl_negate_T0_T1_u64:
1366	push 0
1367	push 0
1368	xchg T0, [xSP]
1369	xchg T1, [xSP + xCB]
1370	sub T0, [xSP]
1371	sbb T1, [xSP + xCB]
1372	add xSP, xCB*2
1373	ret
1374	%endif
1375
1376
1377	;;
1378	; Macro for implementing a division operations.
1379	;
1380	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1381	; 32-bit system where the 64-bit accesses requires hand coding.
1382	;
1383	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1384	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1385	; pointer to eflags in A3.
1386	;
1387	; The functions all return 0 on success and -1 if a divide error should be
1388	; raised by the caller.
1389	;
1390	; @param 1 The instruction mnemonic.
1391	; @param 2 The modified flags.
1392	; @param 3 The undefined flags.
1393	; @param 4 1 if signed, 0 if unsigned.
1394	;
1395	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1396	;
1397	%macro IEMIMPL_DIV_OP 4
1398	BEGINCODE
1399	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1400	PROLOGUE_3_ARGS
1401
1402	; div by chainsaw check.
1403	test A1_8, A1_8
1404	jz .div_zero
1405
1406	; Overflow check - unsigned division is simple to verify, haven't
1407	; found a simple way to check signed division yet unfortunately.
1408	%if %4 == 0
1409	cmp [A0 + 1], A1_8
1410	jae .div_overflow
1411	%else
1412	mov T0_16, [A0] ; T0 = dividend
1413	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1414	test A1_8, A1_8
1415	js .divisor_negative
1416	test T0_16, T0_16
1417	jns .both_positive
1418	neg T0_16
1419	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1420	push T0 ; Start off like unsigned below.
1421	shr T0_16, 7
1422	cmp T0_8, A1_8
1423	pop T0
1424	jb .div_no_overflow
1425	ja .div_overflow
1426	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1427	cmp T0_8, A1_8
1428	jae .div_overflow
1429	jmp .div_no_overflow
1430
1431	.divisor_negative:
1432	neg A1_8
1433	test T0_16, T0_16
1434	jns .one_of_each
1435	neg T0_16
1436	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1437	shr T0_16, 7
1438	cmp T0_8, A1_8
1439	jae .div_overflow
1440	.div_no_overflow:
1441	mov A1, T1 ; restore divisor
1442	%endif
1443
1444	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1445	mov ax, [A0]
1446	%1 A1_8
1447	mov [A0], ax
1448	IEM_SAVE_FLAGS A2, %2, %3
1449	xor eax, eax
1450
1451	.return:
1452	EPILOGUE_3_ARGS
1453
1454	.div_zero:
1455	.div_overflow:
1456	mov eax, -1
1457	jmp .return
1458	ENDPROC iemAImpl_ %+ %1 %+ _u8
1459
1460	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1461	PROLOGUE_4_ARGS
1462
1463	; div by chainsaw check.
1464	test A2_16, A2_16
1465	jz .div_zero
1466
1467	; Overflow check - unsigned division is simple to verify, haven't
1468	; found a simple way to check signed division yet unfortunately.
1469	%if %4 == 0
1470	cmp [A1], A2_16
1471	jae .div_overflow
1472	%else
1473	mov T0_16, [A1]
1474	shl T0_32, 16
1475	mov T0_16, [A0] ; T0 = dividend
1476	mov T1, A2 ; T1 = divisor
1477	test T1_16, T1_16
1478	js .divisor_negative
1479	test T0_32, T0_32
1480	jns .both_positive
1481	neg T0_32
1482	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1483	push T0 ; Start off like unsigned below.
1484	shr T0_32, 15
1485	cmp T0_16, T1_16
1486	pop T0
1487	jb .div_no_overflow
1488	ja .div_overflow
1489	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1490	cmp T0_16, T1_16
1491	jae .div_overflow
1492	jmp .div_no_overflow
1493
1494	.divisor_negative:
1495	neg T1_16
1496	test T0_32, T0_32
1497	jns .one_of_each
1498	neg T0_32
1499	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1500	shr T0_32, 15
1501	cmp T0_16, T1_16
1502	jae .div_overflow
1503	.div_no_overflow:
1504	%endif
1505
1506	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1507	%ifdef ASM_CALL64_GCC
1508	mov T1, A2
1509	mov ax, [A0]
1510	mov dx, [A1]
1511	%1 T1_16
1512	mov [A0], ax
1513	mov [A1], dx
1514	%else
1515	mov T1, A1
1516	mov ax, [A0]
1517	mov dx, [T1]
1518	%1 A2_16
1519	mov [A0], ax
1520	mov [T1], dx
1521	%endif
1522	IEM_SAVE_FLAGS A3, %2, %3
1523	xor eax, eax
1524
1525	.return:
1526	EPILOGUE_4_ARGS
1527
1528	.div_zero:
1529	.div_overflow:
1530	mov eax, -1
1531	jmp .return
1532	ENDPROC iemAImpl_ %+ %1 %+ _u16
1533
1534	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1535	PROLOGUE_4_ARGS
1536
1537	; div by chainsaw check.
1538	test A2_32, A2_32
1539	jz .div_zero
1540
1541	; Overflow check - unsigned division is simple to verify, haven't
1542	; found a simple way to check signed division yet unfortunately.
1543	%if %4 == 0
1544	cmp [A1], A2_32
1545	jae .div_overflow
1546	%else
1547	push A2 ; save A2 so we modify it (we out of regs on x86).
1548	mov T0_32, [A0] ; T0 = dividend low
1549	mov T1_32, [A1] ; T1 = dividend high
1550	test A2_32, A2_32
1551	js .divisor_negative
1552	test T1_32, T1_32
1553	jns .both_positive
1554	call iemAImpl_negate_T0_T1_u32
1555	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1556	push T0 ; Start off like unsigned below.
1557	shl T1_32, 1
1558	shr T0_32, 31
1559	or T1_32, T0_32
1560	cmp T1_32, A2_32
1561	pop T0
1562	jb .div_no_overflow
1563	ja .div_overflow
1564	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
1565	cmp T0_32, A2_32
1566	jae .div_overflow
1567	jmp .div_no_overflow
1568
1569	.divisor_negative:
1570	neg A2_32
1571	test T1_32, T1_32
1572	jns .one_of_each
1573	call iemAImpl_negate_T0_T1_u32
1574	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1575	shl T1_32, 1
1576	shr T0_32, 31
1577	or T1_32, T0_32
1578	cmp T1_32, A2_32
1579	jae .div_overflow
1580	.div_no_overflow:
1581	pop A2
1582	%endif
1583
1584	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1585	mov eax, [A0]
1586	%ifdef ASM_CALL64_GCC
1587	mov T1, A2
1588	mov eax, [A0]
1589	mov edx, [A1]
1590	%1 T1_32
1591	mov [A0], eax
1592	mov [A1], edx
1593	%else
1594	mov T1, A1
1595	mov eax, [A0]
1596	mov edx, [T1]
1597	%1 A2_32
1598	mov [A0], eax
1599	mov [T1], edx
1600	%endif
1601	IEM_SAVE_FLAGS A3, %2, %3
1602	xor eax, eax
1603
1604	.return:
1605	EPILOGUE_4_ARGS
1606
1607	.div_overflow:
1608	%if %4 != 0
1609	pop A2
1610	%endif
1611	.div_zero:
1612	mov eax, -1
1613	jmp .return
1614	ENDPROC iemAImpl_ %+ %1 %+ _u32
1615
1616	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1617	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1618	PROLOGUE_4_ARGS
1619
1620	test A2, A2
1621	jz .div_zero
1622	%if %4 == 0
1623	cmp [A1], A2
1624	jae .div_overflow
1625	%else
1626	push A2 ; save A2 so we modify it (we out of regs on x86).
1627	mov T0, [A0] ; T0 = dividend low
1628	mov T1, [A1] ; T1 = dividend high
1629	test A2, A2
1630	js .divisor_negative
1631	test T1, T1
1632	jns .both_positive
1633	call iemAImpl_negate_T0_T1_u64
1634	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1635	push T0 ; Start off like unsigned below.
1636	shl T1, 1
1637	shr T0, 63
1638	or T1, T0
1639	cmp T1, A2
1640	pop T0
1641	jb .div_no_overflow
1642	ja .div_overflow
1643	mov T1, 0x7fffffffffffffff
1644	and T0, T1 ; Special case for covering (divisor - 1).
1645	cmp T0, A2
1646	jae .div_overflow
1647	jmp .div_no_overflow
1648
1649	.divisor_negative:
1650	neg A2
1651	test T1, T1
1652	jns .one_of_each
1653	call iemAImpl_negate_T0_T1_u64
1654	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1655	shl T1, 1
1656	shr T0, 63
1657	or T1, T0
1658	cmp T1, A2
1659	jae .div_overflow
1660	.div_no_overflow:
1661	pop A2
1662	%endif
1663
1664	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1665	mov rax, [A0]
1666	%ifdef ASM_CALL64_GCC
1667	mov T1, A2
1668	mov rax, [A0]
1669	mov rdx, [A1]
1670	%1 T1
1671	mov [A0], rax
1672	mov [A1], rdx
1673	%else
1674	mov T1, A1
1675	mov rax, [A0]
1676	mov rdx, [T1]
1677	%1 A2
1678	mov [A0], rax
1679	mov [T1], rdx
1680	%endif
1681	IEM_SAVE_FLAGS A3, %2, %3
1682	xor eax, eax
1683
1684	.return:
1685	EPILOGUE_4_ARGS_EX 12
1686
1687	.div_overflow:
1688	%if %4 != 0
1689	pop A2
1690	%endif
1691	.div_zero:
1692	mov eax, -1
1693	jmp .return
1694	ENDPROC iemAImpl_ %+ %1 %+ _u64
1695	%endif ; !RT_ARCH_AMD64
1696
1697	%endmacro
1698
1699	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1700	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1701
1702
1703	;
1704	; BSWAP. No flag changes.
1705	;
1706	; Each function takes one argument, pointer to the value to bswap
1707	; (input/output). They all return void.
1708	;
1709	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1710	PROLOGUE_1_ARGS
1711	mov T0_32, [A0] ; just in case any of the upper bits are used.
1712	db 66h
1713	bswap T0_32
1714	mov [A0], T0_32
1715	EPILOGUE_1_ARGS
1716	ENDPROC iemAImpl_bswap_u16
1717
1718	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1719	PROLOGUE_1_ARGS
1720	mov T0_32, [A0]
1721	bswap T0_32
1722	mov [A0], T0_32
1723	EPILOGUE_1_ARGS
1724	ENDPROC iemAImpl_bswap_u32
1725
1726	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1727	%ifdef RT_ARCH_AMD64
1728	PROLOGUE_1_ARGS
1729	mov T0, [A0]
1730	bswap T0
1731	mov [A0], T0
1732	EPILOGUE_1_ARGS
1733	%else
1734	PROLOGUE_1_ARGS
1735	mov T0, [A0]
1736	mov T1, [A0 + 4]
1737	bswap T0
1738	bswap T1
1739	mov [A0 + 4], T0
1740	mov [A0], T1
1741	EPILOGUE_1_ARGS
1742	%endif
1743	ENDPROC iemAImpl_bswap_u64
1744
1745
1746	;;
1747	; Initialize the FPU for the actual instruction being emulated, this means
1748	; loading parts of the guest's control word and status word.
1749	;
1750	; @uses 24 bytes of stack.
1751	; @param 1 Expression giving the address of the FXSTATE of the guest.
1752	;
1753	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
1754	fnstenv [xSP]
1755
1756	; FCW - for exception, precision and rounding control.
1757	movzx T0, word [%1 + X86FXSTATE.FCW]
1758	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
1759	mov [xSP + X86FSTENV32P.FCW], T0_16
1760
1761	; FSW - for undefined C0, C1, C2, and C3.
1762	movzx T1, word [%1 + X86FXSTATE.FSW]
1763	and T1, X86_FSW_C_MASK
1764	movzx T0, word [xSP + X86FSTENV32P.FSW]
1765	and T0, X86_FSW_TOP_MASK
1766	or T0, T1
1767	mov [xSP + X86FSTENV32P.FSW], T0_16
1768
1769	fldenv [xSP]
1770	%endmacro
1771
1772
1773	;;
1774	; Need to move this as well somewhere better?
1775	;
1776	struc IEMFPURESULT
1777	.r80Result resw 5
1778	.FSW resw 1
1779	endstruc
1780
1781
1782	;;
1783	; Need to move this as well somewhere better?
1784	;
1785	struc IEMFPURESULTTWO
1786	.r80Result1 resw 5
1787	.FSW resw 1
1788	.r80Result2 resw 5
1789	endstruc
1790
1791
1792	;
1793	;---------------------- 16-bit signed integer operations ----------------------
1794	;
1795
1796
1797	;;
1798	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
1799	;
1800	; @param A0 FPU context (fxsave).
1801	; @param A1 Pointer to a IEMFPURESULT for the output.
1802	; @param A2 Pointer to the 16-bit floating point value to convert.
1803	;
1804	BEGINPROC_FASTCALL iemAImpl_fild_i16_to_r80, 12
1805	PROLOGUE_3_ARGS
1806	sub xSP, 20h
1807
1808	fninit
1809	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1810	fild word [A2]
1811
1812	fnstsw word [A1 + IEMFPURESULT.FSW]
1813	fnclex
1814	fstp tword [A1 + IEMFPURESULT.r80Result]
1815
1816	fninit
1817	add xSP, 20h
1818	EPILOGUE_3_ARGS
1819	ENDPROC iemAImpl_fild_i16_to_r80
1820
1821
1822	;;
1823	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
1824	;
1825	; @param A0 FPU context (fxsave).
1826	; @param A1 Where to return the output FSW.
1827	; @param A2 Where to store the 16-bit signed integer value.
1828	; @param A3 Pointer to the 80-bit value.
1829	;
1830	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
1831	PROLOGUE_4_ARGS
1832	sub xSP, 20h
1833
1834	fninit
1835	fld tword [A3]
1836	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1837	fistp word [A2]
1838
1839	fnstsw word [A1]
1840
1841	fninit
1842	add xSP, 20h
1843	EPILOGUE_4_ARGS
1844	ENDPROC iemAImpl_fist_r80_to_i16
1845
1846
1847	;;
1848	; Store a 80-bit floating point value (register) as a 16-bit signed integer
1849	; (memory) with truncation.
1850	;
1851	; @param A0 FPU context (fxsave).
1852	; @param A1 Where to return the output FSW.
1853	; @param A2 Where to store the 16-bit signed integer value.
1854	; @param A3 Pointer to the 80-bit value.
1855	;
1856	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
1857	PROLOGUE_4_ARGS
1858	sub xSP, 20h
1859
1860	fninit
1861	fld tword [A3]
1862	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1863	fisttp dword [A2]
1864
1865	fnstsw word [A1]
1866
1867	fninit
1868	add xSP, 20h
1869	EPILOGUE_4_ARGS
1870	ENDPROC iemAImpl_fistt_r80_to_i16
1871
1872
1873	;;
1874	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
1875	;
1876	; @param 1 The instruction
1877	;
1878	; @param A0 FPU context (fxsave).
1879	; @param A1 Pointer to a IEMFPURESULT for the output.
1880	; @param A2 Pointer to the 80-bit value.
1881	; @param A3 Pointer to the 16-bit value.
1882	;
1883	%macro IEMIMPL_FPU_R80_BY_I16 1
1884	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1885	PROLOGUE_4_ARGS
1886	sub xSP, 20h
1887
1888	fninit
1889	fld tword [A2]
1890	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1891	%1 word [A3]
1892
1893	fnstsw word [A1 + IEMFPURESULT.FSW]
1894	fnclex
1895	fstp tword [A1 + IEMFPURESULT.r80Result]
1896
1897	fninit
1898	add xSP, 20h
1899	EPILOGUE_4_ARGS
1900	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1901	%endmacro
1902
1903	IEMIMPL_FPU_R80_BY_I16 fiadd
1904	IEMIMPL_FPU_R80_BY_I16 fimul
1905	IEMIMPL_FPU_R80_BY_I16 fisub
1906	IEMIMPL_FPU_R80_BY_I16 fisubr
1907	IEMIMPL_FPU_R80_BY_I16 fidiv
1908	IEMIMPL_FPU_R80_BY_I16 fidivr
1909
1910
1911	;;
1912	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
1913	; only returning FSW.
1914	;
1915	; @param 1 The instruction
1916	;
1917	; @param A0 FPU context (fxsave).
1918	; @param A1 Where to store the output FSW.
1919	; @param A2 Pointer to the 80-bit value.
1920	; @param A3 Pointer to the 64-bit value.
1921	;
1922	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
1923	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1924	PROLOGUE_4_ARGS
1925	sub xSP, 20h
1926
1927	fninit
1928	fld tword [A2]
1929	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1930	%1 word [A3]
1931
1932	fnstsw word [A1]
1933
1934	fninit
1935	add xSP, 20h
1936	EPILOGUE_4_ARGS
1937	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1938	%endmacro
1939
1940	IEMIMPL_FPU_R80_BY_I16_FSW ficom
1941
1942
1943
1944	;
1945	;---------------------- 32-bit signed integer operations ----------------------
1946	;
1947
1948
1949	;;
1950	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
1951	;
1952	; @param A0 FPU context (fxsave).
1953	; @param A1 Pointer to a IEMFPURESULT for the output.
1954	; @param A2 Pointer to the 32-bit floating point value to convert.
1955	;
1956	BEGINPROC_FASTCALL iemAImpl_fild_i32_to_r80, 12
1957	PROLOGUE_3_ARGS
1958	sub xSP, 20h
1959
1960	fninit
1961	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1962	fild dword [A2]
1963
1964	fnstsw word [A1 + IEMFPURESULT.FSW]
1965	fnclex
1966	fstp tword [A1 + IEMFPURESULT.r80Result]
1967
1968	fninit
1969	add xSP, 20h
1970	EPILOGUE_3_ARGS
1971	ENDPROC iemAImpl_fild_i32_to_r80
1972
1973
1974	;;
1975	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
1976	;
1977	; @param A0 FPU context (fxsave).
1978	; @param A1 Where to return the output FSW.
1979	; @param A2 Where to store the 32-bit signed integer value.
1980	; @param A3 Pointer to the 80-bit value.
1981	;
1982	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
1983	PROLOGUE_4_ARGS
1984	sub xSP, 20h
1985
1986	fninit
1987	fld tword [A3]
1988	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1989	fistp dword [A2]
1990
1991	fnstsw word [A1]
1992
1993	fninit
1994	add xSP, 20h
1995	EPILOGUE_4_ARGS
1996	ENDPROC iemAImpl_fist_r80_to_i32
1997
1998
1999	;;
2000	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2001	; (memory) with truncation.
2002	;
2003	; @param A0 FPU context (fxsave).
2004	; @param A1 Where to return the output FSW.
2005	; @param A2 Where to store the 32-bit signed integer value.
2006	; @param A3 Pointer to the 80-bit value.
2007	;
2008	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2009	PROLOGUE_4_ARGS
2010	sub xSP, 20h
2011
2012	fninit
2013	fld tword [A3]
2014	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2015	fisttp dword [A2]
2016
2017	fnstsw word [A1]
2018
2019	fninit
2020	add xSP, 20h
2021	EPILOGUE_4_ARGS
2022	ENDPROC iemAImpl_fistt_r80_to_i32
2023
2024
2025	;;
2026	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2027	;
2028	; @param 1 The instruction
2029	;
2030	; @param A0 FPU context (fxsave).
2031	; @param A1 Pointer to a IEMFPURESULT for the output.
2032	; @param A2 Pointer to the 80-bit value.
2033	; @param A3 Pointer to the 32-bit value.
2034	;
2035	%macro IEMIMPL_FPU_R80_BY_I32 1
2036	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2037	PROLOGUE_4_ARGS
2038	sub xSP, 20h
2039
2040	fninit
2041	fld tword [A2]
2042	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2043	%1 dword [A3]
2044
2045	fnstsw word [A1 + IEMFPURESULT.FSW]
2046	fnclex
2047	fstp tword [A1 + IEMFPURESULT.r80Result]
2048
2049	fninit
2050	add xSP, 20h
2051	EPILOGUE_4_ARGS
2052	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2053	%endmacro
2054
2055	IEMIMPL_FPU_R80_BY_I32 fiadd
2056	IEMIMPL_FPU_R80_BY_I32 fimul
2057	IEMIMPL_FPU_R80_BY_I32 fisub
2058	IEMIMPL_FPU_R80_BY_I32 fisubr
2059	IEMIMPL_FPU_R80_BY_I32 fidiv
2060	IEMIMPL_FPU_R80_BY_I32 fidivr
2061
2062
2063	;;
2064	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2065	; only returning FSW.
2066	;
2067	; @param 1 The instruction
2068	;
2069	; @param A0 FPU context (fxsave).
2070	; @param A1 Where to store the output FSW.
2071	; @param A2 Pointer to the 80-bit value.
2072	; @param A3 Pointer to the 64-bit value.
2073	;
2074	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2075	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2076	PROLOGUE_4_ARGS
2077	sub xSP, 20h
2078
2079	fninit
2080	fld tword [A2]
2081	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2082	%1 dword [A3]
2083
2084	fnstsw word [A1]
2085
2086	fninit
2087	add xSP, 20h
2088	EPILOGUE_4_ARGS
2089	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2090	%endmacro
2091
2092	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2093
2094
2095
2096	;
2097	;---------------------- 64-bit signed integer operations ----------------------
2098	;
2099
2100
2101	;;
2102	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2103	;
2104	; @param A0 FPU context (fxsave).
2105	; @param A1 Pointer to a IEMFPURESULT for the output.
2106	; @param A2 Pointer to the 64-bit floating point value to convert.
2107	;
2108	BEGINPROC_FASTCALL iemAImpl_fild_i64_to_r80, 12
2109	PROLOGUE_3_ARGS
2110	sub xSP, 20h
2111
2112	fninit
2113	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2114	fild qword [A2]
2115
2116	fnstsw word [A1 + IEMFPURESULT.FSW]
2117	fnclex
2118	fstp tword [A1 + IEMFPURESULT.r80Result]
2119
2120	fninit
2121	add xSP, 20h
2122	EPILOGUE_3_ARGS
2123	ENDPROC iemAImpl_fild_i64_to_r80
2124
2125
2126	;;
2127	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2128	;
2129	; @param A0 FPU context (fxsave).
2130	; @param A1 Where to return the output FSW.
2131	; @param A2 Where to store the 64-bit signed integer value.
2132	; @param A3 Pointer to the 80-bit value.
2133	;
2134	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2135	PROLOGUE_4_ARGS
2136	sub xSP, 20h
2137
2138	fninit
2139	fld tword [A3]
2140	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2141	fistp qword [A2]
2142
2143	fnstsw word [A1]
2144
2145	fninit
2146	add xSP, 20h
2147	EPILOGUE_4_ARGS
2148	ENDPROC iemAImpl_fist_r80_to_i64
2149
2150
2151	;;
2152	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2153	; (memory) with truncation.
2154	;
2155	; @param A0 FPU context (fxsave).
2156	; @param A1 Where to return the output FSW.
2157	; @param A2 Where to store the 64-bit signed integer value.
2158	; @param A3 Pointer to the 80-bit value.
2159	;
2160	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2161	PROLOGUE_4_ARGS
2162	sub xSP, 20h
2163
2164	fninit
2165	fld tword [A3]
2166	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2167	fisttp qword [A2]
2168
2169	fnstsw word [A1]
2170
2171	fninit
2172	add xSP, 20h
2173	EPILOGUE_4_ARGS
2174	ENDPROC iemAImpl_fistt_r80_to_i64
2175
2176
2177
2178	;
2179	;---------------------- 32-bit floating point operations ----------------------
2180	;
2181
2182	;;
2183	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2184	;
2185	; @param A0 FPU context (fxsave).
2186	; @param A1 Pointer to a IEMFPURESULT for the output.
2187	; @param A2 Pointer to the 32-bit floating point value to convert.
2188	;
2189	BEGINPROC_FASTCALL iemAImpl_fld_r32_to_r80, 12
2190	PROLOGUE_3_ARGS
2191	sub xSP, 20h
2192
2193	fninit
2194	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2195	fld dword [A2]
2196
2197	fnstsw word [A1 + IEMFPURESULT.FSW]
2198	fnclex
2199	fstp tword [A1 + IEMFPURESULT.r80Result]
2200
2201	fninit
2202	add xSP, 20h
2203	EPILOGUE_3_ARGS
2204	ENDPROC iemAImpl_fld_r32_to_r80
2205
2206
2207	;;
2208	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2209	;
2210	; @param A0 FPU context (fxsave).
2211	; @param A1 Where to return the output FSW.
2212	; @param A2 Where to store the 32-bit value.
2213	; @param A3 Pointer to the 80-bit value.
2214	;
2215	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2216	PROLOGUE_4_ARGS
2217	sub xSP, 20h
2218
2219	fninit
2220	fld tword [A3]
2221	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2222	fst dword [A2]
2223
2224	fnstsw word [A1]
2225
2226	fninit
2227	add xSP, 20h
2228	EPILOGUE_4_ARGS
2229	ENDPROC iemAImpl_fst_r80_to_r32
2230
2231
2232	;;
2233	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2234	;
2235	; @param 1 The instruction
2236	;
2237	; @param A0 FPU context (fxsave).
2238	; @param A1 Pointer to a IEMFPURESULT for the output.
2239	; @param A2 Pointer to the 80-bit value.
2240	; @param A3 Pointer to the 32-bit value.
2241	;
2242	%macro IEMIMPL_FPU_R80_BY_R32 1
2243	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2244	PROLOGUE_4_ARGS
2245	sub xSP, 20h
2246
2247	fninit
2248	fld tword [A2]
2249	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2250	%1 dword [A3]
2251
2252	fnstsw word [A1 + IEMFPURESULT.FSW]
2253	fnclex
2254	fstp tword [A1 + IEMFPURESULT.r80Result]
2255
2256	fninit
2257	add xSP, 20h
2258	EPILOGUE_4_ARGS
2259	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2260	%endmacro
2261
2262	IEMIMPL_FPU_R80_BY_R32 fadd
2263	IEMIMPL_FPU_R80_BY_R32 fmul
2264	IEMIMPL_FPU_R80_BY_R32 fsub
2265	IEMIMPL_FPU_R80_BY_R32 fsubr
2266	IEMIMPL_FPU_R80_BY_R32 fdiv
2267	IEMIMPL_FPU_R80_BY_R32 fdivr
2268
2269
2270	;;
2271	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2272	; only returning FSW.
2273	;
2274	; @param 1 The instruction
2275	;
2276	; @param A0 FPU context (fxsave).
2277	; @param A1 Where to store the output FSW.
2278	; @param A2 Pointer to the 80-bit value.
2279	; @param A3 Pointer to the 64-bit value.
2280	;
2281	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2282	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2283	PROLOGUE_4_ARGS
2284	sub xSP, 20h
2285
2286	fninit
2287	fld tword [A2]
2288	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2289	%1 dword [A3]
2290
2291	fnstsw word [A1]
2292
2293	fninit
2294	add xSP, 20h
2295	EPILOGUE_4_ARGS
2296	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2297	%endmacro
2298
2299	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2300
2301
2302
2303	;
2304	;---------------------- 64-bit floating point operations ----------------------
2305	;
2306
2307	;;
2308	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2309	;
2310	; @param A0 FPU context (fxsave).
2311	; @param A1 Pointer to a IEMFPURESULT for the output.
2312	; @param A2 Pointer to the 64-bit floating point value to convert.
2313	;
2314	BEGINPROC_FASTCALL iemAImpl_fld_r64_to_r80, 12
2315	PROLOGUE_3_ARGS
2316	sub xSP, 20h
2317
2318	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2319	fld qword [A2]
2320
2321	fnstsw word [A1 + IEMFPURESULT.FSW]
2322	fnclex
2323	fstp tword [A1 + IEMFPURESULT.r80Result]
2324
2325	fninit
2326	add xSP, 20h
2327	EPILOGUE_3_ARGS
2328	ENDPROC iemAImpl_fld_r64_to_r80
2329
2330
2331	;;
2332	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2333	;
2334	; @param A0 FPU context (fxsave).
2335	; @param A1 Where to return the output FSW.
2336	; @param A2 Where to store the 64-bit value.
2337	; @param A3 Pointer to the 80-bit value.
2338	;
2339	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2340	PROLOGUE_4_ARGS
2341	sub xSP, 20h
2342
2343	fninit
2344	fld tword [A3]
2345	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2346	fst qword [A2]
2347
2348	fnstsw word [A1]
2349
2350	fninit
2351	add xSP, 20h
2352	EPILOGUE_4_ARGS
2353	ENDPROC iemAImpl_fst_r80_to_r64
2354
2355
2356	;;
2357	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2358	;
2359	; @param 1 The instruction
2360	;
2361	; @param A0 FPU context (fxsave).
2362	; @param A1 Pointer to a IEMFPURESULT for the output.
2363	; @param A2 Pointer to the 80-bit value.
2364	; @param A3 Pointer to the 64-bit value.
2365	;
2366	%macro IEMIMPL_FPU_R80_BY_R64 1
2367	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2368	PROLOGUE_4_ARGS
2369	sub xSP, 20h
2370
2371	fninit
2372	fld tword [A2]
2373	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2374	%1 qword [A3]
2375
2376	fnstsw word [A1 + IEMFPURESULT.FSW]
2377	fnclex
2378	fstp tword [A1 + IEMFPURESULT.r80Result]
2379
2380	fninit
2381	add xSP, 20h
2382	EPILOGUE_4_ARGS
2383	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2384	%endmacro
2385
2386	IEMIMPL_FPU_R80_BY_R64 fadd
2387	IEMIMPL_FPU_R80_BY_R64 fmul
2388	IEMIMPL_FPU_R80_BY_R64 fsub
2389	IEMIMPL_FPU_R80_BY_R64 fsubr
2390	IEMIMPL_FPU_R80_BY_R64 fdiv
2391	IEMIMPL_FPU_R80_BY_R64 fdivr
2392
2393	;;
2394	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2395	; only returning FSW.
2396	;
2397	; @param 1 The instruction
2398	;
2399	; @param A0 FPU context (fxsave).
2400	; @param A1 Where to store the output FSW.
2401	; @param A2 Pointer to the 80-bit value.
2402	; @param A3 Pointer to the 64-bit value.
2403	;
2404	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2405	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2406	PROLOGUE_4_ARGS
2407	sub xSP, 20h
2408
2409	fninit
2410	fld tword [A2]
2411	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2412	%1 qword [A3]
2413
2414	fnstsw word [A1]
2415
2416	fninit
2417	add xSP, 20h
2418	EPILOGUE_4_ARGS
2419	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2420	%endmacro
2421
2422	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2423
2424
2425
2426	;
2427	;---------------------- 80-bit floating point operations ----------------------
2428	;
2429
2430	;;
2431	; Loads a 80-bit floating point register value from memory.
2432	;
2433	; @param A0 FPU context (fxsave).
2434	; @param A1 Pointer to a IEMFPURESULT for the output.
2435	; @param A2 Pointer to the 80-bit floating point value to load.
2436	;
2437	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2438	PROLOGUE_3_ARGS
2439	sub xSP, 20h
2440
2441	fninit
2442	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2443	fld tword [A2]
2444
2445	fnstsw word [A1 + IEMFPURESULT.FSW]
2446	fnclex
2447	fstp tword [A1 + IEMFPURESULT.r80Result]
2448
2449	fninit
2450	add xSP, 20h
2451	EPILOGUE_3_ARGS
2452	ENDPROC iemAImpl_fld_r80_from_r80
2453
2454
2455	;;
2456	; Store a 80-bit floating point register to memory
2457	;
2458	; @param A0 FPU context (fxsave).
2459	; @param A1 Where to return the output FSW.
2460	; @param A2 Where to store the 80-bit value.
2461	; @param A3 Pointer to the 80-bit register value.
2462	;
2463	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2464	PROLOGUE_4_ARGS
2465	sub xSP, 20h
2466
2467	fninit
2468	fld tword [A3]
2469	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2470	fstp tword [A2]
2471
2472	fnstsw word [A1]
2473
2474	fninit
2475	add xSP, 20h
2476	EPILOGUE_4_ARGS
2477	ENDPROC iemAImpl_fst_r80_to_r80
2478
2479
2480	;;
2481	; FPU instruction working on two 80-bit floating point values.
2482	;
2483	; @param 1 The instruction
2484	;
2485	; @param A0 FPU context (fxsave).
2486	; @param A1 Pointer to a IEMFPURESULT for the output.
2487	; @param A2 Pointer to the first 80-bit value (ST0)
2488	; @param A3 Pointer to the second 80-bit value (STn).
2489	;
2490	%macro IEMIMPL_FPU_R80_BY_R80 2
2491	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2492	PROLOGUE_4_ARGS
2493	sub xSP, 20h
2494
2495	fninit
2496	fld tword [A3]
2497	fld tword [A2]
2498	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2499	%1 %2
2500
2501	fnstsw word [A1 + IEMFPURESULT.FSW]
2502	fnclex
2503	fstp tword [A1 + IEMFPURESULT.r80Result]
2504
2505	fninit
2506	add xSP, 20h
2507	EPILOGUE_4_ARGS
2508	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2509	%endmacro
2510
2511	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
2512	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
2513	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
2514	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
2515	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
2516	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
2517	IEMIMPL_FPU_R80_BY_R80 fprem, {}
2518	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
2519	IEMIMPL_FPU_R80_BY_R80 fscale, {}
2520
2521
2522	;;
2523	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
2524	; storing the result in ST1 and popping the stack.
2525	;
2526	; @param 1 The instruction
2527	;
2528	; @param A0 FPU context (fxsave).
2529	; @param A1 Pointer to a IEMFPURESULT for the output.
2530	; @param A2 Pointer to the first 80-bit value (ST1).
2531	; @param A3 Pointer to the second 80-bit value (ST0).
2532	;
2533	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
2534	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2535	PROLOGUE_4_ARGS
2536	sub xSP, 20h
2537
2538	fninit
2539	fld tword [A2]
2540	fld tword [A3]
2541	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2542	%1
2543
2544	fnstsw word [A1 + IEMFPURESULT.FSW]
2545	fnclex
2546	fstp tword [A1 + IEMFPURESULT.r80Result]
2547
2548	fninit
2549	add xSP, 20h
2550	EPILOGUE_4_ARGS
2551	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2552	%endmacro
2553
2554	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
2555	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
2556	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
2557
2558
2559	;;
2560	; FPU instruction working on two 80-bit floating point values, only
2561	; returning FSW.
2562	;
2563	; @param 1 The instruction
2564	;
2565	; @param A0 FPU context (fxsave).
2566	; @param A1 Pointer to a uint16_t for the resulting FSW.
2567	; @param A2 Pointer to the first 80-bit value.
2568	; @param A3 Pointer to the second 80-bit value.
2569	;
2570	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
2571	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2572	PROLOGUE_4_ARGS
2573	sub xSP, 20h
2574
2575	fninit
2576	fld tword [A3]
2577	fld tword [A2]
2578	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2579	%1 st0, st1
2580
2581	fnstsw word [A1]
2582
2583	fninit
2584	add xSP, 20h
2585	EPILOGUE_4_ARGS
2586	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2587	%endmacro
2588
2589	IEMIMPL_FPU_R80_BY_R80_FSW fcom
2590	IEMIMPL_FPU_R80_BY_R80_FSW fucom
2591
2592
2593	;;
2594	; FPU instruction working on two 80-bit floating point values,
2595	; returning FSW and EFLAGS (eax).
2596	;
2597	; @param 1 The instruction
2598	;
2599	; @returns EFLAGS in EAX.
2600	; @param A0 FPU context (fxsave).
2601	; @param A1 Pointer to a uint16_t for the resulting FSW.
2602	; @param A2 Pointer to the first 80-bit value.
2603	; @param A3 Pointer to the second 80-bit value.
2604	;
2605	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
2606	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2607	PROLOGUE_4_ARGS
2608	sub xSP, 20h
2609
2610	fninit
2611	fld tword [A3]
2612	fld tword [A2]
2613	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2614	%1 st1
2615
2616	fnstsw word [A1]
2617	pushf
2618	pop xAX
2619
2620	fninit
2621	add xSP, 20h
2622	EPILOGUE_4_ARGS
2623	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2624	%endmacro
2625
2626	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
2627	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
2628
2629
2630	;;
2631	; FPU instruction working on one 80-bit floating point value.
2632	;
2633	; @param 1 The instruction
2634	;
2635	; @param A0 FPU context (fxsave).
2636	; @param A1 Pointer to a IEMFPURESULT for the output.
2637	; @param A2 Pointer to the 80-bit value.
2638	;
2639	%macro IEMIMPL_FPU_R80 1
2640	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2641	PROLOGUE_3_ARGS
2642	sub xSP, 20h
2643
2644	fninit
2645	fld tword [A2]
2646	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2647	%1
2648
2649	fnstsw word [A1 + IEMFPURESULT.FSW]
2650	fnclex
2651	fstp tword [A1 + IEMFPURESULT.r80Result]
2652
2653	fninit
2654	add xSP, 20h
2655	EPILOGUE_3_ARGS
2656	ENDPROC iemAImpl_ %+ %1 %+ _r80
2657	%endmacro
2658
2659	IEMIMPL_FPU_R80 fchs
2660	IEMIMPL_FPU_R80 fabs
2661	IEMIMPL_FPU_R80 f2xm1
2662	IEMIMPL_FPU_R80 fsqrt
2663	IEMIMPL_FPU_R80 frndint
2664	IEMIMPL_FPU_R80 fsin
2665	IEMIMPL_FPU_R80 fcos
2666
2667
2668	;;
2669	; FPU instruction working on one 80-bit floating point value, only
2670	; returning FSW.
2671	;
2672	; @param 1 The instruction
2673	;
2674	; @param A0 FPU context (fxsave).
2675	; @param A1 Pointer to a uint16_t for the resulting FSW.
2676	; @param A2 Pointer to the 80-bit value.
2677	;
2678	%macro IEMIMPL_FPU_R80_FSW 1
2679	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2680	PROLOGUE_3_ARGS
2681	sub xSP, 20h
2682
2683	fninit
2684	fld tword [A2]
2685	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2686	%1
2687
2688	fnstsw word [A1]
2689
2690	fninit
2691	add xSP, 20h
2692	EPILOGUE_3_ARGS
2693	ENDPROC iemAImpl_ %+ %1 %+ _r80
2694	%endmacro
2695
2696	IEMIMPL_FPU_R80_FSW ftst
2697	IEMIMPL_FPU_R80_FSW fxam
2698
2699
2700
2701	;;
2702	; FPU instruction loading a 80-bit floating point constant.
2703	;
2704	; @param 1 The instruction
2705	;
2706	; @param A0 FPU context (fxsave).
2707	; @param A1 Pointer to a IEMFPURESULT for the output.
2708	;
2709	%macro IEMIMPL_FPU_R80_CONST 1
2710	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
2711	PROLOGUE_2_ARGS
2712	sub xSP, 20h
2713
2714	fninit
2715	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2716	%1
2717
2718	fnstsw word [A1 + IEMFPURESULT.FSW]
2719	fnclex
2720	fstp tword [A1 + IEMFPURESULT.r80Result]
2721
2722	fninit
2723	add xSP, 20h
2724	EPILOGUE_2_ARGS
2725	ENDPROC iemAImpl_ %+ %1 %+
2726	%endmacro
2727
2728	IEMIMPL_FPU_R80_CONST fld1
2729	IEMIMPL_FPU_R80_CONST fldl2t
2730	IEMIMPL_FPU_R80_CONST fldl2e
2731	IEMIMPL_FPU_R80_CONST fldpi
2732	IEMIMPL_FPU_R80_CONST fldlg2
2733	IEMIMPL_FPU_R80_CONST fldln2
2734	IEMIMPL_FPU_R80_CONST fldz
2735
2736
2737	;;
2738	; FPU instruction working on one 80-bit floating point value, outputing two.
2739	;
2740	; @param 1 The instruction
2741	;
2742	; @param A0 FPU context (fxsave).
2743	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
2744	; @param A2 Pointer to the 80-bit value.
2745	;
2746	%macro IEMIMPL_FPU_R80_R80 1
2747	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
2748	PROLOGUE_3_ARGS
2749	sub xSP, 20h
2750
2751	fninit
2752	fld tword [A2]
2753	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2754	%1
2755
2756	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
2757	fnclex
2758	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
2759	fnclex
2760	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
2761
2762	fninit
2763	add xSP, 20h
2764	EPILOGUE_3_ARGS
2765	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
2766	%endmacro
2767
2768	IEMIMPL_FPU_R80_R80 fptan
2769	IEMIMPL_FPU_R80_R80 fxtract
2770	IEMIMPL_FPU_R80_R80 fsincos
2771
2772
2773
2774
2775	;---------------------- SSE and MMX Operations ----------------------
2776
2777	;; @todo what do we need to do for MMX?
2778	%macro IEMIMPL_MMX_PROLOGUE 0
2779	%endmacro
2780	%macro IEMIMPL_MMX_EPILOGUE 0
2781	%endmacro
2782
2783	;; @todo what do we need to do for SSE?
2784	%macro IEMIMPL_SSE_PROLOGUE 0
2785	%endmacro
2786	%macro IEMIMPL_SSE_EPILOGUE 0
2787	%endmacro
2788
2789
2790	;;
2791	; Media instruction working on two full sized registers.
2792	;
2793	; @param 1 The instruction
2794	;
2795	; @param A0 FPU context (fxsave).
2796	; @param A1 Pointer to the first media register size operand (input/output).
2797	; @param A2 Pointer to the second media register size operand (input).
2798	;
2799	%macro IEMIMPL_MEDIA_F2 1
2800	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2801	PROLOGUE_3_ARGS
2802	IEMIMPL_MMX_PROLOGUE
2803
2804	movq mm0, [A1]
2805	movq mm1, [A2]
2806	%1 mm0, mm1
2807	movq [A1], mm0
2808
2809	IEMIMPL_MMX_EPILOGUE
2810	EPILOGUE_3_ARGS
2811	ENDPROC iemAImpl_ %+ %1 %+ _u64
2812
2813	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2814	PROLOGUE_3_ARGS
2815	IEMIMPL_SSE_PROLOGUE
2816
2817	movdqu xmm0, [A1]
2818	movdqu xmm1, [A2]
2819	%1 xmm0, xmm1
2820	movdqu [A1], xmm0
2821
2822	IEMIMPL_SSE_EPILOGUE
2823	EPILOGUE_3_ARGS
2824	ENDPROC iemAImpl_ %+ %1 %+ _u128
2825	%endmacro
2826
2827	IEMIMPL_MEDIA_F2 pxor
2828	IEMIMPL_MEDIA_F2 pcmpeqb
2829	IEMIMPL_MEDIA_F2 pcmpeqw
2830	IEMIMPL_MEDIA_F2 pcmpeqd
2831
2832
2833	;;
2834	; Media instruction working on one full sized and one half sized register (lower half).
2835	;
2836	; @param 1 The instruction
2837	; @param 2 1 if MMX is included, 0 if not.
2838	;
2839	; @param A0 FPU context (fxsave).
2840	; @param A1 Pointer to the first full sized media register operand (input/output).
2841	; @param A2 Pointer to the second half sized media register operand (input).
2842	;
2843	%macro IEMIMPL_MEDIA_F1L1 2
2844	%if %2 != 0
2845	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2846	PROLOGUE_3_ARGS
2847	IEMIMPL_MMX_PROLOGUE
2848
2849	movq mm0, [A1]
2850	movd mm1, [A2]
2851	%1 mm0, mm1
2852	movq [A1], mm0
2853
2854	IEMIMPL_MMX_EPILOGUE
2855	EPILOGUE_3_ARGS
2856	ENDPROC iemAImpl_ %+ %1 %+ _u64
2857	%endif
2858
2859	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2860	PROLOGUE_3_ARGS
2861	IEMIMPL_SSE_PROLOGUE
2862
2863	movdqu xmm0, [A1]
2864	movq xmm1, [A2]
2865	%1 xmm0, xmm1
2866	movdqu [A1], xmm0
2867
2868	IEMIMPL_SSE_EPILOGUE
2869	EPILOGUE_3_ARGS
2870	ENDPROC iemAImpl_ %+ %1 %+ _u128
2871	%endmacro
2872
2873	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
2874	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
2875	IEMIMPL_MEDIA_F1L1 punpckldq, 1
2876	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
2877
2878
2879	;;
2880	; Media instruction working on one full sized and one half sized register (high half).
2881	;
2882	; @param 1 The instruction
2883	; @param 2 1 if MMX is included, 0 if not.
2884	;
2885	; @param A0 FPU context (fxsave).
2886	; @param A1 Pointer to the first full sized media register operand (input/output).
2887	; @param A2 Pointer to the second full sized media register operand, where we
2888	; will only use the upper half (input).
2889	;
2890	%macro IEMIMPL_MEDIA_F1H1 2
2891	%if %2 != 0
2892	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2893	PROLOGUE_3_ARGS
2894	IEMIMPL_MMX_PROLOGUE
2895
2896	movq mm0, [A1]
2897	movq mm1, [A2]
2898	%1 mm0, mm1
2899	movq [A1], mm0
2900
2901	IEMIMPL_MMX_EPILOGUE
2902	EPILOGUE_3_ARGS
2903	ENDPROC iemAImpl_ %+ %1 %+ _u64
2904	%endif
2905
2906	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2907	PROLOGUE_3_ARGS
2908	IEMIMPL_SSE_PROLOGUE
2909
2910	movdqu xmm0, [A1]
2911	movdqu xmm1, [A2]
2912	%1 xmm0, xmm1
2913	movdqu [A1], xmm0
2914
2915	IEMIMPL_SSE_EPILOGUE
2916	EPILOGUE_3_ARGS
2917	ENDPROC iemAImpl_ %+ %1 %+ _u128
2918	%endmacro
2919
2920	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
2921	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
2922	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
2923	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
2924
2925
2926	;
2927	; Shufflers with evil 8-bit immediates.
2928	;
2929
2930	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
2931	PROLOGUE_4_ARGS
2932	IEMIMPL_MMX_PROLOGUE
2933
2934	movq mm0, [A1]
2935	movq mm1, [A2]
2936	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
2937	lea T1, [.imm0 xWrtRIP]
2938	lea T1, [T1 + T0]
2939	call T1
2940	movq [A1], mm0
2941
2942	IEMIMPL_MMX_EPILOGUE
2943	EPILOGUE_4_ARGS
2944	%assign bImm 0
2945	%rep 256
2946	.imm %+ bImm:
2947	pshufw mm0, mm1, bImm
2948	ret
2949	%assign bImm bImm + 1
2950	%endrep
2951	.immEnd: ; 256*5 == 0x500
2952	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2953	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2954	ENDPROC iemAImpl_pshufw
2955
2956
2957	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
2958	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
2959	PROLOGUE_4_ARGS
2960	IEMIMPL_SSE_PROLOGUE
2961
2962	movdqu xmm0, [A1]
2963	movdqu xmm1, [A2]
2964	lea T1, [.imm0 xWrtRIP]
2965	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
2966	lea T1, [T1 + T0*2]
2967	call T1
2968	movdqu [A1], xmm0
2969
2970	IEMIMPL_SSE_EPILOGUE
2971	EPILOGUE_4_ARGS
2972	%assign bImm 0
2973	%rep 256
2974	.imm %+ bImm:
2975	%1 xmm0, xmm1, bImm
2976	ret
2977	%assign bImm bImm + 1
2978	%endrep
2979	.immEnd: ; 256*6 == 0x600
2980	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2981	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2982	ENDPROC iemAImpl_ %+ %1
2983	%endmacro
2984
2985	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
2986	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
2987	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
2988
2989
2990	;
2991	; Move byte mask.
2992	;
2993
2994	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
2995	PROLOGUE_3_ARGS
2996	IEMIMPL_MMX_PROLOGUE
2997
2998	mov T0, [A1]
2999	movq mm1, [A2]
3000	pmovmskb T0, mm1
3001	mov [A1], T0
3002	%ifdef RT_ARCH_X86
3003	mov dword [A1 + 4], 0
3004	%endif
3005	IEMIMPL_MMX_EPILOGUE
3006	EPILOGUE_3_ARGS
3007	ENDPROC iemAImpl_pmovmskb_u64
3008
3009	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3010	PROLOGUE_3_ARGS
3011	IEMIMPL_SSE_PROLOGUE
3012
3013	mov T0, [A1]
3014	movdqu xmm1, [A2]
3015	pmovmskb T0, xmm1
3016	mov [A1], T0
3017	%ifdef RT_ARCH_X86
3018	mov dword [A1 + 4], 0
3019	%endif
3020	IEMIMPL_SSE_EPILOGUE
3021	EPILOGUE_3_ARGS
3022	ENDPROC iemAImpl_pmovmskb_u128
3023

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 66250

Download in other formats: