VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/target-x86/IEMAllN8veEmit-x86.h@ 106723

Last change on this file since 106723 was 106723, checked in by vboxsync, 4 weeks ago

VMM/IEM: Unwind info for win.arm64 (attempt at it, anyway). jiraref:1253

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 165.7 KB
Line 
1/* $Id: IEMAllN8veEmit-x86.h 106723 2024-10-27 01:07:28Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, x86 Target - Code Emitters.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28#ifndef VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
29#define VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
30#ifndef RT_WITHOUT_PRAGMA_ONCE
31# pragma once
32#endif
33
34
35#ifdef RT_ARCH_AMD64
36
37/**
38 * Emits an ModR/M instruction with one opcode byte and only register operands.
39 */
40DECL_FORCE_INLINE(uint32_t)
41iemNativeEmitAmd64OneByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOther,
42 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
43{
44 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
45 switch (cOpBits)
46 {
47 case 16:
48 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
49 RT_FALL_THRU();
50 case 32:
51 if (idxRegReg >= 8 || idxRegRm >= 8)
52 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
53 pCodeBuf[off++] = bOpcodeOther;
54 break;
55
56 default: AssertFailed(); RT_FALL_THRU();
57 case 64:
58 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
59 pCodeBuf[off++] = bOpcodeOther;
60 break;
61
62 case 8:
63 if (idxRegReg >= 8 || idxRegRm >= 8)
64 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
65 else if (idxRegReg >= 4 || idxRegRm >= 4)
66 pCodeBuf[off++] = X86_OP_REX;
67 pCodeBuf[off++] = bOpcode8;
68 break;
69 }
70 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
71 return off;
72}
73
74
75/**
76 * Emits an ModR/M instruction with two opcode bytes and only register operands.
77 */
78DECL_FORCE_INLINE(uint32_t)
79iemNativeEmitAmd64TwoByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off,
80 uint8_t bOpcode0, uint8_t bOpcode8, uint8_t bOpcodeOther,
81 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
82{
83 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
84 switch (cOpBits)
85 {
86 case 16:
87 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
88 RT_FALL_THRU();
89 case 32:
90 if (idxRegReg >= 8 || idxRegRm >= 8)
91 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
92 pCodeBuf[off++] = bOpcode0;
93 pCodeBuf[off++] = bOpcodeOther;
94 break;
95
96 default: AssertFailed(); RT_FALL_THRU();
97 case 64:
98 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
99 pCodeBuf[off++] = bOpcode0;
100 pCodeBuf[off++] = bOpcodeOther;
101 break;
102
103 case 8:
104 if (idxRegReg >= 8 || idxRegRm >= 8)
105 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
106 else if (idxRegReg >= 4 || idxRegRm >= 4)
107 pCodeBuf[off++] = X86_OP_REX;
108 pCodeBuf[off++] = bOpcode0;
109 pCodeBuf[off++] = bOpcode8;
110 break;
111 }
112 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
113 return off;
114}
115
116
117/**
118 * Emits one of three opcodes with an immediate.
119 *
120 * These are expected to be a /idxRegReg form.
121 */
122DECL_FORCE_INLINE(uint32_t)
123iemNativeEmitAmd64OneByteModRmInstrRIEx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOtherImm8,
124 uint8_t bOpcodeOther, uint8_t cOpBits, uint8_t cImmBits, uint8_t idxRegReg,
125 uint8_t idxRegRm, uint64_t uImmOp)
126{
127 Assert(idxRegReg < 8); Assert(idxRegRm < 16);
128 if ( cImmBits == 8
129 || (uImmOp <= (uint64_t)0x7f && bOpcodeOtherImm8 != 0xcc))
130 {
131 switch (cOpBits)
132 {
133 case 16:
134 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
135 RT_FALL_THRU();
136 case 32:
137 if (idxRegRm >= 8)
138 pCodeBuf[off++] = X86_OP_REX_B;
139 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
140 break;
141
142 default: AssertFailed(); RT_FALL_THRU();
143 case 64:
144 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
145 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
146 break;
147
148 case 8:
149 if (idxRegRm >= 8)
150 pCodeBuf[off++] = X86_OP_REX_B;
151 else if (idxRegRm >= 4)
152 pCodeBuf[off++] = X86_OP_REX;
153 pCodeBuf[off++] = bOpcode8; Assert(bOpcode8 != 0xcc);
154 break;
155 }
156 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
157 pCodeBuf[off++] = (uint8_t)uImmOp;
158 }
159 else
160 {
161 switch (cOpBits)
162 {
163 case 32:
164 if (idxRegRm >= 8)
165 pCodeBuf[off++] = X86_OP_REX_B;
166 break;
167
168 default: AssertFailed(); RT_FALL_THRU();
169 case 64:
170 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
171 break;
172
173 case 16:
174 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
175 if (idxRegRm >= 8)
176 pCodeBuf[off++] = X86_OP_REX_B;
177 pCodeBuf[off++] = bOpcodeOther;
178 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
179 pCodeBuf[off++] = RT_BYTE1(uImmOp);
180 pCodeBuf[off++] = RT_BYTE2(uImmOp);
181 Assert(cImmBits == 16);
182 return off;
183 }
184 pCodeBuf[off++] = bOpcodeOther;
185 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
186 pCodeBuf[off++] = RT_BYTE1(uImmOp);
187 pCodeBuf[off++] = RT_BYTE2(uImmOp);
188 pCodeBuf[off++] = RT_BYTE3(uImmOp);
189 pCodeBuf[off++] = RT_BYTE4(uImmOp);
190 Assert(cImmBits == 32);
191 }
192 return off;
193}
194
195#endif /* RT_ARCH_AMD64 */
196
197
198
199/*********************************************************************************************************************************
200* Guest Register Load & Store Helpers *
201*********************************************************************************************************************************/
202
203
204/**
205 * Alternative to iemNativeEmitLoadGprWithGstShadowRegEx() and
206 * iemNativeEmitLoadGprWithGstShadowReg() which should be more efficient as it
207 * lets the compiler do the equivalent of the g_aGstShadowInfo lookup.
208 *
209 * @note This does not mark @a idxHstReg as having a shadow copy of @a a_enmGstReg,
210 * that is something the caller needs to do if applicable.
211 */
212template<IEMNATIVEGSTREG const a_enmGstReg>
213DECL_INLINE_THROW(uint32_t) iemNativeEmitLoadGprWithGstRegExT(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t idxHstReg)
214{
215 /* 64-bit registers: */
216 if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Pc)
217 return iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.rip));
218 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Rsp)
219 return iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.rsp));
220 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_CsBase)
221 return iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.cs.u64Base));
222 //else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Cr0)
223 // return iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.cr0));
224 //else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Cr4)
225 // return iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.cr4));
226 //else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Xcr0)
227 // return iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.aXcr[0]));
228
229 /* 32-bit registers: */
230 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_EFlags)
231 return iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
232 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_MxCsr)
233 return iemNativeEmitLoadGprFromVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.XState.x87.MXCSR));
234
235 /* 16-bit registers */
236 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_FpuFcw)
237 return iemNativeEmitLoadGprFromVCpuU16Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.XState.x87.FCW));
238 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_FpuFsw)
239 return iemNativeEmitLoadGprFromVCpuU16Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.XState.x87.FSW));
240#if RT_CPLUSPLUS_PREREQ(201700) && !defined(__clang_major__)
241 else
242 {
243 AssertCompile(false);
244 return off;
245 }
246#endif
247}
248
249
250/** See iemNativeEmitLoadGprWithGstRegExT(). */
251template<IEMNATIVEGSTREG const a_enmGstReg>
252DECL_INLINE_THROW(uint32_t) iemNativeEmitLoadGprWithGstRegT(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxHstReg)
253{
254#ifdef RT_ARCH_AMD64
255 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 16);
256#elif defined(RT_ARCH_ARM64)
257 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
258#else
259# error "port me"
260#endif
261 off = iemNativeEmitLoadGprWithGstRegExT<a_enmGstReg>(pCodeBuf, off, idxHstReg);
262 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
263 return off;
264}
265
266
267/**
268 * Store companion to iemNativeEmitLoadGprWithGstRegExT().
269 */
270template<IEMNATIVEGSTREG const a_enmGstReg>
271DECL_INLINE_THROW(uint32_t) iemNativeEmitStoreGprToGstRegExT(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t idxHstReg,
272 uint8_t idxTmpReg = IEMNATIVE_REG_FIXED_TMP0)
273{
274 /* 64-bit registers: */
275 if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Pc)
276 return iemNativeEmitStoreGprToVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.rip), idxTmpReg);
277 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Rsp)
278 return iemNativeEmitStoreGprToVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.rsp), idxTmpReg);
279 //else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Cr0)
280 // return iemNativeEmitStoreGprToVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.cr0), idxTmpReg);
281 //else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Cr4)
282 // return iemNativeEmitStoreGprToVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.cr4), idxTmpReg);
283 //else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_Xcr0)
284 // return iemNativeEmitStoreGprToVCpuU64Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.aXcr[0]), idxTmpReg);
285 /* 32-bit registers: */
286 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_EFlags)
287 return iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags), idxTmpReg);
288 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_MxCsr)
289 return iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.XState.x87.MXCSR), idxTmpReg);
290 /* 16-bit registers */
291 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_FpuFcw)
292 return iemNativeEmitStoreGprToVCpuU16Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.XState.x87.FCW), idxTmpReg);
293 else if RT_CONSTEXPR_IF(a_enmGstReg == kIemNativeGstReg_FpuFsw)
294 return iemNativeEmitStoreGprToVCpuU16Ex(pCodeBuf, off, idxHstReg, RT_UOFFSETOF(VMCPU, cpum.GstCtx.XState.x87.FSW), idxTmpReg);
295#if RT_CPLUSPLUS_PREREQ(201700) && !defined(__clang_major__)
296 else
297 {
298 AssertCompile(false);
299 return off;
300 }
301#endif
302}
303
304
305/** See iemNativeEmitLoadGprWithGstRegExT(). */
306template<IEMNATIVEGSTREG const a_enmGstReg>
307DECL_INLINE_THROW(uint32_t) iemNativeEmitStoreGprToGstRegT(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxHstReg)
308{
309#ifdef RT_ARCH_AMD64
310 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 16);
311#elif defined(RT_ARCH_ARM64)
312 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
313#else
314# error "port me"
315#endif
316 off = iemNativeEmitStoreGprToGstRegExT<a_enmGstReg>(pCodeBuf, off, idxHstReg);
317 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
318 return off;
319}
320
321
322
323/*********************************************************************************************************************************
324* EFLAGS *
325*********************************************************************************************************************************/
326
327#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
328
329/** @def IEMNATIVE_POSTPONING_REG_MASK
330 * Register suitable for keeping the inputs or result for a postponed EFLAGS
331 * calculation.
332 *
333 * We use non-volatile register here so we don't have to save & restore them
334 * accross callouts (i.e. TLB loads).
335 *
336 * @note On x86 we cannot use RDI and RSI because these are used by the
337 * opcode checking code. The usual joy of the x86 instruction set.
338 */
339# ifdef RT_ARCH_AMD64
340# define IEMNATIVE_POSTPONING_REG_MASK \
341 (IEMNATIVE_CALL_NONVOLATILE_GREG_MASK & ~(RT_BIT_32(X86_GREG_xDI) | RT_BIT_32(X86_GREG_xSI)))
342# else
343# define IEMNATIVE_POSTPONING_REG_MASK IEMNATIVE_CALL_NONVOLATILE_GREG_MASK
344# endif
345
346/**
347 * This is normally invoked via IEMNATIVE_CLEAR_POSTPONED_EFLAGS().
348 */
349template<uint32_t const a_fEflClobbered>
350DECL_FORCE_INLINE(void) iemNativeClearPostponedEFlags(PIEMRECOMPILERSTATE pReNative)
351{
352 AssertCompile(!(a_fEflClobbered & ~X86_EFL_STATUS_BITS));
353 uint32_t fEFlags = pReNative->PostponedEfl.fEFlags;
354 if (fEFlags)
355 {
356 if RT_CONSTEXPR_IF(a_fEflClobbered != X86_EFL_STATUS_BITS)
357 {
358 fEFlags &= ~a_fEflClobbered;
359 if (!fEFlags)
360 { /* likely */ }
361 else
362 {
363 Log5(("EFLAGS: Clobbering %#x: %#x -> %#x (op=%d bits=%u) - iemNativeClearPostponedEFlags\n", a_fEflClobbered,
364 pReNative->PostponedEfl.fEFlags, fEFlags, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits));
365 pReNative->PostponedEfl.fEFlags = fEFlags;
366 return;
367 }
368 }
369
370 /* Do cleanup. */
371 Log5(("EFLAGS: Cleanup of op=%u bits=%u efl=%#x upon clobbering %#x - iemNativeClearPostponedEFlags\n",
372 pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits, pReNative->PostponedEfl.fEFlags, a_fEflClobbered));
373 pReNative->PostponedEfl.fEFlags = 0;
374 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Invalid;
375 pReNative->PostponedEfl.cOpBits = 0;
376 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg1);
377 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
378 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg2);
379 pReNative->PostponedEfl.idxReg1 = UINT8_MAX;
380 pReNative->PostponedEfl.idxReg2 = UINT8_MAX;
381# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
382 STAM_PROFILE_ADD_PERIOD(&pReNative->pVCpu->iem.s.StatNativeEflPostponedEmits, pReNative->PostponedEfl.cEmits);
383 pReNative->PostponedEfl.cEmits = 0;
384# endif
385 }
386}
387
388#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
389
390
391template<bool const a_fDoOp>
392DECL_INLINE_THROW(uint32_t) iemNativeEmitPostponedEFlagsCalcLogical(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t cOpBits,
393 uint8_t idxRegResult, uint8_t idxRegEfl, uint8_t idxRegTmp)
394{
395#ifdef RT_ARCH_AMD64
396 /* Do TEST idxRegResult, idxRegResult to set flags. */
397 if RT_CONSTEXPR_IF(a_fDoOp)
398 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x84, 0x85, cOpBits, idxRegResult, idxRegResult);
399
400 /*
401 * Collect the EFLAGS status bits.
402 * We know that the overflow bit will always be cleared, so LAHF can be used.
403 */
404 if (idxRegTmp == X86_GREG_xAX)
405 {
406 /* lahf ; AH = EFLAGS */
407 pCodeBuf[off++] = 0x9f;
408 if (idxRegEfl <= X86_GREG_xBX)
409 {
410 /* mov [CDB]L, AH */
411 pCodeBuf[off++] = 0x88;
412 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
413 }
414 else
415 {
416 /* mov AL, AH */
417 pCodeBuf[off++] = 0x88;
418 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
419 /* mov xxL, AL */
420 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
421 pCodeBuf[off++] = 0x88;
422 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
423 }
424 }
425 else if (idxRegEfl != X86_GREG_xAX)
426 {
427# if 1 /* This is 1 or 4 bytes larger, but avoids the stack. */
428 /* xchg rax, tmp */
429 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
430 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
431
432 /* lahf ; AH = EFLAGS */
433 pCodeBuf[off++] = 0x9f;
434 if (idxRegEfl <= X86_GREG_xBX)
435 {
436 /* mov [CDB]L, AH */
437 pCodeBuf[off++] = 0x88;
438 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
439 }
440 else
441 {
442 /* mov AL, AH */
443 pCodeBuf[off++] = 0x88;
444 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
445 /* mov xxL, AL */
446 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
447 pCodeBuf[off++] = 0x88;
448 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
449 }
450
451 /* xchg rax, tmp */
452 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
453 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
454
455# else
456 /* pushf */
457 pCodeBuf[off++] = 0x9c;
458 /* pop tmp */
459 if (idxRegTmp >= 8)
460 pCodeBuf[off++] = X86_OP_REX_B;
461 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
462 /* mov byte(efl), byte(tmp) */
463 if (idxRegEfl >= 4 || idxRegTmp >= 4)
464 pCodeBuf[off++] = (idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX)
465 | (idxRegTmp >= 8 ? X86_OP_REX_R : 0);
466 pCodeBuf[off++] = 0x88;
467 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegTmp & 7, idxRegEfl & 7);
468# endif
469 }
470 else
471 {
472 /* xchg al, ah */
473 pCodeBuf[off++] = 0x86;
474 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
475 /* lahf ; AH = EFLAGS */
476 pCodeBuf[off++] = 0x9f;
477 /* xchg al, ah */
478 pCodeBuf[off++] = 0x86;
479 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
480 }
481 /* BTR idxEfl, 11; Clear OF */
482 if (idxRegEfl >= 8)
483 pCodeBuf[off++] = X86_OP_REX_B;
484 pCodeBuf[off++] = 0xf;
485 pCodeBuf[off++] = 0xba;
486 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxRegEfl & 7);
487 pCodeBuf[off++] = X86_EFL_OF_BIT;
488
489#elif defined(RT_ARCH_ARM64)
490 /*
491 * Calculate flags.
492 */
493 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
494 off = iemNativeEmitLoadGpr32ImmExT<~X86_EFL_STATUS_BITS>(pCodeBuf, off, idxRegTmp);
495 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
496
497 /* N,Z -> SF,ZF */
498 if (cOpBits < 32)
499 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
500 else if RT_CONSTEXPR_IF(a_fDoOp)
501 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
502 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
503 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
504 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
505 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
506
507 /* Calculate 8-bit parity of the result. */
508 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
509 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
510 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
511 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
512 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
513 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
514 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
515 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
516 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
517
518#else
519# error "port me"
520#endif
521 return off;
522}
523
524#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
525
526template<uint32_t const a_bmInputRegs, bool const a_fTlbMiss = false>
527static uint32_t iemNativeDoPostponedEFlagsInternal(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf,
528 uint32_t bmExtraTlbMissRegs = 0)
529{
530# ifdef IEMNATIVE_WITH_TB_DEBUG_INFO
531 iemNativeDbgInfoAddPostponedEFlagsCalc(pReNative, off, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits,
532 pReNative->PostponedEfl.cEmits);
533# endif
534
535 /*
536 * In the TB exit code path we cannot do regular register allocation. Nor
537 * can we when we're in the TLB miss code, unless we're skipping the TLB
538 * lookup. Since the latter isn't an important usecase and should get along
539 * fine on just volatile registers, we do not need to do anything special
540 * for it.
541 *
542 * So, we do our own register allocating here. Any register goes in the TB
543 * exit path, excluding a_bmInputRegs, fixed and postponed related registers.
544 * In the TLB miss we can use any volatile register and temporary registers
545 * allocated in the TLB state.
546 *
547 * Note! On x86 we prefer using RAX as the first TMP register, so we can
548 * make use of LAHF which is typically faster than PUSHF/POP. This
549 * is why the idxRegTmp allocation is first when there is no EFLAG
550 * shadow, since RAX is represented by bit 0 in the mask.
551 */
552 uint32_t bmAvailableRegs;
553 if RT_CONSTEXPR_IF(!a_fTlbMiss)
554 {
555 bmAvailableRegs = ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK) & IEMNATIVE_HST_GREG_MASK;
556 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
557 bmAvailableRegs &= ~(RT_BIT_32(pReNative->PostponedEfl.idxReg1) | RT_BIT_32(pReNative->PostponedEfl.idxReg2));
558 else
559 bmAvailableRegs &= ~RT_BIT_32(pReNative->PostponedEfl.idxReg1);
560 }
561 else
562 {
563 /* Note! a_bmInputRegs takes precedence over bmExtraTlbMissRegs. */
564 bmAvailableRegs = (IEMNATIVE_CALL_VOLATILE_GREG_MASK | bmExtraTlbMissRegs)
565 & ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK)
566 & IEMNATIVE_HST_GREG_MASK;
567 }
568
569 /* Use existing EFLAGS shadow if available. For the TLB-miss code path we
570 need to weed out volatile registers here, as they will no longer be valid. */
571 uint8_t idxRegTmp;
572 uint8_t idxRegEfl = pReNative->Core.aidxGstRegShadows[kIemNativeGstReg_EFlags];
573 if ( (pReNative->Core.bmGstRegShadows & RT_BIT_64(kIemNativeGstReg_EFlags))
574 && (!a_fTlbMiss || !(RT_BIT_32(idxRegEfl) & IEMNATIVE_CALL_VOLATILE_GREG_MASK)))
575 {
576 Assert(idxRegEfl < IEMNATIVE_HST_GREG_COUNT);
577 Assert(!(a_bmInputRegs & RT_BIT_32(idxRegEfl)));
578 if RT_CONSTEXPR_IF(!a_fTlbMiss) Assert(bmAvailableRegs & RT_BIT_32(idxRegEfl));
579 bmAvailableRegs &= ~RT_BIT_32(idxRegEfl);
580# ifdef VBOX_STRICT
581 off = iemNativeEmitGuestRegValueCheckEx(pReNative, pCodeBuf, off, idxRegEfl, kIemNativeGstReg_EFlags);
582# endif
583
584 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1;
585 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
586 }
587 else
588 {
589 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1; /* allocate the temp register first to prioritize EAX on x86. */
590 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
591
592 idxRegEfl = ASMBitFirstSetU32(bmAvailableRegs) - 1;
593 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
594 off = iemNativeEmitLoadGprWithGstRegExT<kIemNativeGstReg_EFlags>(pCodeBuf, off, idxRegEfl);
595 }
596 Assert(bmAvailableRegs != 0);
597
598 /*
599 * Do the actual EFLAGS calculation.
600 */
601 switch (pReNative->PostponedEfl.enmOp)
602 {
603 case kIemNativePostponedEflOp_Logical:
604 Assert(pReNative->PostponedEfl.idxReg2 == UINT8_MAX);
605 off = iemNativeEmitPostponedEFlagsCalcLogical<true>(pCodeBuf, off, pReNative->PostponedEfl.cOpBits,
606 pReNative->PostponedEfl.idxReg1, idxRegEfl, idxRegTmp);
607 break;
608
609 default:
610 AssertFailedBreak();
611 }
612
613 /*
614 * Store EFLAGS.
615 */
616# ifdef VBOX_STRICT
617 /* check that X86_EFL_1 is set. */
618 uint32_t offFixup1;
619 off = iemNativeEmitTestBitInGprAndJmpToFixedIfSetEx(pCodeBuf, off, idxRegEfl, X86_EFL_1_BIT, off, &offFixup1);
620 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3330);
621 iemNativeFixupFixedJump(pReNative, offFixup1, off);
622 /* Check that X86_EFL_RAZ_LO_MASK is zero. */
623 off = iemNativeEmitTestAnyBitsInGpr32Ex(pCodeBuf, off, idxRegEfl, X86_EFL_RAZ_LO_MASK, idxRegTmp);
624 uint32_t const offFixup2 = off;
625 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kIemNativeInstrCond_e);
626 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3331);
627 iemNativeFixupFixedJump(pReNative, offFixup2, off);
628# endif
629 off = iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
630 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
631
632# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
633 pReNative->PostponedEfl.cEmits++;
634# endif
635 return off;
636}
637
638
639
640template<uint32_t const a_bmInputRegs>
641DECL_FORCE_INLINE_THROW(uint32_t)
642iemNativeDoPostponedEFlagsAtTbExit(PIEMRECOMPILERSTATE pReNative, uint32_t off)
643{
644 if (pReNative->PostponedEfl.fEFlags)
645 {
646 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
647 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
648 }
649 return off;
650}
651
652
653template<uint32_t const a_bmInputRegs>
654DECL_FORCE_INLINE_THROW(uint32_t)
655iemNativeDoPostponedEFlagsAtTbExitEx(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf)
656{
657 if (pReNative->PostponedEfl.fEFlags)
658 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
659 return off;
660}
661
662
663template<uint32_t const a_bmInputRegs>
664DECL_FORCE_INLINE_THROW(uint32_t)
665iemNativeDoPostponedEFlagsAtTlbMiss(PIEMRECOMPILERSTATE pReNative, uint32_t off, const IEMNATIVEEMITTLBSTATE *pTlbState,
666 uint32_t bmTmpRegs)
667{
668 if (pReNative->PostponedEfl.fEFlags)
669 {
670 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
671 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs, true>(pReNative, off, pCodeBuf,
672 pTlbState->getRegsNotToSave() | bmTmpRegs);
673 }
674 return off;
675}
676
677#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
678
679
680/**
681 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL.
682 *
683 * It takes liveness stuff into account.
684 */
685template<bool a_fNeedToSetFlags>
686DECL_INLINE_THROW(uint32_t)
687iemNativeEmitEFlagsForLogical(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl,
688 uint8_t cOpBits, uint8_t idxRegResult)
689{
690 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalLogical);
691 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
692 RT_NOREF(cOpBits, idxRegResult);
693
694#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
695 /*
696 * See if we can skip this wholesale.
697 */
698 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
699 uint64_t const fEflClobbered = IEMLIVENESS_STATE_GET_WILL_BE_CLOBBERED_SET(pLivenessEntry)
700 & IEMLIVENESSBIT_STATUS_EFL_MASK;
701# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
702 uint64_t fEflPostponing;
703# endif
704 if ( fEflClobbered == IEMLIVENESSBIT_STATUS_EFL_MASK
705 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
706 {
707 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedLogical);
708 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
709# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
710 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
711# endif
712 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS));
713 return off;
714 }
715# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
716 if ( ( (fEflPostponing = IEMLIVENESS_STATE_GET_CAN_BE_POSTPONED_SET(pLivenessEntry) & IEMLIVENESSBIT_STATUS_EFL_MASK)
717 | fEflClobbered)
718 == IEMLIVENESSBIT_STATUS_EFL_MASK
719 && idxRegResult != UINT8_MAX)
720 {
721 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflPostponedLogical);
722 pReNative->PostponedEfl.fEFlags = X86_EFL_STATUS_BITS;
723 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Logical;
724 pReNative->PostponedEfl.cOpBits = cOpBits;
725 pReNative->PostponedEfl.idxReg1 = iemNativeRegAllocTmpExPreferNonVolatile(pReNative, &off, IEMNATIVE_POSTPONING_REG_MASK);
726 /** @todo it would normally be possible to use idxRegResult, iff it is
727 * already a non-volatile register and we can be user the caller
728 * doesn't modify it. That'll save a register move and allocation. */
729 off = iemNativeEmitLoadGprFromGpr(pReNative, off, pReNative->PostponedEfl.idxReg1, idxRegResult);
730 Log5(("EFLAGS: Postponing %#x op=%u bits=%u reg1=%u - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS,
731 kIemNativePostponedEflOp_Logical, cOpBits, pReNative->PostponedEfl.idxReg1));
732 }
733# endif
734 else
735#endif
736 {
737 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
738 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
739#ifdef RT_ARCH_AMD64
740 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 32);
741#elif defined(RT_ARCH_ARM64)
742 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 16);
743#else
744# error "port me"
745#endif
746 off = iemNativeEmitPostponedEFlagsCalcLogical<a_fNeedToSetFlags>(pCodeBuf, off, cOpBits, idxRegResult,
747 idxRegEfl, idxRegTmp);
748 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
749
750 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
751 iemNativeRegFreeTmp(pReNative, idxRegTmp);
752 }
753
754#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
755 if (pReNative->fSkippingEFlags)
756 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForLogical)\n", pReNative->fSkippingEFlags));
757 pReNative->fSkippingEFlags = 0;
758# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
759 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
760# endif
761#endif
762 return off;
763}
764
765
766/**
767 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
768 *
769 * It takes liveness stuff into account.
770 */
771DECL_FORCE_INLINE_THROW(uint32_t)
772iemNativeEmitEFlagsForArithmetic(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl, uint8_t idxRegEflIn
773#ifndef RT_ARCH_AMD64
774 , uint8_t cOpBits, uint8_t idxRegResult, uint8_t idxRegDstIn, uint8_t idxRegSrc
775 , bool fInvertCarry, uint64_t uImmSrc
776#endif
777 )
778{
779 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalArithmetic);
780 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
781
782#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
783 /*
784 * See if we can skip this wholesale.
785 */
786 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
787 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
788 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
789 {
790 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedArithmetic);
791 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
792 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForArithmetic\n", X86_EFL_STATUS_BITS));
793# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
794 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
795# endif
796 }
797 else
798#endif
799 {
800#ifdef RT_ARCH_AMD64
801 /*
802 * Collect flags and merge them with eflags.
803 */
804 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
805 /* pushf - do this before any reg allocations as they may emit instructions too. */
806 pCodeBuf[off++] = 0x9c;
807
808 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
809 : iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
810 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
811 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
812 /* pop tmp */
813 if (idxTmpReg >= 8)
814 pCodeBuf[off++] = X86_OP_REX_B;
815 pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
816 /* Isolate the flags we want. */
817 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_STATUS_BITS);
818 /* Clear the status bits in EFLs. */
819 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
820 /* OR in the flags we collected. */
821 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
822 if (idxRegEflIn != idxRegEfl)
823 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
824 iemNativeRegFreeTmp(pReNative, idxTmpReg);
825
826#elif defined(RT_ARCH_ARM64)
827 /*
828 * Calculate flags.
829 */
830 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
831 : iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
832 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
833 uint8_t const idxTmpReg2 = cOpBits >= 32 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
834 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
835
836 /* Invert CF (stored inved on ARM) and load the flags into the temporary register. */
837 if (fInvertCarry)
838 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
839 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
840
841 if (cOpBits >= 32)
842 {
843 /* V -> OF */
844 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 28);
845 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
846
847 /* C -> CF */
848 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 1);
849 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
850 }
851
852 /* N,Z -> SF,ZF */
853 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits >= 32 ? 1 : 30);
854 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
855
856 /* For ADC and SBB we have to calculate overflow and carry our selves. */
857 if (cOpBits < 32)
858 {
859 /* Since the carry flag is the zero'th flag, we just use BFXIL got copy it over. */
860 AssertCompile(X86_EFL_CF_BIT == 0);
861 pCodeBuf[off++] = Armv8A64MkInstrBfxil(idxRegEfl, idxRegResult, cOpBits, 1, false /*f64Bit*/);
862
863 /* The overflow flag is more work as we have to compare the signed bits for
864 both inputs and the result. See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
865
866 Formula: ~(a_uDst ^ a_uSrcOf) & (a_uResult ^ a_uDst)
867 With a_uSrcOf as a_uSrc for additions and ~a_uSrc for subtractions.
868
869 It is a bit simpler when the right (source) side is constant:
870 adc: S D R -> OF sbb: S D R -> OF
871 0 0 0 -> 0 \ 0 0 0 -> 0 \
872 0 0 1 -> 1 \ 0 0 1 -> 0 \
873 0 1 0 -> 0 / and not(D), R 0 1 0 -> 1 / and D, not(R)
874 0 1 1 -> 0 / 0 1 1 -> 0 /
875 1 0 0 -> 0 \ 1 0 0 -> 0 \
876 1 0 1 -> 0 \ and D, not(R) 1 0 1 -> 1 \ and not(D), R
877 1 1 0 -> 1 / 1 1 0 -> 0 /
878 1 1 1 -> 0 / 1 1 1 -> 0 / */
879 if (idxRegSrc != UINT8_MAX)
880 {
881 if (fInvertCarry) /* sbb: ~((a_uDst) ^ ~(a_uSrcOf)) -> (a_uDst) ^ (a_uSrcOf); HACK ALERT: fInvertCarry == sbb */
882 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false);
883 else /* adc: ~((a_uDst) ^ (a_uSrcOf)) -> (a_uDst) ^ ~(a_uSrcOf) */
884 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegSrc, false);
885 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg2, idxRegDstIn, idxRegResult, false); /* (a_uDst) ^ (a_uResult) */
886 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxTmpReg, idxTmpReg, idxTmpReg2, false /*f64Bit*/);
887 }
888 else if (uImmSrc & RT_BIT_64(cOpBits - 1))
889 {
890 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
891 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
892 else
893 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
894 }
895 else
896 {
897 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
898 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
899 else
900 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
901 }
902 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits - 1, false /*f64Bit*/);
903 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1);
904 iemNativeRegFreeTmp(pReNative, idxTmpReg2);
905 }
906
907 /* Calculate 8-bit parity of the result. */
908 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /*f64Bit*/,
909 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
910 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
911 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
912 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
913 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
914 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
915 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /*f64Bit*/);
916 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
917
918 /* Calculate auxilary carry/borrow. This is related to 8-bit BCD.
919 General formula: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
920 S D R
921 0 0 0 -> 0; \
922 0 0 1 -> 1; \ regular
923 0 1 0 -> 1; / xor R, D
924 0 1 1 -> 0; /
925 1 0 0 -> 1; \
926 1 0 1 -> 0; \ invert one of the two
927 1 1 0 -> 0; / xor not(R), D
928 1 1 1 -> 1; /
929 a_uSrc[bit 4]=0: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
930 a_uSrc[bit 4]=1: ((uint32_t)~(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
931 */
932
933 if (idxRegSrc != UINT8_MAX)
934 {
935 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false /*f64Bit*/);
936 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxRegResult, false /*f64Bit*/);
937 }
938 else if (uImmSrc & X86_EFL_AF)
939 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
940 else
941 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
942 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, X86_EFL_AF_BIT, false /*f64Bit*/);
943 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_AF_BIT, 1, false /*f64Bit*/);
944
945 if (idxRegEflIn != idxRegEfl)
946 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
947 iemNativeRegFreeTmp(pReNative, idxTmpReg);
948
949#else
950# error "port me"
951#endif
952 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
953
954#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
955 if (pReNative->fSkippingEFlags)
956 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForArithmetic)\n", pReNative->fSkippingEFlags));
957 pReNative->fSkippingEFlags = 0;
958# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
959 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
960# endif
961#endif
962 }
963 return off;
964
965}
966
967
968
969/*********************************************************************************************************************************
970* Bitwise Logical Operations *
971*********************************************************************************************************************************/
972
973/**
974 * The AND instruction will clear OF, CF and AF (latter is undefined) and
975 * set the other flags according to the result.
976 */
977template<uint8_t const a_cOpBits>
978DECL_INLINE_THROW(uint32_t)
979iemNativeEmit_and_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
980{
981 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
982 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
983#ifdef RT_ARCH_AMD64
984 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
985 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
986 0x22, 0x23, a_cOpBits, idxRegDst, idxRegSrc);
987 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
988 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
989
990 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
991
992#elif defined(RT_ARCH_ARM64)
993 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. */
994 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
995 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
996 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
997 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
998
999 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1000#else
1001# error "Port me"
1002#endif
1003 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1004 return off;
1005}
1006
1007
1008/**
1009 * The AND instruction with immediate value as right operand.
1010 */
1011template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1012DECL_INLINE_THROW(uint32_t)
1013iemNativeEmit_and_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1014{
1015 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1016#ifdef RT_ARCH_AMD64
1017 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
1018 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1019 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 4, idxRegDst, uImmOp);
1020 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1021
1022 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1023
1024#elif defined(RT_ARCH_ARM64)
1025 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1026 course the immediate variant when possible to save a register load. */
1027 uint32_t uImmSizeLen, uImmRotations;
1028 if ( a_cOpBits > 32
1029 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1030 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1031 {
1032 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1033 if (a_cOpBits >= 32)
1034 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1035 else
1036 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1037 }
1038 else
1039 {
1040 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1041 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1042 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1043 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1044 else
1045 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1046 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1047 }
1048 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1049
1050 off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1051
1052#else
1053# error "Port me"
1054#endif
1055 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1056 return off;
1057}
1058
1059
1060/**
1061 * The TEST instruction will clear OF, CF and AF (latter is undefined) and
1062 * set the other flags according to the result.
1063 */
1064template<uint8_t const a_cOpBits>
1065DECL_INLINE_THROW(uint32_t)
1066iemNativeEmit_test_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1067{
1068 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1069 uint8_t const idxRegSrc = idxVarSrc == idxVarDst ? idxRegDst /* special case of 'test samereg,samereg' */
1070 : iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1071#ifdef RT_ARCH_AMD64
1072 /* On AMD64 we just use the correctly sized TEST instruction harvest the EFLAGS. */
1073 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1074 0x84, 0x85, a_cOpBits, idxRegSrc, idxRegDst);
1075 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1076
1077#elif defined(RT_ARCH_ARM64)
1078 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. We also
1079 need to keep the result in order to calculate the flags. */
1080 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1081 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1082 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1083 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1084 else
1085 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1086 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1087
1088#else
1089# error "Port me"
1090#endif
1091 if (idxVarSrc != idxVarDst)
1092 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1093 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1094
1095#ifdef RT_ARCH_AMD64
1096 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
1097#else
1098 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1099 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
1100 else
1101 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
1102 iemNativeRegFreeTmp(pReNative, idxRegResult);
1103#endif
1104 return off;
1105}
1106
1107
1108/**
1109 * The TEST instruction with immediate value as right operand.
1110 */
1111template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1112DECL_INLINE_THROW(uint32_t)
1113iemNativeEmit_test_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1114{
1115 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1116#ifdef RT_ARCH_AMD64
1117 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
1118 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1119 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0xf6, 0xcc, 0xf7, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
1120 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1121 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1122
1123 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
1124
1125#elif defined(RT_ARCH_ARM64)
1126 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1127 course the immediate variant when possible to save a register load.
1128 We also need to keep the result in order to calculate the flags. */
1129 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1130 uint32_t uImmSizeLen, uImmRotations;
1131 if ( a_cOpBits > 32
1132 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1133 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1134 {
1135 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1136 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1137 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1138 else
1139 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1140 }
1141 else
1142 {
1143 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1144 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1145 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1146 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1147 else
1148 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1149 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1150 }
1151 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1152 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1153
1154 off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
1155
1156 iemNativeRegFreeTmp(pReNative, idxRegResult);
1157
1158#else
1159# error "Port me"
1160#endif
1161 return off;
1162}
1163
1164
1165/**
1166 * The OR instruction will clear OF, CF and AF (latter is undefined) and
1167 * set the other flags according to the result.
1168 */
1169template<uint8_t const a_cOpBits>
1170DECL_INLINE_THROW(uint32_t)
1171iemNativeEmit_or_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1172{
1173 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1174 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1175#ifdef RT_ARCH_AMD64
1176 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1177 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1178 0x0a, 0x0b, a_cOpBits, idxRegDst, idxRegSrc);
1179 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1180 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1181
1182 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1183
1184#elif defined(RT_ARCH_ARM64)
1185 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1186 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1187 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1188 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1189 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1190
1191 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1192
1193#else
1194# error "Port me"
1195#endif
1196 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1197 return off;
1198}
1199
1200
1201/**
1202 * The OR instruction with immediate value as right operand.
1203 */
1204template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1205DECL_INLINE_THROW(uint32_t)
1206iemNativeEmit_or_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1207{
1208 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1209#ifdef RT_ARCH_AMD64
1210 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1211 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1212 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 1, idxRegDst, uImmOp);
1213 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1214
1215 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1216
1217#elif defined(RT_ARCH_ARM64)
1218 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1219 course the immediate variant when possible to save a register load. */
1220 uint32_t uImmSizeLen, uImmRotations;
1221 if ( a_cOpBits > 32
1222 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1223 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1224 {
1225 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1226 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1227 }
1228 else
1229 {
1230 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1231 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1232 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1233 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1234 }
1235 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1236
1237 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1238
1239#else
1240# error "Port me"
1241#endif
1242 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1243 return off;
1244}
1245
1246
1247/**
1248 * The XOR instruction will clear OF, CF and AF (latter is undefined) and
1249 * set the other flags according to the result.
1250 */
1251template<uint8_t const a_cOpBits>
1252DECL_INLINE_THROW(uint32_t)
1253iemNativeEmit_xor_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1254{
1255 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1256 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1257#ifdef RT_ARCH_AMD64
1258 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1259 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1260 0x32, 0x33, a_cOpBits, idxRegDst, idxRegSrc);
1261 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1262 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1263
1264 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1265
1266#elif defined(RT_ARCH_ARM64)
1267 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1268 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1269 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1270 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1271 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1272
1273 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1274
1275#else
1276# error "Port me"
1277#endif
1278 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1279 return off;
1280}
1281
1282
1283/**
1284 * The XOR instruction with immediate value as right operand.
1285 */
1286template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1287DECL_INLINE_THROW(uint32_t)
1288iemNativeEmit_xor_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1289{
1290 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1291#ifdef RT_ARCH_AMD64
1292 /* On AMD64 we just use the correctly sized XOR instruction harvest the EFLAGS. */
1293 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1294 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 6, idxRegDst, uImmOp);
1295 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1296
1297 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1298
1299#elif defined(RT_ARCH_ARM64)
1300 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1301 course the immediate variant when possible to save a register load. */
1302 uint32_t uImmSizeLen, uImmRotations;
1303 if ( a_cOpBits > 32
1304 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1305 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1306 {
1307 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1308 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1309 }
1310 else
1311 {
1312 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1313 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1314 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1315 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1316 }
1317 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1318
1319 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1320
1321#else
1322# error "Port me"
1323#endif
1324 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1325 return off;
1326}
1327
1328
1329
1330/*********************************************************************************************************************************
1331* ADD, ADC, SUB, SBB, CMP *
1332*********************************************************************************************************************************/
1333
1334/**
1335 * The ADD instruction will set all status flags.
1336 */
1337template<uint8_t const a_cOpBits>
1338DECL_INLINE_THROW(uint32_t)
1339iemNativeEmit_add_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1340{
1341 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1342 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1343
1344#ifdef RT_ARCH_AMD64
1345 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1346 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1347 0x02, 0x03, a_cOpBits, idxRegDst, idxRegSrc);
1348 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1349
1350 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1351 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1352
1353 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1354
1355#elif defined(RT_ARCH_ARM64)
1356 /* On ARM64 we'll need the two input operands as well as the result in order
1357 to calculate the right flags, even if we use ADDS and translates NZCV into
1358 OF, CF, ZF and SF. */
1359 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1360 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1361 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1362 {
1363 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1364 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1365 }
1366 else
1367 {
1368 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1369 uint32_t const cShift = 32 - a_cOpBits;
1370 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1371 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1372 true /*fSetFlags*/, cShift);
1373 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1374 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1375 }
1376 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1377
1378 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1379 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1380
1381 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1382 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1383 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1384
1385#else
1386# error "port me"
1387#endif
1388 return off;
1389}
1390
1391
1392/**
1393 * The ADD instruction with immediate value as right operand.
1394 */
1395template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1396DECL_INLINE_THROW(uint32_t)
1397iemNativeEmit_add_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1398{
1399 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1400
1401#ifdef RT_ARCH_AMD64
1402 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1403 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1404 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
1405 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1406
1407 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1408
1409 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1410
1411#elif defined(RT_ARCH_ARM64)
1412 /* On ARM64 we'll need the two input operands as well as the result in order
1413 to calculate the right flags, even if we use ADDS and translates NZCV into
1414 OF, CF, ZF and SF. */
1415 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1416 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1417 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1418 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1419 {
1420 if (uImmOp <= 0xfffU)
1421 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1422 true /*fSetFlags*/);
1423 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1424 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1425 true /*fSetFlags*/, true /*fShift12*/);
1426 else
1427 {
1428 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1429 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1430 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1431 true /*fSetFlags*/);
1432 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1433 }
1434 }
1435 else
1436 {
1437 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1438 uint32_t const cShift = 32 - a_cOpBits;
1439 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp << cShift);
1440 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1441 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegTmpImm, idxRegDstIn, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1442 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1443 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1444 }
1445 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1446
1447 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1448 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1449
1450 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1451 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1452
1453#else
1454# error "port me"
1455#endif
1456 return off;
1457}
1458
1459
1460/**
1461 * The ADC instruction takes CF as input and will set all status flags.
1462 */
1463template<uint8_t const a_cOpBits>
1464DECL_INLINE_THROW(uint32_t)
1465iemNativeEmit_adc_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1466{
1467 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1468 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1469 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
1470
1471#ifdef RT_ARCH_AMD64
1472 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1473 with matching size to get the correct flags. */
1474 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1475
1476 /* Use the BT instruction to set CF according to idxRegEfl. */
1477 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1478 pCodeBuf[off++] = X86_EFL_CF_BIT;
1479
1480 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x12, 0x13, a_cOpBits, idxRegDst, idxRegSrc);
1481 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1482
1483 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1484 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1485
1486 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1487
1488#elif defined(RT_ARCH_ARM64)
1489 /* On ARM64 we use the RMIF instruction to load PSTATE.CF from idxRegEfl and
1490 then ADCS for the calculation. We need all inputs and result for the two
1491 flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1492 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1493 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
1494
1495 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1496 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1497 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1498 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1499 else
1500 {
1501 /* Since we're also adding in the carry flag here, shifting operands up
1502 doesn't work. So, we have to calculate carry & overflow manually. */
1503 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1504 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1505 }
1506 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1507
1508 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1509 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1510
1511 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1512 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1513 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1514 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1515 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1516
1517#else
1518# error "port me"
1519#endif
1520 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1521 return off;
1522}
1523
1524
1525/**
1526 * The ADC instruction with immediate value as right operand.
1527 */
1528template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1529DECL_INLINE_THROW(uint32_t)
1530iemNativeEmit_adc_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1531{
1532 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1533 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
1534
1535#ifdef RT_ARCH_AMD64
1536 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1537 with matching size to get the correct flags. */
1538 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1539
1540 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1541 pCodeBuf[off++] = X86_EFL_CF_BIT;
1542
1543 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 2, idxRegDst, uImmOp);
1544 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1545
1546 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1547
1548 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1549
1550#elif defined(RT_ARCH_ARM64)
1551 /* On ARM64 we use the RMIF instructions to load PSTATE.CF from idxRegEfl
1552 and then ADCS for the calculation. We need all inputs and result for
1553 the two flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1554 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1555 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1556 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1557
1558 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1559 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1560 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1561 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /*f64Bit*/);
1562 else
1563 {
1564 /* Since we're also adding in the carry flag here, shifting operands up
1565 doesn't work. So, we have to calculate carry & overflow manually. */
1566 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1567 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1568 }
1569 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1570
1571 iemNativeRegFreeTmp(pReNative, idxRegImm);
1572
1573 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1574 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1575
1576 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1577 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1578 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1579 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1580
1581#else
1582# error "port me"
1583#endif
1584 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1585 return off;
1586}
1587
1588
1589/**
1590 * The SUB instruction will set all status flags.
1591 */
1592template<uint8_t const a_cOpBits>
1593DECL_INLINE_THROW(uint32_t)
1594iemNativeEmit_sub_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1595{
1596 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1597 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1598
1599#ifdef RT_ARCH_AMD64
1600 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1601 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1602 0x2a, 0x2b, a_cOpBits, idxRegDst, idxRegSrc);
1603 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1604
1605 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1606 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1607
1608 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1609
1610#elif defined(RT_ARCH_ARM64)
1611 /* On ARM64 we'll need the two input operands as well as the result in order
1612 to calculate the right flags, even if we use SUBS and translates NZCV into
1613 OF, CF, ZF and SF. */
1614 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1615 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1616 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1617 {
1618 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1619 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1620 }
1621 else
1622 {
1623 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1624 uint32_t const cShift = 32 - a_cOpBits;
1625 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1626 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1627 true /*fSetFlags*/, cShift);
1628 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1629 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1630 }
1631 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1632
1633 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1634 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1635
1636 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1637 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1638 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1639
1640#else
1641# error "port me"
1642#endif
1643 return off;
1644}
1645
1646
1647/**
1648 * The SUB instruction with immediate value as right operand.
1649 */
1650template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1651DECL_INLINE_THROW(uint32_t)
1652iemNativeEmit_sub_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1653{
1654 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1655
1656#ifdef RT_ARCH_AMD64
1657 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1658 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1659 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 5, idxRegDst, uImmOp);
1660 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1661
1662 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1663
1664 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1665
1666#elif defined(RT_ARCH_ARM64)
1667 /* On ARM64 we'll need the two input operands as well as the result in order
1668 to calculate the right flags, even if we use SUBS and translates NZCV into
1669 OF, CF, ZF and SF. */
1670 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1671 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1672 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1673 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1674 {
1675 if (uImmOp <= 0xfffU)
1676 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1677 true /*fSetFlags*/);
1678 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1679 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1680 true /*fSetFlags*/, true /*fShift12*/);
1681 else
1682 {
1683 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1684 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1685 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1686 true /*fSetFlags*/);
1687 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1688 }
1689 }
1690 else
1691 {
1692 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1693 uint32_t const cShift = 32 - a_cOpBits;
1694 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1695 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1696 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1697 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1698 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1699 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1700 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1701 }
1702 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1703
1704 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1705 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1706
1707 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1708 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1709
1710#else
1711# error "port me"
1712#endif
1713 return off;
1714}
1715
1716
1717/**
1718 * The CMP instruction will set all status flags, but modifies no registers.
1719 */
1720template<uint8_t const a_cOpBits>
1721DECL_INLINE_THROW(uint32_t)
1722iemNativeEmit_cmp_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1723{
1724 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1725 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1726
1727#ifdef RT_ARCH_AMD64
1728 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1729 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1730 0x3a, 0x3b, a_cOpBits, idxRegDst, idxRegSrc);
1731 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1732
1733 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1734 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1735
1736 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1737
1738#elif defined(RT_ARCH_ARM64)
1739 /* On ARM64 we'll need the actual result as well as both input operands in order
1740 to calculate the right flags, even if we use SUBS and translates NZCV into
1741 OF, CF, ZF and SF. */
1742 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1743 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1744 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1745 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1746 else
1747 {
1748 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1749 uint32_t const cShift = 32 - a_cOpBits;
1750 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegResult, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1751 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegSrc, false /*f64Bit*/,
1752 true /*fSetFlags*/, cShift);
1753 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1754 }
1755 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1756
1757 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1758 idxRegDst, idxRegSrc, true /*fInvertCarry*/, 0);
1759
1760 iemNativeRegFreeTmp(pReNative, idxRegResult);
1761 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1762 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1763
1764#else
1765# error "port me"
1766#endif
1767 return off;
1768}
1769
1770
1771/**
1772 * The CMP instruction with immediate value as right operand.
1773 */
1774template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1775DECL_INLINE_THROW(uint32_t)
1776iemNativeEmit_cmp_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1777{
1778 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1779
1780#ifdef RT_ARCH_AMD64
1781 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1782 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1783 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 7, idxRegDst, uImmOp);
1784 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1785
1786 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1787
1788 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1789
1790#elif defined(RT_ARCH_ARM64)
1791 /* On ARM64 we'll need the actual result as well as both input operands in order
1792 to calculate the right flags, even if we use SUBS and translates NZCV into
1793 OF, CF, ZF and SF. */
1794 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1795 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1796 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1797 {
1798 if (uImmOp <= 0xfffU)
1799 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1800 true /*fSetFlags*/);
1801 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1802 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1803 true /*fSetFlags*/, true /*fShift12*/);
1804 else
1805 {
1806 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1807 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1808 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1809 true /*fSetFlags*/);
1810 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1811 }
1812 }
1813 else
1814 {
1815 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1816 uint32_t const cShift = 32 - a_cOpBits;
1817 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1818 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1819 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegResult, idxRegDst, cShift, false /*f64Bit*/);
1820 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1821 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1822 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1823 }
1824 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1825
1826 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1827 idxRegDst, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1828
1829 iemNativeRegFreeTmp(pReNative, idxRegResult);
1830 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1831
1832#else
1833# error "port me"
1834#endif
1835 return off;
1836}
1837
1838
1839/**
1840 * The SBB instruction takes CF as input and will set all status flags.
1841 */
1842template<uint8_t const a_cOpBits>
1843DECL_INLINE_THROW(uint32_t)
1844iemNativeEmit_sbb_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1845{
1846 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1847 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1848 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
1849
1850#ifdef RT_ARCH_AMD64
1851 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1852 with matching size to get the correct flags. */
1853 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1854
1855 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1856 pCodeBuf[off++] = X86_EFL_CF_BIT;
1857
1858 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x1a, 0x1b, a_cOpBits, idxRegDst, idxRegSrc);
1859 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1860
1861 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1862 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1863
1864 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1865
1866#elif defined(RT_ARCH_ARM64)
1867 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1868 idxRegEfl and then SBCS for the calculation. We need all inputs and
1869 result for the two flags (AF,PF) that can't be directly derived from
1870 PSTATE.NZCV. */
1871 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1872 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1873
1874 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1875 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1876 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1877 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1878 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1879 else
1880 {
1881 /* Since we're also adding in the carry flag here, shifting operands up
1882 doesn't work. So, we have to calculate carry & overflow manually. */
1883 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1884 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1885 }
1886 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1887
1888 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1889 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1890
1891 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1892 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1893 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1894 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1895 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1896
1897#else
1898# error "port me"
1899#endif
1900 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1901 return off;
1902}
1903
1904
1905/**
1906 * The SBB instruction with immediate value as right operand.
1907 */
1908template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1909DECL_INLINE_THROW(uint32_t)
1910iemNativeEmit_sbb_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1911{
1912 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1913 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
1914
1915#ifdef RT_ARCH_AMD64
1916 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1917 with matching size to get the correct flags. */
1918 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1919
1920 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1921 pCodeBuf[off++] = X86_EFL_CF_BIT;
1922
1923 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 3, idxRegDst, uImmOp);
1924 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1925
1926 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1927
1928 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1929
1930#elif defined(RT_ARCH_ARM64)
1931 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1932 idxRegEfl and then SBCS for the calculation. We need all inputs and
1933 result for the two flags (AF,PF) that can't be directly derived from
1934 PSTATE.NZCV. */
1935 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1936 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1937 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1938
1939 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1940 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1941 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1942 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1943 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /*f64Bit*/);
1944 else
1945 {
1946 /* Since we're also adding in the carry flag here, shifting operands up
1947 doesn't work. So, we have to calculate carry & overflow manually. */
1948 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1949 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1950 }
1951 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1952
1953 iemNativeRegFreeTmp(pReNative, idxRegImm);
1954
1955 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1956 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1957
1958 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1959 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1960 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1961 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1962
1963#else
1964# error "port me"
1965#endif
1966 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1967 return off;
1968}
1969
1970
1971template<uint8_t const a_cOpBits>
1972DECL_INLINE_THROW(uint32_t)
1973iemNativeEmit_imul_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1974{
1975 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1976 AssertFailed();
1977 return iemNativeEmitBrk(pReNative, off, 0x666);
1978}
1979
1980
1981template<uint8_t const a_cOpBits>
1982DECL_INLINE_THROW(uint32_t)
1983iemNativeEmit_popcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1984{
1985 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1986 AssertFailed();
1987 return iemNativeEmitBrk(pReNative, off, 0x666);
1988}
1989
1990
1991template<uint8_t const a_cOpBits>
1992DECL_INLINE_THROW(uint32_t)
1993iemNativeEmit_tzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1994{
1995 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1996 AssertFailed();
1997 return iemNativeEmitBrk(pReNative, off, 0x666);
1998}
1999
2000
2001template<uint8_t const a_cOpBits>
2002DECL_INLINE_THROW(uint32_t)
2003iemNativeEmit_lzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
2004{
2005 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
2006 AssertFailed();
2007 return iemNativeEmitBrk(pReNative, off, 0x666);
2008}
2009
2010
2011
2012/*********************************************************************************************************************************
2013* Shifting and Rotating. *
2014*********************************************************************************************************************************/
2015
2016
2017typedef enum
2018{
2019 kIemNativeEmitEFlagsForShiftType_Left,
2020 kIemNativeEmitEFlagsForShiftType_Right,
2021 kIemNativeEmitEFlagsForShiftType_SignedRight
2022} IEMNATIVEEMITEFLAGSFORSHIFTTYPE;
2023
2024/**
2025 * This is used by SHL, SHR and SAR emulation.
2026 *
2027 * It takes liveness stuff into account.
2028 */
2029DECL_INLINE_THROW(uint32_t)
2030iemNativeEmitEFlagsForShift(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxRegEfl, uint8_t idxRegResult,
2031 uint8_t idxRegSrc, uint8_t idxRegCount, uint8_t cOpBits, IEMNATIVEEMITEFLAGSFORSHIFTTYPE enmType,
2032 uint8_t idxRegTmp)
2033{
2034 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalShift);
2035
2036RT_NOREF(pReNative, off, idxRegEfl, idxRegResult, idxRegSrc, idxRegCount, cOpBits, enmType);
2037#if 0 //def IEMNATIVE_WITH_EFLAGS_SKIPPING
2038 /*
2039 * See if we can skip this wholesale.
2040 */
2041 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
2042 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
2043 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
2044 {
2045 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedShift);
2046 pReNative->fSkippingEFlags |= X86_EFL_STATUS_BITS;
2047# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2048 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2049# endif
2050 }
2051 else
2052#endif
2053 {
2054 /*
2055 * The difference between Intel and AMD flags for SHL are:
2056 * - Intel always clears AF while AMD always sets it.
2057 * - Intel sets OF for the first shift, while AMD for the last shift.
2058 *
2059 */
2060
2061#ifdef RT_ARCH_AMD64
2062 /*
2063 * We capture flags and does the additional OF and AF calculations as needed.
2064 */
2065 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 64);
2066 /** @todo kIemNativeEmitEFlagsForShiftType_SignedRight: we could alternatively
2067 * use LAHF here when host rax is free since, OF is cleared. */
2068 /* pushf */
2069 pCodeBuf[off++] = 0x9c;
2070 /* pop tmp */
2071 if (idxRegTmp >= 8)
2072 pCodeBuf[off++] = X86_OP_REX_B;
2073 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
2074 /* Clear the status bits in EFLs. */
2075 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
2076 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1];
2077 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
2078 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_STATUS_BITS);
2079 else
2080 {
2081 /* and tmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF */
2082 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF);
2083 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2084 off = iemNativeEmitOrGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_AF);
2085 /* OR in the flags we collected. */
2086 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2087
2088 /* Calculate OF */
2089 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2090 {
2091 /* AMD last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2092 /* bt idxRegResult, (cOpBits - 1) => CF=result-sign-bit */
2093 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b /*ud2*/, 0xba,
2094 RT_MAX(cOpBits, 16), 4, idxRegResult);
2095 pCodeBuf[off++] = cOpBits - 1;
2096 /* setc idxRegTmp */
2097 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x92, 0x0b /*ud2*/, 8, 0, idxRegTmp);
2098 /* xor idxRegTmp, idxRegEfl */
2099 off = iemNativeEmitXorGpr32ByGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegEfl);
2100 /* and idxRegTmp, 1 */
2101 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, 1);
2102 /* shl idxRegTmp, X86_EFL_OF_BIT */
2103 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT);
2104 }
2105 else
2106 {
2107 /* Intel first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2108 if (cOpBits <= 32)
2109 {
2110 /* mov idxRegTmp, idxRegSrc */
2111 off = iemNativeEmitLoadGprFromGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegSrc);
2112 /* shl idxRegTmp, 1 */
2113 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, 1);
2114 /* xor idxRegTmp, idxRegSrc */
2115 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2116 /* shr idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1 or shl idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1 */
2117 if (cOpBits >= X86_EFL_OF_BIT)
2118 off = iemNativeEmitShiftGpr32RightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2119 else
2120 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1);
2121 }
2122 else
2123 {
2124 /* same as above but with 64-bit grps*/
2125 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2126 off = iemNativeEmitShiftGprLeftEx(pCodeBuf, off, idxRegTmp, 1);
2127 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2128 off = iemNativeEmitShiftGprRightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2129 }
2130 /* and idxRegTmp, X86_EFL_OF */
2131 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF);
2132 }
2133 }
2134 /* Or in the collected flag(s) */
2135 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2136
2137#elif defined(RT_ARCH_ARM64)
2138 /*
2139 * Calculate flags.
2140 */
2141 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
2142
2143 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
2144 off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxRegTmp, ~X86_EFL_STATUS_BITS);
2145 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
2146
2147 /* N,Z -> SF,ZF */
2148 if (cOpBits < 32)
2149 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
2150 else
2151 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
2152 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
2153 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
2154 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
2155 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
2156
2157 /* Calculate 8-bit parity of the result. */
2158 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
2159 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2160 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2161 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2162 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2163 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2164 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
2165 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
2166 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
2167
2168 /* Calculate carry - the last bit shifted out of the input value. */
2169 if (enmType == kIemNativeEmitEFlagsForShiftType_Left)
2170 {
2171 /* CF = (idxRegSrc >> (cOpBits - idxRegCount))) & 1 */
2172 pCodeBuf[off++] = Armv8A64MkInstrMovZ(idxRegTmp, cOpBits);
2173 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegTmp, idxRegTmp, idxRegCount, false /*f64Bit*/, cOpBits < 32 /*fSetFlags*/);
2174 if (cOpBits < 32)
2175 pCodeBuf[off++] = Armv8A64MkInstrBCond(kArmv8InstrCond_Cc, 3); /* 16 or 8 bit: CF is clear if all shifted out */
2176 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2177 }
2178 else
2179 {
2180 /* CF = (idxRegSrc >> (idxRegCount - 1)) & 1 */
2181 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegTmp, idxRegCount, 1, false /*f64Bit*/);
2182 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2183 }
2184 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
2185
2186 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[0];
2187 if (idxTargetCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2188 {
2189 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2190 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegSrc, idxRegSrc, cOpBits > 32, 1 /*left shift count*/);
2191 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, cOpBits - 1, cOpBits > 32);
2192 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2193 }
2194 else
2195 {
2196 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2197 AssertCompile(X86_EFL_CF_BIT == 0);
2198 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegEfl, idxRegResult, cOpBits > 32, /* ASSUMES CF calculated! */
2199 cOpBits - 1, kArmv8A64InstrShift_Lsr);
2200 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2201
2202 /* AMD unconditionally clears AF. */
2203 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 32 - X86_EFL_AF_BIT) == X86_EFL_AF);
2204 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegEfl, idxRegEfl, 0, 32 - X86_EFL_AF_BIT, false /*f64Bit*/);
2205 }
2206#else
2207# error "port me"
2208#endif
2209 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2210
2211#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
2212 if (pReNative->fSkippingEFlags)
2213 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForShift)\n", pReNative->fSkippingEFlags));
2214 pReNative->fSkippingEFlags = 0;
2215# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2216 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2217# endif
2218#endif
2219 }
2220 return off;
2221}
2222
2223
2224DECL_INLINE_THROW(uint32_t)
2225iemNativeEmit_shl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2226 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2227{
2228 /* Note! Since we're doing some branching here, we need to allocate all
2229 registers we need before the jump or we may end up with invalid
2230 register state if the branch is taken. */
2231 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off); /* Do this first in hope we'll get EAX. */
2232 uint8_t const idxRegCount = iemNativeVarRegisterAcquireInited(pReNative, idxVarCount, &off); /* modified on arm64 */
2233 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
2234 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
2235
2236#ifdef RT_ARCH_AMD64
2237 /* Make sure IEM_MC_NATIVE_AMD64_HOST_REG_FOR_LOCAL was used. */
2238 AssertStmt(idxRegCount == X86_GREG_xCX, IEMNATIVE_DO_LONGJMP(pReNative, VERR_IEM_EMIT_UNEXPECTED_VAR_REGISTER));
2239
2240 /* We only need a copy of the input value if the target CPU differs from the host CPU. */
2241 uint8_t const idxRegDstIn = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1] == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2242 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
2243 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4+2+3+4);
2244
2245 /* Check if it's NOP before we do anything. */
2246 off = iemNativeEmitTestAnyBitsInGpr8Ex(pCodeBuf, off, idxRegCount, cOpBits <= 32 ? 0x1f : 0x3f);
2247 uint32_t const offFixup = off;
2248 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off /*8-bit should be enough */, kIemNativeInstrCond_z);
2249
2250 if (idxRegDstIn != UINT8_MAX)
2251 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
2252 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0xd2, 0xd3, cOpBits, 4, idxRegDst);
2253
2254#elif defined(RT_ARCH_ARM64)
2255 /* We always (except we can skip EFLAGS calcs) a copy of the input value. */
2256 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
2257 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
2258
2259 /* Check if it's NOP before we do anything. We MODIFY idxRegCount here! */
2260 Assert(Armv8A64ConvertImmRImmS2Mask32(4, 0) == 0x1f);
2261 Assert(Armv8A64ConvertImmRImmS2Mask32(5, 0) == 0x3f);
2262 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegCount, idxRegCount, cOpBits > 32 ? 5 : 4, 0, false /*f64Bit*/);
2263 uint32_t const offFixup = off;
2264 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kArmv8InstrCond_Eq);
2265
2266 pCodeBuf[off++] = Armv8A64MkInstrMov(idxRegDstIn, idxRegDst);
2267 pCodeBuf[off++] = Armv8A64MkInstrLslv(idxRegDst, idxRegDst, idxRegCount, cOpBits > 32 /*f64Bit*/);
2268 if (cOpBits < 32)
2269 {
2270 Assert(Armv8A64ConvertImmRImmS2Mask32(7, 0) == 0xff);
2271 Assert(Armv8A64ConvertImmRImmS2Mask32(15, 0) == 0xffff);
2272 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, cOpBits - 1, 0, false /*f64Bit*/);
2273 }
2274
2275#else
2276# error "port me"
2277#endif
2278
2279 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2280 off = iemNativeEmitEFlagsForShift(pReNative, off, idxRegEfl, idxRegDst, idxRegDstIn, idxRegCount,
2281 cOpBits, kIemNativeEmitEFlagsForShiftType_Left, idxRegTmp);
2282
2283 /* fixup the jump */
2284 iemNativeFixupFixedJump(pReNative, offFixup, off);
2285
2286#ifdef RT_ARCH_AMD64
2287 if (idxRegDstIn != UINT8_MAX)
2288#endif
2289 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
2290 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
2291 iemNativeVarRegisterRelease(pReNative, idxVarDst);
2292 iemNativeVarRegisterRelease(pReNative, idxVarCount);
2293 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2294 return off;
2295}
2296
2297
2298DECL_INLINE_THROW(uint32_t)
2299iemNativeEmit_shr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2300 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2301{
2302 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2303 AssertFailed();
2304 return iemNativeEmitBrk(pReNative, off, 0x666);
2305}
2306
2307
2308DECL_INLINE_THROW(uint32_t)
2309iemNativeEmit_sar_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2310 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2311{
2312 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2313 AssertFailed();
2314 return iemNativeEmitBrk(pReNative, off, 0x666);
2315}
2316
2317
2318DECL_INLINE_THROW(uint32_t)
2319iemNativeEmit_rol_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2320 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2321{
2322 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2323 AssertFailed();
2324 return iemNativeEmitBrk(pReNative, off, 0x666);
2325}
2326
2327
2328DECL_INLINE_THROW(uint32_t)
2329iemNativeEmit_ror_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2330 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2331{
2332 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2333 AssertFailed();
2334 return iemNativeEmitBrk(pReNative, off, 0x666);
2335}
2336
2337
2338DECL_INLINE_THROW(uint32_t)
2339iemNativeEmit_rcl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2340 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2341{
2342 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2343 AssertFailed();
2344 return iemNativeEmitBrk(pReNative, off, 0x666);
2345}
2346
2347
2348DECL_INLINE_THROW(uint32_t)
2349iemNativeEmit_rcr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2350 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2351{
2352 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2353 AssertFailed();
2354 return iemNativeEmitBrk(pReNative, off, 0x666);
2355}
2356
2357
2358
2359/*********************************************************************************************************************************
2360* SIMD emitters. *
2361*********************************************************************************************************************************/
2362
2363/**
2364 * Common emitter for packed arithmetic instructions.
2365 */
2366#ifdef RT_ARCH_AMD64
2367# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2368 DECL_INLINE_THROW(uint32_t) \
2369 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2370 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2371 { \
2372 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2373 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2374 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2375 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2376 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2377 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2378 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2379 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2380 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2381 pCodeBuf[off++] = 0x0f; \
2382 pCodeBuf[off++] = (a_bOpcX86); \
2383 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2384 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2385 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2386 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2387 return off; \
2388 } \
2389 DECL_INLINE_THROW(uint32_t) \
2390 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2391 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2392 { \
2393 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2394 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2395 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2396 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2397 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2398 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2399 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2400 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2401 pCodeBuf[off++] = 0x0f; \
2402 pCodeBuf[off++] = (a_bOpcX86); \
2403 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2404 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2405 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2406 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2407 return off; \
2408 } \
2409 typedef int ignore_semicolon
2410#elif defined(RT_ARCH_ARM64)
2411# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2412 DECL_INLINE_THROW(uint32_t) \
2413 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2414 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2415 { \
2416 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2417 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2418 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2419 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2420 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2421 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2422 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2423 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2424 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2425 return off; \
2426 } \
2427 DECL_INLINE_THROW(uint32_t) \
2428 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2429 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2430 { \
2431 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2432 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2433 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2434 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2435 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2436 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2437 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2438 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2439 return off; \
2440 } \
2441 typedef int ignore_semicolon
2442#else
2443# error "Port me"
2444#endif
2445
2446/* POR, ORPS, ORPD. */
2447IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(por, kArmv8VecInstrLogicOp_Orr, 0xeb);
2448/* PXOR, XORPS, XORPD. */
2449IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pxor, kArmv8VecInstrLogicOp_Eor, 0xef);
2450/* PAND, ANDPS, ANDPD. */
2451IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pand, kArmv8VecInstrLogicOp_And, 0xdb);
2452
2453
2454/**
2455 * Common emitter for the shift right with immediate instructions.
2456 */
2457#ifdef RT_ARCH_AMD64
2458# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2459 DECL_INLINE_THROW(uint32_t) \
2460 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2461 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2462 { \
2463 if (bImm) \
2464 { \
2465 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2466 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2467 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2468 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2469 if (idxSimdRegDst >= 8) \
2470 pCodeBuf[off++] = X86_OP_REX_B; \
2471 pCodeBuf[off++] = 0x0f; \
2472 pCodeBuf[off++] = (a_bOpcX86); \
2473 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, idxSimdRegDst & 7); \
2474 pCodeBuf[off++] = bImm; \
2475 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2476 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2477 } \
2478 /* Immediate 0 is a nop. */ \
2479 return off; \
2480 } \
2481 typedef int ignore_semicolon
2482#elif defined(RT_ARCH_ARM64)
2483# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2484 DECL_INLINE_THROW(uint32_t) \
2485 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2486 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2487 { \
2488 if (bImm) \
2489 { \
2490 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2491 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2492 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2493 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegDst, idxSimdRegDst, RT_MIN(bImm, (a_cShiftMax)), (a_ArmElemSz)); \
2494 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2495 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2496 } \
2497 /* Immediate 0 is a nop. */ \
2498 return off; \
2499 } \
2500 typedef int ignore_semicolon
2501#else
2502# error "Port me"
2503#endif
2504
2505IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlw, 16, kArmv8InstrShiftSz_U16, 0x71);
2506IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrld, 32, kArmv8InstrShiftSz_U32, 0x72);
2507IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlq, 64, kArmv8InstrShiftSz_U64, 0x73);
2508
2509
2510/**
2511 * Common emitter for the shift left with immediate instructions.
2512 */
2513#ifdef RT_ARCH_AMD64
2514# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2515 DECL_INLINE_THROW(uint32_t) \
2516 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2517 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2518 { \
2519 if (bImm) \
2520 { \
2521 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2522 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2523 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2524 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2525 if (idxSimdRegDst >= 8) \
2526 pCodeBuf[off++] = X86_OP_REX_B; \
2527 pCodeBuf[off++] = 0x0f; \
2528 pCodeBuf[off++] = (a_bOpcX86); \
2529 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxSimdRegDst & 7); \
2530 pCodeBuf[off++] = bImm; \
2531 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2532 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2533 } \
2534 /* Immediate 0 is a nop. */ \
2535 return off; \
2536 } \
2537 typedef int ignore_semicolon
2538#elif defined(RT_ARCH_ARM64)
2539# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2540 DECL_INLINE_THROW(uint32_t) \
2541 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2542 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2543 { \
2544 if (bImm) /* bImm == 0 is a nop */ \
2545 { \
2546 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2547 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2548 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2549 if (bImm < (a_cShiftMax)) \
2550 pCodeBuf[off++] = Armv8A64MkVecInstrShlImm(idxSimdRegDst, idxSimdRegDst, bImm, (a_ArmElemSz)); \
2551 else /* Everything >= a_cShiftMax sets the register to zero. */ \
2552 pCodeBuf[off++] = Armv8A64MkVecInstrEor(idxSimdRegDst, idxSimdRegDst, idxSimdRegDst); \
2553 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2554 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2555 } \
2556 return off; \
2557 } \
2558 typedef int ignore_semicolon
2559#else
2560# error "Port me"
2561#endif
2562
2563IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllw, 16, kArmv8InstrShiftSz_U16, 0x71);
2564IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(pslld, 32, kArmv8InstrShiftSz_U32, 0x72);
2565IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllq, 64, kArmv8InstrShiftSz_U64, 0x73);
2566
2567
2568/**
2569 * Common emitter for packed arithmetic instructions.
2570 */
2571#ifdef RT_ARCH_AMD64
2572# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2573 DECL_INLINE_THROW(uint32_t) \
2574 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2575 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2576 { \
2577 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2578 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2579 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2580 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2581 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2582 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2583 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2584 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2585 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2586 pCodeBuf[off++] = 0x0f; \
2587 pCodeBuf[off++] = (a_bOpcX86); \
2588 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2589 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2590 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2591 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2592 return off; \
2593 } \
2594 DECL_INLINE_THROW(uint32_t) \
2595 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2596 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2597 { \
2598 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2599 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2600 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2601 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2602 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2603 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2604 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2605 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2606 pCodeBuf[off++] = 0x0f; \
2607 pCodeBuf[off++] = (a_bOpcX86); \
2608 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2609 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2610 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2611 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2612 return off; \
2613 } \
2614 typedef int ignore_semicolon
2615#elif defined(RT_ARCH_ARM64)
2616# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2617 DECL_INLINE_THROW(uint32_t) \
2618 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2619 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2620 { \
2621 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2622 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2623 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2624 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2625 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2626 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2627 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2628 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2629 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2630 return off; \
2631 } \
2632 DECL_INLINE_THROW(uint32_t) \
2633 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2634 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2635 { \
2636 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2637 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2638 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2639 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2640 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2641 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2642 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2643 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2644 return off; \
2645 } \
2646 typedef int ignore_semicolon
2647#else
2648# error "Port me"
2649#endif
2650
2651/*
2652 * PADDx.
2653 */
2654IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddb, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_8, 0xfc);
2655IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddw, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_16, 0xfd);
2656IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddd, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_32, 0xfe);
2657IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddq, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_64, 0xd4);
2658
2659/*
2660 * PSUBx.
2661 */
2662IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubb, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_8, 0xf8);
2663IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubw, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_16, 0xf9);
2664IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubd, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_32, 0xfa);
2665IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubq, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_64, 0xfb);
2666
2667/*
2668 * PADDUSx.
2669 */
2670IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusb, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_8, 0xdc);
2671IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusw, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_16, 0xdd);
2672
2673/*
2674 * PMULLx.
2675 */
2676IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(pmullw, kArmv8VecInstrArithOp_Mul, kArmv8VecInstrArithSz_16, 0xd5);
2677
2678
2679/**
2680 * Common emitter for the pcmpeqb/pcmpeqw/pcmpeqd instructions.
2681 */
2682#ifdef RT_ARCH_AMD64
2683# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2684 DECL_INLINE_THROW(uint32_t) \
2685 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2686 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2687 { \
2688 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2689 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2690 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2691 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2692 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2693 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2694 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2695 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2696 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2697 pCodeBuf[off++] = 0x0f; \
2698 pCodeBuf[off++] = (a_bOpcX86); \
2699 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2700 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2701 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2702 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2703 return off; \
2704 } \
2705 DECL_INLINE_THROW(uint32_t) \
2706 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2707 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2708 { \
2709 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2710 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2711 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2712 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2713 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2714 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2715 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2716 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2717 pCodeBuf[off++] = 0x0f; \
2718 pCodeBuf[off++] = (a_bOpcX86); \
2719 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2720 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2721 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2722 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2723 return off; \
2724 } \
2725 typedef int ignore_semicolon
2726#elif defined(RT_ARCH_ARM64)
2727# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2728 DECL_INLINE_THROW(uint32_t) \
2729 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2730 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2731 { \
2732 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2733 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2734 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2735 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2736 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2737 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2738 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2739 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2740 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2741 return off; \
2742 } \
2743 DECL_INLINE_THROW(uint32_t) \
2744 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2745 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2746 { \
2747 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2748 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2749 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2750 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2751 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2752 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2753 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2754 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2755 return off; \
2756 } \
2757 typedef int ignore_semicolon
2758#else
2759# error "Port me"
2760#endif
2761
2762IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqb, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_8, 0x74);
2763IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqw, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_16, 0x75);
2764IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqd, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_32, 0x76);
2765
2766
2767/**
2768 * Emitter for pmovmskb
2769 */
2770DECL_INLINE_THROW(uint32_t)
2771iemNativeEmit_pmovmskb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2772 uint8_t const idxGstRegDst, uint8_t const idxSimdGstRegSrc)
2773{
2774#ifdef RT_ARCH_AMD64
2775 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2776 kIemNativeGstRegUse_ForFullWrite);
2777 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2778 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2779 kIemNativeGstSimdRegLdStSz_Low128,
2780 kIemNativeGstRegUse_ReadOnly);
2781 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2782
2783 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2784 if (idxRegDst >= 8 || idxSimdRegSrc >= 8)
2785 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2786 | (idxRegDst >= 8 ? X86_OP_REX_R : 0);
2787 pCodeBuf[off++] = 0x0f;
2788 pCodeBuf[off++] = 0xd7;
2789 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegDst & 7, idxSimdRegSrc & 7);
2790
2791 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2792 iemNativeRegFreeTmp(pReNative, idxRegDst);
2793
2794#elif defined(RT_ARCH_ARM64)
2795 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2796 kIemNativeGstRegUse_ForFullWrite);
2797 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2798 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2799 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2800 kIemNativeGstSimdRegLdStSz_Low128,
2801 kIemNativeGstRegUse_Calculation);
2802 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2803
2804 /*
2805 * See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
2806 * for different approaches as NEON doesn't has an instruction equivalent for pmovmskb, so we have to emulate that.
2807 *
2808 * As there is no way around emulating the exact semantics of pmovmskb we will use the same algorithm
2809 * as the sse2neon implementation because there we can get away with loading any constants and the
2810 * base algorithm is only 4 NEON instructions (+ 3 for extracting the result to a general register).
2811 *
2812 * The following illustrates the algorithm:
2813 *
2814 * Byte vector Element -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
2815 * Instruction
2816 * |
2817 * V
2818 * Axxxxxxx Bxxxxxxx Cxxxxxxx Dxxxxxxx Exxxxxxx Fxxxxxxx Gxxxxxxx Hxxxxxxx Ixxxxxxx Jxxxxxxx Kxxxxxxx Lxxxxxxx Mxxxxxxx Nxxxxxxx Oxxxxxxx Pxxxxxxx
2819 * USHR v.16B, v.16B, #7 0000000A 0000000B 0000000C 0000000D 0000000E 0000000F 0000000G 0000000H 0000000I 0000000J 0000000K 0000000L 0000000M 0000000N 0000000O 0000000P
2820 * USRA v.8H, v.8H, #7 00000000 000000AB 00000000 000000CD 00000000 000000EF 00000000 000000GH 00000000 000000IJ 00000000 000000KL 00000000 000000MN 00000000 000000OP
2821 * USRA v.4S, v.4S, #14 00000000 00000000 00000000 0000ABCD 00000000 00000000 00000000 0000EFGH 00000000 00000000 00000000 0000IJKL 00000000 00000000 00000000 0000MNOP
2822 * USRA v.2D, v.2D, #28 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2823 *
2824 * The extraction process
2825 * UMOV wTMP, v.16B[8] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH
2826 * UMOV wRES, v.16B[0] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2827 * ORR xRES, xRES, xTMP, LSL #8 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH IJKLMNOP
2828 */
2829 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U8);
2830 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U16, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2831 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 14, kArmv8InstrShiftSz_U32, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2832 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 28, kArmv8InstrShiftSz_U64, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2833 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegTmp, idxSimdRegSrc, 8, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2834 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegDst, idxSimdRegSrc, 0, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2835 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmp, true /*f64Bit*/, 8 /*offShift6*/);
2836
2837 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2838 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2839 iemNativeRegFreeTmp(pReNative, idxRegDst);
2840
2841#else
2842# error "Port me"
2843#endif
2844 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2845 return off;
2846}
2847
2848
2849/**
2850 * Common emitter for the PACKUSWB instructions - guest register / guest register variant.
2851 */
2852DECL_INLINE_THROW(uint32_t)
2853iemNativeEmit_packuswb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2854 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc)
2855{
2856 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2857 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2858 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2859 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
2860
2861#ifdef RT_ARCH_AMD64
2862 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2863
2864 /* packuswb xmm, xmm */
2865 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2866 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2867 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2868 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2869 pCodeBuf[off++] = 0x0f;
2870 pCodeBuf[off++] = 0x67;
2871 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2872
2873#elif defined(RT_ARCH_ARM64)
2874 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2875
2876 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2877 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2878
2879#else
2880# error "port me"
2881#endif
2882
2883 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2884 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2885
2886 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2887 return off;
2888}
2889
2890
2891/**
2892 * Common emitter for the PACKUSWB instructions - guest register / recompiler variable variant.
2893 */
2894DECL_INLINE_THROW(uint32_t)
2895iemNativeEmit_packuswb_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2896 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc)
2897{
2898 IEMNATIVE_ASSERT_VAR_IDX(pReNative, idxVarSrc);
2899 IEMNATIVE_ASSERT_VAR_SIZE(pReNative, idxVarSrc, sizeof(RTUINT128U));
2900
2901 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2902 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2903 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
2904
2905
2906#ifdef RT_ARCH_AMD64
2907 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2908
2909 /* packuswb xmm, xmm */
2910 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2911 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2912 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2913 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2914 pCodeBuf[off++] = 0x0f;
2915 pCodeBuf[off++] = 0x67;
2916 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2917
2918#elif defined(RT_ARCH_ARM64)
2919 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2920
2921 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2922 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2923
2924#else
2925# error "port me"
2926#endif
2927
2928 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2929 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
2930
2931 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2932 return off;
2933}
2934
2935
2936/**
2937 * Common emitter for the pmov{s,z}x* instructions.
2938 */
2939#ifdef RT_ARCH_AMD64
2940# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2941 DECL_INLINE_THROW(uint32_t) \
2942 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2943 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2944 { \
2945 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2946 { \
2947 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2948 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2949 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2950 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2951 if (idxSimdReg >= 8) \
2952 pCodeBuf[off++] = (idxSimdReg >= 8 ? X86_OP_REX_B | X86_OP_REX_R : 0); \
2953 pCodeBuf[off++] = 0x0f; \
2954 pCodeBuf[off++] = 0x38; \
2955 pCodeBuf[off++] = (a_bOpcX86); \
2956 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdReg & 7, idxSimdReg & 7); \
2957 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2958 } \
2959 else \
2960 { \
2961 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2962 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2963 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2964 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2965 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2966 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2967 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2968 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2969 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2970 pCodeBuf[off++] = 0x0f; \
2971 pCodeBuf[off++] = 0x38; \
2972 pCodeBuf[off++] = (a_bOpcX86); \
2973 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2974 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2975 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2976 } \
2977 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2978 return off; \
2979 } \
2980 DECL_INLINE_THROW(uint32_t) \
2981 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2982 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2983 { \
2984 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2985 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2986 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off); \
2987 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7 + 6); \
2988 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; /* Transfer value from GPR to temporary vector register using pinsrq. */ \
2989 pCodeBuf[off++] = X86_OP_REX_W \
2990 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 < 8 ? 0 : X86_OP_REX_R) \
2991 | (idxRegSrc < 8 ? 0 : X86_OP_REX_B); \
2992 pCodeBuf[off++] = 0x0f; \
2993 pCodeBuf[off++] = 0x3a; \
2994 pCodeBuf[off++] = 0x22; \
2995 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxRegSrc & 7); \
2996 pCodeBuf[off++] = 0; /* QWord */\
2997 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2998 if (idxSimdRegDst >= 8 || IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8) \
2999 pCodeBuf[off++] = (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_B : 0) \
3000 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
3001 pCodeBuf[off++] = 0x0f; \
3002 pCodeBuf[off++] = 0x38; \
3003 pCodeBuf[off++] = (a_bOpcX86); \
3004 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7); \
3005 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
3006 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
3007 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
3008 return off; \
3009 } \
3010 typedef int ignore_semicolon
3011#elif defined(RT_ARCH_ARM64)
3012# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
3013 DECL_INLINE_THROW(uint32_t) \
3014 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
3015 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3016 { \
3017 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
3018 { \
3019 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
3020 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
3021 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
3022 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdReg, idxSimdReg, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
3023 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
3024 } \
3025 else \
3026 { \
3027 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
3028 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
3029 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
3030 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
3031 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
3032 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, idxSimdRegSrc, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
3033 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
3034 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
3035 } \
3036 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
3037 return off; \
3038 } \
3039 DECL_INLINE_THROW(uint32_t) \
3040 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
3041 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3042 { \
3043 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
3044 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
3045 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off); \
3046 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2); \
3047 pCodeBuf[off++] = Armv8A64MkVecInstrIns(IEMNATIVE_SIMD_REG_FIXED_TMP0, idxRegSrc, 0 /*idxElem*/); /* Transfer value from GPR to temporary vector register. */ \
3048 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
3049 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
3050 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
3051 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
3052 return off; \
3053 } \
3054 typedef int ignore_semicolon
3055#else
3056# error "Port me"
3057#endif
3058
3059IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxbw, true, kArmv8InstrShiftSz_U8, 0x30);
3060IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxwd, true, kArmv8InstrShiftSz_U16, 0x33);
3061IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxdq, true, kArmv8InstrShiftSz_U32, 0x35);
3062
3063IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxbw, false, kArmv8InstrShiftSz_U8, 0x20);
3064IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxwd, false, kArmv8InstrShiftSz_U16, 0x23);
3065IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxdq, false, kArmv8InstrShiftSz_U32, 0x25);
3066
3067
3068/**
3069 * Updates the MXCSR exception flags, raising any unmasked exceptions.
3070 */
3071DECL_INLINE_THROW(uint32_t)
3072iemNativeEmitMxcsrUpdate(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, uint8_t const idxSimdGstRegDst, uint8_t const idxSimdRegRes)
3073{
3074 uint8_t const idxRegMxCsr = iemNativeRegAllocTmpForGuestReg(pReNative, &off, kIemNativeGstReg_MxCsr, kIemNativeGstRegUse_ForUpdate);
3075 uint8_t const idxRegMxCsrXcptFlags = iemNativeRegAllocTmp(pReNative, &off);
3076 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
3077
3078#ifdef RT_ARCH_AMD64
3079 PIEMNATIVEINSTR pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
3080
3081 /* stmxcsr */
3082 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
3083 pbCodeBuf[off++] = X86_OP_REX_B;
3084 pbCodeBuf[off++] = 0x0f;
3085 pbCodeBuf[off++] = 0xae;
3086 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 3, IEMNATIVE_REG_FIXED_PVMCPU & 7);
3087 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3088 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3089 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3090 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3091
3092 /* Load MXCSR, mask everything except status flags and or into guest MXCSR. */
3093 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3094
3095 /* Store the flags in the MXCSR xcpt flags register. */
3096 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3097 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_XCPT_FLAGS);
3098
3099 /* Clear the status flags in the temporary copy and write it back to MXCSR. */
3100 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, ~X86_MXCSR_XCPT_FLAGS);
3101 off = iemNativeEmitStoreGprToVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3102
3103 pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
3104
3105 /* ldmxcsr */
3106 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
3107 pbCodeBuf[off++] = X86_OP_REX_B;
3108 pbCodeBuf[off++] = 0x0f;
3109 pbCodeBuf[off++] = 0xae;
3110 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 2, IEMNATIVE_REG_FIXED_PVMCPU & 7);
3111 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3112 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3113 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3114 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3115
3116#elif defined(RT_ARCH_ARM64)
3117 PIEMNATIVEINSTR pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
3118 pu32CodeBuf[off++] = Armv8A64MkInstrMrs(idxRegMxCsrXcptFlags, ARMV8_AARCH64_SYSREG_FPSR);
3119 pu32CodeBuf[off++] = Armv8A64MkInstrMsr(ARMV8_A64_REG_XZR, ARMV8_AARCH64_SYSREG_FPSR); /* Clear FPSR for next instruction. */
3120 pu32CodeBuf[off++] = Armv8A64MkInstrUxtb(idxRegMxCsrXcptFlags, idxRegMxCsrXcptFlags); /* Ensure there are only the exception flags set (clears QC, and any possible NZCV flags). */
3121
3122 /*
3123 * The exception flags layout differs between MXCSR and FPSR of course:
3124 *
3125 * Bit FPSR MXCSR
3126 * 0 IOC ------> IE
3127 *
3128 * 1 DZC ---- DE <-+
3129 * \ |
3130 * 2 OFC --- -> ZE |
3131 * \ |
3132 * 3 UFC -- --> OE |
3133 * \ |
3134 * 4 IXC - ---> UE |
3135 * \ |
3136 * 5 ----> PE |
3137 * 6 |
3138 * 7 IDC --------------+
3139 */
3140 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 1); /* Shift the block of flags starting at DZC to the least significant bits. */
3141 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 2, 4); /* Insert DZC, OFC, UFC and IXC into the MXCSR positions. */
3142 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 6); /* Shift IDC (now at 6) into the LSB. */
3143 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 1, 1); /* Insert IDC into the MXCSR positions. */
3144#else
3145# error "Port me"
3146#endif
3147
3148 /*
3149 * If PE is set together with OE/UE and neither are masked
3150 * PE needs to be cleared, because on real hardware
3151 * an exception is generated with only OE/UE being set,
3152 * but because we mask all exceptions PE will get set as well.
3153 */
3154 /** @todo On ARM we can combine the load+and into one and instruction. */
3155 /** @todo r=aeichner Can this be done more optimal? */
3156 uint8_t const idxRegTmp2 = iemNativeRegAllocTmp(pReNative, &off);
3157 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsrXcptFlags);
3158 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, X86_MXCSR_OE | X86_MXCSR_UE);
3159 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp2, idxRegMxCsr);
3160 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp2, X86_MXCSR_OM | X86_MXCSR_UM);
3161 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp2, X86_MXCSR_XCPT_MASK_SHIFT);
3162 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp2, idxRegTmp2, false /*f64Bit*/);
3163 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegTmp2, idxRegTmp);
3164 off = iemNativeEmitTestAnyBitsInGpr(pReNative, off, idxRegTmp2, X86_MXCSR_OE | X86_MXCSR_UE);
3165
3166 uint32_t offFixup = off;
3167 off = iemNativeEmitJzToFixed(pReNative, off, off);
3168 off = iemNativeEmitBitClearInGpr32(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_PE_BIT);
3169 iemNativeFixupFixedJump(pReNative, offFixup, off);
3170 iemNativeRegFreeTmp(pReNative, idxRegTmp2);
3171
3172
3173 /* Set the MXCSR flags now. */
3174 off = iemNativeEmitOrGpr32ByGpr(pReNative, off, idxRegMxCsr, idxRegMxCsrXcptFlags);
3175
3176 /*
3177 * Make sure we don't have any outstanding guest register writes as we may
3178 * raise an \#UD or \#XF and all guest register must be up to date in CPUMCTX.
3179 */
3180 off = iemNativeRegFlushPendingWrites(pReNative, off);
3181
3182#ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3183 off = iemNativeEmitStoreImmToVCpuU8(pReNative, off, idxInstr, RT_UOFFSETOF(VMCPUCC, iem.s.idxTbCurInstr));
3184#else
3185 RT_NOREF(idxInstr);
3186#endif
3187
3188 /* Check whether an exception is pending and only update the guest SIMD register if it isn't. */
3189 /* mov tmp, varmxcsr */
3190 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsr);
3191 /* tmp >>= X86_MXCSR_XCPT_MASK_SHIFT */
3192 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp, X86_MXCSR_XCPT_MASK_SHIFT);
3193 /* tmp = ~tmp */
3194 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp, idxRegTmp, false /*f64Bit*/);
3195 /* tmp &= mxcsr */
3196 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3197 off = iemNativeEmitTbExitIfAnyBitsSetInGpr<kIemNativeLabelType_RaiseSseAvxFpRelated>(pReNative, off, idxRegMxCsrXcptFlags,
3198 X86_MXCSR_XCPT_FLAGS);
3199
3200 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3201 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite);
3202
3203 /* Move result to guest SIMD register (at this point there is no exception being raised). */
3204 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, idxSimdRegDst, idxSimdRegRes);
3205
3206 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3207 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3208 iemNativeRegFreeTmp(pReNative, idxRegTmp);
3209 iemNativeRegFreeTmp(pReNative, idxRegMxCsrXcptFlags);
3210 iemNativeRegFreeTmp(pReNative, idxRegMxCsr);
3211 return off;
3212}
3213
3214
3215/**
3216 * Common emitter for packed floating point instructions with 3 operands - register, register variant.
3217 */
3218DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3219 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc,
3220#ifdef RT_ARCH_AMD64
3221 uint8_t const bPrefixX86, uint8_t const bOpcX86
3222#elif defined(RT_ARCH_ARM64)
3223 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3224#endif
3225 )
3226{
3227 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3228 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3229 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
3230 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3231
3232#ifdef RT_ARCH_AMD64
3233 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3234 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3235 if (bPrefixX86 != 0)
3236 pCodeBuf[off++] = bPrefixX86;
3237 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3238 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3239 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3240 pCodeBuf[off++] = 0x0f;
3241 pCodeBuf[off++] = bOpcX86;
3242 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3243#elif defined(RT_ARCH_ARM64)
3244 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3245 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3246#else
3247# error "Port me"
3248#endif
3249 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3250 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
3251 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3252 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3253}
3254
3255
3256/**
3257 * Common emitter for packed floating point instructions with 3 operands - register, local variable variant.
3258 */
3259DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3260 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc,
3261#ifdef RT_ARCH_AMD64
3262 uint8_t const bPrefixX86, uint8_t const bOpcX86
3263#elif defined(RT_ARCH_ARM64)
3264 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3265#endif
3266 )
3267{
3268 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3269 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3270 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
3271
3272#ifdef RT_ARCH_AMD64
3273 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3274 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3275 if (bPrefixX86 != 0)
3276 pCodeBuf[off++] = bPrefixX86;
3277 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3278 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3279 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3280 pCodeBuf[off++] = 0x0f;
3281 pCodeBuf[off++] = bOpcX86;
3282 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3283#elif defined(RT_ARCH_ARM64)
3284 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3285 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3286#else
3287# error "Port me"
3288#endif
3289 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
3290 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3291 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3292 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3293}
3294
3295
3296/**
3297 * Common emitter for packed floating point instructions with 3 operands.
3298 */
3299#ifdef RT_ARCH_AMD64
3300# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3301 DECL_FORCE_INLINE_THROW(uint32_t) \
3302 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3303 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3304 { \
3305 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3306 a_bPrefixX86, a_bOpcX86); \
3307 } \
3308 DECL_FORCE_INLINE_THROW(uint32_t) \
3309 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3310 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3311 { \
3312 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3313 a_bPrefixX86, a_bOpcX86); \
3314 } \
3315 typedef int ignore_semicolon
3316#elif defined(RT_ARCH_ARM64)
3317# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3318 DECL_FORCE_INLINE_THROW(uint32_t) \
3319 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3320 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3321 { \
3322 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3323 a_enmArmOp, a_ArmElemSz); \
3324 } \
3325 DECL_FORCE_INLINE_THROW(uint32_t) \
3326 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3327 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3328 { \
3329 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3330 a_enmArmOp, a_ArmElemSz); \
3331 } \
3332 typedef int ignore_semicolon
3333#else
3334# error "Port me"
3335#endif
3336
3337
3338IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(mulps, kArmv8VecInstrFpOp_Mul, kArmv8VecInstrFpSz_4x_Single, 0, 0x59);
3339IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addps, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_4x_Single, 0, 0x58);
3340IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addpd, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_2x_Double, X86_OP_PRF_SIZE_OP, 0x58);
3341IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(subps, kArmv8VecInstrFpOp_Sub, kArmv8VecInstrFpSz_4x_Single, 0, 0x5c);
3342
3343
3344#endif /* !VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette