VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/target-x86/IEMAllN8veEmit-x86.h@ 106453

Last change on this file since 106453 was 106453, checked in by vboxsync, 5 weeks ago

VMM/IEM: Eliminated the IEMNATIVE_WITH_SIMD_REG_ALLOCATOR define. Fixed bug in iemNativeEmitMemFetchStoreDataCommon where a SIMD register was masked in calls to iemNativeVarSaveVolatileRegsPreHlpCall and friends. Fixed theoretical loop-forever bugs in iemNativeSimdRegAllocFindFree & iemNativeRegAllocFindFree. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 159.2 KB
Line 
1/* $Id: IEMAllN8veEmit-x86.h 106453 2024-10-17 13:54:35Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, x86 Target - Code Emitters.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28#ifndef VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
29#define VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
30#ifndef RT_WITHOUT_PRAGMA_ONCE
31# pragma once
32#endif
33
34
35#ifdef RT_ARCH_AMD64
36
37/**
38 * Emits an ModR/M instruction with one opcode byte and only register operands.
39 */
40DECL_FORCE_INLINE(uint32_t)
41iemNativeEmitAmd64OneByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOther,
42 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
43{
44 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
45 switch (cOpBits)
46 {
47 case 16:
48 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
49 RT_FALL_THRU();
50 case 32:
51 if (idxRegReg >= 8 || idxRegRm >= 8)
52 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
53 pCodeBuf[off++] = bOpcodeOther;
54 break;
55
56 default: AssertFailed(); RT_FALL_THRU();
57 case 64:
58 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
59 pCodeBuf[off++] = bOpcodeOther;
60 break;
61
62 case 8:
63 if (idxRegReg >= 8 || idxRegRm >= 8)
64 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
65 else if (idxRegReg >= 4 || idxRegRm >= 4)
66 pCodeBuf[off++] = X86_OP_REX;
67 pCodeBuf[off++] = bOpcode8;
68 break;
69 }
70 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
71 return off;
72}
73
74
75/**
76 * Emits an ModR/M instruction with two opcode bytes and only register operands.
77 */
78DECL_FORCE_INLINE(uint32_t)
79iemNativeEmitAmd64TwoByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off,
80 uint8_t bOpcode0, uint8_t bOpcode8, uint8_t bOpcodeOther,
81 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
82{
83 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
84 switch (cOpBits)
85 {
86 case 16:
87 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
88 RT_FALL_THRU();
89 case 32:
90 if (idxRegReg >= 8 || idxRegRm >= 8)
91 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
92 pCodeBuf[off++] = bOpcode0;
93 pCodeBuf[off++] = bOpcodeOther;
94 break;
95
96 default: AssertFailed(); RT_FALL_THRU();
97 case 64:
98 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
99 pCodeBuf[off++] = bOpcode0;
100 pCodeBuf[off++] = bOpcodeOther;
101 break;
102
103 case 8:
104 if (idxRegReg >= 8 || idxRegRm >= 8)
105 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
106 else if (idxRegReg >= 4 || idxRegRm >= 4)
107 pCodeBuf[off++] = X86_OP_REX;
108 pCodeBuf[off++] = bOpcode0;
109 pCodeBuf[off++] = bOpcode8;
110 break;
111 }
112 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
113 return off;
114}
115
116
117/**
118 * Emits one of three opcodes with an immediate.
119 *
120 * These are expected to be a /idxRegReg form.
121 */
122DECL_FORCE_INLINE(uint32_t)
123iemNativeEmitAmd64OneByteModRmInstrRIEx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOtherImm8,
124 uint8_t bOpcodeOther, uint8_t cOpBits, uint8_t cImmBits, uint8_t idxRegReg,
125 uint8_t idxRegRm, uint64_t uImmOp)
126{
127 Assert(idxRegReg < 8); Assert(idxRegRm < 16);
128 if ( cImmBits == 8
129 || (uImmOp <= (uint64_t)0x7f && bOpcodeOtherImm8 != 0xcc))
130 {
131 switch (cOpBits)
132 {
133 case 16:
134 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
135 RT_FALL_THRU();
136 case 32:
137 if (idxRegRm >= 8)
138 pCodeBuf[off++] = X86_OP_REX_B;
139 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
140 break;
141
142 default: AssertFailed(); RT_FALL_THRU();
143 case 64:
144 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
145 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
146 break;
147
148 case 8:
149 if (idxRegRm >= 8)
150 pCodeBuf[off++] = X86_OP_REX_B;
151 else if (idxRegRm >= 4)
152 pCodeBuf[off++] = X86_OP_REX;
153 pCodeBuf[off++] = bOpcode8; Assert(bOpcode8 != 0xcc);
154 break;
155 }
156 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
157 pCodeBuf[off++] = (uint8_t)uImmOp;
158 }
159 else
160 {
161 switch (cOpBits)
162 {
163 case 32:
164 if (idxRegRm >= 8)
165 pCodeBuf[off++] = X86_OP_REX_B;
166 break;
167
168 default: AssertFailed(); RT_FALL_THRU();
169 case 64:
170 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
171 break;
172
173 case 16:
174 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
175 if (idxRegRm >= 8)
176 pCodeBuf[off++] = X86_OP_REX_B;
177 pCodeBuf[off++] = bOpcodeOther;
178 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
179 pCodeBuf[off++] = RT_BYTE1(uImmOp);
180 pCodeBuf[off++] = RT_BYTE2(uImmOp);
181 Assert(cImmBits == 16);
182 return off;
183 }
184 pCodeBuf[off++] = bOpcodeOther;
185 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
186 pCodeBuf[off++] = RT_BYTE1(uImmOp);
187 pCodeBuf[off++] = RT_BYTE2(uImmOp);
188 pCodeBuf[off++] = RT_BYTE3(uImmOp);
189 pCodeBuf[off++] = RT_BYTE4(uImmOp);
190 Assert(cImmBits == 32);
191 }
192 return off;
193}
194
195#endif /* RT_ARCH_AMD64 */
196
197
198
199/*********************************************************************************************************************************
200* EFLAGS *
201*********************************************************************************************************************************/
202
203#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
204
205/** @def IEMNATIVE_POSTPONING_REG_MASK
206 * Register suitable for keeping the inputs or result for a postponed EFLAGS
207 * calculation.
208 *
209 * We use non-volatile register here so we don't have to save & restore them
210 * accross callouts (i.e. TLB loads).
211 *
212 * @note On x86 we cannot use RDI and RSI because these are used by the
213 * opcode checking code. The usual joy of the x86 instruction set.
214 */
215# ifdef RT_ARCH_AMD64
216# define IEMNATIVE_POSTPONING_REG_MASK \
217 (IEMNATIVE_CALL_NONVOLATILE_GREG_MASK & ~(RT_BIT_32(X86_GREG_xDI) | RT_BIT_32(X86_GREG_xSI)))
218# else
219# define IEMNATIVE_POSTPONING_REG_MASK IEMNATIVE_CALL_NONVOLATILE_GREG_MASK
220# endif
221
222/**
223 * This is normally invoked via IEMNATIVE_CLEAR_POSTPONED_EFLAGS().
224 */
225template<uint32_t const a_fEflClobbered>
226DECL_FORCE_INLINE(void) iemNativeClearPostponedEFlags(PIEMRECOMPILERSTATE pReNative)
227{
228 AssertCompile(!(a_fEflClobbered & ~X86_EFL_STATUS_BITS));
229 uint32_t fEFlags = pReNative->PostponedEfl.fEFlags;
230 if (fEFlags)
231 {
232 if RT_CONSTEXPR_IF(a_fEflClobbered != X86_EFL_STATUS_BITS)
233 {
234 fEFlags &= ~a_fEflClobbered;
235 if (!fEFlags)
236 { /* likely */ }
237 else
238 {
239 Log5(("EFLAGS: Clobbering %#x: %#x -> %#x (op=%d bits=%u) - iemNativeClearPostponedEFlags\n", a_fEflClobbered,
240 pReNative->PostponedEfl.fEFlags, fEFlags, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits));
241 pReNative->PostponedEfl.fEFlags = fEFlags;
242 return;
243 }
244 }
245
246 /* Do cleanup. */
247 Log5(("EFLAGS: Cleanup of op=%u bits=%u efl=%#x upon clobbering %#x - iemNativeClearPostponedEFlags\n",
248 pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits, pReNative->PostponedEfl.fEFlags, a_fEflClobbered));
249 pReNative->PostponedEfl.fEFlags = 0;
250 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Invalid;
251 pReNative->PostponedEfl.cOpBits = 0;
252 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg1);
253 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
254 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg2);
255 pReNative->PostponedEfl.idxReg1 = UINT8_MAX;
256 pReNative->PostponedEfl.idxReg2 = UINT8_MAX;
257# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
258 STAM_PROFILE_ADD_PERIOD(&pReNative->pVCpu->iem.s.StatNativeEflPostponedEmits, pReNative->PostponedEfl.cEmits);
259 pReNative->PostponedEfl.cEmits = 0;
260# endif
261 }
262}
263
264#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
265
266
267template<bool const a_fDoOp>
268DECL_INLINE_THROW(uint32_t) iemNativeEmitPostponedEFlagsCalcLogical(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t cOpBits,
269 uint8_t idxRegResult, uint8_t idxRegEfl, uint8_t idxRegTmp)
270{
271#ifdef RT_ARCH_AMD64
272 /* Do TEST idxRegResult, idxRegResult to set flags. */
273 if RT_CONSTEXPR_IF(a_fDoOp)
274 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x84, 0x85, cOpBits, idxRegResult, idxRegResult);
275
276 /*
277 * Collect the EFLAGS status bits.
278 * We know that the overflow bit will always be cleared, so LAHF can be used.
279 */
280 if (idxRegTmp == X86_GREG_xAX)
281 {
282 /* lahf ; AH = EFLAGS */
283 pCodeBuf[off++] = 0x9f;
284 if (idxRegEfl <= X86_GREG_xBX)
285 {
286 /* mov [CDB]L, AH */
287 pCodeBuf[off++] = 0x88;
288 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
289 }
290 else
291 {
292 /* mov AL, AH */
293 pCodeBuf[off++] = 0x88;
294 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
295 /* mov xxL, AL */
296 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
297 pCodeBuf[off++] = 0x88;
298 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
299 }
300 }
301 else if (idxRegEfl != X86_GREG_xAX)
302 {
303# if 1 /* This is 1 or 4 bytes larger, but avoids the stack. */
304 /* xchg rax, tmp */
305 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
306 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
307
308 /* lahf ; AH = EFLAGS */
309 pCodeBuf[off++] = 0x9f;
310 if (idxRegEfl <= X86_GREG_xBX)
311 {
312 /* mov [CDB]L, AH */
313 pCodeBuf[off++] = 0x88;
314 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
315 }
316 else
317 {
318 /* mov AL, AH */
319 pCodeBuf[off++] = 0x88;
320 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
321 /* mov xxL, AL */
322 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
323 pCodeBuf[off++] = 0x88;
324 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
325 }
326
327 /* xchg rax, tmp */
328 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
329 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
330
331# else
332 /* pushf */
333 pCodeBuf[off++] = 0x9c;
334 /* pop tmp */
335 if (idxRegTmp >= 8)
336 pCodeBuf[off++] = X86_OP_REX_B;
337 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
338 /* mov byte(efl), byte(tmp) */
339 if (idxRegEfl >= 4 || idxRegTmp >= 4)
340 pCodeBuf[off++] = (idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX)
341 | (idxRegTmp >= 8 ? X86_OP_REX_R : 0);
342 pCodeBuf[off++] = 0x88;
343 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegTmp & 7, idxRegEfl & 7);
344# endif
345 }
346 else
347 {
348 /* xchg al, ah */
349 pCodeBuf[off++] = 0x86;
350 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
351 /* lahf ; AH = EFLAGS */
352 pCodeBuf[off++] = 0x9f;
353 /* xchg al, ah */
354 pCodeBuf[off++] = 0x86;
355 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
356 }
357 /* BTR idxEfl, 11; Clear OF */
358 if (idxRegEfl >= 8)
359 pCodeBuf[off++] = X86_OP_REX_B;
360 pCodeBuf[off++] = 0xf;
361 pCodeBuf[off++] = 0xba;
362 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxRegEfl & 7);
363 pCodeBuf[off++] = X86_EFL_OF_BIT;
364
365#elif defined(RT_ARCH_ARM64)
366 /*
367 * Calculate flags.
368 */
369 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
370 off = iemNativeEmitLoadGpr32ImmExT<~X86_EFL_STATUS_BITS>(pCodeBuf, off, idxRegTmp);
371 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
372
373 /* N,Z -> SF,ZF */
374 if (cOpBits < 32)
375 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
376 else if RT_CONSTEXPR_IF(a_fDoOp)
377 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
378 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
379 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
380 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
381 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
382
383 /* Calculate 8-bit parity of the result. */
384 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
385 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
386 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
387 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
388 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
389 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
390 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
391 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
392 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
393
394#else
395# error "port me"
396#endif
397 return off;
398}
399
400#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
401
402template<uint32_t const a_bmInputRegs, bool const a_fTlbMiss = false>
403static uint32_t iemNativeDoPostponedEFlagsInternal(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf,
404 uint32_t bmExtraTlbMissRegs = 0)
405{
406# ifdef IEMNATIVE_WITH_TB_DEBUG_INFO
407 iemNativeDbgInfoAddPostponedEFlagsCalc(pReNative, off, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits,
408 pReNative->PostponedEfl.cEmits);
409# endif
410
411 /*
412 * In the TB exit code path we cannot do regular register allocation. Nor
413 * can we when we're in the TLB miss code, unless we're skipping the TLB
414 * lookup. Since the latter isn't an important usecase and should get along
415 * fine on just volatile registers, we do not need to do anything special
416 * for it.
417 *
418 * So, we do our own register allocating here. Any register goes in the TB
419 * exit path, excluding a_bmInputRegs, fixed and postponed related registers.
420 * In the TLB miss we can use any volatile register and temporary registers
421 * allocated in the TLB state.
422 *
423 * Note! On x86 we prefer using RAX as the first TMP register, so we can
424 * make use of LAHF which is typically faster than PUSHF/POP. This
425 * is why the idxRegTmp allocation is first when there is no EFLAG
426 * shadow, since RAX is represented by bit 0 in the mask.
427 */
428 uint32_t bmAvailableRegs;
429 if RT_CONSTEXPR_IF(!a_fTlbMiss)
430 {
431 bmAvailableRegs = ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK) & IEMNATIVE_HST_GREG_MASK;
432 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
433 bmAvailableRegs &= ~(RT_BIT_32(pReNative->PostponedEfl.idxReg1) | RT_BIT_32(pReNative->PostponedEfl.idxReg2));
434 else
435 bmAvailableRegs &= ~RT_BIT_32(pReNative->PostponedEfl.idxReg1);
436 }
437 else
438 {
439 /* Note! a_bmInputRegs takes precedence over bmExtraTlbMissRegs. */
440 bmAvailableRegs = (IEMNATIVE_CALL_VOLATILE_GREG_MASK | bmExtraTlbMissRegs)
441 & ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK)
442 & IEMNATIVE_HST_GREG_MASK;
443 }
444
445 /* Use existing EFLAGS shadow if available. For the TLB-miss code path we
446 need to weed out volatile registers here, as they will no longer be valid. */
447 uint8_t idxRegTmp;
448 uint8_t idxRegEfl = pReNative->Core.aidxGstRegShadows[kIemNativeGstReg_EFlags];
449 if ( (pReNative->Core.bmGstRegShadows & RT_BIT_64(kIemNativeGstReg_EFlags))
450 && (!a_fTlbMiss || !(RT_BIT_32(idxRegEfl) & IEMNATIVE_CALL_VOLATILE_GREG_MASK)))
451 {
452 Assert(idxRegEfl < IEMNATIVE_HST_GREG_COUNT);
453 Assert(!(a_bmInputRegs & RT_BIT_32(idxRegEfl)));
454 if RT_CONSTEXPR_IF(!a_fTlbMiss) Assert(bmAvailableRegs & RT_BIT_32(idxRegEfl));
455 bmAvailableRegs &= ~RT_BIT_32(idxRegEfl);
456# ifdef VBOX_STRICT
457 off = iemNativeEmitGuestRegValueCheckEx(pReNative, pCodeBuf, off, idxRegEfl, kIemNativeGstReg_EFlags);
458# endif
459
460 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1;
461 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
462 }
463 else
464 {
465 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1; /* allocate the temp register first to prioritize EAX on x86. */
466 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
467
468 idxRegEfl = ASMBitFirstSetU32(bmAvailableRegs) - 1;
469 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
470 off = iemNativeEmitLoadGprFromVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
471 }
472 Assert(bmAvailableRegs != 0);
473
474 /*
475 * Do the actual EFLAGS calculation.
476 */
477 switch (pReNative->PostponedEfl.enmOp)
478 {
479 case kIemNativePostponedEflOp_Logical:
480 Assert(pReNative->PostponedEfl.idxReg2 == UINT8_MAX);
481 off = iemNativeEmitPostponedEFlagsCalcLogical<true>(pCodeBuf, off, pReNative->PostponedEfl.cOpBits,
482 pReNative->PostponedEfl.idxReg1, idxRegEfl, idxRegTmp);
483 break;
484
485 default:
486 AssertFailedBreak();
487 }
488
489 /*
490 * Store EFLAGS.
491 */
492# ifdef VBOX_STRICT
493 /* check that X86_EFL_1 is set. */
494 uint32_t offFixup1;
495 off = iemNativeEmitTestBitInGprAndJmpToFixedIfSetEx(pCodeBuf, off, idxRegEfl, X86_EFL_1_BIT, off, &offFixup1);
496 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3330);
497 iemNativeFixupFixedJump(pReNative, offFixup1, off);
498 /* Check that X86_EFL_RAZ_LO_MASK is zero. */
499 off = iemNativeEmitTestAnyBitsInGpr32Ex(pCodeBuf, off, idxRegEfl, X86_EFL_RAZ_LO_MASK, idxRegTmp);
500 uint32_t const offFixup2 = off;
501 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kIemNativeInstrCond_e);
502 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3331);
503 iemNativeFixupFixedJump(pReNative, offFixup2, off);
504# endif
505 off = iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
506 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
507
508# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
509 pReNative->PostponedEfl.cEmits++;
510# endif
511 return off;
512}
513
514
515
516template<uint32_t const a_bmInputRegs>
517DECL_FORCE_INLINE_THROW(uint32_t)
518iemNativeDoPostponedEFlagsAtTbExit(PIEMRECOMPILERSTATE pReNative, uint32_t off)
519{
520 if (pReNative->PostponedEfl.fEFlags)
521 {
522 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
523 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
524 }
525 return off;
526}
527
528
529template<uint32_t const a_bmInputRegs>
530DECL_FORCE_INLINE_THROW(uint32_t)
531iemNativeDoPostponedEFlagsAtTbExitEx(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf)
532{
533 if (pReNative->PostponedEfl.fEFlags)
534 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
535 return off;
536}
537
538
539template<uint32_t const a_bmInputRegs>
540DECL_FORCE_INLINE_THROW(uint32_t)
541iemNativeDoPostponedEFlagsAtTlbMiss(PIEMRECOMPILERSTATE pReNative, uint32_t off, const IEMNATIVEEMITTLBSTATE *pTlbState,
542 uint32_t bmTmpRegs)
543{
544 if (pReNative->PostponedEfl.fEFlags)
545 {
546 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
547 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs, true>(pReNative, off, pCodeBuf,
548 pTlbState->getRegsNotToSave() | bmTmpRegs);
549 }
550 return off;
551}
552
553#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
554
555
556/**
557 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL.
558 *
559 * It takes liveness stuff into account.
560 */
561template<bool a_fNeedToSetFlags>
562DECL_INLINE_THROW(uint32_t)
563iemNativeEmitEFlagsForLogical(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl,
564 uint8_t cOpBits, uint8_t idxRegResult)
565{
566 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalLogical);
567 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
568 RT_NOREF(cOpBits, idxRegResult);
569
570#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
571 /*
572 * See if we can skip this wholesale.
573 */
574 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
575 uint64_t const fEflClobbered = IEMLIVENESS_STATE_GET_WILL_BE_CLOBBERED_SET(pLivenessEntry)
576 & IEMLIVENESSBIT_STATUS_EFL_MASK;
577# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
578 uint64_t fEflPostponing;
579# endif
580 if ( fEflClobbered == IEMLIVENESSBIT_STATUS_EFL_MASK
581 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
582 {
583 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedLogical);
584 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
585# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
586 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
587# endif
588 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS));
589 return off;
590 }
591# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
592 if ( ( (fEflPostponing = IEMLIVENESS_STATE_GET_CAN_BE_POSTPONED_SET(pLivenessEntry) & IEMLIVENESSBIT_STATUS_EFL_MASK)
593 | fEflClobbered)
594 == IEMLIVENESSBIT_STATUS_EFL_MASK
595 && idxRegResult != UINT8_MAX)
596 {
597 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflPostponedLogical);
598 pReNative->PostponedEfl.fEFlags = X86_EFL_STATUS_BITS;
599 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Logical;
600 pReNative->PostponedEfl.cOpBits = cOpBits;
601 pReNative->PostponedEfl.idxReg1 = iemNativeRegAllocTmpExPreferNonVolatile(pReNative, &off, IEMNATIVE_POSTPONING_REG_MASK);
602 /** @todo it would normally be possible to use idxRegResult, iff it is
603 * already a non-volatile register and we can be user the caller
604 * doesn't modify it. That'll save a register move and allocation. */
605 off = iemNativeEmitLoadGprFromGpr(pReNative, off, pReNative->PostponedEfl.idxReg1, idxRegResult);
606 Log5(("EFLAGS: Postponing %#x op=%u bits=%u reg1=%u - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS,
607 kIemNativePostponedEflOp_Logical, cOpBits, pReNative->PostponedEfl.idxReg1));
608 }
609# endif
610 else
611#endif
612 {
613 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
614 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
615#ifdef RT_ARCH_AMD64
616 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 32);
617#elif defined(RT_ARCH_ARM64)
618 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 16);
619#else
620# error "port me"
621#endif
622 off = iemNativeEmitPostponedEFlagsCalcLogical<a_fNeedToSetFlags>(pCodeBuf, off, cOpBits, idxRegResult,
623 idxRegEfl, idxRegTmp);
624 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
625
626 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
627 iemNativeRegFreeTmp(pReNative, idxRegTmp);
628 }
629
630#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
631 if (pReNative->fSkippingEFlags)
632 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForLogical)\n", pReNative->fSkippingEFlags));
633 pReNative->fSkippingEFlags = 0;
634# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
635 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
636# endif
637#endif
638 return off;
639}
640
641
642/**
643 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
644 *
645 * It takes liveness stuff into account.
646 */
647DECL_FORCE_INLINE_THROW(uint32_t)
648iemNativeEmitEFlagsForArithmetic(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl, uint8_t idxRegEflIn
649#ifndef RT_ARCH_AMD64
650 , uint8_t cOpBits, uint8_t idxRegResult, uint8_t idxRegDstIn, uint8_t idxRegSrc
651 , bool fInvertCarry, uint64_t uImmSrc
652#endif
653 )
654{
655 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalArithmetic);
656 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
657
658#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
659 /*
660 * See if we can skip this wholesale.
661 */
662 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
663 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
664 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
665 {
666 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedArithmetic);
667 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
668 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForArithmetic\n", X86_EFL_STATUS_BITS));
669# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
670 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
671# endif
672 }
673 else
674#endif
675 {
676#ifdef RT_ARCH_AMD64
677 /*
678 * Collect flags and merge them with eflags.
679 */
680 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
681 /* pushf - do this before any reg allocations as they may emit instructions too. */
682 pCodeBuf[off++] = 0x9c;
683
684 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
685 : iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
686 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
687 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
688 /* pop tmp */
689 if (idxTmpReg >= 8)
690 pCodeBuf[off++] = X86_OP_REX_B;
691 pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
692 /* Isolate the flags we want. */
693 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_STATUS_BITS);
694 /* Clear the status bits in EFLs. */
695 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
696 /* OR in the flags we collected. */
697 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
698 if (idxRegEflIn != idxRegEfl)
699 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
700 iemNativeRegFreeTmp(pReNative, idxTmpReg);
701
702#elif defined(RT_ARCH_ARM64)
703 /*
704 * Calculate flags.
705 */
706 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
707 : iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
708 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
709 uint8_t const idxTmpReg2 = cOpBits >= 32 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
710 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
711
712 /* Invert CF (stored inved on ARM) and load the flags into the temporary register. */
713 if (fInvertCarry)
714 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
715 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
716
717 if (cOpBits >= 32)
718 {
719 /* V -> OF */
720 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 28);
721 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
722
723 /* C -> CF */
724 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 1);
725 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
726 }
727
728 /* N,Z -> SF,ZF */
729 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits >= 32 ? 1 : 30);
730 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
731
732 /* For ADC and SBB we have to calculate overflow and carry our selves. */
733 if (cOpBits < 32)
734 {
735 /* Since the carry flag is the zero'th flag, we just use BFXIL got copy it over. */
736 AssertCompile(X86_EFL_CF_BIT == 0);
737 pCodeBuf[off++] = Armv8A64MkInstrBfxil(idxRegEfl, idxRegResult, cOpBits, 1, false /*f64Bit*/);
738
739 /* The overflow flag is more work as we have to compare the signed bits for
740 both inputs and the result. See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
741
742 Formula: ~(a_uDst ^ a_uSrcOf) & (a_uResult ^ a_uDst)
743 With a_uSrcOf as a_uSrc for additions and ~a_uSrc for subtractions.
744
745 It is a bit simpler when the right (source) side is constant:
746 adc: S D R -> OF sbb: S D R -> OF
747 0 0 0 -> 0 \ 0 0 0 -> 0 \
748 0 0 1 -> 1 \ 0 0 1 -> 0 \
749 0 1 0 -> 0 / and not(D), R 0 1 0 -> 1 / and D, not(R)
750 0 1 1 -> 0 / 0 1 1 -> 0 /
751 1 0 0 -> 0 \ 1 0 0 -> 0 \
752 1 0 1 -> 0 \ and D, not(R) 1 0 1 -> 1 \ and not(D), R
753 1 1 0 -> 1 / 1 1 0 -> 0 /
754 1 1 1 -> 0 / 1 1 1 -> 0 / */
755 if (idxRegSrc != UINT8_MAX)
756 {
757 if (fInvertCarry) /* sbb: ~((a_uDst) ^ ~(a_uSrcOf)) -> (a_uDst) ^ (a_uSrcOf); HACK ALERT: fInvertCarry == sbb */
758 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false);
759 else /* adc: ~((a_uDst) ^ (a_uSrcOf)) -> (a_uDst) ^ ~(a_uSrcOf) */
760 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegSrc, false);
761 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg2, idxRegDstIn, idxRegResult, false); /* (a_uDst) ^ (a_uResult) */
762 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxTmpReg, idxTmpReg, idxTmpReg2, false /*f64Bit*/);
763 }
764 else if (uImmSrc & RT_BIT_32(cOpBits - 1))
765 {
766 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
767 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
768 else
769 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
770 }
771 else
772 {
773 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
774 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
775 else
776 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
777 }
778 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits - 1, false /*f64Bit*/);
779 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1);
780 iemNativeRegFreeTmp(pReNative, idxTmpReg2);
781 }
782
783 /* Calculate 8-bit parity of the result. */
784 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /*f64Bit*/,
785 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
786 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
787 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
788 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
789 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
790 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
791 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /*f64Bit*/);
792 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
793
794 /* Calculate auxilary carry/borrow. This is related to 8-bit BCD.
795 General formula: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
796 S D R
797 0 0 0 -> 0; \
798 0 0 1 -> 1; \ regular
799 0 1 0 -> 1; / xor R, D
800 0 1 1 -> 0; /
801 1 0 0 -> 1; \
802 1 0 1 -> 0; \ invert one of the two
803 1 1 0 -> 0; / xor not(R), D
804 1 1 1 -> 1; /
805 a_uSrc[bit 4]=0: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
806 a_uSrc[bit 4]=1: ((uint32_t)~(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
807 */
808
809 if (idxRegSrc != UINT8_MAX)
810 {
811 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false /*f64Bit*/);
812 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxRegResult, false /*f64Bit*/);
813 }
814 else if (uImmSrc & X86_EFL_AF)
815 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
816 else
817 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
818 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, X86_EFL_AF_BIT, false /*f64Bit*/);
819 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_AF_BIT, 1, false /*f64Bit*/);
820
821 if (idxRegEflIn != idxRegEfl)
822 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
823 iemNativeRegFreeTmp(pReNative, idxTmpReg);
824
825#else
826# error "port me"
827#endif
828 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
829
830#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
831 if (pReNative->fSkippingEFlags)
832 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForArithmetic)\n", pReNative->fSkippingEFlags));
833 pReNative->fSkippingEFlags = 0;
834# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
835 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
836# endif
837#endif
838 }
839 return off;
840
841}
842
843
844
845/*********************************************************************************************************************************
846* Bitwise Logical Operations *
847*********************************************************************************************************************************/
848
849/**
850 * The AND instruction will clear OF, CF and AF (latter is undefined) and
851 * set the other flags according to the result.
852 */
853template<uint8_t const a_cOpBits>
854DECL_INLINE_THROW(uint32_t)
855iemNativeEmit_and_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
856{
857 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
858 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
859#ifdef RT_ARCH_AMD64
860 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
861 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
862 0x22, 0x23, a_cOpBits, idxRegDst, idxRegSrc);
863 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
864 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
865
866 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
867
868#elif defined(RT_ARCH_ARM64)
869 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. */
870 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
871 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
872 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
873 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
874
875 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
876#else
877# error "Port me"
878#endif
879 iemNativeVarRegisterRelease(pReNative, idxVarDst);
880 return off;
881}
882
883
884/**
885 * The AND instruction with immediate value as right operand.
886 */
887template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
888DECL_INLINE_THROW(uint32_t)
889iemNativeEmit_and_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
890{
891 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
892#ifdef RT_ARCH_AMD64
893 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
894 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
895 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 4, idxRegDst, uImmOp);
896 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
897
898 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
899
900#elif defined(RT_ARCH_ARM64)
901 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
902 course the immediate variant when possible to save a register load. */
903 uint32_t uImmSizeLen, uImmRotations;
904 if ( a_cOpBits > 32
905 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
906 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
907 {
908 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
909 if (a_cOpBits >= 32)
910 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
911 else
912 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
913 }
914 else
915 {
916 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
917 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
918 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
919 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
920 else
921 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
922 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
923 }
924 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
925
926 off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
927
928#else
929# error "Port me"
930#endif
931 iemNativeVarRegisterRelease(pReNative, idxVarDst);
932 return off;
933}
934
935
936/**
937 * The TEST instruction will clear OF, CF and AF (latter is undefined) and
938 * set the other flags according to the result.
939 */
940template<uint8_t const a_cOpBits>
941DECL_INLINE_THROW(uint32_t)
942iemNativeEmit_test_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
943{
944 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
945 uint8_t const idxRegSrc = idxVarSrc == idxVarDst ? idxRegDst /* special case of 'test samereg,samereg' */
946 : iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
947#ifdef RT_ARCH_AMD64
948 /* On AMD64 we just use the correctly sized TEST instruction harvest the EFLAGS. */
949 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
950 0x84, 0x85, a_cOpBits, idxRegSrc, idxRegDst);
951 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
952
953#elif defined(RT_ARCH_ARM64)
954 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. We also
955 need to keep the result in order to calculate the flags. */
956 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
957 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
958 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
959 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
960 else
961 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
962 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
963
964#else
965# error "Port me"
966#endif
967 if (idxVarSrc != idxVarDst)
968 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
969 iemNativeVarRegisterRelease(pReNative, idxVarDst);
970
971#ifdef RT_ARCH_AMD64
972 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
973#else
974 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
975 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
976 else
977 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
978 iemNativeRegFreeTmp(pReNative, idxRegResult);
979#endif
980 return off;
981}
982
983
984/**
985 * The TEST instruction with immediate value as right operand.
986 */
987template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
988DECL_INLINE_THROW(uint32_t)
989iemNativeEmit_test_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
990{
991 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
992#ifdef RT_ARCH_AMD64
993 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
994 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
995 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0xf6, 0xcc, 0xf7, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
996 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
997 iemNativeVarRegisterRelease(pReNative, idxVarDst);
998
999 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
1000
1001#elif defined(RT_ARCH_ARM64)
1002 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1003 course the immediate variant when possible to save a register load.
1004 We also need to keep the result in order to calculate the flags. */
1005 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1006 uint32_t uImmSizeLen, uImmRotations;
1007 if ( a_cOpBits > 32
1008 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1009 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1010 {
1011 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1012 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1013 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1014 else
1015 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1016 }
1017 else
1018 {
1019 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1020 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1021 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1022 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1023 else
1024 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1025 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1026 }
1027 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1028 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1029
1030 off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
1031
1032 iemNativeRegFreeTmp(pReNative, idxRegResult);
1033
1034#else
1035# error "Port me"
1036#endif
1037 return off;
1038}
1039
1040
1041/**
1042 * The OR instruction will clear OF, CF and AF (latter is undefined) and
1043 * set the other flags according to the result.
1044 */
1045template<uint8_t const a_cOpBits>
1046DECL_INLINE_THROW(uint32_t)
1047iemNativeEmit_or_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1048{
1049 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1050 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1051#ifdef RT_ARCH_AMD64
1052 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1053 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1054 0x0a, 0x0b, a_cOpBits, idxRegDst, idxRegSrc);
1055 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1056 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1057
1058 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1059
1060#elif defined(RT_ARCH_ARM64)
1061 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1062 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1063 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1064 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1065 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1066
1067 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1068
1069#else
1070# error "Port me"
1071#endif
1072 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1073 return off;
1074}
1075
1076
1077/**
1078 * The OR instruction with immediate value as right operand.
1079 */
1080template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1081DECL_INLINE_THROW(uint32_t)
1082iemNativeEmit_or_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1083{
1084 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1085#ifdef RT_ARCH_AMD64
1086 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1087 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1088 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 1, idxRegDst, uImmOp);
1089 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1090
1091 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1092
1093#elif defined(RT_ARCH_ARM64)
1094 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1095 course the immediate variant when possible to save a register load. */
1096 uint32_t uImmSizeLen, uImmRotations;
1097 if ( a_cOpBits > 32
1098 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1099 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1100 {
1101 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1102 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1103 }
1104 else
1105 {
1106 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1107 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1108 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1109 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1110 }
1111 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1112
1113 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1114
1115#else
1116# error "Port me"
1117#endif
1118 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1119 return off;
1120}
1121
1122
1123/**
1124 * The XOR instruction will clear OF, CF and AF (latter is undefined) and
1125 * set the other flags according to the result.
1126 */
1127template<uint8_t const a_cOpBits>
1128DECL_INLINE_THROW(uint32_t)
1129iemNativeEmit_xor_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1130{
1131 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1132 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1133#ifdef RT_ARCH_AMD64
1134 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1135 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1136 0x32, 0x33, a_cOpBits, idxRegDst, idxRegSrc);
1137 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1138 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1139
1140 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1141
1142#elif defined(RT_ARCH_ARM64)
1143 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1144 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1145 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1146 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1147 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1148
1149 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1150
1151#else
1152# error "Port me"
1153#endif
1154 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1155 return off;
1156}
1157
1158
1159/**
1160 * The XOR instruction with immediate value as right operand.
1161 */
1162template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1163DECL_INLINE_THROW(uint32_t)
1164iemNativeEmit_xor_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1165{
1166 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1167#ifdef RT_ARCH_AMD64
1168 /* On AMD64 we just use the correctly sized XOR instruction harvest the EFLAGS. */
1169 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1170 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 6, idxRegDst, uImmOp);
1171 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1172
1173 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1174
1175#elif defined(RT_ARCH_ARM64)
1176 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1177 course the immediate variant when possible to save a register load. */
1178 uint32_t uImmSizeLen, uImmRotations;
1179 if ( a_cOpBits > 32
1180 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1181 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1182 {
1183 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1184 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1185 }
1186 else
1187 {
1188 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1189 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1190 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1191 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1192 }
1193 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1194
1195 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1196
1197#else
1198# error "Port me"
1199#endif
1200 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1201 return off;
1202}
1203
1204
1205
1206/*********************************************************************************************************************************
1207* ADD, ADC, SUB, SBB, CMP *
1208*********************************************************************************************************************************/
1209
1210/**
1211 * The ADD instruction will set all status flags.
1212 */
1213template<uint8_t const a_cOpBits>
1214DECL_INLINE_THROW(uint32_t)
1215iemNativeEmit_add_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1216{
1217 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1218 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1219
1220#ifdef RT_ARCH_AMD64
1221 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1222 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1223 0x02, 0x03, a_cOpBits, idxRegDst, idxRegSrc);
1224 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1225
1226 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1227 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1228
1229 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1230
1231#elif defined(RT_ARCH_ARM64)
1232 /* On ARM64 we'll need the two input operands as well as the result in order
1233 to calculate the right flags, even if we use ADDS and translates NZCV into
1234 OF, CF, ZF and SF. */
1235 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1236 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1237 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1238 {
1239 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1240 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1241 }
1242 else
1243 {
1244 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1245 uint32_t const cShift = 32 - a_cOpBits;
1246 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1247 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1248 true /*fSetFlags*/, cShift);
1249 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1250 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1251 }
1252 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1253
1254 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1255 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1256
1257 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1258 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1259 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1260
1261#else
1262# error "port me"
1263#endif
1264 return off;
1265}
1266
1267
1268/**
1269 * The ADD instruction with immediate value as right operand.
1270 */
1271template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1272DECL_INLINE_THROW(uint32_t)
1273iemNativeEmit_add_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1274{
1275 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1276
1277#ifdef RT_ARCH_AMD64
1278 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1279 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1280 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
1281 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1282
1283 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1284
1285 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1286
1287#elif defined(RT_ARCH_ARM64)
1288 /* On ARM64 we'll need the two input operands as well as the result in order
1289 to calculate the right flags, even if we use ADDS and translates NZCV into
1290 OF, CF, ZF and SF. */
1291 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1292 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1293 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1294 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1295 {
1296 if (uImmOp <= 0xfffU)
1297 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1298 true /*fSetFlags*/);
1299 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1300 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1301 true /*fSetFlags*/, true /*fShift12*/);
1302 else
1303 {
1304 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1305 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1306 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1307 true /*fSetFlags*/);
1308 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1309 }
1310 }
1311 else
1312 {
1313 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1314 uint32_t const cShift = 32 - a_cOpBits;
1315 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp << cShift);
1316 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1317 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegTmpImm, idxRegDstIn, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1318 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1319 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1320 }
1321 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1322
1323 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1324 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1325
1326 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1327 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1328
1329#else
1330# error "port me"
1331#endif
1332 return off;
1333}
1334
1335
1336/**
1337 * The ADC instruction takes CF as input and will set all status flags.
1338 */
1339template<uint8_t const a_cOpBits>
1340DECL_INLINE_THROW(uint32_t)
1341iemNativeEmit_adc_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1342{
1343 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1344 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1345 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
1346
1347#ifdef RT_ARCH_AMD64
1348 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1349 with matching size to get the correct flags. */
1350 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1351
1352 /* Use the BT instruction to set CF according to idxRegEfl. */
1353 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1354 pCodeBuf[off++] = X86_EFL_CF_BIT;
1355
1356 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x12, 0x13, a_cOpBits, idxRegDst, idxRegSrc);
1357 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1358
1359 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1360 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1361
1362 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1363
1364#elif defined(RT_ARCH_ARM64)
1365 /* On ARM64 we use the RMIF instruction to load PSTATE.CF from idxRegEfl and
1366 then ADCS for the calculation. We need all inputs and result for the two
1367 flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1368 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1369 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
1370
1371 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1372 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1373 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1374 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1375 else
1376 {
1377 /* Since we're also adding in the carry flag here, shifting operands up
1378 doesn't work. So, we have to calculate carry & overflow manually. */
1379 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1380 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1381 }
1382 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1383
1384 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1385 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1386
1387 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1388 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1389 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1390 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1391 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1392
1393#else
1394# error "port me"
1395#endif
1396 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1397 return off;
1398}
1399
1400
1401/**
1402 * The ADC instruction with immediate value as right operand.
1403 */
1404template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1405DECL_INLINE_THROW(uint32_t)
1406iemNativeEmit_adc_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1407{
1408 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1409 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
1410
1411#ifdef RT_ARCH_AMD64
1412 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1413 with matching size to get the correct flags. */
1414 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1415
1416 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1417 pCodeBuf[off++] = X86_EFL_CF_BIT;
1418
1419 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 2, idxRegDst, uImmOp);
1420 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1421
1422 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1423
1424 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1425
1426#elif defined(RT_ARCH_ARM64)
1427 /* On ARM64 we use the RMIF instructions to load PSTATE.CF from idxRegEfl
1428 and then ADCS for the calculation. We need all inputs and result for
1429 the two flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1430 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1431 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1432 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1433
1434 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1435 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1436 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1437 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /*f64Bit*/);
1438 else
1439 {
1440 /* Since we're also adding in the carry flag here, shifting operands up
1441 doesn't work. So, we have to calculate carry & overflow manually. */
1442 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1443 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1444 }
1445 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1446
1447 iemNativeRegFreeTmp(pReNative, idxRegImm);
1448
1449 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1450 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1451
1452 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1453 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1454 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1455 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1456
1457#else
1458# error "port me"
1459#endif
1460 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1461 return off;
1462}
1463
1464
1465/**
1466 * The SUB instruction will set all status flags.
1467 */
1468template<uint8_t const a_cOpBits>
1469DECL_INLINE_THROW(uint32_t)
1470iemNativeEmit_sub_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1471{
1472 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1473 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1474
1475#ifdef RT_ARCH_AMD64
1476 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1477 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1478 0x2a, 0x2b, a_cOpBits, idxRegDst, idxRegSrc);
1479 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1480
1481 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1482 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1483
1484 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1485
1486#elif defined(RT_ARCH_ARM64)
1487 /* On ARM64 we'll need the two input operands as well as the result in order
1488 to calculate the right flags, even if we use SUBS and translates NZCV into
1489 OF, CF, ZF and SF. */
1490 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1491 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1492 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1493 {
1494 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1495 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1496 }
1497 else
1498 {
1499 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1500 uint32_t const cShift = 32 - a_cOpBits;
1501 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1502 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1503 true /*fSetFlags*/, cShift);
1504 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1505 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1506 }
1507 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1508
1509 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1510 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1511
1512 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1513 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1514 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1515
1516#else
1517# error "port me"
1518#endif
1519 return off;
1520}
1521
1522
1523/**
1524 * The SUB instruction with immediate value as right operand.
1525 */
1526template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1527DECL_INLINE_THROW(uint32_t)
1528iemNativeEmit_sub_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1529{
1530 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1531
1532#ifdef RT_ARCH_AMD64
1533 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1534 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1535 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 5, idxRegDst, uImmOp);
1536 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1537
1538 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1539
1540 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1541
1542#elif defined(RT_ARCH_ARM64)
1543 /* On ARM64 we'll need the two input operands as well as the result in order
1544 to calculate the right flags, even if we use SUBS and translates NZCV into
1545 OF, CF, ZF and SF. */
1546 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1547 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1548 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1549 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1550 {
1551 if (uImmOp <= 0xfffU)
1552 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1553 true /*fSetFlags*/);
1554 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1555 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1556 true /*fSetFlags*/, true /*fShift12*/);
1557 else
1558 {
1559 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1560 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1561 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1562 true /*fSetFlags*/);
1563 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1564 }
1565 }
1566 else
1567 {
1568 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1569 uint32_t const cShift = 32 - a_cOpBits;
1570 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1571 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1572 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1573 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1574 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1575 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1576 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1577 }
1578 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1579
1580 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1581 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1582
1583 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1584 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1585
1586#else
1587# error "port me"
1588#endif
1589 return off;
1590}
1591
1592
1593/**
1594 * The CMP instruction will set all status flags, but modifies no registers.
1595 */
1596template<uint8_t const a_cOpBits>
1597DECL_INLINE_THROW(uint32_t)
1598iemNativeEmit_cmp_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1599{
1600 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1601 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1602
1603#ifdef RT_ARCH_AMD64
1604 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1605 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1606 0x3a, 0x3b, a_cOpBits, idxRegDst, idxRegSrc);
1607 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1608
1609 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1610 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1611
1612 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1613
1614#elif defined(RT_ARCH_ARM64)
1615 /* On ARM64 we'll need the actual result as well as both input operands in order
1616 to calculate the right flags, even if we use SUBS and translates NZCV into
1617 OF, CF, ZF and SF. */
1618 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1619 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1620 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1621 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1622 else
1623 {
1624 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1625 uint32_t const cShift = 32 - a_cOpBits;
1626 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegResult, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1627 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegSrc, false /*f64Bit*/,
1628 true /*fSetFlags*/, cShift);
1629 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1630 }
1631 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1632
1633 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1634 idxRegDst, idxRegSrc, true /*fInvertCarry*/, 0);
1635
1636 iemNativeRegFreeTmp(pReNative, idxRegResult);
1637 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1638 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1639
1640#else
1641# error "port me"
1642#endif
1643 return off;
1644}
1645
1646
1647/**
1648 * The CMP instruction with immediate value as right operand.
1649 */
1650template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1651DECL_INLINE_THROW(uint32_t)
1652iemNativeEmit_cmp_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1653{
1654 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1655
1656#ifdef RT_ARCH_AMD64
1657 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1658 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1659 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 7, idxRegDst, uImmOp);
1660 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1661
1662 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1663
1664 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1665
1666#elif defined(RT_ARCH_ARM64)
1667 /* On ARM64 we'll need the actual result as well as both input operands in order
1668 to calculate the right flags, even if we use SUBS and translates NZCV into
1669 OF, CF, ZF and SF. */
1670 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1671 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1672 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1673 {
1674 if (uImmOp <= 0xfffU)
1675 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1676 true /*fSetFlags*/);
1677 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1678 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1679 true /*fSetFlags*/, true /*fShift12*/);
1680 else
1681 {
1682 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1683 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1684 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1685 true /*fSetFlags*/);
1686 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1687 }
1688 }
1689 else
1690 {
1691 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1692 uint32_t const cShift = 32 - a_cOpBits;
1693 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1694 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1695 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegResult, idxRegDst, cShift, false /*f64Bit*/);
1696 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1697 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1698 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1699 }
1700 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1701
1702 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1703 idxRegDst, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1704
1705 iemNativeRegFreeTmp(pReNative, idxRegResult);
1706 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1707
1708#else
1709# error "port me"
1710#endif
1711 return off;
1712}
1713
1714
1715/**
1716 * The SBB instruction takes CF as input and will set all status flags.
1717 */
1718template<uint8_t const a_cOpBits>
1719DECL_INLINE_THROW(uint32_t)
1720iemNativeEmit_sbb_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1721{
1722 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1723 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off);
1724 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
1725
1726#ifdef RT_ARCH_AMD64
1727 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1728 with matching size to get the correct flags. */
1729 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1730
1731 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1732 pCodeBuf[off++] = X86_EFL_CF_BIT;
1733
1734 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x1a, 0x1b, a_cOpBits, idxRegDst, idxRegSrc);
1735 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1736
1737 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1738 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1739
1740 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1741
1742#elif defined(RT_ARCH_ARM64)
1743 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1744 idxRegEfl and then SBCS for the calculation. We need all inputs and
1745 result for the two flags (AF,PF) that can't be directly derived from
1746 PSTATE.NZCV. */
1747 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1748 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1749
1750 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1751 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1752 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1753 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1754 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1755 else
1756 {
1757 /* Since we're also adding in the carry flag here, shifting operands up
1758 doesn't work. So, we have to calculate carry & overflow manually. */
1759 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1760 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1761 }
1762 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1763
1764 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1765 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1766
1767 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1768 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1769 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1770 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1771 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1772
1773#else
1774# error "port me"
1775#endif
1776 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1777 return off;
1778}
1779
1780
1781/**
1782 * The SBB instruction with immediate value as right operand.
1783 */
1784template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1785DECL_INLINE_THROW(uint32_t)
1786iemNativeEmit_sbb_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1787{
1788 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
1789 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
1790
1791#ifdef RT_ARCH_AMD64
1792 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1793 with matching size to get the correct flags. */
1794 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1795
1796 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1797 pCodeBuf[off++] = X86_EFL_CF_BIT;
1798
1799 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 3, idxRegDst, uImmOp);
1800 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1801
1802 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1803
1804 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1805
1806#elif defined(RT_ARCH_ARM64)
1807 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1808 idxRegEfl and then SBCS for the calculation. We need all inputs and
1809 result for the two flags (AF,PF) that can't be directly derived from
1810 PSTATE.NZCV. */
1811 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1812 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1813 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1814
1815 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1816 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1817 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1818 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1819 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /*f64Bit*/);
1820 else
1821 {
1822 /* Since we're also adding in the carry flag here, shifting operands up
1823 doesn't work. So, we have to calculate carry & overflow manually. */
1824 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1825 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1826 }
1827 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1828
1829 iemNativeRegFreeTmp(pReNative, idxRegImm);
1830
1831 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1832 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1833
1834 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1835 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1836 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1837 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1838
1839#else
1840# error "port me"
1841#endif
1842 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1843 return off;
1844}
1845
1846
1847template<uint8_t const a_cOpBits>
1848DECL_INLINE_THROW(uint32_t)
1849iemNativeEmit_imul_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1850{
1851 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1852 AssertFailed();
1853 return iemNativeEmitBrk(pReNative, off, 0x666);
1854}
1855
1856
1857template<uint8_t const a_cOpBits>
1858DECL_INLINE_THROW(uint32_t)
1859iemNativeEmit_popcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1860{
1861 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1862 AssertFailed();
1863 return iemNativeEmitBrk(pReNative, off, 0x666);
1864}
1865
1866
1867template<uint8_t const a_cOpBits>
1868DECL_INLINE_THROW(uint32_t)
1869iemNativeEmit_tzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1870{
1871 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1872 AssertFailed();
1873 return iemNativeEmitBrk(pReNative, off, 0x666);
1874}
1875
1876
1877template<uint8_t const a_cOpBits>
1878DECL_INLINE_THROW(uint32_t)
1879iemNativeEmit_lzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1880{
1881 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1882 AssertFailed();
1883 return iemNativeEmitBrk(pReNative, off, 0x666);
1884}
1885
1886
1887
1888/*********************************************************************************************************************************
1889* Shifting and Rotating. *
1890*********************************************************************************************************************************/
1891
1892
1893typedef enum
1894{
1895 kIemNativeEmitEFlagsForShiftType_Left,
1896 kIemNativeEmitEFlagsForShiftType_Right,
1897 kIemNativeEmitEFlagsForShiftType_SignedRight
1898} IEMNATIVEEMITEFLAGSFORSHIFTTYPE;
1899
1900/**
1901 * This is used by SHL, SHR and SAR emulation.
1902 *
1903 * It takes liveness stuff into account.
1904 */
1905DECL_INLINE_THROW(uint32_t)
1906iemNativeEmitEFlagsForShift(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxRegEfl, uint8_t idxRegResult,
1907 uint8_t idxRegSrc, uint8_t idxRegCount, uint8_t cOpBits, IEMNATIVEEMITEFLAGSFORSHIFTTYPE enmType,
1908 uint8_t idxRegTmp)
1909{
1910 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalShift);
1911
1912RT_NOREF(pReNative, off, idxRegEfl, idxRegResult, idxRegSrc, idxRegCount, cOpBits, enmType);
1913#if 0 //def IEMNATIVE_WITH_EFLAGS_SKIPPING
1914 /*
1915 * See if we can skip this wholesale.
1916 */
1917 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
1918 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
1919 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
1920 {
1921 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedShift);
1922 pReNative->fSkippingEFlags |= X86_EFL_STATUS_BITS;
1923# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
1924 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
1925# endif
1926 }
1927 else
1928#endif
1929 {
1930 /*
1931 * The difference between Intel and AMD flags for SHL are:
1932 * - Intel always clears AF while AMD always sets it.
1933 * - Intel sets OF for the first shift, while AMD for the last shift.
1934 *
1935 */
1936
1937#ifdef RT_ARCH_AMD64
1938 /*
1939 * We capture flags and does the additional OF and AF calculations as needed.
1940 */
1941 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 64);
1942 /** @todo kIemNativeEmitEFlagsForShiftType_SignedRight: we could alternatively
1943 * use LAHF here when host rax is free since, OF is cleared. */
1944 /* pushf */
1945 pCodeBuf[off++] = 0x9c;
1946 /* pop tmp */
1947 if (idxRegTmp >= 8)
1948 pCodeBuf[off++] = X86_OP_REX_B;
1949 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
1950 /* Clear the status bits in EFLs. */
1951 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
1952 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1];
1953 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
1954 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_STATUS_BITS);
1955 else
1956 {
1957 /* and tmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF */
1958 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF);
1959 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1960 off = iemNativeEmitOrGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_AF);
1961 /* OR in the flags we collected. */
1962 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
1963
1964 /* Calculate OF */
1965 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1966 {
1967 /* AMD last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1968 /* bt idxRegResult, (cOpBits - 1) => CF=result-sign-bit */
1969 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b /*ud2*/, 0xba,
1970 RT_MAX(cOpBits, 16), 4, idxRegResult);
1971 pCodeBuf[off++] = cOpBits - 1;
1972 /* setc idxRegTmp */
1973 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x92, 0x0b /*ud2*/, 8, 0, idxRegTmp);
1974 /* xor idxRegTmp, idxRegEfl */
1975 off = iemNativeEmitXorGpr32ByGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegEfl);
1976 /* and idxRegTmp, 1 */
1977 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, 1);
1978 /* shl idxRegTmp, X86_EFL_OF_BIT */
1979 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT);
1980 }
1981 else
1982 {
1983 /* Intel first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1984 if (cOpBits <= 32)
1985 {
1986 /* mov idxRegTmp, idxRegSrc */
1987 off = iemNativeEmitLoadGprFromGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegSrc);
1988 /* shl idxRegTmp, 1 */
1989 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, 1);
1990 /* xor idxRegTmp, idxRegSrc */
1991 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
1992 /* shr idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1 or shl idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1 */
1993 if (cOpBits >= X86_EFL_OF_BIT)
1994 off = iemNativeEmitShiftGpr32RightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
1995 else
1996 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1);
1997 }
1998 else
1999 {
2000 /* same as above but with 64-bit grps*/
2001 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2002 off = iemNativeEmitShiftGprLeftEx(pCodeBuf, off, idxRegTmp, 1);
2003 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2004 off = iemNativeEmitShiftGprRightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2005 }
2006 /* and idxRegTmp, X86_EFL_OF */
2007 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF);
2008 }
2009 }
2010 /* Or in the collected flag(s) */
2011 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2012
2013#elif defined(RT_ARCH_ARM64)
2014 /*
2015 * Calculate flags.
2016 */
2017 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
2018
2019 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
2020 off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxRegTmp, ~X86_EFL_STATUS_BITS);
2021 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
2022
2023 /* N,Z -> SF,ZF */
2024 if (cOpBits < 32)
2025 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
2026 else
2027 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
2028 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
2029 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
2030 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
2031 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
2032
2033 /* Calculate 8-bit parity of the result. */
2034 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
2035 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2036 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2037 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2038 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2039 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2040 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
2041 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
2042 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
2043
2044 /* Calculate carry - the last bit shifted out of the input value. */
2045 if (enmType == kIemNativeEmitEFlagsForShiftType_Left)
2046 {
2047 /* CF = (idxRegSrc >> (cOpBits - idxRegCount))) & 1 */
2048 pCodeBuf[off++] = Armv8A64MkInstrMovZ(idxRegTmp, cOpBits);
2049 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegTmp, idxRegTmp, idxRegCount, false /*f64Bit*/, cOpBits < 32 /*fSetFlags*/);
2050 if (cOpBits < 32)
2051 pCodeBuf[off++] = Armv8A64MkInstrBCond(kArmv8InstrCond_Cc, 3); /* 16 or 8 bit: CF is clear if all shifted out */
2052 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2053 }
2054 else
2055 {
2056 /* CF = (idxRegSrc >> (idxRegCount - 1)) & 1 */
2057 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegTmp, idxRegCount, 1, false /*f64Bit*/);
2058 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2059 }
2060 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
2061
2062 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[0];
2063 if (idxTargetCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2064 {
2065 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2066 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegSrc, idxRegSrc, cOpBits > 32, 1 /*left shift count*/);
2067 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, cOpBits - 1, cOpBits > 32);
2068 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2069 }
2070 else
2071 {
2072 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2073 AssertCompile(X86_EFL_CF_BIT == 0);
2074 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegEfl, idxRegResult, cOpBits > 32, /* ASSUMES CF calculated! */
2075 cOpBits - 1, kArmv8A64InstrShift_Lsr);
2076 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2077
2078 /* AMD unconditionally clears AF. */
2079 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 32 - X86_EFL_AF_BIT) == X86_EFL_AF);
2080 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegEfl, idxRegEfl, 0, 32 - X86_EFL_AF_BIT, false /*f64Bit*/);
2081 }
2082#else
2083# error "port me"
2084#endif
2085 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2086
2087#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
2088 if (pReNative->fSkippingEFlags)
2089 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForShift)\n", pReNative->fSkippingEFlags));
2090 pReNative->fSkippingEFlags = 0;
2091# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2092 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2093# endif
2094#endif
2095 }
2096 return off;
2097}
2098
2099
2100DECL_INLINE_THROW(uint32_t)
2101iemNativeEmit_shl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2102 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2103{
2104 /* Note! Since we're doing some branching here, we need to allocate all
2105 registers we need before the jump or we may end up with invalid
2106 register state if the branch is taken. */
2107 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off); /* Do this first in hope we'll get EAX. */
2108 uint8_t const idxRegCount = iemNativeVarRegisterAcquireInited(pReNative, idxVarCount, &off); /* modified on arm64 */
2109 uint8_t const idxRegDst = iemNativeVarRegisterAcquireInited(pReNative, idxVarDst, &off);
2110 uint8_t const idxRegEfl = iemNativeVarRegisterAcquireInited(pReNative, idxVarEfl, &off);
2111
2112#ifdef RT_ARCH_AMD64
2113 /* Make sure IEM_MC_NATIVE_AMD64_HOST_REG_FOR_LOCAL was used. */
2114 AssertStmt(idxRegCount == X86_GREG_xCX, IEMNATIVE_DO_LONGJMP(pReNative, VERR_IEM_EMIT_UNEXPECTED_VAR_REGISTER));
2115
2116 /* We only need a copy of the input value if the target CPU differs from the host CPU. */
2117 uint8_t const idxRegDstIn = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1] == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2118 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
2119 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4+2+3+4);
2120
2121 /* Check if it's NOP before we do anything. */
2122 off = iemNativeEmitTestAnyBitsInGpr8Ex(pCodeBuf, off, idxRegCount, cOpBits <= 32 ? 0x1f : 0x3f);
2123 uint32_t const offFixup = off;
2124 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off /*8-bit should be enough */, kIemNativeInstrCond_z);
2125
2126 if (idxRegDstIn != UINT8_MAX)
2127 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
2128 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0xd2, 0xd3, cOpBits, 4, idxRegDst);
2129
2130#elif defined(RT_ARCH_ARM64)
2131 /* We always (except we can skip EFLAGS calcs) a copy of the input value. */
2132 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
2133 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
2134
2135 /* Check if it's NOP before we do anything. We MODIFY idxRegCount here! */
2136 Assert(Armv8A64ConvertImmRImmS2Mask32(4, 0) == 0x1f);
2137 Assert(Armv8A64ConvertImmRImmS2Mask32(5, 0) == 0x3f);
2138 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegCount, idxRegCount, cOpBits > 32 ? 5 : 4, 0, false /*f64Bit*/);
2139 uint32_t const offFixup = off;
2140 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kArmv8InstrCond_Eq);
2141
2142 pCodeBuf[off++] = Armv8A64MkInstrMov(idxRegDstIn, idxRegDst);
2143 pCodeBuf[off++] = Armv8A64MkInstrLslv(idxRegDst, idxRegDst, idxRegCount, cOpBits > 32 /*f64Bit*/);
2144 if (cOpBits < 32)
2145 {
2146 Assert(Armv8A64ConvertImmRImmS2Mask32(7, 0) == 0xff);
2147 Assert(Armv8A64ConvertImmRImmS2Mask32(15, 0) == 0xffff);
2148 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, cOpBits - 1, 0, false /*f64Bit*/);
2149 }
2150
2151#else
2152# error "port me"
2153#endif
2154
2155 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2156 off = iemNativeEmitEFlagsForShift(pReNative, off, idxRegEfl, idxRegDst, idxRegDstIn, idxRegCount,
2157 cOpBits, kIemNativeEmitEFlagsForShiftType_Left, idxRegTmp);
2158
2159 /* fixup the jump */
2160 iemNativeFixupFixedJump(pReNative, offFixup, off);
2161
2162#ifdef RT_ARCH_AMD64
2163 if (idxRegDstIn != UINT8_MAX)
2164#endif
2165 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
2166 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
2167 iemNativeVarRegisterRelease(pReNative, idxVarDst);
2168 iemNativeVarRegisterRelease(pReNative, idxVarCount);
2169 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2170 return off;
2171}
2172
2173
2174DECL_INLINE_THROW(uint32_t)
2175iemNativeEmit_shr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2176 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2177{
2178 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2179 AssertFailed();
2180 return iemNativeEmitBrk(pReNative, off, 0x666);
2181}
2182
2183
2184DECL_INLINE_THROW(uint32_t)
2185iemNativeEmit_sar_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2186 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2187{
2188 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2189 AssertFailed();
2190 return iemNativeEmitBrk(pReNative, off, 0x666);
2191}
2192
2193
2194DECL_INLINE_THROW(uint32_t)
2195iemNativeEmit_rol_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2196 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2197{
2198 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2199 AssertFailed();
2200 return iemNativeEmitBrk(pReNative, off, 0x666);
2201}
2202
2203
2204DECL_INLINE_THROW(uint32_t)
2205iemNativeEmit_ror_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2206 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2207{
2208 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2209 AssertFailed();
2210 return iemNativeEmitBrk(pReNative, off, 0x666);
2211}
2212
2213
2214DECL_INLINE_THROW(uint32_t)
2215iemNativeEmit_rcl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2216 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2217{
2218 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2219 AssertFailed();
2220 return iemNativeEmitBrk(pReNative, off, 0x666);
2221}
2222
2223
2224DECL_INLINE_THROW(uint32_t)
2225iemNativeEmit_rcr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2226 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2227{
2228 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2229 AssertFailed();
2230 return iemNativeEmitBrk(pReNative, off, 0x666);
2231}
2232
2233
2234
2235/*********************************************************************************************************************************
2236* SIMD emitters. *
2237*********************************************************************************************************************************/
2238
2239/**
2240 * Common emitter for packed arithmetic instructions.
2241 */
2242#ifdef RT_ARCH_AMD64
2243# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2244 DECL_INLINE_THROW(uint32_t) \
2245 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2246 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2247 { \
2248 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2249 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2250 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2251 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2252 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2253 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2254 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2255 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2256 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2257 pCodeBuf[off++] = 0x0f; \
2258 pCodeBuf[off++] = (a_bOpcX86); \
2259 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2260 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2261 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2262 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2263 return off; \
2264 } \
2265 DECL_INLINE_THROW(uint32_t) \
2266 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2267 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2268 { \
2269 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2270 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2271 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2272 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2273 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2274 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2275 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2276 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2277 pCodeBuf[off++] = 0x0f; \
2278 pCodeBuf[off++] = (a_bOpcX86); \
2279 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2280 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2281 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2282 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2283 return off; \
2284 } \
2285 typedef int ignore_semicolon
2286#elif defined(RT_ARCH_ARM64)
2287# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2288 DECL_INLINE_THROW(uint32_t) \
2289 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2290 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2291 { \
2292 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2293 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2294 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2295 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2296 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2297 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2298 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2299 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2300 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2301 return off; \
2302 } \
2303 DECL_INLINE_THROW(uint32_t) \
2304 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2305 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2306 { \
2307 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2308 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2309 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2310 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2311 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2312 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2313 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2314 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2315 return off; \
2316 } \
2317 typedef int ignore_semicolon
2318#else
2319# error "Port me"
2320#endif
2321
2322/* POR, ORPS, ORPD. */
2323IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(por, kArmv8VecInstrLogicOp_Orr, 0xeb);
2324/* PXOR, XORPS, XORPD. */
2325IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pxor, kArmv8VecInstrLogicOp_Eor, 0xef);
2326/* PAND, ANDPS, ANDPD. */
2327IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pand, kArmv8VecInstrLogicOp_And, 0xdb);
2328
2329
2330/**
2331 * Common emitter for the shift right with immediate instructions.
2332 */
2333#ifdef RT_ARCH_AMD64
2334# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2335 DECL_INLINE_THROW(uint32_t) \
2336 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2337 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2338 { \
2339 if (bImm) \
2340 { \
2341 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2342 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2343 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2344 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2345 if (idxSimdRegDst >= 8) \
2346 pCodeBuf[off++] = X86_OP_REX_B; \
2347 pCodeBuf[off++] = 0x0f; \
2348 pCodeBuf[off++] = (a_bOpcX86); \
2349 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, idxSimdRegDst & 7); \
2350 pCodeBuf[off++] = bImm; \
2351 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2352 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2353 } \
2354 /* Immediate 0 is a nop. */ \
2355 return off; \
2356 } \
2357 typedef int ignore_semicolon
2358#elif defined(RT_ARCH_ARM64)
2359# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2360 DECL_INLINE_THROW(uint32_t) \
2361 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2362 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2363 { \
2364 if (bImm) \
2365 { \
2366 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2367 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2368 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2369 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegDst, idxSimdRegDst, RT_MIN(bImm, (a_cShiftMax)), (a_ArmElemSz)); \
2370 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2371 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2372 } \
2373 /* Immediate 0 is a nop. */ \
2374 return off; \
2375 } \
2376 typedef int ignore_semicolon
2377#else
2378# error "Port me"
2379#endif
2380
2381IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlw, 16, kArmv8InstrShiftSz_U16, 0x71);
2382IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrld, 32, kArmv8InstrShiftSz_U32, 0x72);
2383IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlq, 64, kArmv8InstrShiftSz_U64, 0x73);
2384
2385
2386/**
2387 * Common emitter for the shift left with immediate instructions.
2388 */
2389#ifdef RT_ARCH_AMD64
2390# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2391 DECL_INLINE_THROW(uint32_t) \
2392 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2393 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2394 { \
2395 if (bImm) \
2396 { \
2397 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2398 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2399 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2400 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2401 if (idxSimdRegDst >= 8) \
2402 pCodeBuf[off++] = X86_OP_REX_B; \
2403 pCodeBuf[off++] = 0x0f; \
2404 pCodeBuf[off++] = (a_bOpcX86); \
2405 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxSimdRegDst & 7); \
2406 pCodeBuf[off++] = bImm; \
2407 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2408 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2409 } \
2410 /* Immediate 0 is a nop. */ \
2411 return off; \
2412 } \
2413 typedef int ignore_semicolon
2414#elif defined(RT_ARCH_ARM64)
2415# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2416 DECL_INLINE_THROW(uint32_t) \
2417 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2418 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2419 { \
2420 if (bImm) /* bImm == 0 is a nop */ \
2421 { \
2422 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2423 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2424 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2425 if (bImm < (a_cShiftMax)) \
2426 pCodeBuf[off++] = Armv8A64MkVecInstrShlImm(idxSimdRegDst, idxSimdRegDst, bImm, (a_ArmElemSz)); \
2427 else /* Everything >= a_cShiftMax sets the register to zero. */ \
2428 pCodeBuf[off++] = Armv8A64MkVecInstrEor(idxSimdRegDst, idxSimdRegDst, idxSimdRegDst); \
2429 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2430 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2431 } \
2432 return off; \
2433 } \
2434 typedef int ignore_semicolon
2435#else
2436# error "Port me"
2437#endif
2438
2439IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllw, 16, kArmv8InstrShiftSz_U16, 0x71);
2440IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(pslld, 32, kArmv8InstrShiftSz_U32, 0x72);
2441IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllq, 64, kArmv8InstrShiftSz_U64, 0x73);
2442
2443
2444/**
2445 * Common emitter for packed arithmetic instructions.
2446 */
2447#ifdef RT_ARCH_AMD64
2448# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2449 DECL_INLINE_THROW(uint32_t) \
2450 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2451 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2452 { \
2453 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2454 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2455 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2456 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2457 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2458 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2459 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2460 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2461 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2462 pCodeBuf[off++] = 0x0f; \
2463 pCodeBuf[off++] = (a_bOpcX86); \
2464 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2465 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2466 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2467 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2468 return off; \
2469 } \
2470 DECL_INLINE_THROW(uint32_t) \
2471 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2472 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2473 { \
2474 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2475 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2476 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2477 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2478 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2479 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2480 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2481 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2482 pCodeBuf[off++] = 0x0f; \
2483 pCodeBuf[off++] = (a_bOpcX86); \
2484 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2485 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2486 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2487 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2488 return off; \
2489 } \
2490 typedef int ignore_semicolon
2491#elif defined(RT_ARCH_ARM64)
2492# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2493 DECL_INLINE_THROW(uint32_t) \
2494 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2495 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2496 { \
2497 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2498 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2499 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2500 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2501 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2502 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2503 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2504 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2505 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2506 return off; \
2507 } \
2508 DECL_INLINE_THROW(uint32_t) \
2509 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2510 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2511 { \
2512 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2513 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2514 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2515 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2516 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2517 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2518 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2519 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2520 return off; \
2521 } \
2522 typedef int ignore_semicolon
2523#else
2524# error "Port me"
2525#endif
2526
2527/*
2528 * PADDx.
2529 */
2530IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddb, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_8, 0xfc);
2531IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddw, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_16, 0xfd);
2532IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddd, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_32, 0xfe);
2533IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddq, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_64, 0xd4);
2534
2535/*
2536 * PSUBx.
2537 */
2538IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubb, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_8, 0xf8);
2539IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubw, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_16, 0xf9);
2540IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubd, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_32, 0xfa);
2541IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubq, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_64, 0xfb);
2542
2543/*
2544 * PADDUSx.
2545 */
2546IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusb, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_8, 0xdc);
2547IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusw, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_16, 0xdd);
2548
2549/*
2550 * PMULLx.
2551 */
2552IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(pmullw, kArmv8VecInstrArithOp_Mul, kArmv8VecInstrArithSz_16, 0xd5);
2553
2554
2555/**
2556 * Common emitter for the pcmpeqb/pcmpeqw/pcmpeqd instructions.
2557 */
2558#ifdef RT_ARCH_AMD64
2559# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2560 DECL_INLINE_THROW(uint32_t) \
2561 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2562 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2563 { \
2564 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2565 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2566 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2567 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2568 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2569 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2570 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2571 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2572 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2573 pCodeBuf[off++] = 0x0f; \
2574 pCodeBuf[off++] = (a_bOpcX86); \
2575 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2576 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2577 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2578 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2579 return off; \
2580 } \
2581 DECL_INLINE_THROW(uint32_t) \
2582 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2583 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2584 { \
2585 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2586 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2587 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2588 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2589 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2590 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2591 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2592 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2593 pCodeBuf[off++] = 0x0f; \
2594 pCodeBuf[off++] = (a_bOpcX86); \
2595 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2596 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2597 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2598 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2599 return off; \
2600 } \
2601 typedef int ignore_semicolon
2602#elif defined(RT_ARCH_ARM64)
2603# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2604 DECL_INLINE_THROW(uint32_t) \
2605 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2606 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2607 { \
2608 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2609 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2610 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2611 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2612 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2613 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2614 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2615 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2616 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2617 return off; \
2618 } \
2619 DECL_INLINE_THROW(uint32_t) \
2620 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2621 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2622 { \
2623 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2624 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2625 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2626 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2627 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2628 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2629 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2630 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2631 return off; \
2632 } \
2633 typedef int ignore_semicolon
2634#else
2635# error "Port me"
2636#endif
2637
2638IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqb, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_8, 0x74);
2639IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqw, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_16, 0x75);
2640IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqd, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_32, 0x76);
2641
2642
2643/**
2644 * Emitter for pmovmskb
2645 */
2646DECL_INLINE_THROW(uint32_t)
2647iemNativeEmit_pmovmskb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2648 uint8_t const idxGstRegDst, uint8_t const idxSimdGstRegSrc)
2649{
2650#ifdef RT_ARCH_AMD64
2651 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2652 kIemNativeGstRegUse_ForFullWrite);
2653 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2654 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2655 kIemNativeGstSimdRegLdStSz_Low128,
2656 kIemNativeGstRegUse_ReadOnly);
2657 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2658
2659 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2660 if (idxRegDst >= 8 || idxSimdRegSrc >= 8)
2661 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2662 | (idxRegDst >= 8 ? X86_OP_REX_R : 0);
2663 pCodeBuf[off++] = 0x0f;
2664 pCodeBuf[off++] = 0xd7;
2665 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegDst & 7, idxSimdRegSrc & 7);
2666
2667 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2668 iemNativeRegFreeTmp(pReNative, idxRegDst);
2669
2670#elif defined(RT_ARCH_ARM64)
2671 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2672 kIemNativeGstRegUse_ForFullWrite);
2673 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2674 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2675 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2676 kIemNativeGstSimdRegLdStSz_Low128,
2677 kIemNativeGstRegUse_Calculation);
2678 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2679
2680 /*
2681 * See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
2682 * for different approaches as NEON doesn't has an instruction equivalent for pmovmskb, so we have to emulate that.
2683 *
2684 * As there is no way around emulating the exact semantics of pmovmskb we will use the same algorithm
2685 * as the sse2neon implementation because there we can get away with loading any constants and the
2686 * base algorithm is only 4 NEON instructions (+ 3 for extracting the result to a general register).
2687 *
2688 * The following illustrates the algorithm:
2689 *
2690 * Byte vector Element -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
2691 * Instruction
2692 * |
2693 * V
2694 * Axxxxxxx Bxxxxxxx Cxxxxxxx Dxxxxxxx Exxxxxxx Fxxxxxxx Gxxxxxxx Hxxxxxxx Ixxxxxxx Jxxxxxxx Kxxxxxxx Lxxxxxxx Mxxxxxxx Nxxxxxxx Oxxxxxxx Pxxxxxxx
2695 * USHR v.16B, v.16B, #7 0000000A 0000000B 0000000C 0000000D 0000000E 0000000F 0000000G 0000000H 0000000I 0000000J 0000000K 0000000L 0000000M 0000000N 0000000O 0000000P
2696 * USRA v.8H, v.8H, #7 00000000 000000AB 00000000 000000CD 00000000 000000EF 00000000 000000GH 00000000 000000IJ 00000000 000000KL 00000000 000000MN 00000000 000000OP
2697 * USRA v.4S, v.4S, #14 00000000 00000000 00000000 0000ABCD 00000000 00000000 00000000 0000EFGH 00000000 00000000 00000000 0000IJKL 00000000 00000000 00000000 0000MNOP
2698 * USRA v.2D, v.2D, #28 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2699 *
2700 * The extraction process
2701 * UMOV wTMP, v.16B[8] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH
2702 * UMOV wRES, v.16B[0] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2703 * ORR xRES, xRES, xTMP, LSL #8 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH IJKLMNOP
2704 */
2705 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U8);
2706 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U16, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2707 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 14, kArmv8InstrShiftSz_U32, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2708 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 28, kArmv8InstrShiftSz_U64, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2709 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegTmp, idxSimdRegSrc, 8, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2710 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegDst, idxSimdRegSrc, 0, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2711 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmp, true /*f64Bit*/, 8 /*offShift6*/);
2712
2713 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2714 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2715 iemNativeRegFreeTmp(pReNative, idxRegDst);
2716
2717#else
2718# error "Port me"
2719#endif
2720 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2721 return off;
2722}
2723
2724
2725/**
2726 * Common emitter for the PACKUSWB instructions - guest register / guest register variant.
2727 */
2728DECL_INLINE_THROW(uint32_t)
2729iemNativeEmit_packuswb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2730 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc)
2731{
2732 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2733 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2734 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2735 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
2736
2737#ifdef RT_ARCH_AMD64
2738 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2739
2740 /* packuswb xmm, xmm */
2741 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2742 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2743 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2744 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2745 pCodeBuf[off++] = 0x0f;
2746 pCodeBuf[off++] = 0x67;
2747 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2748
2749#elif defined(RT_ARCH_ARM64)
2750 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2751
2752 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2753 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2754
2755#else
2756# error "port me"
2757#endif
2758
2759 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2760 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2761
2762 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2763 return off;
2764}
2765
2766
2767/**
2768 * Common emitter for the PACKUSWB instructions - guest register / recompiler variable variant.
2769 */
2770DECL_INLINE_THROW(uint32_t)
2771iemNativeEmit_packuswb_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2772 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc)
2773{
2774 IEMNATIVE_ASSERT_VAR_IDX(pReNative, idxVarSrc);
2775 IEMNATIVE_ASSERT_VAR_SIZE(pReNative, idxVarSrc, sizeof(RTUINT128U));
2776
2777 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2778 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2779 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
2780
2781
2782#ifdef RT_ARCH_AMD64
2783 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2784
2785 /* packuswb xmm, xmm */
2786 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2787 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2788 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2789 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2790 pCodeBuf[off++] = 0x0f;
2791 pCodeBuf[off++] = 0x67;
2792 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2793
2794#elif defined(RT_ARCH_ARM64)
2795 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2796
2797 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2798 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2799
2800#else
2801# error "port me"
2802#endif
2803
2804 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2805 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
2806
2807 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2808 return off;
2809}
2810
2811
2812/**
2813 * Common emitter for the pmov{s,z}x* instructions.
2814 */
2815#ifdef RT_ARCH_AMD64
2816# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2817 DECL_INLINE_THROW(uint32_t) \
2818 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2819 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2820 { \
2821 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2822 { \
2823 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2824 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2825 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2826 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2827 if (idxSimdReg >= 8) \
2828 pCodeBuf[off++] = (idxSimdReg >= 8 ? X86_OP_REX_B | X86_OP_REX_R : 0); \
2829 pCodeBuf[off++] = 0x0f; \
2830 pCodeBuf[off++] = 0x38; \
2831 pCodeBuf[off++] = (a_bOpcX86); \
2832 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdReg & 7, idxSimdReg & 7); \
2833 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2834 } \
2835 else \
2836 { \
2837 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2838 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2839 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2840 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2841 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2842 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2843 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2844 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2845 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2846 pCodeBuf[off++] = 0x0f; \
2847 pCodeBuf[off++] = 0x38; \
2848 pCodeBuf[off++] = (a_bOpcX86); \
2849 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2850 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2851 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2852 } \
2853 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2854 return off; \
2855 } \
2856 DECL_INLINE_THROW(uint32_t) \
2857 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2858 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2859 { \
2860 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2861 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2862 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off); \
2863 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7 + 6); \
2864 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; /* Transfer value from GPR to temporary vector register using pinsrq. */ \
2865 pCodeBuf[off++] = X86_OP_REX_W \
2866 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 < 8 ? 0 : X86_OP_REX_R) \
2867 | (idxRegSrc < 8 ? 0 : X86_OP_REX_B); \
2868 pCodeBuf[off++] = 0x0f; \
2869 pCodeBuf[off++] = 0x3a; \
2870 pCodeBuf[off++] = 0x22; \
2871 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxRegSrc & 7); \
2872 pCodeBuf[off++] = 0; /* QWord */\
2873 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2874 if (idxSimdRegDst >= 8 || IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8) \
2875 pCodeBuf[off++] = (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_B : 0) \
2876 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2877 pCodeBuf[off++] = 0x0f; \
2878 pCodeBuf[off++] = 0x38; \
2879 pCodeBuf[off++] = (a_bOpcX86); \
2880 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7); \
2881 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2882 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2883 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2884 return off; \
2885 } \
2886 typedef int ignore_semicolon
2887#elif defined(RT_ARCH_ARM64)
2888# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2889 DECL_INLINE_THROW(uint32_t) \
2890 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2891 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2892 { \
2893 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2894 { \
2895 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2896 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2897 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2898 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdReg, idxSimdReg, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2899 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2900 } \
2901 else \
2902 { \
2903 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2904 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2905 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2906 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2907 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2908 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, idxSimdRegSrc, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2909 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2910 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2911 } \
2912 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2913 return off; \
2914 } \
2915 DECL_INLINE_THROW(uint32_t) \
2916 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2917 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2918 { \
2919 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2920 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2921 uint8_t const idxRegSrc = iemNativeVarRegisterAcquireInited(pReNative, idxVarSrc, &off); \
2922 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2); \
2923 pCodeBuf[off++] = Armv8A64MkVecInstrIns(IEMNATIVE_SIMD_REG_FIXED_TMP0, idxRegSrc, 0 /*idxElem*/); /* Transfer value from GPR to temporary vector register. */ \
2924 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2925 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2926 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2927 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2928 return off; \
2929 } \
2930 typedef int ignore_semicolon
2931#else
2932# error "Port me"
2933#endif
2934
2935IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxbw, true, kArmv8InstrShiftSz_U8, 0x30);
2936IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxwd, true, kArmv8InstrShiftSz_U16, 0x33);
2937IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxdq, true, kArmv8InstrShiftSz_U32, 0x35);
2938
2939IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxbw, false, kArmv8InstrShiftSz_U8, 0x20);
2940IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxwd, false, kArmv8InstrShiftSz_U16, 0x23);
2941IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxdq, false, kArmv8InstrShiftSz_U32, 0x25);
2942
2943
2944/**
2945 * Updates the MXCSR exception flags, raising any unmasked exceptions.
2946 */
2947DECL_INLINE_THROW(uint32_t)
2948iemNativeEmitMxcsrUpdate(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, uint8_t const idxSimdGstRegDst, uint8_t const idxSimdRegRes)
2949{
2950 uint8_t const idxRegMxCsr = iemNativeRegAllocTmpForGuestReg(pReNative, &off, kIemNativeGstReg_MxCsr, kIemNativeGstRegUse_ForUpdate);
2951 uint8_t const idxRegMxCsrXcptFlags = iemNativeRegAllocTmp(pReNative, &off);
2952 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2953
2954#ifdef RT_ARCH_AMD64
2955 PIEMNATIVEINSTR pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2956
2957 /* stmxcsr */
2958 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2959 pbCodeBuf[off++] = X86_OP_REX_B;
2960 pbCodeBuf[off++] = 0x0f;
2961 pbCodeBuf[off++] = 0xae;
2962 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 3, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2963 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2964 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2965 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2966 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2967
2968 /* Load MXCSR, mask everything except status flags and or into guest MXCSR. */
2969 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2970
2971 /* Store the flags in the MXCSR xcpt flags register. */
2972 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
2973 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_XCPT_FLAGS);
2974
2975 /* Clear the status flags in the temporary copy and write it back to MXCSR. */
2976 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, ~X86_MXCSR_XCPT_FLAGS);
2977 off = iemNativeEmitStoreGprToVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2978
2979 pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2980
2981 /* ldmxcsr */
2982 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2983 pbCodeBuf[off++] = X86_OP_REX_B;
2984 pbCodeBuf[off++] = 0x0f;
2985 pbCodeBuf[off++] = 0xae;
2986 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 2, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2987 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2988 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2989 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2990 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2991
2992#elif defined(RT_ARCH_ARM64)
2993 PIEMNATIVEINSTR pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2994 pu32CodeBuf[off++] = Armv8A64MkInstrMrs(idxRegMxCsrXcptFlags, ARMV8_AARCH64_SYSREG_FPSR);
2995 pu32CodeBuf[off++] = Armv8A64MkInstrMsr(ARMV8_A64_REG_XZR, ARMV8_AARCH64_SYSREG_FPSR); /* Clear FPSR for next instruction. */
2996 pu32CodeBuf[off++] = Armv8A64MkInstrUxtb(idxRegMxCsrXcptFlags, idxRegMxCsrXcptFlags); /* Ensure there are only the exception flags set (clears QC, and any possible NZCV flags). */
2997
2998 /*
2999 * The exception flags layout differs between MXCSR and FPSR of course:
3000 *
3001 * Bit FPSR MXCSR
3002 * 0 IOC ------> IE
3003 *
3004 * 1 DZC ---- DE <-+
3005 * \ |
3006 * 2 OFC --- -> ZE |
3007 * \ |
3008 * 3 UFC -- --> OE |
3009 * \ |
3010 * 4 IXC - ---> UE |
3011 * \ |
3012 * 5 ----> PE |
3013 * 6 |
3014 * 7 IDC --------------+
3015 */
3016 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 1); /* Shift the block of flags starting at DZC to the least significant bits. */
3017 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 2, 4); /* Insert DZC, OFC, UFC and IXC into the MXCSR positions. */
3018 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 6); /* Shift IDC (now at 6) into the LSB. */
3019 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 1, 1); /* Insert IDC into the MXCSR positions. */
3020#else
3021# error "Port me"
3022#endif
3023
3024 /*
3025 * If PE is set together with OE/UE and neither are masked
3026 * PE needs to be cleared, because on real hardware
3027 * an exception is generated with only OE/UE being set,
3028 * but because we mask all exceptions PE will get set as well.
3029 */
3030 /** @todo On ARM we can combine the load+and into one and instruction. */
3031 /** @todo r=aeichner Can this be done more optimal? */
3032 uint8_t const idxRegTmp2 = iemNativeRegAllocTmp(pReNative, &off);
3033 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsrXcptFlags);
3034 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, X86_MXCSR_OE | X86_MXCSR_UE);
3035 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp2, idxRegMxCsr);
3036 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp2, X86_MXCSR_OM | X86_MXCSR_UM);
3037 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp2, X86_MXCSR_XCPT_MASK_SHIFT);
3038 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp2, idxRegTmp2, false /*f64Bit*/);
3039 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegTmp2, idxRegTmp);
3040 off = iemNativeEmitTestAnyBitsInGpr(pReNative, off, idxRegTmp2, X86_MXCSR_OE | X86_MXCSR_UE);
3041
3042 uint32_t offFixup = off;
3043 off = iemNativeEmitJzToFixed(pReNative, off, off);
3044 off = iemNativeEmitBitClearInGpr32(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_PE_BIT);
3045 iemNativeFixupFixedJump(pReNative, offFixup, off);
3046 iemNativeRegFreeTmp(pReNative, idxRegTmp2);
3047
3048
3049 /* Set the MXCSR flags now. */
3050 off = iemNativeEmitOrGpr32ByGpr(pReNative, off, idxRegMxCsr, idxRegMxCsrXcptFlags);
3051
3052 /*
3053 * Make sure we don't have any outstanding guest register writes as we may
3054 * raise an \#UD or \#XF and all guest register must be up to date in CPUMCTX.
3055 */
3056 off = iemNativeRegFlushPendingWrites(pReNative, off);
3057
3058#ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3059 off = iemNativeEmitStoreImmToVCpuU8(pReNative, off, idxInstr, RT_UOFFSETOF(VMCPUCC, iem.s.idxTbCurInstr));
3060#else
3061 RT_NOREF(idxInstr);
3062#endif
3063
3064 /* Check whether an exception is pending and only update the guest SIMD register if it isn't. */
3065 /* mov tmp, varmxcsr */
3066 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsr);
3067 /* tmp >>= X86_MXCSR_XCPT_MASK_SHIFT */
3068 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp, X86_MXCSR_XCPT_MASK_SHIFT);
3069 /* tmp = ~tmp */
3070 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp, idxRegTmp, false /*f64Bit*/);
3071 /* tmp &= mxcsr */
3072 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3073 off = iemNativeEmitTbExitIfAnyBitsSetInGpr<kIemNativeLabelType_RaiseSseAvxFpRelated>(pReNative, off, idxRegMxCsrXcptFlags,
3074 X86_MXCSR_XCPT_FLAGS);
3075
3076 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3077 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite);
3078
3079 /* Move result to guest SIMD register (at this point there is no exception being raised). */
3080 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, idxSimdRegDst, idxSimdRegRes);
3081
3082 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3083 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3084 iemNativeRegFreeTmp(pReNative, idxRegTmp);
3085 iemNativeRegFreeTmp(pReNative, idxRegMxCsrXcptFlags);
3086 iemNativeRegFreeTmp(pReNative, idxRegMxCsr);
3087 return off;
3088}
3089
3090
3091/**
3092 * Common emitter for packed floating point instructions with 3 operands - register, register variant.
3093 */
3094DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3095 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc,
3096#ifdef RT_ARCH_AMD64
3097 uint8_t const bPrefixX86, uint8_t const bOpcX86
3098#elif defined(RT_ARCH_ARM64)
3099 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3100#endif
3101 )
3102{
3103 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3104 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3105 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
3106 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3107
3108#ifdef RT_ARCH_AMD64
3109 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3110 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3111 if (bPrefixX86 != 0)
3112 pCodeBuf[off++] = bPrefixX86;
3113 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3114 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3115 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3116 pCodeBuf[off++] = 0x0f;
3117 pCodeBuf[off++] = bOpcX86;
3118 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3119#elif defined(RT_ARCH_ARM64)
3120 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3121 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3122#else
3123# error "Port me"
3124#endif
3125 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3126 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
3127 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3128 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3129}
3130
3131
3132/**
3133 * Common emitter for packed floating point instructions with 3 operands - register, local variable variant.
3134 */
3135DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3136 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc,
3137#ifdef RT_ARCH_AMD64
3138 uint8_t const bPrefixX86, uint8_t const bOpcX86
3139#elif defined(RT_ARCH_ARM64)
3140 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3141#endif
3142 )
3143{
3144 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3145 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3146 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
3147
3148#ifdef RT_ARCH_AMD64
3149 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3150 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3151 if (bPrefixX86 != 0)
3152 pCodeBuf[off++] = bPrefixX86;
3153 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3154 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3155 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3156 pCodeBuf[off++] = 0x0f;
3157 pCodeBuf[off++] = bOpcX86;
3158 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3159#elif defined(RT_ARCH_ARM64)
3160 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3161 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3162#else
3163# error "Port me"
3164#endif
3165 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
3166 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3167 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3168 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3169}
3170
3171
3172/**
3173 * Common emitter for packed floating point instructions with 3 operands.
3174 */
3175#ifdef RT_ARCH_AMD64
3176# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3177 DECL_FORCE_INLINE_THROW(uint32_t) \
3178 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3179 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3180 { \
3181 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3182 a_bPrefixX86, a_bOpcX86); \
3183 } \
3184 DECL_FORCE_INLINE_THROW(uint32_t) \
3185 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3186 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3187 { \
3188 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3189 a_bPrefixX86, a_bOpcX86); \
3190 } \
3191 typedef int ignore_semicolon
3192#elif defined(RT_ARCH_ARM64)
3193# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3194 DECL_FORCE_INLINE_THROW(uint32_t) \
3195 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3196 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3197 { \
3198 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3199 a_enmArmOp, a_ArmElemSz); \
3200 } \
3201 DECL_FORCE_INLINE_THROW(uint32_t) \
3202 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3203 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3204 { \
3205 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3206 a_enmArmOp, a_ArmElemSz); \
3207 } \
3208 typedef int ignore_semicolon
3209#else
3210# error "Port me"
3211#endif
3212
3213
3214IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(mulps, kArmv8VecInstrFpOp_Mul, kArmv8VecInstrFpSz_4x_Single, 0, 0x59);
3215IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addps, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_4x_Single, 0, 0x58);
3216IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addpd, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_2x_Double, X86_OP_PRF_SIZE_OP, 0x58);
3217IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(subps, kArmv8VecInstrFpOp_Sub, kArmv8VecInstrFpSz_4x_Single, 0, 0x5c);
3218
3219
3220#endif /* !VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette