VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 100595

Last change on this file since 100595 was 100595, checked in by vboxsync, 17 months ago

VMM/IEM: Implement vpsubsb/vpsubsw instruction emulations, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 680.1 KB
Line 
1/* $Id: IEMAllAImplC.cpp 100595 2023-07-17 10:55:34Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6124{
6125 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6126 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6127 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6128 extFloat80_t v;
6129 (void)fFcw;
6130
6131 v = extF80_atan2(y, x, &SoftState);
6132
6133 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6134 return fFsw;
6135}
6136
6137IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6138 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6139{
6140 uint16_t const fFcw = pFpuState->FCW;
6141 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6142
6143 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6144 {
6145 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6146
6147 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6148 if (!(fFcw & X86_FCW_PM))
6149 fFsw |= X86_FSW_ES | X86_FSW_B;
6150 }
6151 else
6152 {
6153 fFsw |= X86_FSW_IE;
6154 if (!(fFcw & X86_FCW_IM))
6155 {
6156 pFpuRes->r80Result = *pr80Val2;
6157 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6158 }
6159 else
6160 {
6161 pFpuRes->r80Result = g_r80Indefinite;
6162 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6163 }
6164 }
6165
6166 pFpuRes->FSW = fFsw;
6167}
6168#endif /* IEM_WITHOUT_ASSEMBLY */
6169
6170IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6171 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6172{
6173 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6174}
6175
6176IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6177 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6178{
6179 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6180}
6181
6182
6183#if defined(IEM_WITHOUT_ASSEMBLY)
6184static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6185{
6186 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6187 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6188 extFloat80_t v;
6189 (void)fFcw;
6190
6191 v = extF80_tan(x, &SoftState);
6192
6193 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6194 return fFsw;
6195}
6196
6197IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6198{
6199 uint16_t const fFcw = pFpuState->FCW;
6200 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6201
6202 if (RTFLOAT80U_IS_ZERO(pr80Val))
6203 {
6204 pFpuResTwo->r80Result1 = *pr80Val;
6205 pFpuResTwo->r80Result2 = g_ar80One[0];
6206 }
6207 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6208 {
6209 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6210 {
6211 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6212 pFpuResTwo->r80Result1 = *pr80Val;
6213 }
6214 else
6215 {
6216 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6217 {
6218 pFpuResTwo->r80Result1 = *pr80Val;
6219 }
6220 else
6221 {
6222 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6223 }
6224
6225 pFpuResTwo->r80Result2 = g_ar80One[0];
6226
6227 fFsw |= X86_FSW_PE;
6228 if (!(fFcw & X86_FCW_PM))
6229 fFsw |= X86_FSW_ES | X86_FSW_B;
6230 }
6231 }
6232 else
6233 {
6234 fFsw |= X86_FSW_IE;
6235 if (!(fFcw & X86_FCW_IM))
6236 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6237 }
6238
6239 pFpuResTwo->FSW = fFsw;
6240}
6241#endif /* IEM_WITHOUT_ASSEMBLY */
6242
6243IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6244{
6245 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6246}
6247
6248IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6249{
6250 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6251}
6252
6253#ifdef IEM_WITHOUT_ASSEMBLY
6254
6255static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6256{
6257 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6258 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6259 extFloat80_t v;
6260 (void)fFcw;
6261
6262 v = extF80_sin(x, &SoftState);
6263
6264 iemFpuSoftF80ToIprt(pr80Result, v);
6265
6266 return fFsw;
6267}
6268
6269IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6270{
6271 uint16_t const fFcw = pFpuState->FCW;
6272 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6273
6274 if (RTFLOAT80U_IS_ZERO(pr80Val))
6275 {
6276 pFpuRes->r80Result = *pr80Val;
6277 }
6278 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6279 {
6280 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6281 {
6282 fFsw |= X86_FSW_C2;
6283 pFpuRes->r80Result = *pr80Val;
6284 }
6285 else
6286 {
6287 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6288 {
6289 pFpuRes->r80Result = *pr80Val;
6290 }
6291 else
6292 {
6293 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6294 }
6295 fFsw |= X86_FSW_PE;
6296 if (!(fFcw & X86_FCW_PM))
6297 fFsw |= X86_FSW_ES | X86_FSW_B;
6298 }
6299 }
6300 else if (RTFLOAT80U_IS_INF(pr80Val))
6301 {
6302 fFsw |= X86_FSW_IE;
6303 if (!(fFcw & X86_FCW_IM))
6304 {
6305 fFsw |= X86_FSW_ES | X86_FSW_B;
6306 pFpuRes->r80Result = *pr80Val;
6307 }
6308 else
6309 {
6310 pFpuRes->r80Result = g_r80Indefinite;
6311 }
6312 }
6313 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6314 {
6315 fFsw |= X86_FSW_DE;
6316
6317 if (fFcw & X86_FCW_DM)
6318 {
6319 if (fFcw & X86_FCW_UM)
6320 {
6321 pFpuRes->r80Result = *pr80Val;
6322 }
6323 else
6324 {
6325 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6326 uint64_t uMantissa = pr80Val->s.uMantissa;
6327 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6328
6329 uExponent = 64 - uExponent;
6330 uMantissa <<= uExponent;
6331 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6332
6333 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6334 pFpuRes->r80Result.s.uMantissa = uMantissa;
6335 pFpuRes->r80Result.s.uExponent = uExponent;
6336 }
6337
6338 fFsw |= X86_FSW_UE | X86_FSW_PE;
6339
6340 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6341 {
6342 /* All the exceptions are masked. */
6343 }
6344 else
6345 {
6346 fFsw |= X86_FSW_ES | X86_FSW_B;
6347 }
6348 }
6349 else
6350 {
6351 pFpuRes->r80Result = *pr80Val;
6352
6353 fFsw |= X86_FSW_ES | X86_FSW_B;
6354 }
6355 }
6356 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6357 {
6358 pFpuRes->r80Result = *pr80Val;
6359 fFsw |= X86_FSW_DE;
6360
6361 if (fFcw & X86_FCW_DM)
6362 {
6363 if (fFcw & X86_FCW_PM)
6364 {
6365 fFsw |= X86_FSW_PE;
6366 }
6367 else
6368 {
6369 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6370 }
6371
6372 pFpuRes->r80Result.sj64.uExponent = 1;
6373 }
6374 else
6375 {
6376 fFsw |= X86_FSW_ES | X86_FSW_B;
6377 }
6378 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6379 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6380 {
6381 pFpuRes->r80Result = *pr80Val;
6382 } else {
6383 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6384 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6385 && (fFcw & X86_FCW_IM))
6386 pFpuRes->r80Result = g_r80Indefinite;
6387 else
6388 {
6389 pFpuRes->r80Result = *pr80Val;
6390 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6391 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6392 }
6393
6394 fFsw |= X86_FSW_IE;
6395 if (!(fFcw & X86_FCW_IM))
6396 fFsw |= X86_FSW_ES | X86_FSW_B;
6397 }
6398
6399 pFpuRes->FSW = fFsw;
6400}
6401#endif /* IEM_WITHOUT_ASSEMBLY */
6402
6403IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6404{
6405 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6406}
6407
6408IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6409{
6410 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6411}
6412
6413#ifdef IEM_WITHOUT_ASSEMBLY
6414
6415static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6416{
6417 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6418 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6419 extFloat80_t v;
6420 (void)fFcw;
6421
6422 v = extF80_cos(x, &SoftState);
6423
6424 iemFpuSoftF80ToIprt(pr80Result, v);
6425
6426 return fFsw;
6427}
6428
6429IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6430{
6431 uint16_t const fFcw = pFpuState->FCW;
6432 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6433
6434 if (RTFLOAT80U_IS_ZERO(pr80Val))
6435 {
6436 pFpuRes->r80Result = g_ar80One[0];
6437 }
6438 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6439 {
6440 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6441 {
6442 fFsw |= X86_FSW_C2;
6443 pFpuRes->r80Result = *pr80Val;
6444 }
6445 else
6446 {
6447 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6448 {
6449 pFpuRes->r80Result = g_ar80One[0];
6450
6451 }
6452 else
6453 {
6454 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6455 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6456 }
6457 fFsw |= X86_FSW_PE;
6458 if (!(fFcw & X86_FCW_PM))
6459 fFsw |= X86_FSW_ES | X86_FSW_B;
6460 }
6461 }
6462 else if (RTFLOAT80U_IS_INF(pr80Val))
6463 {
6464 fFsw |= X86_FSW_IE;
6465 if (!(fFcw & X86_FCW_IM))
6466 {
6467 fFsw |= X86_FSW_ES | X86_FSW_B;
6468 pFpuRes->r80Result = *pr80Val;
6469 }
6470 else
6471 {
6472 pFpuRes->r80Result = g_r80Indefinite;
6473 }
6474 }
6475 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6476 {
6477 fFsw |= X86_FSW_DE;
6478
6479 if (fFcw & X86_FCW_DM)
6480 {
6481 pFpuRes->r80Result = g_ar80One[0];
6482
6483 if (fFcw & X86_FCW_PM)
6484 {
6485 fFsw |= X86_FSW_PE;
6486 }
6487 else
6488 {
6489 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6490 }
6491 }
6492 else
6493 {
6494 pFpuRes->r80Result = *pr80Val;
6495 fFsw |= X86_FSW_ES | X86_FSW_B;
6496 }
6497 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6498 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6499 {
6500 pFpuRes->r80Result = *pr80Val;
6501 } else {
6502 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6503 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6504 && (fFcw & X86_FCW_IM))
6505 pFpuRes->r80Result = g_r80Indefinite;
6506 else
6507 {
6508 pFpuRes->r80Result = *pr80Val;
6509 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6510 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6511 }
6512
6513 fFsw |= X86_FSW_IE;
6514 if (!(fFcw & X86_FCW_IM))
6515 fFsw |= X86_FSW_ES | X86_FSW_B;
6516 }
6517
6518 pFpuRes->FSW = fFsw;
6519}
6520#endif /* IEM_WITHOUT_ASSEMBLY */
6521
6522IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6523{
6524 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6525}
6526
6527IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6528{
6529 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6530}
6531
6532#ifdef IEM_WITHOUT_ASSEMBLY
6533
6534static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6535{
6536 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6537 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6538 extFloat80_t r80Sin, r80Cos;
6539 (void)fFcw;
6540
6541 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6542
6543 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6544 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6545
6546 return fFsw;
6547}
6548
6549IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6550{
6551 uint16_t const fFcw = pFpuState->FCW;
6552 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6553
6554 if (RTFLOAT80U_IS_ZERO(pr80Val))
6555 {
6556 pFpuResTwo->r80Result1 = *pr80Val;
6557 pFpuResTwo->r80Result2 = g_ar80One[0];
6558 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6559 }
6560 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6561 {
6562 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6563 {
6564 fFsw |= X86_FSW_C2;
6565
6566 if (fFcw & X86_FCW_IM)
6567 {
6568 pFpuResTwo->r80Result1 = g_r80Indefinite;
6569 }
6570 else
6571 {
6572 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6573 }
6574
6575 pFpuResTwo->r80Result2 = *pr80Val;
6576 }
6577 else
6578 {
6579 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6580
6581 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6582 {
6583 pFpuResTwo->r80Result1 = *pr80Val;
6584 pFpuResTwo->r80Result2 = g_ar80One[0];
6585 }
6586 else
6587 {
6588 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6589 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6590 }
6591 fFsw |= X86_FSW_PE;
6592 if (!(fFcw & X86_FCW_PM))
6593 fFsw |= X86_FSW_ES | X86_FSW_B;
6594 }
6595 }
6596 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6597 {
6598 fFsw |= X86_FSW_DE;
6599
6600 if (fFcw & X86_FCW_DM)
6601 {
6602 pFpuResTwo->r80Result1 = *pr80Val;
6603 pFpuResTwo->r80Result2 = g_ar80One[0];
6604 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6605
6606 if (fFcw & X86_FCW_PM)
6607 {
6608 fFsw |= X86_FSW_PE;
6609 }
6610 else
6611 {
6612 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6613 }
6614
6615 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6616 }
6617 else
6618 {
6619 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6620 pFpuResTwo->r80Result2 = *pr80Val;
6621 fFsw |= X86_FSW_ES | X86_FSW_B;
6622 }
6623 }
6624 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6625 {
6626 fFsw |= X86_FSW_DE;
6627
6628 if (fFcw & X86_FCW_DM)
6629 {
6630 pFpuResTwo->r80Result2 = g_ar80One[0];
6631
6632 if (fFcw & X86_FCW_UM)
6633 {
6634 pFpuResTwo->r80Result1 = *pr80Val;
6635 }
6636 else
6637 {
6638 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6639 uint64_t uMantissa = pr80Val->s.uMantissa;
6640 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6641
6642 uExponent = 64 - uExponent;
6643 uMantissa <<= uExponent;
6644 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6645
6646 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6647 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6648 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6649 }
6650
6651 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6652 fFsw |= X86_FSW_UE | X86_FSW_PE;
6653
6654 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6655 {
6656 /* All the exceptions are masked. */
6657 }
6658 else
6659 {
6660 fFsw |= X86_FSW_ES | X86_FSW_B;
6661 }
6662 }
6663 else
6664 {
6665 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6666 pFpuResTwo->r80Result2 = *pr80Val;
6667 fFsw |= X86_FSW_ES | X86_FSW_B;
6668 }
6669 }
6670 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6671 {
6672 pFpuResTwo->r80Result1 = *pr80Val;
6673 pFpuResTwo->r80Result2 = *pr80Val;
6674 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6675 }
6676 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6677 {
6678 if (fFcw & X86_FCW_IM)
6679 {
6680 pFpuResTwo->r80Result1 = g_r80Indefinite;
6681 pFpuResTwo->r80Result2 = g_r80Indefinite;
6682 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6683 }
6684 else
6685 {
6686 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6687 pFpuResTwo->r80Result2 = *pr80Val;
6688 }
6689
6690 fFsw |= X86_FSW_IE;
6691 if (!(fFcw & X86_FCW_IM))
6692 fFsw |= X86_FSW_ES | X86_FSW_B;
6693 }
6694 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6695 {
6696 pFpuResTwo->r80Result1 = *pr80Val;
6697 pFpuResTwo->r80Result2 = *pr80Val;
6698
6699 if (fFcw & X86_FCW_IM)
6700 {
6701 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6702 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6703 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6704 }
6705 else
6706 {
6707 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6708 pFpuResTwo->r80Result2 = *pr80Val;
6709 }
6710
6711 fFsw |= X86_FSW_IE;
6712 if (!(fFcw & X86_FCW_IM))
6713 fFsw |= X86_FSW_ES | X86_FSW_B;
6714 }
6715 else if (RTFLOAT80U_IS_INF(pr80Val))
6716 {
6717 if (fFcw & X86_FCW_IM)
6718 {
6719 pFpuResTwo->r80Result1 = g_r80Indefinite;
6720 pFpuResTwo->r80Result2 = g_r80Indefinite;
6721 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6722 }
6723 else
6724 {
6725 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6726 pFpuResTwo->r80Result2 = *pr80Val;
6727 }
6728
6729 fFsw |= X86_FSW_IE;
6730 if (!(fFcw & X86_FCW_IM))
6731 fFsw |= X86_FSW_ES | X86_FSW_B;
6732 }
6733
6734 pFpuResTwo->FSW = fFsw;
6735}
6736#endif /* IEM_WITHOUT_ASSEMBLY */
6737
6738IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6739{
6740 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6741}
6742
6743IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6744{
6745 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6746}
6747
6748#ifdef IEM_WITHOUT_ASSEMBLY
6749
6750
6751/*********************************************************************************************************************************
6752* x87 FPU Compare and Testing Operations *
6753*********************************************************************************************************************************/
6754
6755IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6756{
6757 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6758
6759 if (RTFLOAT80U_IS_ZERO(pr80Val))
6760 fFsw |= X86_FSW_C3;
6761 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6762 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6763 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6764 {
6765 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6766 if (!(pFpuState->FCW & X86_FCW_DM))
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 else
6770 {
6771 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6772 if (!(pFpuState->FCW & X86_FCW_IM))
6773 fFsw |= X86_FSW_ES | X86_FSW_B;
6774 }
6775
6776 *pu16Fsw = fFsw;
6777}
6778
6779
6780IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6781{
6782 RT_NOREF(pFpuState);
6783 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6784
6785 /* C1 = sign bit (always, even if empty Intel says). */
6786 if (pr80Val->s.fSign)
6787 fFsw |= X86_FSW_C1;
6788
6789 /* Classify the value in C0, C2, C3. */
6790 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6791 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6792 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6793 fFsw |= X86_FSW_C2;
6794 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6795 fFsw |= X86_FSW_C3;
6796 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6797 fFsw |= X86_FSW_C0;
6798 else if (RTFLOAT80U_IS_INF(pr80Val))
6799 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6800 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6801 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6802 /* whatever else: 0 */
6803
6804 *pu16Fsw = fFsw;
6805}
6806
6807
6808/**
6809 * Worker for fcom, fucom, and friends.
6810 */
6811static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6812 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6813{
6814 /*
6815 * Unpack the values.
6816 */
6817 bool const fSign1 = pr80Val1->s.fSign;
6818 int32_t iExponent1 = pr80Val1->s.uExponent;
6819 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6820
6821 bool const fSign2 = pr80Val2->s.fSign;
6822 int32_t iExponent2 = pr80Val2->s.uExponent;
6823 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6824
6825 /*
6826 * Check for invalid inputs.
6827 */
6828 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6829 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6830 {
6831 if (!(fFcw & X86_FCW_IM))
6832 fFsw |= X86_FSW_ES | X86_FSW_B;
6833 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6834 }
6835
6836 /*
6837 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6838 */
6839 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6840 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6841 {
6842 if ( fIeOnAllNaNs
6843 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6844 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6845 {
6846 fFsw |= X86_FSW_IE;
6847 if (!(fFcw & X86_FCW_IM))
6848 fFsw |= X86_FSW_ES | X86_FSW_B;
6849 }
6850 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6851 }
6852
6853 /*
6854 * Normalize the values.
6855 */
6856 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6857 {
6858 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6859 iExponent1 = 1;
6860 else
6861 {
6862 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6863 uMantissa1 <<= iExponent1;
6864 iExponent1 = 1 - iExponent1;
6865 }
6866 fFsw |= X86_FSW_DE;
6867 if (!(fFcw & X86_FCW_DM))
6868 fFsw |= X86_FSW_ES | X86_FSW_B;
6869 }
6870
6871 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6872 {
6873 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6874 iExponent2 = 1;
6875 else
6876 {
6877 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6878 uMantissa2 <<= iExponent2;
6879 iExponent2 = 1 - iExponent2;
6880 }
6881 fFsw |= X86_FSW_DE;
6882 if (!(fFcw & X86_FCW_DM))
6883 fFsw |= X86_FSW_ES | X86_FSW_B;
6884 }
6885
6886 /*
6887 * Test if equal (val1 == val2):
6888 */
6889 if ( uMantissa1 == uMantissa2
6890 && iExponent1 == iExponent2
6891 && ( fSign1 == fSign2
6892 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6893 fFsw |= X86_FSW_C3;
6894 /*
6895 * Test if less than (val1 < val2):
6896 */
6897 else if (fSign1 && !fSign2)
6898 fFsw |= X86_FSW_C0;
6899 else if (fSign1 == fSign2)
6900 {
6901 /* Zeros are problematic, however at the most one can be zero here. */
6902 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6903 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6904 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6905 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6906
6907 if ( fSign1
6908 ^ ( iExponent1 < iExponent2
6909 || ( iExponent1 == iExponent2
6910 && uMantissa1 < uMantissa2 ) ) )
6911 fFsw |= X86_FSW_C0;
6912 }
6913 /* else: No flags set if greater. */
6914
6915 return fFsw;
6916}
6917
6918
6919IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6920 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6921{
6922 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6923}
6924
6925
6926
6927
6928IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6929 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6930{
6931 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6932}
6933
6934
6935IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6936 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6937{
6938 RTFLOAT80U r80Val2;
6939 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6940 Assert(!fFsw || fFsw == X86_FSW_DE);
6941 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6942 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6943 {
6944 if (!(pFpuState->FCW & X86_FCW_DM))
6945 fFsw |= X86_FSW_ES | X86_FSW_B;
6946 *pfFsw |= fFsw;
6947 }
6948}
6949
6950
6951IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6952 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6953{
6954 RTFLOAT80U r80Val2;
6955 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6956 Assert(!fFsw || fFsw == X86_FSW_DE);
6957 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6958 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6959 {
6960 if (!(pFpuState->FCW & X86_FCW_DM))
6961 fFsw |= X86_FSW_ES | X86_FSW_B;
6962 *pfFsw |= fFsw;
6963 }
6964}
6965
6966
6967IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6968 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6969{
6970 RTFLOAT80U r80Val2;
6971 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6972 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6973}
6974
6975
6976IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6977 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6978{
6979 RTFLOAT80U r80Val2;
6980 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6981 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6982}
6983
6984
6985/**
6986 * Worker for fcomi & fucomi.
6987 */
6988static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6989 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6990{
6991 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6992 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6993 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6994 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6995
6996 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6997 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6998 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6999}
7000
7001
7002IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7003 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7004{
7005 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7006}
7007
7008
7009IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7010 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7011{
7012 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7013}
7014
7015
7016/*********************************************************************************************************************************
7017* x87 FPU Other Operations *
7018*********************************************************************************************************************************/
7019
7020/**
7021 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7022 */
7023static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7024{
7025 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7026 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7027 true /*exact / generate #PE */, &SoftState));
7028 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7029}
7030
7031
7032IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7033{
7034 uint16_t const fFcw = pFpuState->FCW;
7035 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7036
7037 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7038 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7039 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7040 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7041 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7042 || RTFLOAT80U_IS_INF(pr80Val))
7043 pFpuRes->r80Result = *pr80Val;
7044 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7045 {
7046 fFsw |= X86_FSW_DE;
7047 if (fFcw & X86_FCW_DM)
7048 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7049 else
7050 {
7051 pFpuRes->r80Result = *pr80Val;
7052 fFsw |= X86_FSW_ES | X86_FSW_B;
7053 }
7054 }
7055 else
7056 {
7057 if (fFcw & X86_FCW_IM)
7058 {
7059 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7060 pFpuRes->r80Result = g_r80Indefinite;
7061 else
7062 {
7063 pFpuRes->r80Result = *pr80Val;
7064 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7065 }
7066 }
7067 else
7068 {
7069 pFpuRes->r80Result = *pr80Val;
7070 fFsw |= X86_FSW_ES | X86_FSW_B;
7071 }
7072 fFsw |= X86_FSW_IE;
7073 }
7074 pFpuRes->FSW = fFsw;
7075}
7076
7077
7078IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7079 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7080{
7081 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7082 it does everything we need it to do. */
7083 uint16_t const fFcw = pFpuState->FCW;
7084 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7085 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7086 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7087 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7088}
7089
7090
7091/**
7092 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7093 */
7094static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7095{
7096 Assert(!pr80Val->s.fSign);
7097 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7098 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7099 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7100}
7101
7102
7103IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7104{
7105 uint16_t const fFcw = pFpuState->FCW;
7106 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7107
7108 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7109 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7110 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7111 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7112 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7113 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7114 pFpuRes->r80Result = *pr80Val;
7115 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7116 {
7117 fFsw |= X86_FSW_DE;
7118 if (fFcw & X86_FCW_DM)
7119 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7120 else
7121 {
7122 pFpuRes->r80Result = *pr80Val;
7123 fFsw |= X86_FSW_ES | X86_FSW_B;
7124 }
7125 }
7126 else
7127 {
7128 if (fFcw & X86_FCW_IM)
7129 {
7130 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7131 pFpuRes->r80Result = g_r80Indefinite;
7132 else
7133 {
7134 pFpuRes->r80Result = *pr80Val;
7135 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7136 }
7137 }
7138 else
7139 {
7140 pFpuRes->r80Result = *pr80Val;
7141 fFsw |= X86_FSW_ES | X86_FSW_B;
7142 }
7143 fFsw |= X86_FSW_IE;
7144 }
7145 pFpuRes->FSW = fFsw;
7146}
7147
7148
7149/**
7150 * @code{.unparsed}
7151 * x x * ln2
7152 * f(x) = 2 - 1 = e - 1
7153 *
7154 * @endcode
7155 *
7156 * We can approximate e^x by a Taylor/Maclaurin series (see
7157 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7158 * @code{.unparsed}
7159 * n 0 1 2 3 4
7160 * inf x x x x x x
7161 * SUM ----- = --- + --- + --- + --- + --- + ...
7162 * n=0 n! 0! 1! 2! 3! 4!
7163 *
7164 * 2 3 4
7165 * x x x
7166 * = 1 + x + --- + --- + --- + ...
7167 * 2! 3! 4!
7168 * @endcode
7169 *
7170 * Given z = x * ln2, we get:
7171 * @code{.unparsed}
7172 * 2 3 4 n
7173 * z z z z z
7174 * e - 1 = z + --- + --- + --- + ... + ---
7175 * 2! 3! 4! n!
7176 * @endcode
7177 *
7178 * Wanting to use Horner's method, we move one z outside and get:
7179 * @code{.unparsed}
7180 * 2 3 (n-1)
7181 * z z z z
7182 * = z ( 1 + --- + --- + --- + ... + ------- )
7183 * 2! 3! 4! n!
7184 * @endcode
7185 *
7186 * The constants we need for using Horner's methods are 1 and 1 / n!.
7187 *
7188 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7189 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7190 * and can approximate it to be 1.0. For a visual demonstration of this
7191 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7192 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7193 *
7194 *
7195 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7196 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7197 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7198 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7199 * blocks). (The one bit difference is probably an implicit one missing from
7200 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7201 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7202 * exponent.
7203 *
7204 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7205 * successfully reproduced the exact results from an Intel 10980XE, there is
7206 * always a portition of rounding differences. Not going to spend too much time
7207 * on getting this 100% the same, at least not now.
7208 *
7209 * P.S. If someone are really curious about 8087 and its contstants:
7210 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7211 *
7212 *
7213 * @param pr80Val The exponent value (x), less than 1.0, greater than
7214 * -1.0 and not zero. This can be a normal, denormal
7215 * or pseudo-denormal value.
7216 * @param pr80Result Where to return the result.
7217 * @param fFcw FPU control word.
7218 * @param fFsw FPU status word.
7219 */
7220static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7221{
7222 /* As mentioned above, we can skip the expensive polynomial calculation
7223 as it will be close enough to 1.0 that it makes no difference.
7224
7225 The cutoff point for intel 10980XE is exponents >= -69. Intel
7226 also seems to be using a 67-bit or 68-bit constant value, and we get
7227 a smattering of rounding differences if we go for higher precision. */
7228 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7229 {
7230 RTUINT256U u256;
7231 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7232 u256.QWords.qw0 |= 1; /* force #PE */
7233 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7234 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7235 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7236 : 1 - RTFLOAT80U_EXP_BIAS,
7237 fFcw, fFsw);
7238 }
7239 else
7240 {
7241#ifdef IEM_WITH_FLOAT128_FOR_FPU
7242 /* This approach is not good enough for small values, we end up with zero. */
7243 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7244 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7245 _Float128 rd128Result = powf128(2.0L, rd128Val);
7246 rd128Result -= 1.0L;
7247 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7248 iemFpuF128RestoreRounding(fOldRounding);
7249
7250# else
7251 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7252 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7253
7254 /* As mentioned above, enforce 68-bit internal mantissa width to better
7255 match the Intel 10980XE results. */
7256 unsigned const cPrecision = 68;
7257
7258 /* first calculate z = x * ln2 */
7259 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7260 cPrecision);
7261
7262 /* Then do the polynomial evaluation. */
7263 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7264 cPrecision, &SoftState);
7265 r = f128_mul(z, r, &SoftState);
7266
7267 /* Output the result. */
7268 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7269# endif
7270 }
7271 return fFsw;
7272}
7273
7274
7275IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7276{
7277 uint16_t const fFcw = pFpuState->FCW;
7278 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7279
7280 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7281 {
7282 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7283 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7284 else
7285 {
7286 /* Special case:
7287 2^+1.0 - 1.0 = 1.0
7288 2^-1.0 - 1.0 = -0.5 */
7289 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7290 && pr80Val->s.uMantissa == RT_BIT_64(63))
7291 {
7292 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7293 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7294 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7295 }
7296 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7297 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7298 else
7299 pFpuRes->r80Result = *pr80Val;
7300 fFsw |= X86_FSW_PE;
7301 if (!(fFcw & X86_FCW_PM))
7302 fFsw |= X86_FSW_ES | X86_FSW_B;
7303 }
7304 }
7305 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7306 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7307 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7308 pFpuRes->r80Result = *pr80Val;
7309 else if (RTFLOAT80U_IS_INF(pr80Val))
7310 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7311 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7312 {
7313 fFsw |= X86_FSW_DE;
7314 if (fFcw & X86_FCW_DM)
7315 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7316 else
7317 {
7318 pFpuRes->r80Result = *pr80Val;
7319 fFsw |= X86_FSW_ES | X86_FSW_B;
7320 }
7321 }
7322 else
7323 {
7324 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7325 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7326 && (fFcw & X86_FCW_IM))
7327 pFpuRes->r80Result = g_r80Indefinite;
7328 else
7329 {
7330 pFpuRes->r80Result = *pr80Val;
7331 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7332 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7333 }
7334 fFsw |= X86_FSW_IE;
7335 if (!(fFcw & X86_FCW_IM))
7336 fFsw |= X86_FSW_ES | X86_FSW_B;
7337 }
7338 pFpuRes->FSW = fFsw;
7339}
7340
7341#endif /* IEM_WITHOUT_ASSEMBLY */
7342
7343IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7344{
7345 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7346}
7347
7348IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7349{
7350 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7351}
7352
7353#ifdef IEM_WITHOUT_ASSEMBLY
7354
7355IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7356{
7357 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7358 pFpuRes->r80Result = *pr80Val;
7359 pFpuRes->r80Result.s.fSign = 0;
7360}
7361
7362
7363IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7364{
7365 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7366 pFpuRes->r80Result = *pr80Val;
7367 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7368}
7369
7370
7371IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7372{
7373 uint16_t const fFcw = pFpuState->FCW;
7374 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7375
7376 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7377 {
7378 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7379 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7380
7381 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7382 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7383 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7384 }
7385 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7386 {
7387 fFsw |= X86_FSW_ZE;
7388 if (fFcw & X86_FCW_ZM)
7389 {
7390 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7391 pFpuResTwo->r80Result2 = *pr80Val;
7392 }
7393 else
7394 {
7395 pFpuResTwo->r80Result2 = *pr80Val;
7396 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7397 }
7398 }
7399 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7400 {
7401 fFsw |= X86_FSW_DE;
7402 if (fFcw & X86_FCW_DM)
7403 {
7404 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7405 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7406 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7407 int32_t iExponent = -16382;
7408 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7409 {
7410 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7411 iExponent--;
7412 }
7413
7414 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7415 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7416 }
7417 else
7418 {
7419 pFpuResTwo->r80Result2 = *pr80Val;
7420 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7421 }
7422 }
7423 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7424 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7425 {
7426 pFpuResTwo->r80Result1 = *pr80Val;
7427 pFpuResTwo->r80Result2 = *pr80Val;
7428 }
7429 else if (RTFLOAT80U_IS_INF(pr80Val))
7430 {
7431 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7432 pFpuResTwo->r80Result2 = *pr80Val;
7433 }
7434 else
7435 {
7436 if (fFcw & X86_FCW_IM)
7437 {
7438 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7439 pFpuResTwo->r80Result1 = g_r80Indefinite;
7440 else
7441 {
7442 pFpuResTwo->r80Result1 = *pr80Val;
7443 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7444 }
7445 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7446 }
7447 else
7448 {
7449 pFpuResTwo->r80Result2 = *pr80Val;
7450 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7451 }
7452 fFsw |= X86_FSW_IE;
7453 }
7454 pFpuResTwo->FSW = fFsw;
7455}
7456#endif /* IEM_WITHOUT_ASSEMBLY */
7457
7458#if defined(IEM_WITHOUT_ASSEMBLY)
7459
7460static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7461{
7462 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7463 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7464 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7465 extFloat80_t v;
7466 (void)fFcw;
7467
7468 v = extF80_ylog2x(y, x, &SoftState);
7469 iemFpuSoftF80ToIprt(pr80Result, v);
7470
7471 return fFsw;
7472}
7473
7474IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7475 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7476{
7477 uint16_t const fFcw = pFpuState->FCW;
7478 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7479
7480 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7481 {
7482 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7483
7484 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7485 if (!(fFcw & X86_FCW_PM))
7486 fFsw |= X86_FSW_ES | X86_FSW_B;
7487 }
7488 else
7489 {
7490 fFsw |= X86_FSW_IE;
7491
7492 if (!(fFcw & X86_FCW_IM))
7493 {
7494 pFpuRes->r80Result = *pr80Val2;
7495 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7496 }
7497 else
7498 {
7499 pFpuRes->r80Result = g_r80Indefinite;
7500 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7501 }
7502 }
7503
7504 pFpuRes->FSW = fFsw;
7505}
7506#endif /* IEM_WITHOUT_ASSEMBLY */
7507
7508IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7509 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7510{
7511 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7512}
7513
7514IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7515 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7516{
7517 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7518}
7519
7520#if defined(IEM_WITHOUT_ASSEMBLY)
7521
7522static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7523{
7524 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7525 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7526 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7527 extFloat80_t v;
7528 (void)fFcw;
7529
7530 v = extF80_ylog2xp1(y, x, &SoftState);
7531 iemFpuSoftF80ToIprt(pr80Result, v);
7532
7533 return fFsw;
7534}
7535
7536IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7537 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7538{
7539 uint16_t const fFcw = pFpuState->FCW;
7540 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7541
7542 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7543 {
7544 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7545
7546 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7547 if (!(fFcw & X86_FCW_PM))
7548 fFsw |= X86_FSW_ES | X86_FSW_B;
7549 }
7550 else
7551 {
7552 fFsw |= X86_FSW_IE;
7553
7554 if (!(fFcw & X86_FCW_IM))
7555 {
7556 pFpuRes->r80Result = *pr80Val2;
7557 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7558 }
7559 else
7560 {
7561 pFpuRes->r80Result = g_r80Indefinite;
7562 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7563 }
7564 }
7565
7566 pFpuRes->FSW = fFsw;
7567}
7568
7569#endif /* IEM_WITHOUT_ASSEMBLY */
7570
7571IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7572 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7573{
7574 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7575}
7576
7577IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7578 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7579{
7580 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7581}
7582
7583
7584/*********************************************************************************************************************************
7585* MMX, SSE & AVX *
7586*********************************************************************************************************************************/
7587
7588#ifdef IEM_WITH_VEX
7589
7590/*
7591 * VMOVSLDUP
7592 */
7593IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7594{
7595 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7596 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7597 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7598 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7599 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7600 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7601 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7602 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7603}
7604
7605
7606IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7607{
7608 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7609 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7610 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7611 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7612 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7613 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7614 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7615 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7616}
7617
7618#endif /* IEM_WITH_VEX */
7619
7620
7621#ifdef IEM_WITH_VEX
7622
7623/*
7624 * VMOVSHDUP
7625 */
7626IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7627{
7628 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7629 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7630 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7631 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7632 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7633 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7634 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7635 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7636}
7637
7638
7639IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7640{
7641 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7642 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7643 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7644 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7645 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7646 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7647 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7648 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7649}
7650
7651#endif /* IEM_WITH_VEX */
7652
7653
7654#ifdef IEM_WITH_VEX
7655
7656/*
7657 * VMOVDDUP
7658 */
7659IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7660{
7661 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7662 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7663 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7664 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7665}
7666
7667IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7668{
7669 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7670 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7671 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7672 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7673}
7674
7675#endif /* IEM_WITH_VEX */
7676
7677
7678/*
7679 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7680 */
7681#ifdef IEM_WITHOUT_ASSEMBLY
7682
7683IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7684{
7685 RT_NOREF(pFpuState);
7686 *puDst &= *puSrc;
7687}
7688
7689
7690IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7691{
7692 RT_NOREF(pFpuState);
7693 puDst->au64[0] &= puSrc->au64[0];
7694 puDst->au64[1] &= puSrc->au64[1];
7695}
7696
7697#endif
7698
7699IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7700 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7701{
7702 RT_NOREF(pExtState);
7703 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7704 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7705}
7706
7707
7708IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7709 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7710{
7711 RT_NOREF(pExtState);
7712 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7713 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7714 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7715 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7716}
7717
7718
7719/*
7720 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7721 */
7722#ifdef IEM_WITHOUT_ASSEMBLY
7723
7724IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7725{
7726 RT_NOREF(pFpuState);
7727 *puDst = ~*puDst & *puSrc;
7728}
7729
7730
7731IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7732{
7733 RT_NOREF(pFpuState);
7734 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7735 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7736}
7737
7738#endif
7739
7740IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7741 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7742{
7743 RT_NOREF(pExtState);
7744 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7745 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7746}
7747
7748
7749IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7750 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7751{
7752 RT_NOREF(pExtState);
7753 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7754 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7755 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7756 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7757}
7758
7759
7760/*
7761 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7762 */
7763#ifdef IEM_WITHOUT_ASSEMBLY
7764
7765IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7766{
7767 RT_NOREF(pFpuState);
7768 *puDst |= *puSrc;
7769}
7770
7771
7772IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7773{
7774 RT_NOREF(pFpuState);
7775 puDst->au64[0] |= puSrc->au64[0];
7776 puDst->au64[1] |= puSrc->au64[1];
7777}
7778
7779#endif
7780
7781IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7782 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7783{
7784 RT_NOREF(pExtState);
7785 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7786 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7787}
7788
7789
7790IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7791 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7792{
7793 RT_NOREF(pExtState);
7794 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7795 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7796 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7797 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7798}
7799
7800
7801/*
7802 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7803 */
7804#ifdef IEM_WITHOUT_ASSEMBLY
7805
7806IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7807{
7808 RT_NOREF(pFpuState);
7809 *puDst ^= *puSrc;
7810}
7811
7812
7813IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7814{
7815 RT_NOREF(pFpuState);
7816 puDst->au64[0] ^= puSrc->au64[0];
7817 puDst->au64[1] ^= puSrc->au64[1];
7818}
7819
7820#endif
7821
7822IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7823 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7824{
7825 RT_NOREF(pExtState);
7826 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7827 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7828}
7829
7830
7831IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7832 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7833{
7834 RT_NOREF(pExtState);
7835 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7836 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7837 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7838 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7839}
7840
7841
7842/*
7843 * PCMPEQB / VPCMPEQB
7844 */
7845#ifdef IEM_WITHOUT_ASSEMBLY
7846
7847IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7848{
7849 RT_NOREF(pFpuState);
7850 RTUINT64U uSrc1 = { *puDst };
7851 RTUINT64U uSrc2 = { *puSrc };
7852 RTUINT64U uDst;
7853 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7854 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7855 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7856 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7857 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7858 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7859 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7860 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7861 *puDst = uDst.u;
7862}
7863
7864
7865IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7866{
7867 RT_NOREF(pFpuState);
7868 RTUINT128U uSrc1 = *puDst;
7869 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7870 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7871 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7872 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7873 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7874 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7875 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7876 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7877 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7878 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7879 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7880 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7881 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7882 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7883 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7884 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7885}
7886
7887#endif
7888
7889IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7890 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7891{
7892 RT_NOREF(pExtState);
7893 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7894 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7895 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7896 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7897 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7898 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7899 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7900 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7901 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7902 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7903 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7904 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7905 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7906 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7907 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7908 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7909}
7910
7911IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7912 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7913{
7914 RT_NOREF(pExtState);
7915 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7916 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7917 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7918 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7919 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7920 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7921 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7922 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7923 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7924 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7925 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7926 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7927 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7928 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7929 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7930 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7931 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7932 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7933 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7934 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7935 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7936 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7937 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7938 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7939 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7940 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7941 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7942 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7943 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7944 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7945 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7946 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7947}
7948
7949
7950/*
7951 * PCMPEQW / VPCMPEQW
7952 */
7953#ifdef IEM_WITHOUT_ASSEMBLY
7954
7955IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7956{
7957 RT_NOREF(pFpuState);
7958 RTUINT64U uSrc1 = { *puDst };
7959 RTUINT64U uSrc2 = { *puSrc };
7960 RTUINT64U uDst;
7961 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7962 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7963 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7964 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7965 *puDst = uDst.u;
7966}
7967
7968
7969IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7970{
7971 RT_NOREF(pFpuState);
7972 RTUINT128U uSrc1 = *puDst;
7973 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7974 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7975 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7976 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7977 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7978 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7979 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7980 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7981}
7982
7983#endif
7984
7985IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7986 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7987{
7988 RT_NOREF(pExtState);
7989 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7990 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7991 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7992 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7993 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7994 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7995 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7996 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7997}
7998
7999IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8000 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8001{
8002 RT_NOREF(pExtState);
8003 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8004 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8005 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8006 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8007 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8008 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8009 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8010 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8011 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8012 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8013 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8014 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8015 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8016 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8017 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8018 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8019}
8020
8021
8022/*
8023 * PCMPEQD / VPCMPEQD.
8024 */
8025#ifdef IEM_WITHOUT_ASSEMBLY
8026
8027IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8028{
8029 RT_NOREF(pFpuState);
8030 RTUINT64U uSrc1 = { *puDst };
8031 RTUINT64U uSrc2 = { *puSrc };
8032 RTUINT64U uDst;
8033 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8034 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8035 *puDst = uDst.u;
8036}
8037
8038
8039IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8040{
8041 RT_NOREF(pFpuState);
8042 RTUINT128U uSrc1 = *puDst;
8043 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8044 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8045 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8046 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8047}
8048
8049#endif /* IEM_WITHOUT_ASSEMBLY */
8050
8051IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8052 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8053{
8054 RT_NOREF(pExtState);
8055 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8056 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8057 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8058 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8059}
8060
8061IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8062 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8063{
8064 RT_NOREF(pExtState);
8065 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8066 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8067 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8068 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8069 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8070 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8071 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8072 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8073}
8074
8075
8076/*
8077 * PCMPEQQ / VPCMPEQQ.
8078 */
8079IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8080{
8081 RT_NOREF(pFpuState);
8082 RTUINT128U uSrc1 = *puDst;
8083 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8084 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8085}
8086
8087IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8088 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8089{
8090 RT_NOREF(pExtState);
8091 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8092 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8093}
8094
8095IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8096 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8097{
8098 RT_NOREF(pExtState);
8099 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8100 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8101 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8102 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8103}
8104
8105
8106/*
8107 * PCMPGTB / VPCMPGTB
8108 */
8109#ifdef IEM_WITHOUT_ASSEMBLY
8110
8111IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8112{
8113 RT_NOREF(pFpuState);
8114 RTUINT64U uSrc1 = { *puDst };
8115 RTUINT64U uSrc2 = { *puSrc };
8116 RTUINT64U uDst;
8117 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8118 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8119 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8120 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8121 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8122 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8123 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8124 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8125 *puDst = uDst.u;
8126}
8127
8128
8129IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8130{
8131 RT_NOREF(pFpuState);
8132 RTUINT128U uSrc1 = *puDst;
8133 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8134 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8135 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8136 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8137 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8138 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8139 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8140 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8141 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8142 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8143 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8144 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8145 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8146 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8147 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8148 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8149}
8150
8151#endif
8152
8153IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8154 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8155{
8156 RT_NOREF(pExtState);
8157 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8158 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8159 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8160 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8161 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8162 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8163 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8164 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8165 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8166 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8167 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8168 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8169 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8170 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8171 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8172 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8173}
8174
8175IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8176 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8177{
8178 RT_NOREF(pExtState);
8179 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8180 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8181 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8182 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8183 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8184 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8185 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8186 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8187 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8188 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8189 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8190 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8191 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8192 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8193 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8194 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8195 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8196 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8197 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8198 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8199 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8200 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8201 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8202 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8203 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8204 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8205 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8206 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8207 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8208 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8209 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8210 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8211}
8212
8213
8214/*
8215 * PCMPGTW / VPCMPGTW
8216 */
8217#ifdef IEM_WITHOUT_ASSEMBLY
8218
8219IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8220{
8221 RT_NOREF(pFpuState);
8222 RTUINT64U uSrc1 = { *puDst };
8223 RTUINT64U uSrc2 = { *puSrc };
8224 RTUINT64U uDst;
8225 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8226 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8227 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8228 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8229 *puDst = uDst.u;
8230}
8231
8232
8233IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8234{
8235 RT_NOREF(pFpuState);
8236 RTUINT128U uSrc1 = *puDst;
8237 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8238 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8239 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8240 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8241 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8242 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8243 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8244 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8245}
8246
8247#endif
8248
8249IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8250 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8251{
8252 RT_NOREF(pExtState);
8253 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8254 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8255 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8256 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8257 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8258 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8259 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8260 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8261}
8262
8263IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8264 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8265{
8266 RT_NOREF(pExtState);
8267 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8268 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8269 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8270 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8271 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8272 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8273 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8274 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8275 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8276 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8277 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8278 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8279 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8280 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8281 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8282 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8283}
8284
8285
8286/*
8287 * PCMPGTD / VPCMPGTD.
8288 */
8289#ifdef IEM_WITHOUT_ASSEMBLY
8290
8291IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8292{
8293 RT_NOREF(pFpuState);
8294 RTUINT64U uSrc1 = { *puDst };
8295 RTUINT64U uSrc2 = { *puSrc };
8296 RTUINT64U uDst;
8297 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8298 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8299 *puDst = uDst.u;
8300}
8301
8302
8303IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8304{
8305 RT_NOREF(pFpuState);
8306 RTUINT128U uSrc1 = *puDst;
8307 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8308 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8309 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8310 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8311}
8312
8313#endif /* IEM_WITHOUT_ASSEMBLY */
8314
8315IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8316 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8317{
8318 RT_NOREF(pExtState);
8319 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8320 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8321 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8322 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8323}
8324
8325IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8326 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8327{
8328 RT_NOREF(pExtState);
8329 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8330 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8331 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8332 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8333 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8334 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8335 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8336 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8337}
8338
8339
8340/*
8341 * PCMPGTQ / VPCMPGTQ.
8342 */
8343IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8344{
8345 RT_NOREF(pFpuState);
8346 RTUINT128U uSrc1 = *puDst;
8347 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8348 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8349}
8350
8351IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8352 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8353{
8354 RT_NOREF(pExtState);
8355 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8356 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8357}
8358
8359IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8360 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8361{
8362 RT_NOREF(pExtState);
8363 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8364 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8365 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8366 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8367}
8368
8369
8370/*
8371 * PADDB / VPADDB
8372 */
8373#ifdef IEM_WITHOUT_ASSEMBLY
8374
8375IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8376{
8377 RT_NOREF(pFpuState);
8378 RTUINT64U uSrc1 = { *puDst };
8379 RTUINT64U uSrc2 = { *puSrc };
8380 RTUINT64U uDst;
8381 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8382 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8383 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8384 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8385 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8386 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8387 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8388 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8389 *puDst = uDst.u;
8390}
8391
8392
8393IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8394{
8395 RT_NOREF(pFpuState);
8396 RTUINT128U uSrc1 = *puDst;
8397 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8398 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8399 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8400 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8401 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8402 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8403 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8404 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8405 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8406 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8407 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8408 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8409 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8410 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8411 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8412 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8413}
8414
8415#endif
8416
8417
8418IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8419 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8420{
8421 RT_NOREF(pExtState);
8422 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8423 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8424 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8425 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8426 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8427 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8428 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8429 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8430 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8431 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8432 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8433 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8434 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8435 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8436 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8437 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8438}
8439
8440IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8441 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8442{
8443 RT_NOREF(pExtState);
8444 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8445 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8446 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8447 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8448 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8449 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8450 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8451 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8452 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8453 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8454 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8455 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8456 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8457 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8458 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8459 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8460 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8461 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8462 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8463 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8464 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8465 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8466 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8467 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8468 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8469 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8470 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8471 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8472 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8473 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8474 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8475 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8476}
8477
8478
8479/*
8480 * PADDSB / VPADDSB
8481 */
8482#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8483 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8484 ? (uint8_t)(a_iWord) \
8485 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8486
8487#ifdef IEM_WITHOUT_ASSEMBLY
8488
8489IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8490{
8491 RT_NOREF(pFpuState);
8492 RTUINT64U uSrc1 = { *puDst };
8493 RTUINT64U uSrc2 = { *puSrc };
8494 RTUINT64U uDst;
8495 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8496 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8497 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8498 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8499 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8500 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8501 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8502 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8503 *puDst = uDst.u;
8504}
8505
8506
8507IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8508{
8509 RT_NOREF(pFpuState);
8510 RTUINT128U uSrc1 = *puDst;
8511 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8512 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8513 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8514 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8515 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8516 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8517 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8518 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8519 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8520 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8521 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8522 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8523 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8524 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8525 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8526 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8527}
8528
8529#endif
8530
8531
8532/*
8533 * PADDSB / VPADDSB
8534 */
8535#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8536 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8537 ? (uint8_t)(a_uWord) \
8538 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8539
8540#ifdef IEM_WITHOUT_ASSEMBLY
8541
8542IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8543{
8544 RT_NOREF(pFpuState);
8545 RTUINT64U uSrc1 = { *puDst };
8546 RTUINT64U uSrc2 = { *puSrc };
8547 RTUINT64U uDst;
8548 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8549 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8550 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8551 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8552 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8553 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8554 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8555 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8556 *puDst = uDst.u;
8557}
8558
8559
8560IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8561{
8562 RT_NOREF(pFpuState);
8563 RTUINT128U uSrc1 = *puDst;
8564 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8565 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8566 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8567 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8568 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8569 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8570 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8571 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8572 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8573 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8574 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8575 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8576 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8577 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8578 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8579 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8580}
8581
8582#endif
8583
8584
8585/*
8586 * PADDW / VPADDW
8587 */
8588#ifdef IEM_WITHOUT_ASSEMBLY
8589
8590IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8591{
8592 RT_NOREF(pFpuState);
8593 RTUINT64U uSrc1 = { *puDst };
8594 RTUINT64U uSrc2 = { *puSrc };
8595 RTUINT64U uDst;
8596 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8597 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8598 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8599 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8600 *puDst = uDst.u;
8601}
8602
8603
8604IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8605{
8606 RT_NOREF(pFpuState);
8607 RTUINT128U uSrc1 = *puDst;
8608 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8609 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8610 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8611 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8612 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8613 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8614 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8615 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8616}
8617
8618#endif
8619
8620
8621IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8622 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8623{
8624 RT_NOREF(pExtState);
8625 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8626 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8627 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8628 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8629 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8630 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8631 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8632 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8633}
8634
8635IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8636 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8637{
8638 RT_NOREF(pExtState);
8639 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8640 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8641 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8642 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8643 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8644 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8645 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8646 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8647 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8648 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8649 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8650 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8651 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8652 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8653 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8654 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8655}
8656
8657
8658/*
8659 * PADDSW / VPADDSW
8660 */
8661#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8662 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8663 ? (uint16_t)(a_iDword) \
8664 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8665
8666#ifdef IEM_WITHOUT_ASSEMBLY
8667
8668IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8669{
8670 RT_NOREF(pFpuState);
8671 RTUINT64U uSrc1 = { *puDst };
8672 RTUINT64U uSrc2 = { *puSrc };
8673 RTUINT64U uDst;
8674 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8675 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8676 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8677 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8678 *puDst = uDst.u;
8679}
8680
8681
8682IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8683{
8684 RT_NOREF(pFpuState);
8685 RTUINT128U uSrc1 = *puDst;
8686 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8687 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8688 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8689 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8690 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8691 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8692 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8693 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8694}
8695
8696#endif
8697
8698
8699/*
8700 * PADDUSW / VPADDUSW
8701 */
8702#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8703 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8704 ? (uint16_t)(a_uDword) \
8705 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8706
8707#ifdef IEM_WITHOUT_ASSEMBLY
8708
8709IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8710{
8711 RT_NOREF(pFpuState);
8712 RTUINT64U uSrc1 = { *puDst };
8713 RTUINT64U uSrc2 = { *puSrc };
8714 RTUINT64U uDst;
8715 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8716 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8717 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8718 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8719 *puDst = uDst.u;
8720}
8721
8722
8723IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8724{
8725 RT_NOREF(pFpuState);
8726 RTUINT128U uSrc1 = *puDst;
8727 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8728 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8729 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8730 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8731 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8732 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8733 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8734 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8735}
8736
8737#endif
8738
8739
8740/*
8741 * PADDD / VPADDD.
8742 */
8743#ifdef IEM_WITHOUT_ASSEMBLY
8744
8745IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8746{
8747 RT_NOREF(pFpuState);
8748 RTUINT64U uSrc1 = { *puDst };
8749 RTUINT64U uSrc2 = { *puSrc };
8750 RTUINT64U uDst;
8751 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8752 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8753 *puDst = uDst.u;
8754}
8755
8756
8757IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8758{
8759 RT_NOREF(pFpuState);
8760 RTUINT128U uSrc1 = *puDst;
8761 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8762 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8763 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8764 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8765}
8766
8767#endif /* IEM_WITHOUT_ASSEMBLY */
8768
8769IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8770 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8771{
8772 RT_NOREF(pExtState);
8773 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8774 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8775 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8776 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8777}
8778
8779IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8780 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8781{
8782 RT_NOREF(pExtState);
8783 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8784 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8785 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8786 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8787 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8788 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8789 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8790 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8791}
8792
8793
8794/*
8795 * PADDQ / VPADDQ.
8796 */
8797#ifdef IEM_WITHOUT_ASSEMBLY
8798
8799IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8800{
8801 RT_NOREF(pFpuState);
8802 *puDst = *puDst + *puSrc;
8803}
8804
8805IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8806{
8807 RT_NOREF(pFpuState);
8808 RTUINT128U uSrc1 = *puDst;
8809 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8810 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8811}
8812
8813#endif
8814
8815IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8816 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8817{
8818 RT_NOREF(pExtState);
8819 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8820 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8821}
8822
8823IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8824 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8825{
8826 RT_NOREF(pExtState);
8827 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8828 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8829 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8830 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8831}
8832
8833
8834/*
8835 * PSUBB / VPSUBB
8836 */
8837#ifdef IEM_WITHOUT_ASSEMBLY
8838
8839IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8840{
8841 RT_NOREF(pFpuState);
8842 RTUINT64U uSrc1 = { *puDst };
8843 RTUINT64U uSrc2 = { *puSrc };
8844 RTUINT64U uDst;
8845 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8846 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8847 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8848 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8849 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8850 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8851 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8852 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8853 *puDst = uDst.u;
8854}
8855
8856
8857IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8858{
8859 RT_NOREF(pFpuState);
8860 RTUINT128U uSrc1 = *puDst;
8861 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8862 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8863 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8864 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8865 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8866 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8867 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8868 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8869 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8870 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8871 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8872 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8873 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8874 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8875 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8876 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8877}
8878
8879#endif
8880
8881IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8882 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8883{
8884 RT_NOREF(pExtState);
8885 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8886 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8887 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8888 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8889 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8890 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8891 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8892 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8893 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8894 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8895 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8896 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8897 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8898 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8899 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8900 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8901}
8902
8903IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8904 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8905{
8906 RT_NOREF(pExtState);
8907 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8908 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8909 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8910 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8911 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8912 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8913 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8914 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8915 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8916 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8917 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8918 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8919 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8920 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8921 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8922 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8923 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8924 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8925 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8926 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8927 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8928 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8929 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8930 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8931 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8932 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8933 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8934 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8935 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8936 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8937 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8938 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8939}
8940
8941
8942/*
8943 * PSUBSB / VSUBSB
8944 */
8945#ifdef IEM_WITHOUT_ASSEMBLY
8946
8947IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8948{
8949 RT_NOREF(pFpuState);
8950 RTUINT64U uSrc1 = { *puDst };
8951 RTUINT64U uSrc2 = { *puSrc };
8952 RTUINT64U uDst;
8953 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8954 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8955 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8956 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8957 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8958 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8959 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8960 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8961 *puDst = uDst.u;
8962}
8963
8964
8965IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8966{
8967 RT_NOREF(pFpuState);
8968 RTUINT128U uSrc1 = *puDst;
8969 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8970 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8971 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8972 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8973 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8974 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8975 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8976 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8977 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8978 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8979 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8980 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8981 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8982 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8983 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8984 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8985}
8986
8987#endif
8988
8989IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
8990 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8991{
8992 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
8993 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
8994 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
8995 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
8996 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
8997 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
8998 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
8999 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9000 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9001 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9002 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9003 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9004 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9005 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9006 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9007 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9008}
9009
9010IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9011 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9012{
9013 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9014 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9015 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9016 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9017 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9018 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9019 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9020 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9021 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9022 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9023 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9024 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9025 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9026 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9027 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9028 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9029 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9030 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9031 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9032 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9033 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9034 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9035 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9036 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9037 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9038 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9039 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9040 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9041 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9042 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9043 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9044 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9045}
9046
9047
9048/*
9049 * PADDSB / VPADDSB
9050 */
9051#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9052 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9053 ? (uint8_t)(a_uWord) \
9054 : (uint8_t)0 )
9055
9056#ifdef IEM_WITHOUT_ASSEMBLY
9057
9058IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9059{
9060 RT_NOREF(pFpuState);
9061 RTUINT64U uSrc1 = { *puDst };
9062 RTUINT64U uSrc2 = { *puSrc };
9063 RTUINT64U uDst;
9064 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9065 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9066 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9067 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9068 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9069 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9070 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9071 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9072 *puDst = uDst.u;
9073}
9074
9075
9076IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9077{
9078 RT_NOREF(pFpuState);
9079 RTUINT128U uSrc1 = *puDst;
9080 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9081 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9082 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9083 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9084 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9085 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9086 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9087 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9088 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9089 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9090 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9091 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9092 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9093 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9094 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9095 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9096}
9097
9098#endif
9099
9100
9101/*
9102 * PSUBW / VPSUBW
9103 */
9104#ifdef IEM_WITHOUT_ASSEMBLY
9105
9106IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9107{
9108 RT_NOREF(pFpuState);
9109 RTUINT64U uSrc1 = { *puDst };
9110 RTUINT64U uSrc2 = { *puSrc };
9111 RTUINT64U uDst;
9112 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9113 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9114 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9115 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9116 *puDst = uDst.u;
9117}
9118
9119
9120IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9121{
9122 RT_NOREF(pFpuState);
9123 RTUINT128U uSrc1 = *puDst;
9124 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9125 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9126 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9127 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9128 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9129 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9130 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9131 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9132}
9133
9134#endif
9135
9136IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9137 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9138{
9139 RT_NOREF(pExtState);
9140 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9141 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9142 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9143 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9144 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9145 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9146 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9147 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9148}
9149
9150IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9151 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9152{
9153 RT_NOREF(pExtState);
9154 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9155 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9156 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9157 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9158 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9159 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9160 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9161 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9162 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9163 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9164 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9165 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9166 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9167 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9168 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9169 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9170}
9171
9172
9173/*
9174 * PSUBSW / VPSUBSW
9175 */
9176#ifdef IEM_WITHOUT_ASSEMBLY
9177
9178IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9179{
9180 RT_NOREF(pFpuState);
9181 RTUINT64U uSrc1 = { *puDst };
9182 RTUINT64U uSrc2 = { *puSrc };
9183 RTUINT64U uDst;
9184 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9185 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9186 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9187 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9188 *puDst = uDst.u;
9189}
9190
9191
9192IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9193{
9194 RT_NOREF(pFpuState);
9195 RTUINT128U uSrc1 = *puDst;
9196 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9197 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9198 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9199 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9200 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9201 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9202 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9203 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9204}
9205
9206#endif
9207
9208IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9209 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9210{
9211 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9212 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9213 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9214 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9215 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9216 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9217 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9218 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9219}
9220
9221IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9222 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9223{
9224 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9225 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9226 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9227 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9228 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9229 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9230 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9231 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9232 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9233 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9234 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9235 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9236 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9237 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9238 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9239 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9240}
9241
9242
9243/*
9244 * PSUBUSW / VPSUBUSW
9245 */
9246#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9247 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9248 ? (uint16_t)(a_uDword) \
9249 : (uint16_t)0 )
9250
9251#ifdef IEM_WITHOUT_ASSEMBLY
9252
9253IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9254{
9255 RT_NOREF(pFpuState);
9256 RTUINT64U uSrc1 = { *puDst };
9257 RTUINT64U uSrc2 = { *puSrc };
9258 RTUINT64U uDst;
9259 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9260 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9261 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9262 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9263 *puDst = uDst.u;
9264}
9265
9266
9267IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9268{
9269 RT_NOREF(pFpuState);
9270 RTUINT128U uSrc1 = *puDst;
9271 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9272 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9273 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9274 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9275 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9276 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9277 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9278 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9279}
9280
9281#endif
9282
9283
9284/*
9285 * PSUBD / VPSUBD.
9286 */
9287#ifdef IEM_WITHOUT_ASSEMBLY
9288
9289IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9290{
9291 RT_NOREF(pFpuState);
9292 RTUINT64U uSrc1 = { *puDst };
9293 RTUINT64U uSrc2 = { *puSrc };
9294 RTUINT64U uDst;
9295 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9296 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9297 *puDst = uDst.u;
9298}
9299
9300
9301IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9302{
9303 RT_NOREF(pFpuState);
9304 RTUINT128U uSrc1 = *puDst;
9305 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9306 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9307 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9308 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9309}
9310
9311#endif /* IEM_WITHOUT_ASSEMBLY */
9312
9313IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9314 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9315{
9316 RT_NOREF(pExtState);
9317 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9318 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9319 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9320 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9321}
9322
9323IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9324 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9325{
9326 RT_NOREF(pExtState);
9327 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9328 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9329 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9330 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9331 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9332 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9333 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9334 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9335}
9336
9337
9338/*
9339 * PSUBQ / VPSUBQ.
9340 */
9341#ifdef IEM_WITHOUT_ASSEMBLY
9342
9343IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9344{
9345 RT_NOREF(pFpuState);
9346 *puDst = *puDst - *puSrc;
9347}
9348
9349IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9350{
9351 RT_NOREF(pFpuState);
9352 RTUINT128U uSrc1 = *puDst;
9353 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9354 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9355}
9356
9357#endif
9358
9359IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9360 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9361{
9362 RT_NOREF(pExtState);
9363 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9364 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9365}
9366
9367IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9368 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9369{
9370 RT_NOREF(pExtState);
9371 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9372 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9373 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9374 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9375}
9376
9377
9378
9379/*
9380 * PMULLW / VPMULLW / PMULLD / VPMULLD
9381 */
9382#ifdef IEM_WITHOUT_ASSEMBLY
9383
9384IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9385{
9386 RT_NOREF(pFpuState);
9387 RTUINT64U uSrc1 = { *puDst };
9388 RTUINT64U uSrc2 = { *puSrc };
9389 RTUINT64U uDst;
9390 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9391 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9392 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9393 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9394 *puDst = uDst.u;
9395}
9396
9397
9398IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9399{
9400 RT_NOREF(pFpuState);
9401 RTUINT128U uSrc1 = *puDst;
9402 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9403 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9404 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9405 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9406 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9407 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9408 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9409 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9410}
9411
9412#endif
9413
9414IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9415{
9416 RTUINT128U uSrc1 = *puDst;
9417
9418 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9419 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9420 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9421 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9422 RT_NOREF(pFpuState);
9423}
9424
9425
9426IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9427{
9428 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9429 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9430 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9431 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9432 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9433 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9434 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9435 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9436}
9437
9438
9439IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9440{
9441 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9442 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9443 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9444 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9445 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9446 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9447 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9448 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9449 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9450 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9451 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9452 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9453 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9454 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9455 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9456 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9457}
9458
9459
9460IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9461{
9462 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9463 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9464 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9465 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9466}
9467
9468
9469IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9470{
9471 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9472 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9473 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9474 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9475 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9476 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9477 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9478 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9479}
9480
9481
9482/*
9483 * PMULHW / VPMULHW
9484 */
9485#ifdef IEM_WITHOUT_ASSEMBLY
9486
9487IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9488{
9489 RT_NOREF(pFpuState);
9490 RTUINT64U uSrc1 = { *puDst };
9491 RTUINT64U uSrc2 = { *puSrc };
9492 RTUINT64U uDst;
9493 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9494 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9495 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9496 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9497 *puDst = uDst.u;
9498}
9499
9500
9501IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9502{
9503 RT_NOREF(pFpuState);
9504 RTUINT128U uSrc1 = *puDst;
9505 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9506 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9507 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9508 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9509 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9510 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9511 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9512 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9513}
9514
9515#endif
9516
9517IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9518{
9519 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9520 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9521 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9522 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9523 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9524 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9525 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9526 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9527}
9528
9529
9530IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9531{
9532 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9533 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9534 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9535 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9536 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9537 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9538 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9539 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9540 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9541 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9542 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9543 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9544 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9545 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9546 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9547 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9548}
9549
9550
9551/*
9552 * PMULHUW / VPMULHUW
9553 */
9554#ifdef IEM_WITHOUT_ASSEMBLY
9555
9556IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9557{
9558 RTUINT64U uSrc1 = { *puDst };
9559 RTUINT64U uSrc2 = { *puSrc };
9560 RTUINT64U uDst;
9561 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9562 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9563 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9564 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9565 *puDst = uDst.u;
9566}
9567
9568
9569IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9570{
9571 RTUINT128U uSrc1 = *puDst;
9572 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9573 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9574 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9575 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9576 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9577 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9578 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9579 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9580}
9581
9582#endif
9583
9584IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9585{
9586 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9587 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9588 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9589 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9590 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9591 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9592 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9593 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9594}
9595
9596
9597IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9598{
9599 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9600 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9601 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9602 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9603 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9604 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9605 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9606 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9607 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9608 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9609 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9610 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9611 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9612 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9613 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9614 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9615}
9616
9617
9618/*
9619 * PSRLW / VPSRLW
9620 */
9621#ifdef IEM_WITHOUT_ASSEMBLY
9622
9623IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9624{
9625 RTUINT64U uSrc1 = { *puDst };
9626 RTUINT64U uSrc2 = { *puSrc };
9627 RTUINT64U uDst;
9628
9629 if (uSrc2.au64[0] <= 15)
9630 {
9631 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9632 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9633 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9634 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9635 }
9636 else
9637 {
9638 uDst.au64[0] = 0;
9639 }
9640 *puDst = uDst.u;
9641}
9642
9643
9644IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9645{
9646 RTUINT64U uSrc1 = { *puDst };
9647 RTUINT64U uDst;
9648
9649 if (uShift <= 15)
9650 {
9651 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9652 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9653 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9654 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9655 }
9656 else
9657 {
9658 uDst.au64[0] = 0;
9659 }
9660 *puDst = uDst.u;
9661}
9662
9663
9664IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9665{
9666 RTUINT128U uSrc1 = *puDst;
9667
9668 if (puSrc->au64[0] <= 15)
9669 {
9670 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9671 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9672 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9673 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9674 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9675 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9676 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9677 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9678 }
9679 else
9680 {
9681 puDst->au64[0] = 0;
9682 puDst->au64[1] = 0;
9683 }
9684}
9685
9686IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9687{
9688 RTUINT128U uSrc1 = *puDst;
9689
9690 if (uShift <= 15)
9691 {
9692 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9693 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9694 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9695 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9696 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9697 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9698 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9699 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9700 }
9701 else
9702 {
9703 puDst->au64[0] = 0;
9704 puDst->au64[1] = 0;
9705 }
9706}
9707
9708#endif
9709
9710
9711/*
9712 * PSRAW / VPSRAW
9713 */
9714#ifdef IEM_WITHOUT_ASSEMBLY
9715
9716IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9717{
9718 RTUINT64U uSrc1 = { *puDst };
9719 RTUINT64U uSrc2 = { *puSrc };
9720 RTUINT64U uDst;
9721
9722 if (uSrc2.au64[0] <= 15)
9723 {
9724 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9725 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9726 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9727 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9728 }
9729 else
9730 {
9731 uDst.au64[0] = 0;
9732 }
9733 *puDst = uDst.u;
9734}
9735
9736
9737IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9738{
9739 RTUINT64U uSrc1 = { *puDst };
9740 RTUINT64U uDst;
9741
9742 if (uShift <= 15)
9743 {
9744 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9745 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9746 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9747 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9748 }
9749 else
9750 {
9751 uDst.au64[0] = 0;
9752 }
9753 *puDst = uDst.u;
9754}
9755
9756
9757IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9758{
9759 RTUINT128U uSrc1 = *puDst;
9760
9761 if (puSrc->au64[0] <= 15)
9762 {
9763 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9764 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9765 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9766 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9767 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9768 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9769 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9770 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9771 }
9772 else
9773 {
9774 puDst->au64[0] = 0;
9775 puDst->au64[1] = 0;
9776 }
9777}
9778
9779IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9780{
9781 RTUINT128U uSrc1 = *puDst;
9782
9783 if (uShift <= 15)
9784 {
9785 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9786 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9787 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9788 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9789 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9790 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9791 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9792 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9793 }
9794 else
9795 {
9796 puDst->au64[0] = 0;
9797 puDst->au64[1] = 0;
9798 }
9799}
9800
9801#endif
9802
9803
9804/*
9805 * PSLLW / VPSLLW
9806 */
9807#ifdef IEM_WITHOUT_ASSEMBLY
9808
9809IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9810{
9811 RTUINT64U uSrc1 = { *puDst };
9812 RTUINT64U uSrc2 = { *puSrc };
9813 RTUINT64U uDst;
9814
9815 if (uSrc2.au64[0] <= 15)
9816 {
9817 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9818 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9819 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9820 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9821 }
9822 else
9823 {
9824 uDst.au64[0] = 0;
9825 }
9826 *puDst = uDst.u;
9827}
9828
9829
9830IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9831{
9832 RTUINT64U uSrc1 = { *puDst };
9833 RTUINT64U uDst;
9834
9835 if (uShift <= 15)
9836 {
9837 uDst.au16[0] = uSrc1.au16[0] << uShift;
9838 uDst.au16[1] = uSrc1.au16[1] << uShift;
9839 uDst.au16[2] = uSrc1.au16[2] << uShift;
9840 uDst.au16[3] = uSrc1.au16[3] << uShift;
9841 }
9842 else
9843 {
9844 uDst.au64[0] = 0;
9845 }
9846 *puDst = uDst.u;
9847}
9848
9849
9850IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9851{
9852 RTUINT128U uSrc1 = *puDst;
9853
9854 if (puSrc->au64[0] <= 15)
9855 {
9856 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9857 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9858 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9859 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9860 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9861 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9862 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9863 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9864 }
9865 else
9866 {
9867 puDst->au64[0] = 0;
9868 puDst->au64[1] = 0;
9869 }
9870}
9871
9872IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9873{
9874 RTUINT128U uSrc1 = *puDst;
9875
9876 if (uShift <= 15)
9877 {
9878 puDst->au16[0] = uSrc1.au16[0] << uShift;
9879 puDst->au16[1] = uSrc1.au16[1] << uShift;
9880 puDst->au16[2] = uSrc1.au16[2] << uShift;
9881 puDst->au16[3] = uSrc1.au16[3] << uShift;
9882 puDst->au16[4] = uSrc1.au16[4] << uShift;
9883 puDst->au16[5] = uSrc1.au16[5] << uShift;
9884 puDst->au16[6] = uSrc1.au16[6] << uShift;
9885 puDst->au16[7] = uSrc1.au16[7] << uShift;
9886 }
9887 else
9888 {
9889 puDst->au64[0] = 0;
9890 puDst->au64[1] = 0;
9891 }
9892}
9893
9894#endif
9895
9896
9897/*
9898 * PSRLD / VPSRLD
9899 */
9900#ifdef IEM_WITHOUT_ASSEMBLY
9901
9902IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9903{
9904 RTUINT64U uSrc1 = { *puDst };
9905 RTUINT64U uSrc2 = { *puSrc };
9906 RTUINT64U uDst;
9907
9908 if (uSrc2.au64[0] <= 31)
9909 {
9910 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9911 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9912 }
9913 else
9914 {
9915 uDst.au64[0] = 0;
9916 }
9917 *puDst = uDst.u;
9918}
9919
9920
9921IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9922{
9923 RTUINT64U uSrc1 = { *puDst };
9924 RTUINT64U uDst;
9925
9926 if (uShift <= 31)
9927 {
9928 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9929 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9930 }
9931 else
9932 {
9933 uDst.au64[0] = 0;
9934 }
9935 *puDst = uDst.u;
9936}
9937
9938
9939IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9940{
9941 RTUINT128U uSrc1 = *puDst;
9942
9943 if (puSrc->au64[0] <= 31)
9944 {
9945 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9946 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9947 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9948 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9949 }
9950 else
9951 {
9952 puDst->au64[0] = 0;
9953 puDst->au64[1] = 0;
9954 }
9955}
9956
9957IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9958{
9959 RTUINT128U uSrc1 = *puDst;
9960
9961 if (uShift <= 31)
9962 {
9963 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9964 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9965 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9966 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9967 }
9968 else
9969 {
9970 puDst->au64[0] = 0;
9971 puDst->au64[1] = 0;
9972 }
9973}
9974
9975#endif
9976
9977
9978/*
9979 * PSRAD / VPSRAD
9980 */
9981#ifdef IEM_WITHOUT_ASSEMBLY
9982
9983IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9984{
9985 RTUINT64U uSrc1 = { *puDst };
9986 RTUINT64U uSrc2 = { *puSrc };
9987 RTUINT64U uDst;
9988
9989 if (uSrc2.au64[0] <= 31)
9990 {
9991 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9992 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9993 }
9994 else
9995 {
9996 uDst.au64[0] = 0;
9997 }
9998 *puDst = uDst.u;
9999}
10000
10001
10002IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10003{
10004 RTUINT64U uSrc1 = { *puDst };
10005 RTUINT64U uDst;
10006
10007 if (uShift <= 31)
10008 {
10009 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10010 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10011 }
10012 else
10013 {
10014 uDst.au64[0] = 0;
10015 }
10016 *puDst = uDst.u;
10017}
10018
10019
10020IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10021{
10022 RTUINT128U uSrc1 = *puDst;
10023
10024 if (puSrc->au64[0] <= 31)
10025 {
10026 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
10027 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
10028 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
10029 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
10030 }
10031 else
10032 {
10033 puDst->au64[0] = 0;
10034 puDst->au64[1] = 0;
10035 }
10036}
10037
10038IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10039{
10040 RTUINT128U uSrc1 = *puDst;
10041
10042 if (uShift <= 31)
10043 {
10044 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10045 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10046 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10047 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10048 }
10049 else
10050 {
10051 puDst->au64[0] = 0;
10052 puDst->au64[1] = 0;
10053 }
10054}
10055
10056#endif
10057
10058
10059/*
10060 * PSLLD / VPSLLD
10061 */
10062#ifdef IEM_WITHOUT_ASSEMBLY
10063
10064IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10065{
10066 RTUINT64U uSrc1 = { *puDst };
10067 RTUINT64U uSrc2 = { *puSrc };
10068 RTUINT64U uDst;
10069
10070 if (uSrc2.au64[0] <= 31)
10071 {
10072 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10073 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10074 }
10075 else
10076 {
10077 uDst.au64[0] = 0;
10078 }
10079 *puDst = uDst.u;
10080}
10081
10082
10083IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10084{
10085 RTUINT64U uSrc1 = { *puDst };
10086 RTUINT64U uDst;
10087
10088 if (uShift <= 31)
10089 {
10090 uDst.au32[0] = uSrc1.au32[0] << uShift;
10091 uDst.au32[1] = uSrc1.au32[1] << uShift;
10092 }
10093 else
10094 {
10095 uDst.au64[0] = 0;
10096 }
10097 *puDst = uDst.u;
10098}
10099
10100
10101IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10102{
10103 RTUINT128U uSrc1 = *puDst;
10104
10105 if (puSrc->au64[0] <= 31)
10106 {
10107 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10108 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10109 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10110 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10111 }
10112 else
10113 {
10114 puDst->au64[0] = 0;
10115 puDst->au64[1] = 0;
10116 }
10117}
10118
10119IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10120{
10121 RTUINT128U uSrc1 = *puDst;
10122
10123 if (uShift <= 31)
10124 {
10125 puDst->au32[0] = uSrc1.au32[0] << uShift;
10126 puDst->au32[1] = uSrc1.au32[1] << uShift;
10127 puDst->au32[2] = uSrc1.au32[2] << uShift;
10128 puDst->au32[3] = uSrc1.au32[3] << uShift;
10129 }
10130 else
10131 {
10132 puDst->au64[0] = 0;
10133 puDst->au64[1] = 0;
10134 }
10135}
10136
10137#endif
10138
10139
10140/*
10141 * PSRLQ / VPSRLQ
10142 */
10143#ifdef IEM_WITHOUT_ASSEMBLY
10144
10145IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10146{
10147 RTUINT64U uSrc1 = { *puDst };
10148 RTUINT64U uSrc2 = { *puSrc };
10149 RTUINT64U uDst;
10150
10151 if (uSrc2.au64[0] <= 63)
10152 {
10153 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10154 }
10155 else
10156 {
10157 uDst.au64[0] = 0;
10158 }
10159 *puDst = uDst.u;
10160}
10161
10162
10163IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10164{
10165 RTUINT64U uSrc1 = { *puDst };
10166 RTUINT64U uDst;
10167
10168 if (uShift <= 63)
10169 {
10170 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10171 }
10172 else
10173 {
10174 uDst.au64[0] = 0;
10175 }
10176 *puDst = uDst.u;
10177}
10178
10179
10180IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10181{
10182 RTUINT128U uSrc1 = *puDst;
10183
10184 if (puSrc->au64[0] <= 63)
10185 {
10186 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10187 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10188 }
10189 else
10190 {
10191 puDst->au64[0] = 0;
10192 puDst->au64[1] = 0;
10193 }
10194}
10195
10196IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10197{
10198 RTUINT128U uSrc1 = *puDst;
10199
10200 if (uShift <= 63)
10201 {
10202 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10203 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10204 }
10205 else
10206 {
10207 puDst->au64[0] = 0;
10208 puDst->au64[1] = 0;
10209 }
10210}
10211
10212#endif
10213
10214
10215/*
10216 * PSLLQ / VPSLLQ
10217 */
10218#ifdef IEM_WITHOUT_ASSEMBLY
10219
10220IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10221{
10222 RTUINT64U uSrc1 = { *puDst };
10223 RTUINT64U uSrc2 = { *puSrc };
10224 RTUINT64U uDst;
10225
10226 if (uSrc2.au64[0] <= 63)
10227 {
10228 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10229 }
10230 else
10231 {
10232 uDst.au64[0] = 0;
10233 }
10234 *puDst = uDst.u;
10235}
10236
10237
10238IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10239{
10240 RTUINT64U uSrc1 = { *puDst };
10241 RTUINT64U uDst;
10242
10243 if (uShift <= 63)
10244 {
10245 uDst.au64[0] = uSrc1.au64[0] << uShift;
10246 }
10247 else
10248 {
10249 uDst.au64[0] = 0;
10250 }
10251 *puDst = uDst.u;
10252}
10253
10254
10255IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10256{
10257 RTUINT128U uSrc1 = *puDst;
10258
10259 if (puSrc->au64[0] <= 63)
10260 {
10261 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10262 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10263 }
10264 else
10265 {
10266 puDst->au64[0] = 0;
10267 puDst->au64[1] = 0;
10268 }
10269}
10270
10271IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10272{
10273 RTUINT128U uSrc1 = *puDst;
10274
10275 if (uShift <= 63)
10276 {
10277 puDst->au64[0] = uSrc1.au64[0] << uShift;
10278 puDst->au64[1] = uSrc1.au64[1] << uShift;
10279 }
10280 else
10281 {
10282 puDst->au64[0] = 0;
10283 puDst->au64[1] = 0;
10284 }
10285}
10286
10287#endif
10288
10289
10290/*
10291 * PSRLDQ / VPSRLDQ
10292 */
10293#ifdef IEM_WITHOUT_ASSEMBLY
10294
10295IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10296{
10297 RTUINT128U uSrc1 = *puDst;
10298
10299 if (uShift < 16)
10300 {
10301 int i;
10302
10303 for (i = 0; i < 16 - uShift; ++i)
10304 puDst->au8[i] = uSrc1.au8[i + uShift];
10305 for (i = 16 - uShift; i < 16; ++i)
10306 puDst->au8[i] = 0;
10307 }
10308 else
10309 {
10310 puDst->au64[0] = 0;
10311 puDst->au64[1] = 0;
10312 }
10313}
10314
10315#endif
10316
10317
10318/*
10319 * PSLLDQ / VPSLLDQ
10320 */
10321#ifdef IEM_WITHOUT_ASSEMBLY
10322
10323IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10324{
10325 RTUINT128U uSrc1 = *puDst;
10326
10327 if (uShift < 16)
10328 {
10329 int i;
10330
10331 for (i = 0; i < uShift; ++i)
10332 puDst->au8[i] = 0;
10333 for (i = uShift; i < 16; ++i)
10334 puDst->au8[i] = uSrc1.au8[i - uShift];
10335 }
10336 else
10337 {
10338 puDst->au64[0] = 0;
10339 puDst->au64[1] = 0;
10340 }
10341}
10342
10343#endif
10344
10345
10346/*
10347 * PMADDWD / VPMADDWD
10348 */
10349#ifdef IEM_WITHOUT_ASSEMBLY
10350
10351IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10352{
10353 RTUINT64U uSrc1 = { *puDst };
10354 RTUINT64U uSrc2 = { *puSrc };
10355 RTUINT64U uDst;
10356
10357 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10358 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10359 *puDst = uDst.u;
10360 RT_NOREF(pFpuState);
10361}
10362
10363
10364IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10365{
10366 RTUINT128U uSrc1 = *puDst;
10367
10368 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10369 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10370 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10371 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10372 RT_NOREF(pFpuState);
10373}
10374
10375#endif
10376
10377
10378/*
10379 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10380 */
10381#ifdef IEM_WITHOUT_ASSEMBLY
10382
10383IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10384{
10385 RTUINT64U uSrc1 = { *puDst };
10386 RTUINT64U uSrc2 = { *puSrc };
10387 RTUINT64U uDst;
10388
10389 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10390 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10391 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10392 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10393 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10394 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10395 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10396 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10397 *puDst = uDst.u;
10398 RT_NOREF(pFpuState);
10399}
10400
10401
10402IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10403{
10404 RTUINT128U uSrc1 = *puDst;
10405
10406 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10407 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10408 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10409 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10410 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10411 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10412 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10413 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10414 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10415 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10416 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10417 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10418 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10419 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10420 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10421 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10422 RT_NOREF(pFpuState);
10423}
10424
10425#endif
10426
10427
10428IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10429{
10430 RTUINT128U uSrc1 = *puDst;
10431
10432 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10433 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10434 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10435 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10436 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10437 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10438 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10439 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10440 RT_NOREF(pFpuState);
10441}
10442
10443
10444IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10445{
10446 RTUINT128U uSrc1 = *puDst;
10447
10448 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10449 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10450 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10451 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10452 RT_NOREF(pFpuState);
10453}
10454
10455
10456IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10457 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10458{
10459 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10460 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10461 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10462 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10463 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10464 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10465 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10466 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10467 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10468 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10469 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10470 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10471 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10472 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10473 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10474 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10475 RT_NOREF(pExtState);
10476}
10477
10478
10479IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10480 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10481{
10482 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10483 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10484 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10485 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10486 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10487 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10488 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10489 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10490 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10491 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10492 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10493 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10494 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10495 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10496 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10497 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10498 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10499 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10500 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10501 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10502 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10503 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10504 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10505 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10506 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10507 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10508 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10509 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10510 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10511 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10512 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10513 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10514 RT_NOREF(pExtState);
10515}
10516
10517
10518IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10519 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10520{
10521 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10522 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10523 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10524 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10525 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10526 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10527 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10528 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10529 RT_NOREF(pExtState);
10530}
10531
10532
10533IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10534 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10535{
10536 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10537 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10538 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10539 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10540 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10541 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10542 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10543 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10544 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10545 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10546 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10547 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10548 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10549 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10550 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10551 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10552 RT_NOREF(pExtState);
10553}
10554
10555
10556IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10557 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10558{
10559 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10560 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10561 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10562 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10563 RT_NOREF(pExtState);
10564}
10565
10566
10567IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10568 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10569{
10570 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10571 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10572 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10573 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10574 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10575 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10576 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10577 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10578 RT_NOREF(pExtState);
10579}
10580
10581
10582/*
10583 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10584 */
10585#ifdef IEM_WITHOUT_ASSEMBLY
10586
10587IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10588{
10589 RTUINT64U uSrc1 = { *puDst };
10590 RTUINT64U uSrc2 = { *puSrc };
10591 RTUINT64U uDst;
10592
10593 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10594 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10595 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10596 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10597 *puDst = uDst.u;
10598 RT_NOREF(pFpuState);
10599}
10600
10601
10602IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10603{
10604 RTUINT128U uSrc1 = *puDst;
10605
10606 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10607 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10608 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10609 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10610 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10611 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10612 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10613 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10614 RT_NOREF(pFpuState);
10615}
10616
10617#endif
10618
10619IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10620{
10621 RTUINT128U uSrc1 = *puDst;
10622
10623 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10624 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10625 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10626 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10627 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10628 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10629 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10630 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10631 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10632 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10633 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10634 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10635 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10636 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10637 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10638 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10639 RT_NOREF(pFpuState);
10640}
10641
10642
10643IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10644{
10645 RTUINT128U uSrc1 = *puDst;
10646
10647 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10648 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10649 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10650 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10651 RT_NOREF(pFpuState);
10652}
10653
10654
10655IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10656 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10657{
10658 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10659 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10660 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10661 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10662 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10663 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10664 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10665 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10666 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10667 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10668 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10669 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10670 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10671 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10672 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10673 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10674 RT_NOREF(pExtState);
10675}
10676
10677
10678IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10679 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10680{
10681 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10682 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10683 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10684 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10685 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10686 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10687 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10688 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10689 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10690 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10691 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10692 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10693 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10694 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10695 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10696 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10697 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10698 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10699 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10700 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10701 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10702 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10703 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10704 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10705 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10706 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10707 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10708 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10709 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10710 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10711 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10712 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10713 RT_NOREF(pExtState);
10714}
10715
10716
10717IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10718 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10719{
10720 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10721 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10722 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10723 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10724 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10725 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10726 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10727 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10728 RT_NOREF(pExtState);
10729}
10730
10731
10732IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10733 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10734{
10735 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10736 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10737 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10738 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10739 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10740 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10741 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10742 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10743 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10744 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10745 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10746 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10747 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10748 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10749 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10750 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10751 RT_NOREF(pExtState);
10752}
10753
10754
10755IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10756 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10757{
10758 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10759 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10760 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10761 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10762 RT_NOREF(pExtState);
10763}
10764
10765
10766IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10767 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10768{
10769 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10770 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10771 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10772 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10773 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10774 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10775 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10776 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10777 RT_NOREF(pExtState);
10778}
10779
10780
10781/*
10782 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10783 */
10784#ifdef IEM_WITHOUT_ASSEMBLY
10785
10786IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10787{
10788 RTUINT64U uSrc1 = { *puDst };
10789 RTUINT64U uSrc2 = { *puSrc };
10790 RTUINT64U uDst;
10791
10792 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10793 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10794 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10795 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10796 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10797 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10798 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10799 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10800 *puDst = uDst.u;
10801 RT_NOREF(pFpuState);
10802}
10803
10804
10805IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10806{
10807 RTUINT128U uSrc1 = *puDst;
10808
10809 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10810 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10811 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10812 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10813 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10814 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10815 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10816 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10817 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10818 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10819 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10820 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10821 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10822 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10823 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10824 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10825 RT_NOREF(pFpuState);
10826}
10827
10828#endif
10829
10830IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10831{
10832 RTUINT128U uSrc1 = *puDst;
10833
10834 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10835 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10836 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10837 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10838 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10839 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10840 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10841 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10842 RT_NOREF(pFpuState);
10843}
10844
10845
10846IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10847{
10848 RTUINT128U uSrc1 = *puDst;
10849
10850 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10851 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10852 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10853 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10854 RT_NOREF(pFpuState);
10855}
10856
10857
10858IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10859 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10860{
10861 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10862 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10863 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10864 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10865 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10866 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10867 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10868 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10869 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10870 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10871 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10872 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10873 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10874 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10875 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10876 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10877 RT_NOREF(pExtState);
10878}
10879
10880
10881IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10882 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10883{
10884 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10885 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10886 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10887 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10888 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10889 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10890 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10891 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10892 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10893 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10894 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10895 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10896 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10897 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10898 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10899 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10900 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10901 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10902 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10903 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10904 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10905 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10906 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10907 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10908 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10909 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10910 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10911 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10912 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10913 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10914 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10915 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10916 RT_NOREF(pExtState);
10917}
10918
10919
10920IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10921 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10922{
10923 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10924 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10925 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10926 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10927 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10928 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10929 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10930 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10931 RT_NOREF(pExtState);
10932}
10933
10934
10935IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10936 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10937{
10938 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10939 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10940 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10941 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10942 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10943 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10944 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10945 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10946 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10947 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10948 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10949 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10950 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10951 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10952 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10953 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10954 RT_NOREF(pExtState);
10955}
10956
10957
10958IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10959 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10960{
10961 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10962 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10963 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10964 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10965 RT_NOREF(pExtState);
10966}
10967
10968
10969IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10970 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10971{
10972 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10973 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10974 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10975 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10976 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10977 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10978 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10979 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10980 RT_NOREF(pExtState);
10981}
10982
10983
10984/*
10985 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10986 */
10987#ifdef IEM_WITHOUT_ASSEMBLY
10988
10989IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10990{
10991 RTUINT64U uSrc1 = { *puDst };
10992 RTUINT64U uSrc2 = { *puSrc };
10993 RTUINT64U uDst;
10994
10995 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10996 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10997 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10998 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10999 *puDst = uDst.u;
11000 RT_NOREF(pFpuState);
11001}
11002
11003
11004IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11005{
11006 RTUINT128U uSrc1 = *puDst;
11007
11008 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11009 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11010 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11011 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11012 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11013 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11014 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11015 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11016 RT_NOREF(pFpuState);
11017}
11018
11019#endif
11020
11021IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11022{
11023 RTUINT128U uSrc1 = *puDst;
11024
11025 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11026 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11027 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11028 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11029 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11030 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11031 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11032 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11033 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11034 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11035 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11036 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11037 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11038 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11039 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
11040 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
11041 RT_NOREF(pFpuState);
11042}
11043
11044
11045IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11046{
11047 RTUINT128U uSrc1 = *puDst;
11048
11049 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11050 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11051 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11052 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11053 RT_NOREF(pFpuState);
11054}
11055
11056
11057IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11058 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11059{
11060 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11061 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11062 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11063 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11064 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11065 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11066 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11067 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11068 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11069 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11070 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11071 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11072 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11073 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11074 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11075 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11076 RT_NOREF(pExtState);
11077}
11078
11079
11080IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11081 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11082{
11083 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11084 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11085 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11086 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11087 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11088 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11089 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11090 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11091 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11092 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11093 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11094 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11095 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11096 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11097 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11098 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11099 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11100 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11101 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11102 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11103 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11104 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11105 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11106 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11107 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11108 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11109 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11110 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11111 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11112 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11113 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11114 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11115 RT_NOREF(pExtState);
11116}
11117
11118
11119IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11120 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11121{
11122 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11123 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11124 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11125 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11126 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11127 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11128 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11129 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11130 RT_NOREF(pExtState);
11131}
11132
11133
11134IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11135 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11136{
11137 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11138 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11139 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11140 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11141 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11142 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11143 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11144 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11145 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11146 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11147 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
11148 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
11149 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
11150 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
11151 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
11152 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
11153 RT_NOREF(pExtState);
11154}
11155
11156
11157IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11158 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11159{
11160 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11161 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11162 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11163 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11164 RT_NOREF(pExtState);
11165}
11166
11167
11168IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11169 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11170{
11171 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11172 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11173 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11174 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11175 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11176 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11177 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11178 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11179 RT_NOREF(pExtState);
11180}
11181
11182
11183/*
11184 * PAVGB / VPAVGB / PAVGW / VPAVGW
11185 */
11186#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11187#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11188
11189#ifdef IEM_WITHOUT_ASSEMBLY
11190
11191IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11192{
11193 RTUINT64U uSrc1 = { *puDst };
11194 RTUINT64U uSrc2 = { *puSrc };
11195 RTUINT64U uDst;
11196
11197 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11198 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11199 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11200 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11201 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11202 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11203 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11204 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11205 *puDst = uDst.u;
11206}
11207
11208
11209IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11210{
11211 RTUINT128U uSrc1 = *puDst;
11212
11213 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11214 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11215 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11216 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11217 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11218 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11219 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11220 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11221 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11222 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11223 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11224 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11225 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11226 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11227 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11228 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11229}
11230
11231
11232IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11233{
11234 RTUINT64U uSrc1 = { *puDst };
11235 RTUINT64U uSrc2 = { *puSrc };
11236 RTUINT64U uDst;
11237
11238 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11239 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11240 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11241 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11242 *puDst = uDst.u;
11243}
11244
11245
11246IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11247{
11248 RTUINT128U uSrc1 = *puDst;
11249
11250 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11251 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11252 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11253 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11254 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11255 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11256 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11257 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11258}
11259
11260#endif
11261
11262IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11263{
11264 RTUINT128U uSrc1 = *puDst;
11265
11266 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11267 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11268 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11269 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11270 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11271 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11272 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11273 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11274 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11275 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11276 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11277 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11278 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11279 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11280 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11281 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11282}
11283
11284
11285IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11286{
11287 RTUINT128U uSrc1 = *puDst;
11288
11289 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11290 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11291 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11292 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11293 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11294 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11295 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11296 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11297 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11298 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11299 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11300 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11301 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11302 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11303 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11304 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11305}
11306
11307
11308IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11309{
11310 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11311 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11312 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11313 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11314 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11315 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11316 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11317 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11318 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11319 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11320 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11321 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11322 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11323 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11324 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11325 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11326}
11327
11328
11329IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11330{
11331 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11332 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11333 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11334 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11335 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11336 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11337 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11338 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11339 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11340 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11341 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11342 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11343 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11344 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11345 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11346 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11347 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11348 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11349 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11350 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11351 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11352 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11353 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11354 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11355 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11356 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11357 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11358 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11359 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11360 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11361 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11362 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11363}
11364
11365
11366IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11367{
11368 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11369 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11370 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11371 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11372 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11373 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11374 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11375 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11376}
11377
11378
11379IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11380{
11381 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11382 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11383 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11384 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11385 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11386 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11387 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11388 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11389 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11390 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11391 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11392 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11393 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11394 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11395 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11396 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11397}
11398
11399#undef PAVGB_EXEC
11400#undef PAVGW_EXEC
11401
11402
11403/*
11404 * PMOVMSKB / VPMOVMSKB
11405 */
11406#ifdef IEM_WITHOUT_ASSEMBLY
11407
11408IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11409{
11410 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11411 uint64_t const uSrc = *pu64Src;
11412 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11413 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11414 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11415 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11416 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11417 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11418 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11419 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11420}
11421
11422
11423IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11424{
11425 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11426 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11427 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11428 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11429 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11430 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11431 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11432 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11433 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11434 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11435 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11436 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11437 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11438 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11439 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11440 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11441 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11442 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11443 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11444}
11445
11446#endif
11447
11448IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11449{
11450 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11451 uint64_t const uSrc0 = puSrc->QWords.qw0;
11452 uint64_t const uSrc1 = puSrc->QWords.qw1;
11453 uint64_t const uSrc2 = puSrc->QWords.qw2;
11454 uint64_t const uSrc3 = puSrc->QWords.qw3;
11455 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11456 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11457 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11458 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11459 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11460 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11461 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11462 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11463 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11464 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11465 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11466 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11467 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11468 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11469 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11470 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11471 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11472 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11473 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11474 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11475 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11476 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11477 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11478 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11479 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11480 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11481 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11482 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11483 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11484 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11485 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11486 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11487}
11488
11489
11490/*
11491 * [V]PSHUFB
11492 */
11493
11494IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11495{
11496 RTUINT64U const uSrc = { *puSrc };
11497 RTUINT64U const uDstIn = { *puDst };
11498 ASMCompilerBarrier();
11499 RTUINT64U uDstOut = { 0 };
11500 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11501 {
11502 uint8_t idxSrc = uSrc.au8[iByte];
11503 if (!(idxSrc & 0x80))
11504 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11505 }
11506 *puDst = uDstOut.u;
11507 RT_NOREF(pFpuState);
11508}
11509
11510
11511IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11512{
11513 RTUINT128U const uSrc = *puSrc;
11514 RTUINT128U const uDstIn = *puDst;
11515 ASMCompilerBarrier();
11516 puDst->au64[0] = 0;
11517 puDst->au64[1] = 0;
11518 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11519 {
11520 uint8_t idxSrc = uSrc.au8[iByte];
11521 if (!(idxSrc & 0x80))
11522 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11523 }
11524 RT_NOREF(pFpuState);
11525}
11526
11527
11528IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11529 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11530{
11531 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11532 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11533 ASMCompilerBarrier();
11534 puDst->au64[0] = 0;
11535 puDst->au64[1] = 0;
11536 for (unsigned iByte = 0; iByte < 16; iByte++)
11537 {
11538 uint8_t idxSrc = uSrc2.au8[iByte];
11539 if (!(idxSrc & 0x80))
11540 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11541 }
11542 RT_NOREF(pExtState);
11543}
11544
11545
11546IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11547 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11548{
11549 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11550 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11551 ASMCompilerBarrier();
11552 puDst->au64[0] = 0;
11553 puDst->au64[1] = 0;
11554 puDst->au64[2] = 0;
11555 puDst->au64[3] = 0;
11556 for (unsigned iByte = 0; iByte < 16; iByte++)
11557 {
11558 uint8_t idxSrc = uSrc2.au8[iByte];
11559 if (!(idxSrc & 0x80))
11560 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11561 }
11562 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11563 {
11564 uint8_t idxSrc = uSrc2.au8[iByte];
11565 if (!(idxSrc & 0x80))
11566 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11567 }
11568 RT_NOREF(pExtState);
11569}
11570
11571
11572/*
11573 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11574 */
11575#ifdef IEM_WITHOUT_ASSEMBLY
11576
11577IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11578{
11579 uint64_t const uSrc = *puSrc;
11580 ASMCompilerBarrier();
11581 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11582 uSrc >> (((bEvil >> 2) & 3) * 16),
11583 uSrc >> (((bEvil >> 4) & 3) * 16),
11584 uSrc >> (((bEvil >> 6) & 3) * 16));
11585}
11586
11587
11588IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11589{
11590 puDst->QWords.qw0 = puSrc->QWords.qw0;
11591 uint64_t const uSrc = puSrc->QWords.qw1;
11592 ASMCompilerBarrier();
11593 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11594 uSrc >> (((bEvil >> 2) & 3) * 16),
11595 uSrc >> (((bEvil >> 4) & 3) * 16),
11596 uSrc >> (((bEvil >> 6) & 3) * 16));
11597}
11598
11599#endif
11600
11601IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11602{
11603 puDst->QWords.qw0 = puSrc->QWords.qw0;
11604 uint64_t const uSrc1 = puSrc->QWords.qw1;
11605 puDst->QWords.qw2 = puSrc->QWords.qw2;
11606 uint64_t const uSrc3 = puSrc->QWords.qw3;
11607 ASMCompilerBarrier();
11608 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11609 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11610 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11611 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11612 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11613 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11614 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11615 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11616}
11617
11618#ifdef IEM_WITHOUT_ASSEMBLY
11619IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11620{
11621 puDst->QWords.qw1 = puSrc->QWords.qw1;
11622 uint64_t const uSrc = puSrc->QWords.qw0;
11623 ASMCompilerBarrier();
11624 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11625 uSrc >> (((bEvil >> 2) & 3) * 16),
11626 uSrc >> (((bEvil >> 4) & 3) * 16),
11627 uSrc >> (((bEvil >> 6) & 3) * 16));
11628
11629}
11630#endif
11631
11632
11633IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11634{
11635 puDst->QWords.qw3 = puSrc->QWords.qw3;
11636 uint64_t const uSrc2 = puSrc->QWords.qw2;
11637 puDst->QWords.qw1 = puSrc->QWords.qw1;
11638 uint64_t const uSrc0 = puSrc->QWords.qw0;
11639 ASMCompilerBarrier();
11640 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11641 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11642 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11643 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11644 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11645 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11646 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11647 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11648
11649}
11650
11651
11652#ifdef IEM_WITHOUT_ASSEMBLY
11653IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11654{
11655 RTUINT128U const uSrc = *puSrc;
11656 ASMCompilerBarrier();
11657 puDst->au32[0] = uSrc.au32[bEvil & 3];
11658 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11659 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11660 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11661}
11662#endif
11663
11664
11665IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11666{
11667 RTUINT256U const uSrc = *puSrc;
11668 ASMCompilerBarrier();
11669 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11670 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11671 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11672 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11673 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11674 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11675 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11676 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11677}
11678
11679
11680/*
11681 * PUNPCKHBW - high bytes -> words
11682 */
11683#ifdef IEM_WITHOUT_ASSEMBLY
11684
11685IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11686{
11687 RTUINT64U const uSrc2 = { *puSrc };
11688 RTUINT64U const uSrc1 = { *puDst };
11689 ASMCompilerBarrier();
11690 RTUINT64U uDstOut;
11691 uDstOut.au8[0] = uSrc1.au8[4];
11692 uDstOut.au8[1] = uSrc2.au8[4];
11693 uDstOut.au8[2] = uSrc1.au8[5];
11694 uDstOut.au8[3] = uSrc2.au8[5];
11695 uDstOut.au8[4] = uSrc1.au8[6];
11696 uDstOut.au8[5] = uSrc2.au8[6];
11697 uDstOut.au8[6] = uSrc1.au8[7];
11698 uDstOut.au8[7] = uSrc2.au8[7];
11699 *puDst = uDstOut.u;
11700}
11701
11702
11703IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11704{
11705 RTUINT128U const uSrc2 = *puSrc;
11706 RTUINT128U const uSrc1 = *puDst;
11707 ASMCompilerBarrier();
11708 RTUINT128U uDstOut;
11709 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11710 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11711 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11712 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11713 uDstOut.au8[ 4] = uSrc1.au8[10];
11714 uDstOut.au8[ 5] = uSrc2.au8[10];
11715 uDstOut.au8[ 6] = uSrc1.au8[11];
11716 uDstOut.au8[ 7] = uSrc2.au8[11];
11717 uDstOut.au8[ 8] = uSrc1.au8[12];
11718 uDstOut.au8[ 9] = uSrc2.au8[12];
11719 uDstOut.au8[10] = uSrc1.au8[13];
11720 uDstOut.au8[11] = uSrc2.au8[13];
11721 uDstOut.au8[12] = uSrc1.au8[14];
11722 uDstOut.au8[13] = uSrc2.au8[14];
11723 uDstOut.au8[14] = uSrc1.au8[15];
11724 uDstOut.au8[15] = uSrc2.au8[15];
11725 *puDst = uDstOut;
11726}
11727
11728#endif
11729
11730IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11731{
11732 RTUINT128U const uSrc2 = *puSrc2;
11733 RTUINT128U const uSrc1 = *puSrc1;
11734 ASMCompilerBarrier();
11735 RTUINT128U uDstOut;
11736 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11737 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11738 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11739 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11740 uDstOut.au8[ 4] = uSrc1.au8[10];
11741 uDstOut.au8[ 5] = uSrc2.au8[10];
11742 uDstOut.au8[ 6] = uSrc1.au8[11];
11743 uDstOut.au8[ 7] = uSrc2.au8[11];
11744 uDstOut.au8[ 8] = uSrc1.au8[12];
11745 uDstOut.au8[ 9] = uSrc2.au8[12];
11746 uDstOut.au8[10] = uSrc1.au8[13];
11747 uDstOut.au8[11] = uSrc2.au8[13];
11748 uDstOut.au8[12] = uSrc1.au8[14];
11749 uDstOut.au8[13] = uSrc2.au8[14];
11750 uDstOut.au8[14] = uSrc1.au8[15];
11751 uDstOut.au8[15] = uSrc2.au8[15];
11752 *puDst = uDstOut;
11753}
11754
11755
11756IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11757{
11758 RTUINT256U const uSrc2 = *puSrc2;
11759 RTUINT256U const uSrc1 = *puSrc1;
11760 ASMCompilerBarrier();
11761 RTUINT256U uDstOut;
11762 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11763 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11764 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11765 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11766 uDstOut.au8[ 4] = uSrc1.au8[10];
11767 uDstOut.au8[ 5] = uSrc2.au8[10];
11768 uDstOut.au8[ 6] = uSrc1.au8[11];
11769 uDstOut.au8[ 7] = uSrc2.au8[11];
11770 uDstOut.au8[ 8] = uSrc1.au8[12];
11771 uDstOut.au8[ 9] = uSrc2.au8[12];
11772 uDstOut.au8[10] = uSrc1.au8[13];
11773 uDstOut.au8[11] = uSrc2.au8[13];
11774 uDstOut.au8[12] = uSrc1.au8[14];
11775 uDstOut.au8[13] = uSrc2.au8[14];
11776 uDstOut.au8[14] = uSrc1.au8[15];
11777 uDstOut.au8[15] = uSrc2.au8[15];
11778 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11779 uDstOut.au8[16] = uSrc1.au8[24];
11780 uDstOut.au8[17] = uSrc2.au8[24];
11781 uDstOut.au8[18] = uSrc1.au8[25];
11782 uDstOut.au8[19] = uSrc2.au8[25];
11783 uDstOut.au8[20] = uSrc1.au8[26];
11784 uDstOut.au8[21] = uSrc2.au8[26];
11785 uDstOut.au8[22] = uSrc1.au8[27];
11786 uDstOut.au8[23] = uSrc2.au8[27];
11787 uDstOut.au8[24] = uSrc1.au8[28];
11788 uDstOut.au8[25] = uSrc2.au8[28];
11789 uDstOut.au8[26] = uSrc1.au8[29];
11790 uDstOut.au8[27] = uSrc2.au8[29];
11791 uDstOut.au8[28] = uSrc1.au8[30];
11792 uDstOut.au8[29] = uSrc2.au8[30];
11793 uDstOut.au8[30] = uSrc1.au8[31];
11794 uDstOut.au8[31] = uSrc2.au8[31];
11795 *puDst = uDstOut;
11796}
11797
11798
11799/*
11800 * PUNPCKHBW - high words -> dwords
11801 */
11802#ifdef IEM_WITHOUT_ASSEMBLY
11803
11804IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11805{
11806 RTUINT64U const uSrc2 = { *puSrc };
11807 RTUINT64U const uSrc1 = { *puDst };
11808 ASMCompilerBarrier();
11809 RTUINT64U uDstOut;
11810 uDstOut.au16[0] = uSrc1.au16[2];
11811 uDstOut.au16[1] = uSrc2.au16[2];
11812 uDstOut.au16[2] = uSrc1.au16[3];
11813 uDstOut.au16[3] = uSrc2.au16[3];
11814 *puDst = uDstOut.u;
11815}
11816
11817
11818IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11819{
11820 RTUINT128U const uSrc2 = *puSrc;
11821 RTUINT128U const uSrc1 = *puDst;
11822 ASMCompilerBarrier();
11823 RTUINT128U uDstOut;
11824 uDstOut.au16[0] = uSrc1.au16[4];
11825 uDstOut.au16[1] = uSrc2.au16[4];
11826 uDstOut.au16[2] = uSrc1.au16[5];
11827 uDstOut.au16[3] = uSrc2.au16[5];
11828 uDstOut.au16[4] = uSrc1.au16[6];
11829 uDstOut.au16[5] = uSrc2.au16[6];
11830 uDstOut.au16[6] = uSrc1.au16[7];
11831 uDstOut.au16[7] = uSrc2.au16[7];
11832 *puDst = uDstOut;
11833}
11834
11835#endif
11836
11837IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11838{
11839 RTUINT128U const uSrc2 = *puSrc2;
11840 RTUINT128U const uSrc1 = *puSrc1;
11841 ASMCompilerBarrier();
11842 RTUINT128U uDstOut;
11843 uDstOut.au16[0] = uSrc1.au16[4];
11844 uDstOut.au16[1] = uSrc2.au16[4];
11845 uDstOut.au16[2] = uSrc1.au16[5];
11846 uDstOut.au16[3] = uSrc2.au16[5];
11847 uDstOut.au16[4] = uSrc1.au16[6];
11848 uDstOut.au16[5] = uSrc2.au16[6];
11849 uDstOut.au16[6] = uSrc1.au16[7];
11850 uDstOut.au16[7] = uSrc2.au16[7];
11851 *puDst = uDstOut;
11852}
11853
11854
11855IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11856{
11857 RTUINT256U const uSrc2 = *puSrc2;
11858 RTUINT256U const uSrc1 = *puSrc1;
11859 ASMCompilerBarrier();
11860 RTUINT256U uDstOut;
11861 uDstOut.au16[0] = uSrc1.au16[4];
11862 uDstOut.au16[1] = uSrc2.au16[4];
11863 uDstOut.au16[2] = uSrc1.au16[5];
11864 uDstOut.au16[3] = uSrc2.au16[5];
11865 uDstOut.au16[4] = uSrc1.au16[6];
11866 uDstOut.au16[5] = uSrc2.au16[6];
11867 uDstOut.au16[6] = uSrc1.au16[7];
11868 uDstOut.au16[7] = uSrc2.au16[7];
11869
11870 uDstOut.au16[8] = uSrc1.au16[12];
11871 uDstOut.au16[9] = uSrc2.au16[12];
11872 uDstOut.au16[10] = uSrc1.au16[13];
11873 uDstOut.au16[11] = uSrc2.au16[13];
11874 uDstOut.au16[12] = uSrc1.au16[14];
11875 uDstOut.au16[13] = uSrc2.au16[14];
11876 uDstOut.au16[14] = uSrc1.au16[15];
11877 uDstOut.au16[15] = uSrc2.au16[15];
11878 *puDst = uDstOut;
11879}
11880
11881
11882/*
11883 * PUNPCKHBW - high dwords -> qword(s)
11884 */
11885#ifdef IEM_WITHOUT_ASSEMBLY
11886
11887IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11888{
11889 RTUINT64U const uSrc2 = { *puSrc };
11890 RTUINT64U const uSrc1 = { *puDst };
11891 ASMCompilerBarrier();
11892 RTUINT64U uDstOut;
11893 uDstOut.au32[0] = uSrc1.au32[1];
11894 uDstOut.au32[1] = uSrc2.au32[1];
11895 *puDst = uDstOut.u;
11896}
11897
11898
11899IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11900{
11901 RTUINT128U const uSrc2 = *puSrc;
11902 RTUINT128U const uSrc1 = *puDst;
11903 ASMCompilerBarrier();
11904 RTUINT128U uDstOut;
11905 uDstOut.au32[0] = uSrc1.au32[2];
11906 uDstOut.au32[1] = uSrc2.au32[2];
11907 uDstOut.au32[2] = uSrc1.au32[3];
11908 uDstOut.au32[3] = uSrc2.au32[3];
11909 *puDst = uDstOut;
11910}
11911
11912#endif
11913
11914IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11915{
11916 RTUINT128U const uSrc2 = *puSrc2;
11917 RTUINT128U const uSrc1 = *puSrc1;
11918 ASMCompilerBarrier();
11919 RTUINT128U uDstOut;
11920 uDstOut.au32[0] = uSrc1.au32[2];
11921 uDstOut.au32[1] = uSrc2.au32[2];
11922 uDstOut.au32[2] = uSrc1.au32[3];
11923 uDstOut.au32[3] = uSrc2.au32[3];
11924 *puDst = uDstOut;
11925}
11926
11927
11928IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11929{
11930 RTUINT256U const uSrc2 = *puSrc2;
11931 RTUINT256U const uSrc1 = *puSrc1;
11932 ASMCompilerBarrier();
11933 RTUINT256U uDstOut;
11934 uDstOut.au32[0] = uSrc1.au32[2];
11935 uDstOut.au32[1] = uSrc2.au32[2];
11936 uDstOut.au32[2] = uSrc1.au32[3];
11937 uDstOut.au32[3] = uSrc2.au32[3];
11938
11939 uDstOut.au32[4] = uSrc1.au32[6];
11940 uDstOut.au32[5] = uSrc2.au32[6];
11941 uDstOut.au32[6] = uSrc1.au32[7];
11942 uDstOut.au32[7] = uSrc2.au32[7];
11943 *puDst = uDstOut;
11944}
11945
11946
11947/*
11948 * PUNPCKHQDQ -> High qwords -> double qword(s).
11949 */
11950#ifdef IEM_WITHOUT_ASSEMBLY
11951IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11952{
11953 RTUINT128U const uSrc2 = *puSrc;
11954 RTUINT128U const uSrc1 = *puDst;
11955 ASMCompilerBarrier();
11956 RTUINT128U uDstOut;
11957 uDstOut.au64[0] = uSrc1.au64[1];
11958 uDstOut.au64[1] = uSrc2.au64[1];
11959 *puDst = uDstOut;
11960}
11961#endif
11962
11963
11964IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11965{
11966 RTUINT128U const uSrc2 = *puSrc2;
11967 RTUINT128U const uSrc1 = *puSrc1;
11968 ASMCompilerBarrier();
11969 RTUINT128U uDstOut;
11970 uDstOut.au64[0] = uSrc1.au64[1];
11971 uDstOut.au64[1] = uSrc2.au64[1];
11972 *puDst = uDstOut;
11973}
11974
11975
11976IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11977{
11978 RTUINT256U const uSrc2 = *puSrc2;
11979 RTUINT256U const uSrc1 = *puSrc1;
11980 ASMCompilerBarrier();
11981 RTUINT256U uDstOut;
11982 uDstOut.au64[0] = uSrc1.au64[1];
11983 uDstOut.au64[1] = uSrc2.au64[1];
11984
11985 uDstOut.au64[2] = uSrc1.au64[3];
11986 uDstOut.au64[3] = uSrc2.au64[3];
11987 *puDst = uDstOut;
11988}
11989
11990
11991/*
11992 * PUNPCKLBW - low bytes -> words
11993 */
11994#ifdef IEM_WITHOUT_ASSEMBLY
11995
11996IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11997{
11998 RTUINT64U const uSrc2 = { *puSrc };
11999 RTUINT64U const uSrc1 = { *puDst };
12000 ASMCompilerBarrier();
12001 RTUINT64U uDstOut;
12002 uDstOut.au8[0] = uSrc1.au8[0];
12003 uDstOut.au8[1] = uSrc2.au8[0];
12004 uDstOut.au8[2] = uSrc1.au8[1];
12005 uDstOut.au8[3] = uSrc2.au8[1];
12006 uDstOut.au8[4] = uSrc1.au8[2];
12007 uDstOut.au8[5] = uSrc2.au8[2];
12008 uDstOut.au8[6] = uSrc1.au8[3];
12009 uDstOut.au8[7] = uSrc2.au8[3];
12010 *puDst = uDstOut.u;
12011}
12012
12013
12014IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12015{
12016 RTUINT128U const uSrc2 = *puSrc;
12017 RTUINT128U const uSrc1 = *puDst;
12018 ASMCompilerBarrier();
12019 RTUINT128U uDstOut;
12020 uDstOut.au8[ 0] = uSrc1.au8[0];
12021 uDstOut.au8[ 1] = uSrc2.au8[0];
12022 uDstOut.au8[ 2] = uSrc1.au8[1];
12023 uDstOut.au8[ 3] = uSrc2.au8[1];
12024 uDstOut.au8[ 4] = uSrc1.au8[2];
12025 uDstOut.au8[ 5] = uSrc2.au8[2];
12026 uDstOut.au8[ 6] = uSrc1.au8[3];
12027 uDstOut.au8[ 7] = uSrc2.au8[3];
12028 uDstOut.au8[ 8] = uSrc1.au8[4];
12029 uDstOut.au8[ 9] = uSrc2.au8[4];
12030 uDstOut.au8[10] = uSrc1.au8[5];
12031 uDstOut.au8[11] = uSrc2.au8[5];
12032 uDstOut.au8[12] = uSrc1.au8[6];
12033 uDstOut.au8[13] = uSrc2.au8[6];
12034 uDstOut.au8[14] = uSrc1.au8[7];
12035 uDstOut.au8[15] = uSrc2.au8[7];
12036 *puDst = uDstOut;
12037}
12038
12039#endif
12040
12041IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12042{
12043 RTUINT128U const uSrc2 = *puSrc2;
12044 RTUINT128U const uSrc1 = *puSrc1;
12045 ASMCompilerBarrier();
12046 RTUINT128U uDstOut;
12047 uDstOut.au8[ 0] = uSrc1.au8[0];
12048 uDstOut.au8[ 1] = uSrc2.au8[0];
12049 uDstOut.au8[ 2] = uSrc1.au8[1];
12050 uDstOut.au8[ 3] = uSrc2.au8[1];
12051 uDstOut.au8[ 4] = uSrc1.au8[2];
12052 uDstOut.au8[ 5] = uSrc2.au8[2];
12053 uDstOut.au8[ 6] = uSrc1.au8[3];
12054 uDstOut.au8[ 7] = uSrc2.au8[3];
12055 uDstOut.au8[ 8] = uSrc1.au8[4];
12056 uDstOut.au8[ 9] = uSrc2.au8[4];
12057 uDstOut.au8[10] = uSrc1.au8[5];
12058 uDstOut.au8[11] = uSrc2.au8[5];
12059 uDstOut.au8[12] = uSrc1.au8[6];
12060 uDstOut.au8[13] = uSrc2.au8[6];
12061 uDstOut.au8[14] = uSrc1.au8[7];
12062 uDstOut.au8[15] = uSrc2.au8[7];
12063 *puDst = uDstOut;
12064}
12065
12066
12067IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12068{
12069 RTUINT256U const uSrc2 = *puSrc2;
12070 RTUINT256U const uSrc1 = *puSrc1;
12071 ASMCompilerBarrier();
12072 RTUINT256U uDstOut;
12073 uDstOut.au8[ 0] = uSrc1.au8[0];
12074 uDstOut.au8[ 1] = uSrc2.au8[0];
12075 uDstOut.au8[ 2] = uSrc1.au8[1];
12076 uDstOut.au8[ 3] = uSrc2.au8[1];
12077 uDstOut.au8[ 4] = uSrc1.au8[2];
12078 uDstOut.au8[ 5] = uSrc2.au8[2];
12079 uDstOut.au8[ 6] = uSrc1.au8[3];
12080 uDstOut.au8[ 7] = uSrc2.au8[3];
12081 uDstOut.au8[ 8] = uSrc1.au8[4];
12082 uDstOut.au8[ 9] = uSrc2.au8[4];
12083 uDstOut.au8[10] = uSrc1.au8[5];
12084 uDstOut.au8[11] = uSrc2.au8[5];
12085 uDstOut.au8[12] = uSrc1.au8[6];
12086 uDstOut.au8[13] = uSrc2.au8[6];
12087 uDstOut.au8[14] = uSrc1.au8[7];
12088 uDstOut.au8[15] = uSrc2.au8[7];
12089 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12090 uDstOut.au8[16] = uSrc1.au8[16];
12091 uDstOut.au8[17] = uSrc2.au8[16];
12092 uDstOut.au8[18] = uSrc1.au8[17];
12093 uDstOut.au8[19] = uSrc2.au8[17];
12094 uDstOut.au8[20] = uSrc1.au8[18];
12095 uDstOut.au8[21] = uSrc2.au8[18];
12096 uDstOut.au8[22] = uSrc1.au8[19];
12097 uDstOut.au8[23] = uSrc2.au8[19];
12098 uDstOut.au8[24] = uSrc1.au8[20];
12099 uDstOut.au8[25] = uSrc2.au8[20];
12100 uDstOut.au8[26] = uSrc1.au8[21];
12101 uDstOut.au8[27] = uSrc2.au8[21];
12102 uDstOut.au8[28] = uSrc1.au8[22];
12103 uDstOut.au8[29] = uSrc2.au8[22];
12104 uDstOut.au8[30] = uSrc1.au8[23];
12105 uDstOut.au8[31] = uSrc2.au8[23];
12106 *puDst = uDstOut;
12107}
12108
12109
12110/*
12111 * PUNPCKLBW - low words -> dwords
12112 */
12113#ifdef IEM_WITHOUT_ASSEMBLY
12114
12115IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12116{
12117 RTUINT64U const uSrc2 = { *puSrc };
12118 RTUINT64U const uSrc1 = { *puDst };
12119 ASMCompilerBarrier();
12120 RTUINT64U uDstOut;
12121 uDstOut.au16[0] = uSrc1.au16[0];
12122 uDstOut.au16[1] = uSrc2.au16[0];
12123 uDstOut.au16[2] = uSrc1.au16[1];
12124 uDstOut.au16[3] = uSrc2.au16[1];
12125 *puDst = uDstOut.u;
12126}
12127
12128
12129IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12130{
12131 RTUINT128U const uSrc2 = *puSrc;
12132 RTUINT128U const uSrc1 = *puDst;
12133 ASMCompilerBarrier();
12134 RTUINT128U uDstOut;
12135 uDstOut.au16[0] = uSrc1.au16[0];
12136 uDstOut.au16[1] = uSrc2.au16[0];
12137 uDstOut.au16[2] = uSrc1.au16[1];
12138 uDstOut.au16[3] = uSrc2.au16[1];
12139 uDstOut.au16[4] = uSrc1.au16[2];
12140 uDstOut.au16[5] = uSrc2.au16[2];
12141 uDstOut.au16[6] = uSrc1.au16[3];
12142 uDstOut.au16[7] = uSrc2.au16[3];
12143 *puDst = uDstOut;
12144}
12145
12146#endif
12147
12148IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12149{
12150 RTUINT128U const uSrc2 = *puSrc2;
12151 RTUINT128U const uSrc1 = *puSrc1;
12152 ASMCompilerBarrier();
12153 RTUINT128U uDstOut;
12154 uDstOut.au16[0] = uSrc1.au16[0];
12155 uDstOut.au16[1] = uSrc2.au16[0];
12156 uDstOut.au16[2] = uSrc1.au16[1];
12157 uDstOut.au16[3] = uSrc2.au16[1];
12158 uDstOut.au16[4] = uSrc1.au16[2];
12159 uDstOut.au16[5] = uSrc2.au16[2];
12160 uDstOut.au16[6] = uSrc1.au16[3];
12161 uDstOut.au16[7] = uSrc2.au16[3];
12162 *puDst = uDstOut;
12163}
12164
12165
12166IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12167{
12168 RTUINT256U const uSrc2 = *puSrc2;
12169 RTUINT256U const uSrc1 = *puSrc1;
12170 ASMCompilerBarrier();
12171 RTUINT256U uDstOut;
12172 uDstOut.au16[0] = uSrc1.au16[0];
12173 uDstOut.au16[1] = uSrc2.au16[0];
12174 uDstOut.au16[2] = uSrc1.au16[1];
12175 uDstOut.au16[3] = uSrc2.au16[1];
12176 uDstOut.au16[4] = uSrc1.au16[2];
12177 uDstOut.au16[5] = uSrc2.au16[2];
12178 uDstOut.au16[6] = uSrc1.au16[3];
12179 uDstOut.au16[7] = uSrc2.au16[3];
12180
12181 uDstOut.au16[8] = uSrc1.au16[8];
12182 uDstOut.au16[9] = uSrc2.au16[8];
12183 uDstOut.au16[10] = uSrc1.au16[9];
12184 uDstOut.au16[11] = uSrc2.au16[9];
12185 uDstOut.au16[12] = uSrc1.au16[10];
12186 uDstOut.au16[13] = uSrc2.au16[10];
12187 uDstOut.au16[14] = uSrc1.au16[11];
12188 uDstOut.au16[15] = uSrc2.au16[11];
12189 *puDst = uDstOut;
12190}
12191
12192
12193/*
12194 * PUNPCKLBW - low dwords -> qword(s)
12195 */
12196#ifdef IEM_WITHOUT_ASSEMBLY
12197
12198IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12199{
12200 RTUINT64U const uSrc2 = { *puSrc };
12201 RTUINT64U const uSrc1 = { *puDst };
12202 ASMCompilerBarrier();
12203 RTUINT64U uDstOut;
12204 uDstOut.au32[0] = uSrc1.au32[0];
12205 uDstOut.au32[1] = uSrc2.au32[0];
12206 *puDst = uDstOut.u;
12207}
12208
12209
12210IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12211{
12212 RTUINT128U const uSrc2 = *puSrc;
12213 RTUINT128U const uSrc1 = *puDst;
12214 ASMCompilerBarrier();
12215 RTUINT128U uDstOut;
12216 uDstOut.au32[0] = uSrc1.au32[0];
12217 uDstOut.au32[1] = uSrc2.au32[0];
12218 uDstOut.au32[2] = uSrc1.au32[1];
12219 uDstOut.au32[3] = uSrc2.au32[1];
12220 *puDst = uDstOut;
12221}
12222
12223#endif
12224
12225IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12226{
12227 RTUINT128U const uSrc2 = *puSrc2;
12228 RTUINT128U const uSrc1 = *puSrc1;
12229 ASMCompilerBarrier();
12230 RTUINT128U uDstOut;
12231 uDstOut.au32[0] = uSrc1.au32[0];
12232 uDstOut.au32[1] = uSrc2.au32[0];
12233 uDstOut.au32[2] = uSrc1.au32[1];
12234 uDstOut.au32[3] = uSrc2.au32[1];
12235 *puDst = uDstOut;
12236}
12237
12238
12239IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12240{
12241 RTUINT256U const uSrc2 = *puSrc2;
12242 RTUINT256U const uSrc1 = *puSrc1;
12243 ASMCompilerBarrier();
12244 RTUINT256U uDstOut;
12245 uDstOut.au32[0] = uSrc1.au32[0];
12246 uDstOut.au32[1] = uSrc2.au32[0];
12247 uDstOut.au32[2] = uSrc1.au32[1];
12248 uDstOut.au32[3] = uSrc2.au32[1];
12249
12250 uDstOut.au32[4] = uSrc1.au32[4];
12251 uDstOut.au32[5] = uSrc2.au32[4];
12252 uDstOut.au32[6] = uSrc1.au32[5];
12253 uDstOut.au32[7] = uSrc2.au32[5];
12254 *puDst = uDstOut;
12255}
12256
12257
12258/*
12259 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12260 */
12261#ifdef IEM_WITHOUT_ASSEMBLY
12262IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12263{
12264 RTUINT128U const uSrc2 = *puSrc;
12265 RTUINT128U const uSrc1 = *puDst;
12266 ASMCompilerBarrier();
12267 RTUINT128U uDstOut;
12268 uDstOut.au64[0] = uSrc1.au64[0];
12269 uDstOut.au64[1] = uSrc2.au64[0];
12270 *puDst = uDstOut;
12271}
12272#endif
12273
12274
12275IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12276{
12277 RTUINT128U const uSrc2 = *puSrc2;
12278 RTUINT128U const uSrc1 = *puSrc1;
12279 ASMCompilerBarrier();
12280 RTUINT128U uDstOut;
12281 uDstOut.au64[0] = uSrc1.au64[0];
12282 uDstOut.au64[1] = uSrc2.au64[0];
12283 *puDst = uDstOut;
12284}
12285
12286
12287IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12288{
12289 RTUINT256U const uSrc2 = *puSrc2;
12290 RTUINT256U const uSrc1 = *puSrc1;
12291 ASMCompilerBarrier();
12292 RTUINT256U uDstOut;
12293 uDstOut.au64[0] = uSrc1.au64[0];
12294 uDstOut.au64[1] = uSrc2.au64[0];
12295
12296 uDstOut.au64[2] = uSrc1.au64[2];
12297 uDstOut.au64[3] = uSrc2.au64[2];
12298 *puDst = uDstOut;
12299}
12300
12301
12302/*
12303 * PACKSSWB - signed words -> signed bytes
12304 */
12305
12306#ifdef IEM_WITHOUT_ASSEMBLY
12307
12308IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12309{
12310 RTUINT64U const uSrc2 = { *puSrc };
12311 RTUINT64U const uSrc1 = { *puDst };
12312 ASMCompilerBarrier();
12313 RTUINT64U uDstOut;
12314 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12315 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12316 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12317 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12318 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12319 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12320 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12321 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12322 *puDst = uDstOut.u;
12323}
12324
12325
12326IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12327{
12328 RTUINT128U const uSrc2 = *puSrc;
12329 RTUINT128U const uSrc1 = *puDst;
12330 ASMCompilerBarrier();
12331 RTUINT128U uDstOut;
12332 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12333 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12334 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12335 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12336 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12337 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12338 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12339 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12340 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12341 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12342 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12343 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12344 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12345 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12346 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12347 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12348 *puDst = uDstOut;
12349}
12350
12351#endif
12352
12353IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12354{
12355 RTUINT128U const uSrc2 = *puSrc2;
12356 RTUINT128U const uSrc1 = *puSrc1;
12357 ASMCompilerBarrier();
12358 RTUINT128U uDstOut;
12359 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12360 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12361 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12362 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12363 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12364 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12365 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12366 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12367 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12368 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12369 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12370 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12371 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12372 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12373 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12374 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12375 *puDst = uDstOut;
12376}
12377
12378
12379IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12380{
12381 RTUINT256U const uSrc2 = *puSrc2;
12382 RTUINT256U const uSrc1 = *puSrc1;
12383 ASMCompilerBarrier();
12384 RTUINT256U uDstOut;
12385 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12386 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12387 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12388 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12389 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12390 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12391 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12392 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12393 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12394 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12395 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12396 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12397 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12398 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12399 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12400 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12401
12402 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12403 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12404 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12405 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12406 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12407 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12408 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12409 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12410 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12411 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12412 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12413 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12414 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12415 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12416 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12417 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12418 *puDst = uDstOut;
12419}
12420
12421
12422/*
12423 * PACKUSWB - signed words -> unsigned bytes
12424 */
12425#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12426 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12427 ? (uint8_t)(a_iWord) \
12428 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12429
12430#ifdef IEM_WITHOUT_ASSEMBLY
12431
12432IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12433{
12434 RTUINT64U const uSrc2 = { *puSrc };
12435 RTUINT64U const uSrc1 = { *puDst };
12436 ASMCompilerBarrier();
12437 RTUINT64U uDstOut;
12438 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12439 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12440 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12441 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12442 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12443 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12444 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12445 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12446 *puDst = uDstOut.u;
12447}
12448
12449
12450IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12451{
12452 RTUINT128U const uSrc2 = *puSrc;
12453 RTUINT128U const uSrc1 = *puDst;
12454 ASMCompilerBarrier();
12455 RTUINT128U uDstOut;
12456 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12457 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12458 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12459 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12460 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12461 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12462 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12463 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12464 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12465 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12466 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12467 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12468 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12469 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12470 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12471 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12472 *puDst = uDstOut;
12473}
12474
12475#endif
12476
12477IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12478{
12479 RTUINT128U const uSrc2 = *puSrc2;
12480 RTUINT128U const uSrc1 = *puSrc1;
12481 ASMCompilerBarrier();
12482 RTUINT128U uDstOut;
12483 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12484 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12485 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12486 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12487 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12488 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12489 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12490 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12491 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12492 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12493 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12494 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12495 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12496 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12497 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12498 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12499 *puDst = uDstOut;
12500}
12501
12502
12503IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12504{
12505 RTUINT256U const uSrc2 = *puSrc2;
12506 RTUINT256U const uSrc1 = *puSrc1;
12507 ASMCompilerBarrier();
12508 RTUINT256U uDstOut;
12509 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12510 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12511 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12512 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12513 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12514 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12515 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12516 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12517 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12518 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12519 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12520 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12521 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12522 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12523 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12524 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12525
12526 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12527 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12528 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12529 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12530 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12531 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12532 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12533 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12534 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12535 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12536 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12537 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12538 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12539 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12540 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12541 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12542 *puDst = uDstOut;
12543}
12544
12545
12546/*
12547 * PACKSSDW - signed dwords -> signed words
12548 */
12549
12550#ifdef IEM_WITHOUT_ASSEMBLY
12551
12552IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12553{
12554 RTUINT64U const uSrc2 = { *puSrc };
12555 RTUINT64U const uSrc1 = { *puDst };
12556 ASMCompilerBarrier();
12557 RTUINT64U uDstOut;
12558 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12559 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12560 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12561 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12562 *puDst = uDstOut.u;
12563}
12564
12565
12566IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12567{
12568 RTUINT128U const uSrc2 = *puSrc;
12569 RTUINT128U const uSrc1 = *puDst;
12570 ASMCompilerBarrier();
12571 RTUINT128U uDstOut;
12572 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12573 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12574 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12575 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12576 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12577 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12578 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12579 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12580 *puDst = uDstOut;
12581}
12582
12583#endif
12584
12585IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12586{
12587 RTUINT128U const uSrc2 = *puSrc2;
12588 RTUINT128U const uSrc1 = *puSrc1;
12589 ASMCompilerBarrier();
12590 RTUINT128U uDstOut;
12591 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12592 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12593 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12594 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12595 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12596 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12597 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12598 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12599 *puDst = uDstOut;
12600}
12601
12602
12603IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12604{
12605 RTUINT256U const uSrc2 = *puSrc2;
12606 RTUINT256U const uSrc1 = *puSrc1;
12607 ASMCompilerBarrier();
12608 RTUINT256U uDstOut;
12609 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12610 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12611 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12612 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12613 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12614 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12615 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12616 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12617
12618 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12619 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12620 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12621 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12622 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12623 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12624 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12625 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12626 *puDst = uDstOut;
12627}
12628
12629
12630/*
12631 * PACKUSDW - signed dwords -> unsigned words
12632 */
12633#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12634 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12635 ? (uint16_t)(a_iDword) \
12636 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12637
12638#ifdef IEM_WITHOUT_ASSEMBLY
12639IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12640{
12641 RTUINT128U const uSrc2 = *puSrc;
12642 RTUINT128U const uSrc1 = *puDst;
12643 ASMCompilerBarrier();
12644 RTUINT128U uDstOut;
12645 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12646 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12647 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12648 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12649 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12650 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12651 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12652 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12653 *puDst = uDstOut;
12654}
12655#endif
12656
12657IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12658{
12659 RTUINT128U const uSrc2 = *puSrc2;
12660 RTUINT128U const uSrc1 = *puSrc1;
12661 ASMCompilerBarrier();
12662 RTUINT128U uDstOut;
12663 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12664 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12665 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12666 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12667 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12668 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12669 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12670 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12671 *puDst = uDstOut;
12672}
12673
12674
12675IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12676{
12677 RTUINT256U const uSrc2 = *puSrc2;
12678 RTUINT256U const uSrc1 = *puSrc1;
12679 ASMCompilerBarrier();
12680 RTUINT256U uDstOut;
12681 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12682 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12683 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12684 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12685 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12686 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12687 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12688 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12689
12690 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12691 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12692 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12693 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12694 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12695 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12696 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12697 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12698 *puDst = uDstOut;
12699}
12700
12701
12702/*
12703 * [V]PABSB / [V]PABSW / [V]PABSD
12704 */
12705
12706IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12707{
12708 RTUINT64U const uSrc = { *puSrc };
12709 RTUINT64U uDstOut = { 0 };
12710
12711 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12712 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12713 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12714 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12715 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12716 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12717 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12718 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12719 *puDst = uDstOut.u;
12720 RT_NOREF(pFpuState);
12721}
12722
12723
12724IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12725{
12726 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12727 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12728 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12729 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12730 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12731 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12732 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12733 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12734 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12735 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12736 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12737 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12738 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12739 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12740 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12741 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12742 RT_NOREF(pFpuState);
12743}
12744
12745
12746IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12747{
12748 RTUINT64U const uSrc = { *puSrc };
12749 RTUINT64U uDstOut = { 0 };
12750
12751 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12752 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12753 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12754 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12755 *puDst = uDstOut.u;
12756 RT_NOREF(pFpuState);
12757}
12758
12759
12760IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12761{
12762 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12763 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12764 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12765 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12766 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12767 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12768 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12769 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12770 RT_NOREF(pFpuState);
12771}
12772
12773
12774IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12775{
12776 RTUINT64U const uSrc = { *puSrc };
12777 RTUINT64U uDstOut = { 0 };
12778
12779 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12780 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12781 *puDst = uDstOut.u;
12782 RT_NOREF(pFpuState);
12783}
12784
12785
12786IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12787{
12788 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12789 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12790 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12791 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12792 RT_NOREF(pFpuState);
12793}
12794
12795
12796IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12797{
12798 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12799 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12800 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12801 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12802 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12803 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12804 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12805 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12806 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12807 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12808 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12809 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12810 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12811 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12812 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12813 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12814}
12815
12816
12817IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12818{
12819 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12820 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12821 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12822 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12823 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12824 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12825 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12826 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12827 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12828 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12829 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12830 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12831 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12832 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12833 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12834 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12835 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12836 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12837 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12838 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12839 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12840 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12841 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12842 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12843 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12844 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12845 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12846 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12847 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12848 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12849 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12850 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12851}
12852
12853
12854IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12855{
12856 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12857 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12858 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12859 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12860 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12861 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12862 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12863 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12864}
12865
12866
12867IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12868{
12869 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12870 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12871 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12872 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12873 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12874 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12875 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12876 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12877 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12878 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12879 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12880 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12881 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12882 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12883 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12884 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12885}
12886
12887
12888IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12889{
12890 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12891 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12892 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12893 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12894}
12895
12896
12897IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12898{
12899 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12900 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12901 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12902 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12903 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12904 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12905 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12906 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12907}
12908
12909
12910/*
12911 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12912 */
12913IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12914{
12915 RTUINT64U uSrc1 = { *puDst };
12916 RTUINT64U uSrc2 = { *puSrc };
12917 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12918
12919 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12920 {
12921 if (uSrc2.ai8[i] < 0)
12922 uDst.ai8[i] = -uSrc1.ai8[i];
12923 else if (uSrc2.ai8[i] == 0)
12924 uDst.ai8[i] = 0;
12925 else /* uSrc2.ai8[i] > 0 */
12926 uDst.ai8[i] = uSrc1.ai8[i];
12927 }
12928
12929 *puDst = uDst.u;
12930 RT_NOREF(pFpuState);
12931}
12932
12933
12934IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12935{
12936 RTUINT128U uSrc1 = *puDst;
12937
12938 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12939 {
12940 if (puSrc->ai8[i] < 0)
12941 puDst->ai8[i] = -uSrc1.ai8[i];
12942 else if (puSrc->ai8[i] == 0)
12943 puDst->ai8[i] = 0;
12944 else /* puSrc->ai8[i] > 0 */
12945 puDst->ai8[i] = uSrc1.ai8[i];
12946 }
12947
12948 RT_NOREF(pFpuState);
12949}
12950
12951
12952IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12953{
12954 RTUINT64U uSrc1 = { *puDst };
12955 RTUINT64U uSrc2 = { *puSrc };
12956 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12957
12958 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12959 {
12960 if (uSrc2.ai16[i] < 0)
12961 uDst.ai16[i] = -uSrc1.ai16[i];
12962 else if (uSrc2.ai16[i] == 0)
12963 uDst.ai16[i] = 0;
12964 else /* uSrc2.ai16[i] > 0 */
12965 uDst.ai16[i] = uSrc1.ai16[i];
12966 }
12967
12968 *puDst = uDst.u;
12969 RT_NOREF(pFpuState);
12970}
12971
12972
12973IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12974{
12975 RTUINT128U uSrc1 = *puDst;
12976
12977 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12978 {
12979 if (puSrc->ai16[i] < 0)
12980 puDst->ai16[i] = -uSrc1.ai16[i];
12981 else if (puSrc->ai16[i] == 0)
12982 puDst->ai16[i] = 0;
12983 else /* puSrc->ai16[i] > 0 */
12984 puDst->ai16[i] = uSrc1.ai16[i];
12985 }
12986
12987 RT_NOREF(pFpuState);
12988}
12989
12990
12991IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12992{
12993 RTUINT64U uSrc1 = { *puDst };
12994 RTUINT64U uSrc2 = { *puSrc };
12995 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12996
12997 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12998 {
12999 if (uSrc2.ai32[i] < 0)
13000 uDst.ai32[i] = -uSrc1.ai32[i];
13001 else if (uSrc2.ai32[i] == 0)
13002 uDst.ai32[i] = 0;
13003 else /* uSrc2.ai32[i] > 0 */
13004 uDst.ai32[i] = uSrc1.ai32[i];
13005 }
13006
13007 *puDst = uDst.u;
13008 RT_NOREF(pFpuState);
13009}
13010
13011
13012IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13013{
13014 RTUINT128U uSrc1 = *puDst;
13015
13016 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13017 {
13018 if (puSrc->ai32[i] < 0)
13019 puDst->ai32[i] = -uSrc1.ai32[i];
13020 else if (puSrc->ai32[i] == 0)
13021 puDst->ai32[i] = 0;
13022 else /* puSrc->ai32[i] > 0 */
13023 puDst->ai32[i] = uSrc1.ai32[i];
13024 }
13025
13026 RT_NOREF(pFpuState);
13027}
13028
13029
13030IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13031{
13032 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13033 {
13034 if (puSrc2->ai8[i] < 0)
13035 puDst->ai8[i] = -puSrc1->ai8[i];
13036 else if (puSrc2->ai8[i] == 0)
13037 puDst->ai8[i] = 0;
13038 else /* puSrc2->ai8[i] > 0 */
13039 puDst->ai8[i] = puSrc1->ai8[i];
13040 }
13041}
13042
13043
13044IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13045{
13046 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13047 {
13048 if (puSrc2->ai8[i] < 0)
13049 puDst->ai8[i] = -puSrc1->ai8[i];
13050 else if (puSrc2->ai8[i] == 0)
13051 puDst->ai8[i] = 0;
13052 else /* puSrc2->ai8[i] > 0 */
13053 puDst->ai8[i] = puSrc1->ai8[i];
13054 }
13055}
13056
13057
13058IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13059{
13060 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13061 {
13062 if (puSrc2->ai16[i] < 0)
13063 puDst->ai16[i] = -puSrc1->ai16[i];
13064 else if (puSrc2->ai16[i] == 0)
13065 puDst->ai16[i] = 0;
13066 else /* puSrc2->ai16[i] > 0 */
13067 puDst->ai16[i] = puSrc1->ai16[i];
13068 }
13069}
13070
13071
13072IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13073{
13074 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13075 {
13076 if (puSrc2->ai16[i] < 0)
13077 puDst->ai16[i] = -puSrc1->ai16[i];
13078 else if (puSrc2->ai16[i] == 0)
13079 puDst->ai16[i] = 0;
13080 else /* puSrc2->ai16[i] > 0 */
13081 puDst->ai16[i] = puSrc1->ai16[i];
13082 }
13083}
13084
13085
13086IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13087{
13088 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13089 {
13090 if (puSrc2->ai32[i] < 0)
13091 puDst->ai32[i] = -puSrc1->ai32[i];
13092 else if (puSrc2->ai32[i] == 0)
13093 puDst->ai32[i] = 0;
13094 else /* puSrc2->ai32[i] > 0 */
13095 puDst->ai32[i] = puSrc1->ai32[i];
13096 }
13097}
13098
13099
13100IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13101{
13102 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13103 {
13104 if (puSrc2->ai32[i] < 0)
13105 puDst->ai32[i] = -puSrc1->ai32[i];
13106 else if (puSrc2->ai32[i] == 0)
13107 puDst->ai32[i] = 0;
13108 else /* puSrc2->ai32[i] > 0 */
13109 puDst->ai32[i] = puSrc1->ai32[i];
13110 }
13111}
13112
13113
13114/*
13115 * PHADDW / VPHADDW / PHADDD / VPHADDD
13116 */
13117IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13118{
13119 RTUINT64U uSrc1 = { *puDst };
13120 RTUINT64U uSrc2 = { *puSrc };
13121 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13122
13123 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13124 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13125 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13126 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13127 *puDst = uDst.u;
13128 RT_NOREF(pFpuState);
13129}
13130
13131
13132IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13133{
13134 RTUINT128U uSrc1 = *puDst;
13135
13136 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13137 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13138 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13139 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13140
13141 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13142 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13143 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13144 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13145 RT_NOREF(pFpuState);
13146}
13147
13148
13149IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13150{
13151 RTUINT64U uSrc1 = { *puDst };
13152 RTUINT64U uSrc2 = { *puSrc };
13153 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13154
13155 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13156 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
13157 *puDst = uDst.u;
13158 RT_NOREF(pFpuState);
13159}
13160
13161
13162IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13163{
13164 RTUINT128U uSrc1 = *puDst;
13165
13166 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13167 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13168
13169 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13170 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13171 RT_NOREF(pFpuState);
13172}
13173
13174
13175IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13176{
13177 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13178
13179 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13180 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13181 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13182 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13183
13184 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13185 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13186 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13187 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13188
13189 puDst->au64[0] = uDst.au64[0];
13190 puDst->au64[1] = uDst.au64[1];
13191}
13192
13193
13194IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13195{
13196 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13197
13198 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13199 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13200 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13201 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13202 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13203 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13204 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13205 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13206
13207 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13208 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13209 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13210 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13211 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13212 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13213 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13214 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13215
13216 puDst->au64[0] = uDst.au64[0];
13217 puDst->au64[1] = uDst.au64[1];
13218 puDst->au64[2] = uDst.au64[2];
13219 puDst->au64[3] = uDst.au64[3];
13220}
13221
13222
13223IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13224{
13225 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13226
13227 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13228 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13229
13230 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13231 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13232
13233 puDst->au64[0] = uDst.au64[0];
13234 puDst->au64[1] = uDst.au64[1];
13235}
13236
13237
13238IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13239{
13240 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13241
13242 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13243 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13244 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13245 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13246
13247 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13248 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13249 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13250 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13251
13252 puDst->au64[0] = uDst.au64[0];
13253 puDst->au64[1] = uDst.au64[1];
13254 puDst->au64[2] = uDst.au64[2];
13255 puDst->au64[3] = uDst.au64[3];
13256}
13257
13258
13259/*
13260 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13261 */
13262IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13263{
13264 RTUINT64U uSrc1 = { *puDst };
13265 RTUINT64U uSrc2 = { *puSrc };
13266 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13267
13268 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13269 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13270 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13271 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13272 *puDst = uDst.u;
13273 RT_NOREF(pFpuState);
13274}
13275
13276
13277IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13278{
13279 RTUINT128U uSrc1 = *puDst;
13280
13281 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13282 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13283 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13284 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13285
13286 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13287 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13288 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13289 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13290 RT_NOREF(pFpuState);
13291}
13292
13293
13294IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13295{
13296 RTUINT64U uSrc1 = { *puDst };
13297 RTUINT64U uSrc2 = { *puSrc };
13298 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13299
13300 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13301 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13302 *puDst = uDst.u;
13303 RT_NOREF(pFpuState);
13304}
13305
13306
13307IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13308{
13309 RTUINT128U uSrc1 = *puDst;
13310
13311 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13312 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13313
13314 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13315 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13316 RT_NOREF(pFpuState);
13317}
13318
13319
13320IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13321{
13322 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13323
13324 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13325 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13326 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13327 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13328
13329 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13330 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13331 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13332 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13333
13334 puDst->au64[0] = uDst.au64[0];
13335 puDst->au64[1] = uDst.au64[1];
13336}
13337
13338
13339IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13340{
13341 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13342
13343 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13344 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13345 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13346 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13347 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13348 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13349 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13350 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13351
13352 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13353 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13354 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13355 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13356 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13357 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13358 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13359 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13360
13361 puDst->au64[0] = uDst.au64[0];
13362 puDst->au64[1] = uDst.au64[1];
13363 puDst->au64[2] = uDst.au64[2];
13364 puDst->au64[3] = uDst.au64[3];
13365}
13366
13367
13368IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13369{
13370 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13371
13372 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13373 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13374
13375 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13376 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13377
13378 puDst->au64[0] = uDst.au64[0];
13379 puDst->au64[1] = uDst.au64[1];
13380}
13381
13382
13383IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13384{
13385 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13386
13387 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13388 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13389 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13390 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13391
13392 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13393 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13394 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13395 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13396
13397 puDst->au64[0] = uDst.au64[0];
13398 puDst->au64[1] = uDst.au64[1];
13399 puDst->au64[2] = uDst.au64[2];
13400 puDst->au64[3] = uDst.au64[3];
13401}
13402
13403
13404/*
13405 * PHADDSW / VPHADDSW
13406 */
13407IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13408{
13409 RTUINT64U uSrc1 = { *puDst };
13410 RTUINT64U uSrc2 = { *puSrc };
13411 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13412
13413 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13414 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13415 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13416 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13417 *puDst = uDst.u;
13418 RT_NOREF(pFpuState);
13419}
13420
13421
13422IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13423{
13424 RTUINT128U uSrc1 = *puDst;
13425
13426 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13427 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13428 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13429 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13430
13431 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13432 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13433 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13434 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13435 RT_NOREF(pFpuState);
13436}
13437
13438
13439IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13440{
13441 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13442
13443 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13444 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13445 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13446 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13447
13448 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13449 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13450 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13451 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13452
13453 puDst->au64[0] = uDst.au64[0];
13454 puDst->au64[1] = uDst.au64[1];
13455}
13456
13457
13458IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13459{
13460 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13461
13462 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13463 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13464 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13465 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13466 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13467 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13468 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13469 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13470
13471 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13472 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13473 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13474 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13475 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13476 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13477 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13478 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13479
13480 puDst->au64[0] = uDst.au64[0];
13481 puDst->au64[1] = uDst.au64[1];
13482 puDst->au64[2] = uDst.au64[2];
13483 puDst->au64[3] = uDst.au64[3];
13484}
13485
13486
13487/*
13488 * PHSUBSW / VPHSUBSW
13489 */
13490IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13491{
13492 RTUINT64U uSrc1 = { *puDst };
13493 RTUINT64U uSrc2 = { *puSrc };
13494 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13495
13496 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13497 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13498 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13499 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13500 *puDst = uDst.u;
13501 RT_NOREF(pFpuState);
13502}
13503
13504
13505IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13506{
13507 RTUINT128U uSrc1 = *puDst;
13508
13509 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13510 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13511 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13512 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13513
13514 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13515 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13516 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13517 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13518 RT_NOREF(pFpuState);
13519}
13520
13521
13522IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13523{
13524 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13525
13526 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13527 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13528 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13529 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13530
13531 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13532 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13533 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13534 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13535
13536 puDst->au64[0] = uDst.au64[0];
13537 puDst->au64[1] = uDst.au64[1];
13538}
13539
13540
13541IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13542{
13543 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13544
13545 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13546 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13547 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13548 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13549 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13550 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13551 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13552 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13553
13554 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13555 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13556 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13557 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13558 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13559 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13560 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13561 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13562
13563 puDst->au64[0] = uDst.au64[0];
13564 puDst->au64[1] = uDst.au64[1];
13565 puDst->au64[2] = uDst.au64[2];
13566 puDst->au64[3] = uDst.au64[3];
13567}
13568
13569
13570/*
13571 * PMADDUBSW / VPMADDUBSW
13572 */
13573IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13574{
13575 RTUINT64U uSrc1 = { *puDst };
13576 RTUINT64U uSrc2 = { *puSrc };
13577 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13578
13579 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13580 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13581 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13582 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13583 *puDst = uDst.u;
13584 RT_NOREF(pFpuState);
13585}
13586
13587
13588IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13589{
13590 RTUINT128U uSrc1 = *puDst;
13591
13592 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13593 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13594 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13595 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13596 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13597 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13598 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13599 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13600 RT_NOREF(pFpuState);
13601}
13602
13603
13604IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13605{
13606 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13607
13608 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13609 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13610 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13611 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13612 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13613 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13614 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13615 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13616
13617 puDst->au64[0] = uDst.au64[0];
13618 puDst->au64[1] = uDst.au64[1];
13619}
13620
13621
13622IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13623{
13624 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13625
13626 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13627 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13628 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13629 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13630 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13631 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13632 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13633 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13634 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13635 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13636 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13637 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13638 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13639 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13640 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13641 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13642
13643 puDst->au64[0] = uDst.au64[0];
13644 puDst->au64[1] = uDst.au64[1];
13645 puDst->au64[2] = uDst.au64[2];
13646 puDst->au64[3] = uDst.au64[3];
13647}
13648
13649
13650/*
13651 * PMULHRSW / VPMULHRSW
13652 */
13653#define DO_PMULHRSW(a_Src1, a_Src2) \
13654 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13655
13656IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13657{
13658 RTUINT64U uSrc1 = { *puDst };
13659 RTUINT64U uSrc2 = { *puSrc };
13660 RTUINT64U uDst;
13661
13662 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13663 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13664 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13665 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13666 *puDst = uDst.u;
13667 RT_NOREF(pFpuState);
13668}
13669
13670
13671IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13672{
13673 RTUINT128U uSrc1 = *puDst;
13674
13675 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13676 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13677 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13678 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13679 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13680 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13681 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13682 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13683 RT_NOREF(pFpuState);
13684}
13685
13686
13687IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13688{
13689 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13690
13691 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13692 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13693 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13694 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13695 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13696 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13697 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13698 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13699
13700 puDst->au64[0] = uDst.au64[0];
13701 puDst->au64[1] = uDst.au64[1];
13702}
13703
13704
13705IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13706{
13707 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13708
13709 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13710 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13711 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13712 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13713 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13714 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13715 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13716 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13717 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13718 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13719 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13720 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13721 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13722 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13723 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13724 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13725
13726 puDst->au64[0] = uDst.au64[0];
13727 puDst->au64[1] = uDst.au64[1];
13728 puDst->au64[2] = uDst.au64[2];
13729 puDst->au64[3] = uDst.au64[3];
13730}
13731
13732
13733/*
13734 * PSADBW / VPSADBW
13735 */
13736#ifdef IEM_WITHOUT_ASSEMBLY
13737
13738IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13739{
13740 RTUINT64U uSrc1 = { *puDst };
13741 RTUINT64U uSrc2 = { *puSrc };
13742 RTUINT64U uDst;
13743 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13744 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13745 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13746 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13747 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13748 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13749 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13750 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13751
13752 uDst.au64[0] = 0;
13753 uDst.au16[0] = uSum;
13754 *puDst = uDst.u;
13755}
13756
13757
13758IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13759{
13760 RTUINT128U uSrc1 = *puDst;
13761
13762 puDst->au64[0] = 0;
13763 puDst->au64[1] = 0;
13764
13765 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13766 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13767 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13768 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13769 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13770 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13771 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13772 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13773 puDst->au16[0] = uSum;
13774
13775 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13776 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13777 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13778 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13779 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13780 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13781 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13782 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13783 puDst->au16[4] = uSum;
13784}
13785
13786#endif
13787
13788IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13789{
13790 RTUINT128U uSrc1 = *puSrc1;
13791 RTUINT128U uSrc2 = *puSrc2;
13792
13793 puDst->au64[0] = 0;
13794 puDst->au64[1] = 0;
13795
13796 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13797 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13798 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13799 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13800 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13801 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13802 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13803 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13804 puDst->au16[0] = uSum;
13805
13806 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13807 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13808 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13809 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13810 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13811 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13812 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13813 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13814 puDst->au16[4] = uSum;
13815}
13816
13817IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13818{
13819 RTUINT256U uSrc1 = *puSrc1;
13820 RTUINT256U uSrc2 = *puSrc2;
13821
13822 puDst->au64[0] = 0;
13823 puDst->au64[1] = 0;
13824 puDst->au64[2] = 0;
13825 puDst->au64[3] = 0;
13826
13827 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13828 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13829 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13830 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13831 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13832 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13833 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13834 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13835 puDst->au16[0] = uSum;
13836
13837 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13838 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13839 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13840 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13841 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13842 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13843 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13844 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13845 puDst->au16[4] = uSum;
13846
13847 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13848 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13849 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13850 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13851 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13852 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13853 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13854 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13855 puDst->au16[8] = uSum;
13856
13857 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13858 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13859 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13860 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13861 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13862 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13863 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13864 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13865 puDst->au16[12] = uSum;
13866}
13867
13868
13869/*
13870 * PMULDQ / VPMULDQ
13871 */
13872IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13873{
13874 RTUINT128U uSrc1 = *puDst;
13875
13876 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13877 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13878}
13879
13880IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13881{
13882 RTUINT128U uSrc1 = *puSrc1;
13883 RTUINT128U uSrc2 = *puSrc2;
13884
13885 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13886 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13887}
13888
13889IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13890{
13891 RTUINT256U uSrc1 = *puSrc1;
13892 RTUINT256U uSrc2 = *puSrc2;
13893
13894 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13895 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13896 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13897 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13898}
13899
13900
13901/*
13902 * PMULUDQ / VPMULUDQ
13903 */
13904#ifdef IEM_WITHOUT_ASSEMBLY
13905
13906IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13907{
13908 RTUINT64U uSrc1 = { *puDst };
13909 RTUINT64U uSrc2 = { *puSrc };
13910 ASMCompilerBarrier();
13911 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13912 RT_NOREF(pFpuState);
13913}
13914
13915
13916IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13917{
13918 RTUINT128U uSrc1 = *puDst;
13919 RTUINT128U uSrc2 = *puSrc;
13920 ASMCompilerBarrier();
13921 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13922 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13923 RT_NOREF(pFpuState);
13924}
13925
13926#endif
13927
13928IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13929{
13930 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13931 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13932 ASMCompilerBarrier();
13933 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13934 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13935}
13936
13937
13938IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13939{
13940 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13941 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13942 ASMCompilerBarrier();
13943 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13944 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13945 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13946 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13947}
13948
13949
13950/*
13951 * UNPCKLPS / VUNPCKLPS
13952 */
13953#ifdef IEM_WITHOUT_ASSEMBLY
13954IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13955{
13956 RTUINT128U uSrc1 = *puDst;
13957 RTUINT128U uSrc2 = *puSrc;
13958 ASMCompilerBarrier();
13959 puDst->au32[0] = uSrc1.au32[0];
13960 puDst->au32[1] = uSrc2.au32[0];
13961 puDst->au32[2] = uSrc1.au32[1];
13962 puDst->au32[3] = uSrc2.au32[1];
13963}
13964
13965#endif
13966
13967IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13968{
13969 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13970 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13971 ASMCompilerBarrier();
13972 puDst->au32[0] = uSrc1.au32[0];
13973 puDst->au32[1] = uSrc2.au32[0];
13974 puDst->au32[2] = uSrc1.au32[1];
13975 puDst->au32[3] = uSrc2.au32[1];
13976}
13977
13978
13979IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13980{
13981 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13982 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13983 ASMCompilerBarrier();
13984 puDst->au32[0] = uSrc1.au32[0];
13985 puDst->au32[1] = uSrc2.au32[0];
13986 puDst->au32[2] = uSrc1.au32[1];
13987 puDst->au32[3] = uSrc2.au32[1];
13988
13989 puDst->au32[4] = uSrc1.au32[4];
13990 puDst->au32[5] = uSrc2.au32[4];
13991 puDst->au32[6] = uSrc1.au32[5];
13992 puDst->au32[7] = uSrc2.au32[5];
13993}
13994
13995
13996/*
13997 * UNPCKLPD / VUNPCKLPD
13998 */
13999#ifdef IEM_WITHOUT_ASSEMBLY
14000IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14001{
14002 RTUINT128U uSrc1 = *puDst;
14003 RTUINT128U uSrc2 = *puSrc;
14004 ASMCompilerBarrier();
14005 puDst->au64[0] = uSrc1.au64[0];
14006 puDst->au64[1] = uSrc2.au64[0];
14007}
14008
14009#endif
14010
14011IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14012{
14013 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14014 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14015 ASMCompilerBarrier();
14016 puDst->au64[0] = uSrc1.au64[0];
14017 puDst->au64[1] = uSrc2.au64[0];
14018}
14019
14020
14021IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14022{
14023 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14024 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14025 ASMCompilerBarrier();
14026 puDst->au64[0] = uSrc1.au64[0];
14027 puDst->au64[1] = uSrc2.au64[0];
14028 puDst->au64[2] = uSrc1.au64[2];
14029 puDst->au64[3] = uSrc2.au64[2];
14030}
14031
14032
14033/*
14034 * UNPCKHPS / VUNPCKHPS
14035 */
14036#ifdef IEM_WITHOUT_ASSEMBLY
14037IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14038{
14039 RTUINT128U uSrc1 = *puDst;
14040 RTUINT128U uSrc2 = *puSrc;
14041 ASMCompilerBarrier();
14042 puDst->au32[0] = uSrc1.au32[2];
14043 puDst->au32[1] = uSrc2.au32[2];
14044 puDst->au32[2] = uSrc1.au32[3];
14045 puDst->au32[3] = uSrc2.au32[3];
14046}
14047
14048#endif
14049
14050IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14051{
14052 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14053 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14054 ASMCompilerBarrier();
14055 puDst->au32[0] = uSrc1.au32[2];
14056 puDst->au32[1] = uSrc2.au32[2];
14057 puDst->au32[2] = uSrc1.au32[3];
14058 puDst->au32[3] = uSrc2.au32[3];
14059}
14060
14061
14062IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14063{
14064 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14065 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14066 ASMCompilerBarrier();
14067 puDst->au32[0] = uSrc1.au32[2];
14068 puDst->au32[1] = uSrc2.au32[2];
14069 puDst->au32[2] = uSrc1.au32[3];
14070 puDst->au32[3] = uSrc2.au32[3];
14071
14072 puDst->au32[4] = uSrc1.au32[6];
14073 puDst->au32[5] = uSrc2.au32[6];
14074 puDst->au32[6] = uSrc1.au32[7];
14075 puDst->au32[7] = uSrc2.au32[7];
14076}
14077
14078
14079/*
14080 * UNPCKHPD / VUNPCKHPD
14081 */
14082#ifdef IEM_WITHOUT_ASSEMBLY
14083IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14084{
14085 RTUINT128U uSrc1 = *puDst;
14086 RTUINT128U uSrc2 = *puSrc;
14087 ASMCompilerBarrier();
14088 puDst->au64[0] = uSrc1.au64[1];
14089 puDst->au64[1] = uSrc2.au64[1];
14090}
14091
14092#endif
14093
14094IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14095{
14096 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14097 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14098 ASMCompilerBarrier();
14099 puDst->au64[0] = uSrc1.au64[1];
14100 puDst->au64[1] = uSrc2.au64[1];
14101}
14102
14103
14104IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14105{
14106 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14107 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14108 ASMCompilerBarrier();
14109 puDst->au64[0] = uSrc1.au64[1];
14110 puDst->au64[1] = uSrc2.au64[1];
14111 puDst->au64[2] = uSrc1.au64[3];
14112 puDst->au64[3] = uSrc2.au64[3];
14113}
14114
14115
14116/*
14117 * CRC32 (SEE 4.2).
14118 */
14119
14120IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14121{
14122 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14123}
14124
14125
14126IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14127{
14128 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14129}
14130
14131IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14132{
14133 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14134}
14135
14136IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14137{
14138 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14139}
14140
14141
14142/*
14143 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14144 */
14145#ifdef IEM_WITHOUT_ASSEMBLY
14146IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14147{
14148 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14149 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14150 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14151 fEfl |= X86_EFL_ZF;
14152 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14153 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14154 fEfl |= X86_EFL_CF;
14155 *pfEFlags = fEfl;
14156}
14157#endif
14158
14159IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14160{
14161 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14162 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14163 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14164 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14165 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14166 fEfl |= X86_EFL_ZF;
14167 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14168 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14169 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14170 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14171 fEfl |= X86_EFL_CF;
14172 *pfEFlags = fEfl;
14173}
14174
14175
14176/*
14177 * PMOVSXBW / VPMOVSXBW
14178 */
14179IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14180{
14181 RTUINT64U uSrc1 = { uSrc };
14182 puDst->ai16[0] = uSrc1.ai8[0];
14183 puDst->ai16[1] = uSrc1.ai8[1];
14184 puDst->ai16[2] = uSrc1.ai8[2];
14185 puDst->ai16[3] = uSrc1.ai8[3];
14186 puDst->ai16[4] = uSrc1.ai8[4];
14187 puDst->ai16[5] = uSrc1.ai8[5];
14188 puDst->ai16[6] = uSrc1.ai8[6];
14189 puDst->ai16[7] = uSrc1.ai8[7];
14190}
14191
14192
14193IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14194{
14195 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14196 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14197 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14198 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14199 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14200 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14201 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14202 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14203 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14204 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14205 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14206 puDst->ai16[10] = uSrc1.ai8[10];
14207 puDst->ai16[11] = uSrc1.ai8[11];
14208 puDst->ai16[12] = uSrc1.ai8[12];
14209 puDst->ai16[13] = uSrc1.ai8[13];
14210 puDst->ai16[14] = uSrc1.ai8[14];
14211 puDst->ai16[15] = uSrc1.ai8[15];
14212}
14213
14214
14215/*
14216 * PMOVSXBD / VPMOVSXBD
14217 */
14218IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14219{
14220 RTUINT32U uSrc1 = { uSrc };
14221 puDst->ai32[0] = uSrc1.ai8[0];
14222 puDst->ai32[1] = uSrc1.ai8[1];
14223 puDst->ai32[2] = uSrc1.ai8[2];
14224 puDst->ai32[3] = uSrc1.ai8[3];
14225}
14226
14227
14228IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14229{
14230 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14231 puDst->ai32[0] = uSrc1.ai8[0];
14232 puDst->ai32[1] = uSrc1.ai8[1];
14233 puDst->ai32[2] = uSrc1.ai8[2];
14234 puDst->ai32[3] = uSrc1.ai8[3];
14235 puDst->ai32[4] = uSrc1.ai8[4];
14236 puDst->ai32[5] = uSrc1.ai8[5];
14237 puDst->ai32[6] = uSrc1.ai8[6];
14238 puDst->ai32[7] = uSrc1.ai8[7];
14239}
14240
14241
14242/*
14243 * PMOVSXBQ / VPMOVSXBQ
14244 */
14245IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14246{
14247 RTUINT16U uSrc1 = { uSrc };
14248 puDst->ai64[0] = uSrc1.ai8[0];
14249 puDst->ai64[1] = uSrc1.ai8[1];
14250}
14251
14252
14253IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14254{
14255 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14256 puDst->ai64[0] = uSrc1.ai8[0];
14257 puDst->ai64[1] = uSrc1.ai8[1];
14258 puDst->ai64[2] = uSrc1.ai8[2];
14259 puDst->ai64[3] = uSrc1.ai8[3];
14260}
14261
14262
14263/*
14264 * PMOVSXWD / VPMOVSXWD
14265 */
14266IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14267{
14268 RTUINT64U uSrc1 = { uSrc };
14269 puDst->ai32[0] = uSrc1.ai16[0];
14270 puDst->ai32[1] = uSrc1.ai16[1];
14271 puDst->ai32[2] = uSrc1.ai16[2];
14272 puDst->ai32[3] = uSrc1.ai16[3];
14273}
14274
14275
14276IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14277{
14278 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14279 puDst->ai32[0] = uSrc1.ai16[0];
14280 puDst->ai32[1] = uSrc1.ai16[1];
14281 puDst->ai32[2] = uSrc1.ai16[2];
14282 puDst->ai32[3] = uSrc1.ai16[3];
14283 puDst->ai32[4] = uSrc1.ai16[4];
14284 puDst->ai32[5] = uSrc1.ai16[5];
14285 puDst->ai32[6] = uSrc1.ai16[6];
14286 puDst->ai32[7] = uSrc1.ai16[7];
14287}
14288
14289
14290/*
14291 * PMOVSXWQ / VPMOVSXWQ
14292 */
14293IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14294{
14295 RTUINT32U uSrc1 = { uSrc };
14296 puDst->ai64[0] = uSrc1.ai16[0];
14297 puDst->ai64[1] = uSrc1.ai16[1];
14298}
14299
14300
14301IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14302{
14303 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14304 puDst->ai64[0] = uSrc1.ai16[0];
14305 puDst->ai64[1] = uSrc1.ai16[1];
14306 puDst->ai64[2] = uSrc1.ai16[2];
14307 puDst->ai64[3] = uSrc1.ai16[3];
14308}
14309
14310
14311/*
14312 * PMOVSXDQ / VPMOVSXDQ
14313 */
14314IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14315{
14316 RTUINT64U uSrc1 = { uSrc };
14317 puDst->ai64[0] = uSrc1.ai32[0];
14318 puDst->ai64[1] = uSrc1.ai32[1];
14319}
14320
14321
14322IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14323{
14324 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14325 puDst->ai64[0] = uSrc1.ai32[0];
14326 puDst->ai64[1] = uSrc1.ai32[1];
14327 puDst->ai64[2] = uSrc1.ai32[2];
14328 puDst->ai64[3] = uSrc1.ai32[3];
14329}
14330
14331
14332/*
14333 * PMOVZXBW / VPMOVZXBW
14334 */
14335IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14336{
14337 RTUINT64U uSrc1 = { uSrc };
14338 puDst->au16[0] = uSrc1.au8[0];
14339 puDst->au16[1] = uSrc1.au8[1];
14340 puDst->au16[2] = uSrc1.au8[2];
14341 puDst->au16[3] = uSrc1.au8[3];
14342 puDst->au16[4] = uSrc1.au8[4];
14343 puDst->au16[5] = uSrc1.au8[5];
14344 puDst->au16[6] = uSrc1.au8[6];
14345 puDst->au16[7] = uSrc1.au8[7];
14346}
14347
14348
14349IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14350{
14351 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14352 puDst->au16[ 0] = uSrc1.au8[ 0];
14353 puDst->au16[ 1] = uSrc1.au8[ 1];
14354 puDst->au16[ 2] = uSrc1.au8[ 2];
14355 puDst->au16[ 3] = uSrc1.au8[ 3];
14356 puDst->au16[ 4] = uSrc1.au8[ 4];
14357 puDst->au16[ 5] = uSrc1.au8[ 5];
14358 puDst->au16[ 6] = uSrc1.au8[ 6];
14359 puDst->au16[ 7] = uSrc1.au8[ 7];
14360 puDst->au16[ 8] = uSrc1.au8[ 8];
14361 puDst->au16[ 9] = uSrc1.au8[ 9];
14362 puDst->au16[10] = uSrc1.au8[10];
14363 puDst->au16[11] = uSrc1.au8[11];
14364 puDst->au16[12] = uSrc1.au8[12];
14365 puDst->au16[13] = uSrc1.au8[13];
14366 puDst->au16[14] = uSrc1.au8[14];
14367 puDst->au16[15] = uSrc1.au8[15];
14368}
14369
14370
14371/*
14372 * PMOVZXBD / VPMOVZXBD
14373 */
14374IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14375{
14376 RTUINT32U uSrc1 = { uSrc };
14377 puDst->au32[0] = uSrc1.au8[0];
14378 puDst->au32[1] = uSrc1.au8[1];
14379 puDst->au32[2] = uSrc1.au8[2];
14380 puDst->au32[3] = uSrc1.au8[3];
14381}
14382
14383
14384IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14385{
14386 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14387 puDst->au32[0] = uSrc1.au8[0];
14388 puDst->au32[1] = uSrc1.au8[1];
14389 puDst->au32[2] = uSrc1.au8[2];
14390 puDst->au32[3] = uSrc1.au8[3];
14391 puDst->au32[4] = uSrc1.au8[4];
14392 puDst->au32[5] = uSrc1.au8[5];
14393 puDst->au32[6] = uSrc1.au8[6];
14394 puDst->au32[7] = uSrc1.au8[7];
14395}
14396
14397
14398/*
14399 * PMOVZXBQ / VPMOVZXBQ
14400 */
14401IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14402{
14403 RTUINT16U uSrc1 = { uSrc };
14404 puDst->au64[0] = uSrc1.au8[0];
14405 puDst->au64[1] = uSrc1.au8[1];
14406}
14407
14408
14409IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14410{
14411 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14412 puDst->au64[0] = uSrc1.au8[0];
14413 puDst->au64[1] = uSrc1.au8[1];
14414 puDst->au64[2] = uSrc1.au8[2];
14415 puDst->au64[3] = uSrc1.au8[3];
14416}
14417
14418
14419/*
14420 * PMOVZXWD / VPMOVZXWD
14421 */
14422IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14423{
14424 RTUINT64U uSrc1 = { uSrc };
14425 puDst->au32[0] = uSrc1.au16[0];
14426 puDst->au32[1] = uSrc1.au16[1];
14427 puDst->au32[2] = uSrc1.au16[2];
14428 puDst->au32[3] = uSrc1.au16[3];
14429}
14430
14431
14432IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14433{
14434 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14435 puDst->au32[0] = uSrc1.au16[0];
14436 puDst->au32[1] = uSrc1.au16[1];
14437 puDst->au32[2] = uSrc1.au16[2];
14438 puDst->au32[3] = uSrc1.au16[3];
14439 puDst->au32[4] = uSrc1.au16[4];
14440 puDst->au32[5] = uSrc1.au16[5];
14441 puDst->au32[6] = uSrc1.au16[6];
14442 puDst->au32[7] = uSrc1.au16[7];
14443}
14444
14445
14446/*
14447 * PMOVZXWQ / VPMOVZXWQ
14448 */
14449IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14450{
14451 RTUINT32U uSrc1 = { uSrc };
14452 puDst->au64[0] = uSrc1.au16[0];
14453 puDst->au64[1] = uSrc1.au16[1];
14454}
14455
14456
14457IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14458{
14459 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14460 puDst->au64[0] = uSrc1.au16[0];
14461 puDst->au64[1] = uSrc1.au16[1];
14462 puDst->au64[2] = uSrc1.au16[2];
14463 puDst->au64[3] = uSrc1.au16[3];
14464}
14465
14466
14467/*
14468 * PMOVZXDQ / VPMOVZXDQ
14469 */
14470IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14471{
14472 RTUINT64U uSrc1 = { uSrc };
14473 puDst->au64[0] = uSrc1.au32[0];
14474 puDst->au64[1] = uSrc1.au32[1];
14475}
14476
14477
14478IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14479{
14480 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14481 puDst->au64[0] = uSrc1.au32[0];
14482 puDst->au64[1] = uSrc1.au32[1];
14483 puDst->au64[2] = uSrc1.au32[2];
14484 puDst->au64[3] = uSrc1.au32[3];
14485}
14486
14487/**
14488 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14489 * the SoftFloat 32-bit floating point format (float32_t).
14490 *
14491 * This is only a structure format conversion, nothing else.
14492 */
14493DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14494{
14495 float32_t Tmp;
14496 Tmp.v = pr32Val->u;
14497 return Tmp;
14498}
14499
14500
14501/**
14502 * Converts from SoftFloat 32-bit floating point format (float32_t)
14503 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14504 *
14505 * This is only a structure format conversion, nothing else.
14506 */
14507DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14508{
14509 pr32Dst->u = r32XSrc.v;
14510 return pr32Dst;
14511}
14512
14513
14514/**
14515 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14516 * the SoftFloat 64-bit floating point format (float64_t).
14517 *
14518 * This is only a structure format conversion, nothing else.
14519 */
14520DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14521{
14522 float64_t Tmp;
14523 Tmp.v = pr64Val->u;
14524 return Tmp;
14525}
14526
14527
14528/**
14529 * Converts from SoftFloat 64-bit floating point format (float64_t)
14530 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14531 *
14532 * This is only a structure format conversion, nothing else.
14533 */
14534DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14535{
14536 pr64Dst->u = r64XSrc.v;
14537 return pr64Dst;
14538}
14539
14540
14541/** Initializer for the SoftFloat state structure. */
14542# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14543 { \
14544 softfloat_tininess_afterRounding, \
14545 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14546 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14547 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14548 : (uint8_t)softfloat_round_minMag, \
14549 0, \
14550 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14551 32 /* Rounding precision, not relevant for SIMD. */ \
14552 }
14553
14554#ifdef IEM_WITHOUT_ASSEMBLY
14555
14556/**
14557 * Helper for transfering exception to MXCSR and setting the result value
14558 * accordingly.
14559 *
14560 * @returns Updated MXCSR.
14561 * @param pSoftState The SoftFloat state following the operation.
14562 * @param r32Result The result of the SoftFloat operation.
14563 * @param pr32Result Where to store the result for IEM.
14564 * @param fMxcsr The original MXCSR value.
14565 */
14566DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14567 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14568{
14569 iemFpSoftF32ToIprt(pr32Result, r32Result);
14570
14571 uint8_t fXcpt = pSoftState->exceptionFlags;
14572 if ( (fMxcsr & X86_MXCSR_FZ)
14573 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14574 {
14575 /* Underflow masked and flush to zero is set. */
14576 pr32Result->s.uFraction = 0;
14577 pr32Result->s.uExponent = 0;
14578 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14579 }
14580
14581 /* If DAZ is set \#DE is never set. */
14582 if ( fMxcsr & X86_MXCSR_DAZ
14583 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14584 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14585 fXcpt &= ~X86_MXCSR_DE;
14586
14587 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14588}
14589
14590
14591/**
14592 * Helper for transfering exception to MXCSR and setting the result value
14593 * accordingly - ignores Flush-to-Zero.
14594 *
14595 * @returns Updated MXCSR.
14596 * @param pSoftState The SoftFloat state following the operation.
14597 * @param r32Result The result of the SoftFloat operation.
14598 * @param pr32Result Where to store the result for IEM.
14599 * @param fMxcsr The original MXCSR value.
14600 */
14601DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14602 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14603{
14604 iemFpSoftF32ToIprt(pr32Result, r32Result);
14605
14606 uint8_t fXcpt = pSoftState->exceptionFlags;
14607 /* If DAZ is set \#DE is never set. */
14608 if ( fMxcsr & X86_MXCSR_DAZ
14609 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14610 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14611 fXcpt &= ~X86_MXCSR_DE;
14612
14613 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14614}
14615
14616
14617/**
14618 * Helper for transfering exception to MXCSR and setting the result value
14619 * accordingly.
14620 *
14621 * @returns Updated MXCSR.
14622 * @param pSoftState The SoftFloat state following the operation.
14623 * @param r64Result The result of the SoftFloat operation.
14624 * @param pr64Result Where to store the result for IEM.
14625 * @param fMxcsr The original MXCSR value.
14626 */
14627DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14628 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14629{
14630 iemFpSoftF64ToIprt(pr64Result, r64Result);
14631 uint8_t fXcpt = pSoftState->exceptionFlags;
14632 if ( (fMxcsr & X86_MXCSR_FZ)
14633 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14634 {
14635 /* Underflow masked and flush to zero is set. */
14636 iemFpSoftF64ToIprt(pr64Result, r64Result);
14637 pr64Result->s.uFractionHigh = 0;
14638 pr64Result->s.uFractionLow = 0;
14639 pr64Result->s.uExponent = 0;
14640 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14641 }
14642
14643 /* If DAZ is set \#DE is never set. */
14644 if ( fMxcsr & X86_MXCSR_DAZ
14645 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14646 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14647 fXcpt &= ~X86_MXCSR_DE;
14648
14649 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14650}
14651
14652
14653/**
14654 * Helper for transfering exception to MXCSR and setting the result value
14655 * accordingly - ignores Flush-to-Zero.
14656 *
14657 * @returns Updated MXCSR.
14658 * @param pSoftState The SoftFloat state following the operation.
14659 * @param r64Result The result of the SoftFloat operation.
14660 * @param pr64Result Where to store the result for IEM.
14661 * @param fMxcsr The original MXCSR value.
14662 */
14663DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14664 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14665{
14666 iemFpSoftF64ToIprt(pr64Result, r64Result);
14667
14668 uint8_t fXcpt = pSoftState->exceptionFlags;
14669 /* If DAZ is set \#DE is never set. */
14670 if ( fMxcsr & X86_MXCSR_DAZ
14671 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14672 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14673 fXcpt &= ~X86_MXCSR_DE;
14674
14675 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14676}
14677
14678#endif /* IEM_WITHOUT_ASSEMBLY */
14679
14680
14681/**
14682 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14683 * in MXCSR into account.
14684 *
14685 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14686 * @param pr32Val Where to store the result.
14687 * @param fMxcsr The input MXCSR value.
14688 * @param pr32Src The value to use.
14689 */
14690DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14691{
14692 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14693 {
14694 if (fMxcsr & X86_MXCSR_DAZ)
14695 {
14696 /* De-normals are changed to 0. */
14697 pr32Val->s.fSign = pr32Src->s.fSign;
14698 pr32Val->s.uFraction = 0;
14699 pr32Val->s.uExponent = 0;
14700 return 0;
14701 }
14702
14703 *pr32Val = *pr32Src;
14704 return X86_MXCSR_DE;
14705 }
14706
14707 *pr32Val = *pr32Src;
14708 return 0;
14709}
14710
14711
14712/**
14713 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14714 * in MXCSR into account.
14715 *
14716 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14717 * @param pr64Val Where to store the result.
14718 * @param fMxcsr The input MXCSR value.
14719 * @param pr64Src The value to use.
14720 */
14721DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14722{
14723 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14724 {
14725 if (fMxcsr & X86_MXCSR_DAZ)
14726 {
14727 /* De-normals are changed to 0. */
14728 pr64Val->s64.fSign = pr64Src->s.fSign;
14729 pr64Val->s64.uFraction = 0;
14730 pr64Val->s64.uExponent = 0;
14731 return 0;
14732 }
14733
14734 *pr64Val = *pr64Src;
14735 return X86_MXCSR_DE;
14736 }
14737
14738 *pr64Val = *pr64Src;
14739 return 0;
14740}
14741
14742#ifdef IEM_WITHOUT_ASSEMBLY
14743
14744/**
14745 * Validates the given input operands returning whether the operation can continue or whether one
14746 * of the source operands contains a NaN value, setting the output accordingly.
14747 *
14748 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14749 * @param pr32Res Where to store the result in case the operation can't continue.
14750 * @param pr32Val1 The first input operand.
14751 * @param pr32Val2 The second input operand.
14752 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14753 */
14754DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14755{
14756 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14757 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14758 if (cSNan + cQNan == 2)
14759 {
14760 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14761 *pr32Res = *pr32Val1;
14762 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14763 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14764 return true;
14765 }
14766 if (cSNan)
14767 {
14768 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14769 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14770 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14771 *pfMxcsr |= X86_MXCSR_IE;
14772 return true;
14773 }
14774 if (cQNan)
14775 {
14776 /* The QNan operand is placed into the result. */
14777 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14778 return true;
14779 }
14780
14781 Assert(!cQNan && !cSNan);
14782 return false;
14783}
14784
14785
14786/**
14787 * Validates the given double precision input operands returning whether the operation can continue or whether one
14788 * of the source operands contains a NaN value, setting the output accordingly.
14789 *
14790 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14791 * @param pr64Res Where to store the result in case the operation can't continue.
14792 * @param pr64Val1 The first input operand.
14793 * @param pr64Val2 The second input operand.
14794 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14795 */
14796DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14797{
14798 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14799 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14800 if (cSNan + cQNan == 2)
14801 {
14802 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14803 *pr64Res = *pr64Val1;
14804 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14805 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14806 return true;
14807 }
14808 if (cSNan)
14809 {
14810 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14811 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14812 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14813 *pfMxcsr |= X86_MXCSR_IE;
14814 return true;
14815 }
14816 if (cQNan)
14817 {
14818 /* The QNan operand is placed into the result. */
14819 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14820 return true;
14821 }
14822
14823 Assert(!cQNan && !cSNan);
14824 return false;
14825}
14826
14827
14828/**
14829 * Validates the given single input operand returning whether the operation can continue or whether
14830 * contains a NaN value, setting the output accordingly.
14831 *
14832 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14833 * @param pr32Res Where to store the result in case the operation can't continue.
14834 * @param pr32Val The input operand.
14835 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14836 */
14837DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
14838{
14839 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
14840 {
14841 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14842 *pr32Res = *pr32Val;
14843 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14844 *pfMxcsr |= X86_MXCSR_IE;
14845 return true;
14846 }
14847 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
14848 {
14849 /* The QNan operand is placed into the result. */
14850 *pr32Res = *pr32Val;
14851 return true;
14852 }
14853
14854 return false;
14855}
14856
14857
14858/**
14859 * Validates the given double input operand returning whether the operation can continue or whether
14860 * contains a NaN value, setting the output accordingly.
14861 *
14862 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14863 * @param pr64Res Where to store the result in case the operation can't continue.
14864 * @param pr64Val The input operand.
14865 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14866 */
14867DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
14868{
14869 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
14870 {
14871 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14872 *pr64Res = *pr64Val;
14873 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14874 *pfMxcsr |= X86_MXCSR_IE;
14875 return true;
14876 }
14877 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
14878 {
14879 /* The QNan operand is placed into the result. */
14880 *pr64Res = *pr64Val;
14881 return true;
14882 }
14883
14884 return false;
14885}
14886
14887#endif /* IEM_WITHOUT_ASSEMBLY */
14888
14889/**
14890 * ADDPS
14891 */
14892#ifdef IEM_WITHOUT_ASSEMBLY
14893static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14894{
14895 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14896 return fMxcsr;
14897
14898 RTFLOAT32U r32Src1, r32Src2;
14899 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14900 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14901 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14902 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14903 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14904}
14905
14906
14907IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14908{
14909 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14910 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14911 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14912 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14913}
14914#endif
14915
14916
14917/**
14918 * ADDSS
14919 */
14920#ifdef IEM_WITHOUT_ASSEMBLY
14921IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14922{
14923 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14924 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14925 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14926 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14927}
14928#endif
14929
14930
14931/**
14932 * ADDPD
14933 */
14934#ifdef IEM_WITHOUT_ASSEMBLY
14935static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14936{
14937 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14938 return fMxcsr;
14939
14940 RTFLOAT64U r64Src1, r64Src2;
14941 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14942 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14943 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14944 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14945 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14946}
14947
14948
14949IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14950{
14951 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14952 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14953}
14954#endif
14955
14956
14957/**
14958 * ADDSD
14959 */
14960#ifdef IEM_WITHOUT_ASSEMBLY
14961IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14962{
14963 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14964 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14965}
14966#endif
14967
14968
14969/**
14970 * MULPS
14971 */
14972#ifdef IEM_WITHOUT_ASSEMBLY
14973static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14974{
14975 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14976 return fMxcsr;
14977
14978 RTFLOAT32U r32Src1, r32Src2;
14979 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14980 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14981 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14982 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14983 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14984}
14985
14986
14987IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14988{
14989 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14990 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14991 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14992 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14993}
14994#endif
14995
14996
14997/**
14998 * MULSS
14999 */
15000#ifdef IEM_WITHOUT_ASSEMBLY
15001IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15002{
15003 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15004 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15005 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15006 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15007}
15008#endif
15009
15010
15011/**
15012 * MULPD
15013 */
15014#ifdef IEM_WITHOUT_ASSEMBLY
15015static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15016{
15017 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15018 return fMxcsr;
15019
15020 RTFLOAT64U r64Src1, r64Src2;
15021 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15022 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15023 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15024 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15025 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15026}
15027
15028
15029IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15030{
15031 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15032 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15033}
15034#endif
15035
15036
15037/**
15038 * MULSD
15039 */
15040#ifdef IEM_WITHOUT_ASSEMBLY
15041IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15042{
15043 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15044 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15045}
15046#endif
15047
15048
15049/**
15050 * SUBPS
15051 */
15052#ifdef IEM_WITHOUT_ASSEMBLY
15053static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15054{
15055 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15056 return fMxcsr;
15057
15058 RTFLOAT32U r32Src1, r32Src2;
15059 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15060 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15061 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15062 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15063 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15064}
15065
15066
15067IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15068{
15069 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15070 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15071 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15072 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15073}
15074#endif
15075
15076
15077/**
15078 * SUBSS
15079 */
15080#ifdef IEM_WITHOUT_ASSEMBLY
15081IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15082{
15083 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15084 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15085 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15086 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15087}
15088#endif
15089
15090
15091/**
15092 * SUBPD
15093 */
15094#ifdef IEM_WITHOUT_ASSEMBLY
15095static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15096{
15097 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15098 return fMxcsr;
15099
15100 RTFLOAT64U r64Src1, r64Src2;
15101 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15102 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15103 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15104 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15105 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15106}
15107
15108
15109IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15110{
15111 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15112 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15113}
15114#endif
15115
15116
15117/**
15118 * SUBSD
15119 */
15120#ifdef IEM_WITHOUT_ASSEMBLY
15121IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15122{
15123 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15124 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15125}
15126#endif
15127
15128
15129/**
15130 * MINPS
15131 */
15132#ifdef IEM_WITHOUT_ASSEMBLY
15133static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15134{
15135 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15136 {
15137 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15138 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15139 return fMxcsr | X86_MXCSR_IE;
15140 }
15141
15142 RTFLOAT32U r32Src1, r32Src2;
15143 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15144 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15145 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15146 {
15147 *pr32Res = r32Src2;
15148 return fMxcsr;
15149 }
15150
15151 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15152 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15153 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15154 fLe
15155 ? iemFpSoftF32FromIprt(&r32Src1)
15156 : iemFpSoftF32FromIprt(&r32Src2),
15157 pr32Res, fMxcsr);
15158}
15159
15160
15161IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15162{
15163 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15164 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15165 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15166 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15167}
15168#endif
15169
15170
15171/**
15172 * MINSS
15173 */
15174#ifdef IEM_WITHOUT_ASSEMBLY
15175IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15176{
15177 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15178 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15179 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15180 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15181}
15182#endif
15183
15184
15185/**
15186 * MINPD
15187 */
15188#ifdef IEM_WITHOUT_ASSEMBLY
15189static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15190{
15191 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15192 {
15193 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15194 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15195 return fMxcsr | X86_MXCSR_IE;
15196 }
15197
15198 RTFLOAT64U r64Src1, r64Src2;
15199 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15200 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15201 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15202 {
15203 *pr64Res = r64Src2;
15204 return fMxcsr;
15205 }
15206
15207 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15208 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15209 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15210 fLe
15211 ? iemFpSoftF64FromIprt(&r64Src1)
15212 : iemFpSoftF64FromIprt(&r64Src2),
15213 pr64Res, fMxcsr);
15214}
15215
15216
15217IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15218{
15219 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15220 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15221}
15222#endif
15223
15224
15225/**
15226 * MINSD
15227 */
15228#ifdef IEM_WITHOUT_ASSEMBLY
15229IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15230{
15231 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15232 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15233}
15234#endif
15235
15236
15237/**
15238 * DIVPS
15239 */
15240#ifdef IEM_WITHOUT_ASSEMBLY
15241static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15242{
15243 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15244 return fMxcsr;
15245
15246 RTFLOAT32U r32Src1, r32Src2;
15247 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15248 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15249 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15250 {
15251 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15252 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15253 {
15254 *pr32Res = g_ar32QNaN[1];
15255 return fMxcsr | X86_MXCSR_IE;
15256 }
15257 else if (RTFLOAT32U_IS_INF(&r32Src1))
15258 {
15259 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15260 return fMxcsr;
15261 }
15262 else
15263 {
15264 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15265 return fMxcsr | X86_MXCSR_ZE;
15266 }
15267 }
15268
15269 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15270 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15271 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15272}
15273
15274
15275IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15276{
15277 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15278 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15279 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15280 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15281}
15282#endif
15283
15284
15285/**
15286 * DIVSS
15287 */
15288#ifdef IEM_WITHOUT_ASSEMBLY
15289IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15290{
15291 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15292 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15293 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15294 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15295}
15296#endif
15297
15298
15299/**
15300 * DIVPD
15301 */
15302#ifdef IEM_WITHOUT_ASSEMBLY
15303static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15304{
15305 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15306 return fMxcsr;
15307
15308 RTFLOAT64U r64Src1, r64Src2;
15309 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15310 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15311 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15312 {
15313 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15314 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15315 {
15316 *pr64Res = g_ar64QNaN[1];
15317 return fMxcsr | X86_MXCSR_IE;
15318 }
15319 else if (RTFLOAT64U_IS_INF(&r64Src1))
15320 {
15321 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15322 return fMxcsr;
15323 }
15324 else
15325 {
15326 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15327 return fMxcsr | X86_MXCSR_ZE;
15328 }
15329 }
15330
15331 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15332 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15333 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15334}
15335
15336
15337IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15338{
15339 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15340 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15341}
15342#endif
15343
15344
15345/**
15346 * DIVSD
15347 */
15348#ifdef IEM_WITHOUT_ASSEMBLY
15349IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15350{
15351 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15352 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15353}
15354#endif
15355
15356
15357/**
15358 * MAXPS
15359 */
15360#ifdef IEM_WITHOUT_ASSEMBLY
15361static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15362{
15363 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15364 {
15365 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15366 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15367 return fMxcsr | X86_MXCSR_IE;
15368 }
15369
15370 RTFLOAT32U r32Src1, r32Src2;
15371 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15372 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15373 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15374 {
15375 *pr32Res = r32Src2;
15376 return fMxcsr;
15377 }
15378
15379 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15380 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15381 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15382 fLe
15383 ? iemFpSoftF32FromIprt(&r32Src2)
15384 : iemFpSoftF32FromIprt(&r32Src1),
15385 pr32Res, fMxcsr);
15386}
15387
15388
15389IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15390{
15391 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15392 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15393 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15394 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15395}
15396#endif
15397
15398
15399/**
15400 * MAXSS
15401 */
15402#ifdef IEM_WITHOUT_ASSEMBLY
15403IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15404{
15405 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15406 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15407 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15408 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15409}
15410#endif
15411
15412
15413/**
15414 * MAXPD
15415 */
15416#ifdef IEM_WITHOUT_ASSEMBLY
15417static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15418{
15419 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15420 {
15421 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15422 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15423 return fMxcsr | X86_MXCSR_IE;
15424 }
15425
15426 RTFLOAT64U r64Src1, r64Src2;
15427 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15428 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15429 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15430 {
15431 *pr64Res = r64Src2;
15432 return fMxcsr;
15433 }
15434
15435 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15436 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15437 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15438 fLe
15439 ? iemFpSoftF64FromIprt(&r64Src2)
15440 : iemFpSoftF64FromIprt(&r64Src1),
15441 pr64Res, fMxcsr);
15442}
15443
15444
15445IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15446{
15447 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15448 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15449}
15450#endif
15451
15452
15453/**
15454 * MAXSD
15455 */
15456#ifdef IEM_WITHOUT_ASSEMBLY
15457IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15458{
15459 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15460 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15461}
15462#endif
15463
15464
15465/**
15466 * CVTSS2SD
15467 */
15468#ifdef IEM_WITHOUT_ASSEMBLY
15469static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15470{
15471 RTFLOAT32U r32Src1;
15472 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15473
15474 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15475 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15476 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15477}
15478
15479
15480IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15481{
15482 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15483 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15484}
15485#endif
15486
15487
15488/**
15489 * CVTSD2SS
15490 */
15491#ifdef IEM_WITHOUT_ASSEMBLY
15492static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15493{
15494 RTFLOAT64U r64Src1;
15495 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15496
15497 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15498 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15499 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15500}
15501
15502
15503IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15504{
15505 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15506 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15507 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15508 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15509}
15510#endif
15511
15512
15513/**
15514 * HADDPS
15515 */
15516#ifdef IEM_WITHOUT_ASSEMBLY
15517IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15518{
15519 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15520 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15521 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15522 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15523}
15524#endif
15525
15526
15527/**
15528 * HADDPD
15529 */
15530#ifdef IEM_WITHOUT_ASSEMBLY
15531IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15532{
15533 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15534 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15535}
15536#endif
15537
15538
15539/**
15540 * HSUBPS
15541 */
15542#ifdef IEM_WITHOUT_ASSEMBLY
15543IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15544{
15545 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15546 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15547 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15548 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15549}
15550#endif
15551
15552
15553/**
15554 * HSUBPD
15555 */
15556#ifdef IEM_WITHOUT_ASSEMBLY
15557IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15558{
15559 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15560 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15561}
15562#endif
15563
15564
15565/**
15566 * SQRTPS
15567 */
15568#ifdef IEM_WITHOUT_ASSEMBLY
15569static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15570{
15571 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15572 return fMxcsr;
15573
15574 RTFLOAT32U r32Src;
15575 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15576 if (RTFLOAT32U_IS_ZERO(&r32Src))
15577 {
15578 *pr32Res = r32Src;
15579 return fMxcsr;
15580 }
15581 else if (r32Src.s.fSign)
15582 {
15583 *pr32Res = g_ar32QNaN[1];
15584 return fMxcsr | X86_MXCSR_IE;
15585 }
15586
15587 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15588 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15589 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15590}
15591
15592
15593IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15594{
15595 RT_NOREF(puSrc1);
15596
15597 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15598 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15599 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15600 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15601}
15602#endif
15603
15604
15605/**
15606 * SQRTSS
15607 */
15608#ifdef IEM_WITHOUT_ASSEMBLY
15609IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15610{
15611 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15612 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15613 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15614 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15615}
15616#endif
15617
15618
15619/**
15620 * SQRTPD
15621 */
15622#ifdef IEM_WITHOUT_ASSEMBLY
15623static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15624{
15625 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15626 return fMxcsr;
15627
15628 RTFLOAT64U r64Src;
15629 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15630 if (RTFLOAT64U_IS_ZERO(&r64Src))
15631 {
15632 *pr64Res = r64Src;
15633 return fMxcsr;
15634 }
15635 else if (r64Src.s.fSign)
15636 {
15637 *pr64Res = g_ar64QNaN[1];
15638 return fMxcsr | X86_MXCSR_IE;
15639 }
15640
15641 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15642 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15643 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15644}
15645
15646
15647IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15648{
15649 RT_NOREF(puSrc1);
15650
15651 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15652 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15653}
15654#endif
15655
15656
15657/**
15658 * SQRTSD
15659 */
15660#ifdef IEM_WITHOUT_ASSEMBLY
15661IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15662{
15663 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15664 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15665}
15666#endif
15667
15668
15669#ifdef IEM_WITHOUT_ASSEMBLY
15670/**
15671 * RSQRTPS
15672 */
15673static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15674{
15675 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15676 return fMxcsr;
15677
15678 RTFLOAT32U r32Src;
15679 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
15680 if (RTFLOAT32U_IS_ZERO(&r32Src))
15681 {
15682 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
15683 return fMxcsr;
15684 }
15685 else if (r32Src.s.fSign)
15686 {
15687 *pr32Res = g_ar32QNaN[1];
15688 return fMxcsr | X86_MXCSR_IE;
15689 }
15690
15691 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15692 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15693 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15694}
15695
15696
15697IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15698{
15699 RT_NOREF(puSrc1);
15700
15701 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15702 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15703 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15704 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15705}
15706
15707
15708/**
15709 * RSQRTSS
15710 */
15711IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15712{
15713 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15714 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15715 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15716 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15717}
15718#endif
15719
15720
15721/**
15722 * ADDSUBPS
15723 */
15724#ifdef IEM_WITHOUT_ASSEMBLY
15725IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15726{
15727 RT_NOREF(puSrc1);
15728
15729 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15730 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15731 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15732 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15733}
15734#endif
15735
15736
15737/**
15738 * ADDSUBPD
15739 */
15740#ifdef IEM_WITHOUT_ASSEMBLY
15741IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15742{
15743 RT_NOREF(puSrc1);
15744
15745 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15746 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15747}
15748#endif
15749
15750
15751/**
15752 * CVTPD2PS
15753 */
15754#ifdef IEM_WITHOUT_ASSEMBLY
15755static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15756{
15757 RTFLOAT64U r64Src1;
15758 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15759
15760 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15761 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15762 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15763}
15764
15765
15766IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15767{
15768 RT_NOREF(puSrc1);
15769
15770 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15771 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15772 pResult->uResult.au32[2] = 0;
15773 pResult->uResult.au32[3] = 0;
15774}
15775#endif
15776
15777
15778/**
15779 * CVTPS2PD
15780 */
15781#ifdef IEM_WITHOUT_ASSEMBLY
15782static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15783{
15784 RTFLOAT32U r32Src1;
15785 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15786
15787 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15788 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15789 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15790}
15791
15792
15793IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15794{
15795 RT_NOREF(puSrc1);
15796
15797 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15798 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15799}
15800#endif
15801
15802
15803/**
15804 * CVTDQ2PS
15805 */
15806#ifdef IEM_WITHOUT_ASSEMBLY
15807static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
15808{
15809 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15810 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
15811 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15812}
15813
15814
15815IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15816{
15817 RT_NOREF(puSrc1);
15818
15819 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15820 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15821 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
15822 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
15823}
15824#endif
15825
15826
15827/**
15828 * CVTPS2DQ
15829 */
15830#ifdef IEM_WITHOUT_ASSEMBLY
15831static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15832{
15833 RTFLOAT32U r32Src;
15834 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15835
15836 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15837 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15838 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15839}
15840
15841
15842IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15843{
15844 RT_NOREF(puSrc1);
15845
15846 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15847 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15848 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15849 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15850}
15851#endif
15852
15853
15854/**
15855 * CVTTPS2DQ
15856 */
15857#ifdef IEM_WITHOUT_ASSEMBLY
15858static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15859{
15860 RTFLOAT32U r32Src;
15861 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15862
15863 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15864 SoftState.roundingMode = softfloat_round_minMag;
15865 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
15866 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15867}
15868
15869
15870IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15871{
15872 RT_NOREF(puSrc1);
15873
15874 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15875 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15876 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15877 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15878}
15879#endif
15880
15881
15882/**
15883 * CVTTPD2DQ
15884 */
15885#ifdef IEM_WITHOUT_ASSEMBLY
15886static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15887{
15888 RTFLOAT64U r64Src;
15889 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15890
15891 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15892 SoftState.roundingMode = softfloat_round_minMag;
15893 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15894 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15895}
15896
15897
15898IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15899{
15900 RT_NOREF(puSrc1);
15901
15902 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15903 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15904 pResult->uResult.au64[1] = 0;
15905}
15906#endif
15907
15908
15909/**
15910 * CVTDQ2PD
15911 */
15912#ifdef IEM_WITHOUT_ASSEMBLY
15913static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
15914{
15915 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15916 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
15917 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15918}
15919
15920
15921IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15922{
15923 RT_NOREF(puSrc1);
15924
15925 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15926 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15927}
15928#endif
15929
15930
15931/**
15932 * CVTPD2DQ
15933 */
15934#ifdef IEM_WITHOUT_ASSEMBLY
15935static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15936{
15937 RTFLOAT64U r64Src;
15938 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15939
15940 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15941 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15942 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15943}
15944
15945
15946IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15947{
15948 RT_NOREF(puSrc1);
15949
15950 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15951 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15952 pResult->uResult.au64[1] = 0;
15953}
15954#endif
15955
15956
15957/**
15958 * [V]SHUFPS
15959 */
15960#ifdef IEM_WITHOUT_ASSEMBLY
15961IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15962{
15963 RTUINT128U const uSrc1 = *puDst;
15964 RTUINT128U const uSrc2 = *puSrc;
15965 ASMCompilerBarrier();
15966 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15967 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15968 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15969 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15970}
15971#endif
15972
15973
15974IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15975{
15976 RTUINT128U const uSrc1 = *puSrc1;
15977 RTUINT128U const uSrc2 = *puSrc2;
15978 ASMCompilerBarrier();
15979 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15980 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15981 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15982 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15983}
15984
15985
15986IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15987{
15988 RTUINT256U const uSrc1 = *puSrc1;
15989 RTUINT256U const uSrc2 = *puSrc2;
15990 ASMCompilerBarrier();
15991 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15992 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15993 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15994 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15995
15996 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
15997 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
15998 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
15999 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
16000}
16001
16002
16003/**
16004 * [V]SHUFPD
16005 */
16006#ifdef IEM_WITHOUT_ASSEMBLY
16007IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16008{
16009 RTUINT128U const uSrc1 = *puDst;
16010 RTUINT128U const uSrc2 = *puSrc;
16011 ASMCompilerBarrier();
16012 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16013 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16014}
16015#endif
16016
16017
16018IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16019{
16020 RTUINT128U const uSrc1 = *puSrc1;
16021 RTUINT128U const uSrc2 = *puSrc2;
16022 ASMCompilerBarrier();
16023 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16024 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16025}
16026
16027
16028IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16029{
16030 RTUINT256U const uSrc1 = *puSrc1;
16031 RTUINT256U const uSrc2 = *puSrc2;
16032 ASMCompilerBarrier();
16033 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16034 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16035 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
16036 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
16037}
16038
16039
16040/*
16041 * PHMINPOSUW / VPHMINPOSUW
16042 */
16043IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16044{
16045 uint16_t u16Min = puSrc->au16[0];
16046 uint8_t idxMin = 0;
16047
16048 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
16049 if (puSrc->au16[i] < u16Min)
16050 {
16051 u16Min = puSrc->au16[i];
16052 idxMin = i;
16053 }
16054
16055 puDst->au64[0] = 0;
16056 puDst->au64[1] = 0;
16057 puDst->au16[0] = u16Min;
16058 puDst->au16[1] = idxMin;
16059}
16060
16061
16062IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16063{
16064 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
16065}
16066
16067
16068/*
16069 * [V]PBLENDVB
16070 */
16071IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16072{
16073 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16074 if (puMask->au8[i] & RT_BIT(7))
16075 puDst->au8[i] = puSrc->au8[i];
16076}
16077
16078
16079IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16080{
16081 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16082 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16083}
16084
16085
16086IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16087{
16088 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16089 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16090}
16091
16092
16093/*
16094 * [V]BLENDVPS
16095 */
16096IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16097{
16098 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16099 if (puMask->au32[i] & RT_BIT_32(31))
16100 puDst->au32[i] = puSrc->au32[i];
16101}
16102
16103
16104IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16105{
16106 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16107 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16108}
16109
16110
16111IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16112{
16113 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16114 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16115}
16116
16117
16118/*
16119 * [V]BLENDVPD
16120 */
16121IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16122{
16123 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
16124 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
16125}
16126
16127
16128IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16129{
16130 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16131 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16132}
16133
16134
16135IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16136{
16137 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16138 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16139}
16140
16141
16142/**
16143 * [V]PALIGNR
16144 */
16145IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
16146{
16147 uint64_t const u64Src1 = *pu64Dst;
16148 ASMCompilerBarrier();
16149
16150 if (bEvil >= 16)
16151 *pu64Dst = 0;
16152 else if (bEvil >= 8)
16153 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
16154 else
16155 {
16156 uint8_t cShift = bEvil * 8;
16157 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
16158 | (u64Src2 >> cShift);
16159 }
16160}
16161
16162
16163IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16164{
16165 RTUINT128U const uSrc1 = *puDst;
16166 RTUINT128U const uSrc2 = *puSrc;
16167 ASMCompilerBarrier();
16168
16169 puDst->au64[0] = 0;
16170 puDst->au64[1] = 0;
16171 if (bEvil >= 32)
16172 { /* Everything stays 0. */ }
16173 else if (bEvil >= 16)
16174 {
16175 bEvil -= 16;
16176 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16177 puDst->au8[i - bEvil] = uSrc1.au8[i];
16178 }
16179 else
16180 {
16181 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16182 puDst->au8[i] = uSrc2.au8[i + bEvil];
16183 for (uint8_t i = 0; i < bEvil; i++)
16184 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16185 }
16186}
16187
16188
16189IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16190{
16191 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16192 RTUINT128U const uSrc2 = *puSrc2;
16193 ASMCompilerBarrier();
16194
16195 puDst->au64[0] = 0;
16196 puDst->au64[1] = 0;
16197 if (bEvil >= 32)
16198 { /* Everything stays 0. */ }
16199 else if (bEvil >= 16)
16200 {
16201 bEvil -= 16;
16202 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16203 puDst->au8[i - bEvil] = uSrc1.au8[i];
16204 }
16205 else
16206 {
16207 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16208 puDst->au8[i] = uSrc2.au8[i + bEvil];
16209 for (uint8_t i = 0; i < bEvil; i++)
16210 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16211 }
16212}
16213
16214
16215IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16216{
16217 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16218 RTUINT256U const uSrc2 = *puSrc2;
16219 ASMCompilerBarrier();
16220
16221 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16222 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16223}
16224
16225
16226/**
16227 * [V]PBLENDW
16228 */
16229IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16230{
16231 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16232 if (bEvil & RT_BIT(i))
16233 puDst->au16[i] = puSrc->au16[i];
16234}
16235
16236
16237IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16238{
16239 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16240 if (bEvil & RT_BIT(i))
16241 puDst->au16[i] = puSrc2->au16[i];
16242 else
16243 puDst->au16[i] = puSrc1->au16[i];
16244}
16245
16246
16247IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16248{
16249 for (uint8_t i = 0; i < 8; i++)
16250 if (bEvil & RT_BIT(i))
16251 {
16252 puDst->au16[ i] = puSrc2->au16[ i];
16253 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16254 }
16255 else
16256 {
16257 puDst->au16[ i] = puSrc1->au16[ i];
16258 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16259 }
16260}
16261
16262
16263/**
16264 * [V]BLENDPS
16265 */
16266IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16267{
16268 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16269 if (bEvil & RT_BIT(i))
16270 puDst->au32[i] = puSrc->au32[i];
16271}
16272
16273
16274IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16275{
16276 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16277 if (bEvil & RT_BIT(i))
16278 puDst->au32[i] = puSrc2->au32[i];
16279 else
16280 puDst->au32[i] = puSrc1->au32[i];
16281}
16282
16283
16284IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16285{
16286 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16287 if (bEvil & RT_BIT(i))
16288 puDst->au32[i] = puSrc2->au32[i];
16289 else
16290 puDst->au32[i] = puSrc1->au32[i];
16291}
16292
16293
16294/**
16295 * [V]BLENDPD
16296 */
16297IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16298{
16299 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16300 if (bEvil & RT_BIT(i))
16301 puDst->au64[i] = puSrc->au64[i];
16302}
16303
16304
16305IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16306{
16307 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16308 if (bEvil & RT_BIT(i))
16309 puDst->au64[i] = puSrc2->au64[i];
16310 else
16311 puDst->au64[i] = puSrc1->au64[i];
16312}
16313
16314
16315IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16316{
16317 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16318 if (bEvil & RT_BIT(i))
16319 puDst->au64[i] = puSrc2->au64[i];
16320 else
16321 puDst->au64[i] = puSrc1->au64[i];
16322}
16323
16324
16325/**
16326 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16327 */
16328
16329static uint8_t iemAImpl_aes_sbox[] = {
16330 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16331 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16332 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16333 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16334 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16335 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16336 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16337 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16338 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16339 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
16340 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16341 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
16342 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
16343 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
16344 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
16345 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
16346};
16347
16348/* The InvS-Box lookup table. */
16349static uint8_t iemAImpl_aes_inv_sbox[] = {
16350 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
16351 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
16352 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
16353 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
16354 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
16355 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
16356 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
16357 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
16358 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
16359 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
16360 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
16361 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
16362 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
16363 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
16364 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
16365 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
16366};
16367
16368/* The ShiftRows lookup table. */
16369static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
16370 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
16371};
16372
16373/* The InvShiftRows lookup table. */
16374static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
16375 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
16376};
16377
16378static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
16379{
16380 RTUINT128U uVal;
16381 int i;
16382
16383 for (i = 0; i < 16; ++i)
16384 uVal.au8[i] = abSubst[puSrc->au8[i]];
16385
16386 return uVal;
16387}
16388
16389static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
16390{
16391 return (u << 1) ^ (((u >> 7) & 1) * 27);
16392}
16393
16394static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
16395{
16396 RTUINT128U uVal;
16397 int i;
16398 uint8_t tmp;
16399
16400 for (i = 0; i < 16; i += 4) {
16401 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
16402 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
16403 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
16404 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
16405 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
16406 }
16407
16408 return uVal;
16409}
16410
16411static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
16412{
16413 RTUINT128U uVal;
16414 int i;
16415
16416 for (i = 0; i < 16; ++i)
16417 uVal.au8[i] = puSrc->au8[abShift[i]];
16418
16419 return uVal;
16420}
16421
16422static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
16423{
16424 uint8_t val;
16425
16426 val = ((b >> 0) & 1) * a;
16427 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
16428 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
16429 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
16430 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
16431
16432 return val;
16433}
16434
16435static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
16436{
16437 RTUINT128U uVal;
16438 int i;
16439
16440 for (i = 0; i < 16; i += 4) {
16441 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
16442 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
16443 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
16444 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
16445 }
16446
16447 return uVal;
16448}
16449
16450static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
16451{
16452 RTUINT32U uTmp;
16453
16454 uTmp.au32[0] = w;
16455 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
16456 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
16457 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
16458 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
16459
16460 return uTmp.au32[0];
16461}
16462
16463static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
16464{
16465 return (w << 24) | (w >> 8);
16466}
16467
16468/**
16469 * [V]AESKEYGENASSIST
16470 */
16471IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
16472{
16473 RTUINT128U uTmp;
16474 uint32_t uRCon = bImm; /* Round constant. */
16475
16476 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
16477 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
16478 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
16479 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
16480
16481 *puDst = uTmp;
16482}
16483
16484
16485/**
16486 * [V]AESIMC
16487 */
16488IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16489{
16490 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
16491}
16492
16493
16494/**
16495 * [V]AESENC
16496 */
16497IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16498{
16499 RTUINT128U uTmp;
16500
16501 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16502 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16503 uTmp = iemAImpl_aes_mix_col(&uTmp);
16504 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16505 uTmp.au64[1] ^= puSrc->au64[1];
16506
16507 *puDst = uTmp;
16508}
16509
16510
16511/**
16512 * [V]AESENCLAST
16513 */
16514IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16515{
16516 RTUINT128U uTmp;
16517
16518 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16519 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16520 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16521 uTmp.au64[1] ^= puSrc->au64[1];
16522
16523 *puDst = uTmp;
16524}
16525
16526
16527/**
16528 * [V]AESDEC
16529 */
16530IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16531{
16532 RTUINT128U uTmp;
16533
16534 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16535 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16536 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
16537 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16538 uTmp.au64[1] ^= puSrc->au64[1];
16539
16540 *puDst = uTmp;
16541}
16542
16543
16544/**
16545 * [V]AESDECLAST
16546 */
16547IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16548{
16549 RTUINT128U uTmp;
16550
16551 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16552 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16553 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16554 uTmp.au64[1] ^= puSrc->au64[1];
16555
16556 *puDst = uTmp;
16557}
16558
16559
16560/**
16561 * [V]PCMPISTRI
16562 */
16563
16564/**
16565 * Does the comparisons based on the mode and source input format.
16566 */
16567static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
16568{
16569#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
16570 do \
16571 { \
16572 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
16573 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
16574 { \
16575 switch (a_bAggOp) \
16576 { \
16577 case 0: \
16578 case 2: \
16579 case 3: \
16580 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
16581 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
16582 break; \
16583 case 1: \
16584 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
16585 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
16586 break; \
16587 default: \
16588 AssertReleaseFailed(); \
16589 } \
16590 } \
16591 } while(0)
16592
16593 uint8_t bAggOp = (bImm >> 2) & 0x3;
16594 switch (bImm & 0x3)
16595 {
16596 case 0:
16597 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
16598 break;
16599 case 1:
16600 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
16601 break;
16602 case 2:
16603 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
16604 break;
16605 case 3:
16606 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
16607 break;
16608 default:
16609 AssertReleaseFailed();
16610 }
16611#undef PCMPXSTRX_CMP_CASE
16612}
16613
16614static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
16615{
16616 if (bImm & 0x1)
16617 {
16618 /* Words -> 8 elements. */
16619 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
16620 if (puSrc->au16[i] == 0)
16621 return i;
16622
16623 return 8;
16624 }
16625 else
16626 {
16627 /* Bytes -> 16 elements. */
16628 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
16629 if (puSrc->au8[i] == 0)
16630 return i;
16631
16632 return 16;
16633 }
16634}
16635
16636static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
16637{
16638 if (bImm & 0x1)
16639 {
16640 if (i64Len > -8 && i64Len < 8)
16641 return RT_ABS(i64Len);
16642
16643 return 8;
16644 }
16645 else
16646 {
16647 if (i64Len > -16 && i64Len < 16)
16648 return RT_ABS(i64Len);
16649
16650 return 16;
16651 }
16652}
16653
16654/**
16655 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
16656 */
16657static const bool g_afCmpOverride[4][3] =
16658{
16659 /* xmm1 AND xmm2/m128 invalid xmm1 invalid, xmm2/m128 valid xmm1 valid, xmm2/m128 invalid */
16660 { false, false, false }, /* Imm8[3:2] = 00b (equal any) */
16661 { false, false, false }, /* Imm8[3:2] = 01b (ranges) */
16662 { true, false, false }, /* Imm8[3:2] = 10b (equal each) */
16663 { true, true, false }, /* Imm8[3:2] = 11b (equal ordered) */
16664};
16665
16666DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
16667{
16668 if (fSrc1Valid && fSrc2Valid)
16669 return fCmpRes;
16670
16671 uint8_t bSrc1Valid = fSrc1Valid ? 2 : 0;
16672 uint8_t bSrc2Valid = fSrc2Valid ? 1 : 0;
16673 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
16674}
16675
16676static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
16677{
16678 uint8_t bAggOp = (bImm >> 2) & 0x3;
16679 uint16_t u16Result = 0;
16680
16681 switch (bAggOp)
16682 {
16683 case 0: /* Equal any */
16684 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16685 {
16686 uint16_t u16Res = 0;
16687 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
16688 {
16689 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
16690 idxSrc1 < idxLen1,
16691 idxSrc2 < idxLen2,
16692 bAggOp))
16693 {
16694 u16Res = RT_BIT(idxSrc2);
16695 break;
16696 }
16697 }
16698
16699 u16Result |= u16Res;
16700 }
16701 break;
16702
16703 case 1: /* Ranges */
16704 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16705 {
16706 uint16_t u16Res = 0;
16707 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
16708 {
16709 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
16710 idxSrc1 < idxLen1,
16711 idxSrc2 < idxLen2,
16712 bAggOp)
16713 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
16714 (idxSrc1 + 1) < idxLen1,
16715 idxSrc2 < idxLen2,
16716 bAggOp))
16717 {
16718 u16Res = RT_BIT(idxSrc2);
16719 break;
16720 }
16721 }
16722
16723 u16Result |= u16Res;
16724 }
16725 break;
16726
16727 case 2: /* Equal each */
16728 for (uint8_t i = 0; i < cElems; i++)
16729 {
16730 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
16731 i < idxLen1,
16732 i < idxLen2,
16733 bAggOp))
16734 u16Result |= RT_BIT(i);
16735 }
16736 break;
16737
16738 case 3: /* Equal ordered */
16739 u16Result = 0;
16740 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16741 {
16742 uint16_t u16Res = RT_BIT(idxSrc2);
16743 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
16744 {
16745 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
16746 idxSrc1 < idxLen1,
16747 k < idxLen2,
16748 bAggOp))
16749 {
16750 u16Res = 0;
16751 break;
16752 }
16753 }
16754
16755 u16Result |= u16Res;
16756 }
16757 break;
16758 }
16759
16760 /* Polarity selection. */
16761 switch ((bImm >> 4) & 0x3)
16762 {
16763 case 0:
16764 case 2:
16765 /* Nothing to do. */
16766 break;
16767 case 1:
16768 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
16769 break;
16770 case 3:
16771 u16Result ^= RT_BIT(idxLen2) - 1;
16772 break;
16773 default:
16774 AssertReleaseFailed();
16775 }
16776
16777 return u16Result;
16778}
16779
16780DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
16781{
16782 uint32_t fEFlags = 0;
16783
16784 if (u16Result)
16785 fEFlags |= X86_EFL_CF;
16786 if (cLen2 < cElems)
16787 fEFlags |= X86_EFL_ZF;
16788 if (cLen1 < cElems)
16789 fEFlags |= X86_EFL_SF;
16790 if (u16Result & 0x1)
16791 fEFlags |= X86_EFL_OF;
16792 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
16793}
16794
16795DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
16796 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
16797{
16798 bool afCmpRes[16][16];
16799 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16800
16801 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
16802 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
16803 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
16804
16805 return u16Result;
16806}
16807
16808DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
16809{
16810 if (bImm & RT_BIT(6))
16811 {
16812 /* Index for MSB set. */
16813 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
16814 if (idxMsb)
16815 *pu32Ecx = idxMsb - 1;
16816 else
16817 *pu32Ecx = cElems;
16818 }
16819 else
16820 {
16821 /* Index for LSB set. */
16822 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
16823 if (idxLsb)
16824 *pu32Ecx = idxLsb - 1;
16825 else
16826 *pu32Ecx = cElems;
16827 }
16828}
16829
16830IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
16831{
16832 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16833 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
16834 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
16835
16836 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
16837 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
16838}
16839
16840
16841/**
16842 * [V]PCMPESTRI
16843 */
16844IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
16845{
16846 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16847 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
16848 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
16849
16850 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
16851 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
16852}
16853
16854
16855/**
16856 * [V]PCMPISTRM
16857 */
16858DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
16859{
16860 if (bImm & RT_BIT(6))
16861 {
16862 /* Generate a mask. */
16863 if (cElems == 8)
16864 {
16865 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16866 if (u16Result & RT_BIT(i))
16867 puDst->au16[i] = 0xffff;
16868 else
16869 puDst->au16[i] = 0;
16870 }
16871 else
16872 {
16873 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16874 if (u16Result & RT_BIT(i))
16875 puDst->au8[i] = 0xff;
16876 else
16877 puDst->au8[i] = 0;
16878 }
16879 }
16880 else
16881 {
16882 /* Store the result. */
16883 puDst->au64[0] = u16Result;
16884 puDst->au64[1] = 0;
16885 }
16886}
16887
16888IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
16889{
16890 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16891 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
16892 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
16893
16894 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
16895 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
16896}
16897
16898
16899/**
16900 * [V]PCMPESTRM
16901 */
16902IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
16903{
16904 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16905 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
16906 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
16907
16908 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
16909 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
16910}
16911
16912
16913/*
16914 * [V]PCLMULQDQ
16915 */
16916IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16917{
16918 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
16919}
16920
16921
16922IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16923{
16924 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
16925 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
16926
16927 puDst->au64[0] = 0;
16928 puDst->au64[1] = 0;
16929
16930 /*
16931 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
16932 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
16933 * and squeeze out some optimizations.
16934 */
16935 if (uSrc1 & 0x1)
16936 puDst->au64[0] = uSrc2;
16937
16938 uSrc1 >>= 1;
16939
16940 uint8_t iDigit = 1;
16941 while (uSrc1)
16942 {
16943 if (uSrc1 & 0x1)
16944 {
16945 puDst->au64[0] ^= (uSrc2 << iDigit);
16946 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
16947 }
16948
16949 uSrc1 >>= 1;
16950 iDigit++;
16951 }
16952}
16953
16954
16955/**
16956 * [V]PINSRW
16957 */
16958#ifdef IEM_WITHOUT_ASSEMBLY
16959IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
16960{
16961 uint8_t cShift = (bEvil & 0x3) * 16;
16962 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
16963}
16964
16965
16966IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
16967{
16968 puDst->au16[bEvil & 0x7] = u16Src;
16969}
16970#endif
16971
16972
16973IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
16974{
16975 *puDst = *puSrc;
16976 puDst->au16[bEvil & 0x7] = u16Src;
16977}
16978
16979
16980/**
16981 * [V]PEXTRW
16982 */
16983#ifdef IEM_WITHOUT_ASSEMBLY
16984IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
16985{
16986 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
16987}
16988
16989
16990IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16991{
16992 *pu16Dst = puSrc->au16[bEvil & 0x7];
16993}
16994
16995#endif
16996
16997IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16998{
16999 *pu16Dst = puSrc->au16[bEvil & 0x7];
17000}
17001
17002
17003/**
17004 * [V]MOVMSKPS
17005 */
17006#ifdef IEM_WITHOUT_ASSEMBLY
17007IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17008{
17009 *pu8Dst = puSrc->au32[0] >> 31;
17010 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17011 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17012 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17013}
17014
17015#endif
17016
17017IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17018{
17019 *pu8Dst = puSrc->au32[0] >> 31;
17020 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17021 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17022 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17023}
17024
17025
17026IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17027{
17028 *pu8Dst = puSrc->au32[0] >> 31;
17029 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17030 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17031 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17032 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
17033 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
17034 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
17035 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
17036}
17037
17038
17039/**
17040 * [V]MOVMSKPD
17041 */
17042#ifdef IEM_WITHOUT_ASSEMBLY
17043IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17044{
17045 *pu8Dst = puSrc->au64[0] >> 63;
17046 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17047}
17048
17049#endif
17050
17051IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17052{
17053 *pu8Dst = puSrc->au64[0] >> 63;
17054 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17055}
17056
17057
17058IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17059{
17060 *pu8Dst = puSrc->au64[0] >> 63;
17061 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17062 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
17063 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
17064}
17065
17066
17067/**
17068 * CVTTSD2SI
17069 */
17070#ifdef IEM_WITHOUT_ASSEMBLY
17071IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17072{
17073 RTFLOAT64U r64Src;
17074
17075 r64Src.u = *pu64Src;
17076 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17077
17078 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17079 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17080 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17081}
17082
17083
17084IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17085{
17086 RTFLOAT64U r64Src;
17087
17088 r64Src.u = *pu64Src;
17089 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17090
17091 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17092 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17093 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17094}
17095#endif
17096
17097
17098/**
17099 * CVTSD2SI
17100 */
17101#ifdef IEM_WITHOUT_ASSEMBLY
17102IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17103{
17104 RTFLOAT64U r64Src;
17105
17106 r64Src.u = *pu64Src;
17107 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17108
17109 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17110 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17111 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17112}
17113
17114
17115IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17116{
17117 RTFLOAT64U r64Src;
17118
17119 r64Src.u = *pu64Src;
17120 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17121
17122 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17123 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17124 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17125}
17126#endif
17127
17128
17129/**
17130 * CVTTSS2SI
17131 */
17132#ifdef IEM_WITHOUT_ASSEMBLY
17133IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17134{
17135 RTFLOAT32U r32Src;
17136
17137 r32Src.u = *pu32Src;
17138 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17139
17140 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17141 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17142 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17143}
17144
17145
17146IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17147{
17148 RTFLOAT32U r32Src;
17149
17150 r32Src.u = *pu32Src;
17151 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17152
17153 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17154 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17155 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17156}
17157#endif
17158
17159
17160/**
17161 * CVTSS2SI
17162 */
17163#ifdef IEM_WITHOUT_ASSEMBLY
17164IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17165{
17166 RTFLOAT32U r32Src;
17167
17168 r32Src.u = *pu32Src;
17169 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17170
17171 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17172 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17173 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17174}
17175
17176
17177IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17178{
17179 RTFLOAT32U r32Src;
17180
17181 r32Src.u = *pu32Src;
17182 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17183
17184 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17185 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17186 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17187}
17188#endif
17189
17190
17191/**
17192 * CVTSI2SD
17193 */
17194#ifdef IEM_WITHOUT_ASSEMBLY
17195IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
17196{
17197 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17198 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
17199 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17200}
17201
17202
17203IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
17204{
17205 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17206 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
17207 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17208}
17209#endif
17210
17211
17212/**
17213 * CVTSI2SS
17214 */
17215#ifdef IEM_WITHOUT_ASSEMBLY
17216IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
17217{
17218 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17219 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
17220 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17221}
17222
17223
17224IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
17225{
17226 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17227 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
17228 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17229}
17230#endif
17231
17232
17233/**
17234 * [V]UCOMISS
17235 */
17236#ifdef IEM_WITHOUT_ASSEMBLY
17237IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17238{
17239 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17240
17241 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
17242 {
17243 *pfMxcsr |= X86_MXCSR_IE;
17244 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17245 }
17246 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17247 {
17248 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17249 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17250 }
17251 else
17252 {
17253 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17254
17255 RTFLOAT32U r32Src1, r32Src2;
17256 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17257 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17258
17259 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17260 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17261 if (f32_eq(f32Src1, f32Src2, &SoftState))
17262 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17263 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17264 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17265 /* else: GREATER_THAN 000 */
17266
17267 *pfMxcsr |= fDe;
17268 }
17269
17270 *pfEFlags = fEFlagsNew;
17271}
17272#endif
17273
17274IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17275{
17276 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17277}
17278
17279
17280/**
17281 * [V]UCOMISD
17282 */
17283#ifdef IEM_WITHOUT_ASSEMBLY
17284IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17285{
17286 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17287
17288 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
17289 {
17290 *pfMxcsr |= X86_MXCSR_IE;
17291 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17292 }
17293 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17294 {
17295 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17296 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17297 }
17298 else
17299 {
17300 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17301
17302 RTFLOAT64U r64Src1, r64Src2;
17303 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17304 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17305
17306 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17307 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17308 if (f64_eq(f64Src1, f64Src2, &SoftState))
17309 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17310 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17311 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17312 /* else: GREATER_THAN 000 */
17313
17314 *pfMxcsr |= fDe;
17315 }
17316
17317 *pfEFlags = fEFlagsNew;
17318}
17319#endif
17320
17321IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17322{
17323 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17324}
17325
17326
17327/**
17328 * [V]COMISS
17329 */
17330#ifdef IEM_WITHOUT_ASSEMBLY
17331IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17332{
17333 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17334
17335 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
17336 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17337 {
17338 *pfMxcsr |= X86_MXCSR_IE;
17339 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17340 }
17341 else
17342 {
17343 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17344
17345 RTFLOAT32U r32Src1, r32Src2;
17346 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17347 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17348
17349 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17350 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17351 if (f32_eq(f32Src1, f32Src2, &SoftState))
17352 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17353 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17354 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17355 /* else: GREATER_THAN 000 */
17356
17357 *pfMxcsr |= fDe;
17358 }
17359
17360 *pfEFlags = fEFlagsNew;
17361}
17362#endif
17363
17364
17365IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17366{
17367 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17368}
17369
17370
17371/**
17372 * [V]COMISD
17373 */
17374#ifdef IEM_WITHOUT_ASSEMBLY
17375IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17376{
17377 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17378
17379 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
17380 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17381 {
17382 *pfMxcsr |= X86_MXCSR_IE;
17383 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17384 }
17385 else
17386 {
17387 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17388
17389 RTFLOAT64U r64Src1, r64Src2;
17390 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17391 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17392
17393 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17394 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17395 if (f64_eq(f64Src1, f64Src2, &SoftState))
17396 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17397 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17398 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17399 /* else: GREATER_THAN 000 */
17400
17401 *pfMxcsr |= fDe;
17402 }
17403
17404 *pfEFlags = fEFlagsNew;
17405}
17406#endif
17407
17408IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17409{
17410 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17411}
17412
17413
17414/**
17415 * CMPPS / CMPPD / CMPSS / CMPSD
17416 */
17417#ifdef IEM_WITHOUT_ASSEMBLY
17418/**
17419 * A compare truth table entry.
17420 */
17421typedef struct CMPTRUTHTBLENTRY
17422{
17423 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
17424 bool fSignalsOnQNan;
17425 /** The boolean result when the input operands are unordered. */
17426 bool fUnordered;
17427 /** The boolean result when A = B. */
17428 bool fEqual;
17429 /** The boolean result when A < B. */
17430 bool fLowerThan;
17431 /** The boolean result when A > B. */
17432 bool fGreaterThan;
17433} CMPTRUTHTBLENTRY;
17434/** Pointer to a const truth table entry. */
17435typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
17436
17437
17438/** The compare truth table (indexed by immediate). */
17439static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
17440{
17441 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
17442 /* 00H (EQ_OQ) */ { false, false, true, false, false },
17443 /* 01H (LT_OS) */ { true, false, false, true, false },
17444 /* 02H (LE_OS) */ { true, false, true, true, false },
17445 /* 03H (UNORD_Q) */ { false, true, false, false, false },
17446 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
17447 /* 05H (NLT_US) */ { true, true, true, false, true },
17448 /* 06H (NLE_US) */ { true, true, false, false, true },
17449 /* 07H (ORQ_Q) */ { false, false, true, true, true },
17450 /** @todo AVX variants. */
17451};
17452
17453
17454static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
17455{
17456 bool fRes;
17457 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17458
17459 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
17460 {
17461 *pfMxcsr |= X86_MXCSR_IE;
17462 fRes = g_aCmpTbl[bEvil].fUnordered;
17463 }
17464 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
17465 {
17466 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17467 *pfMxcsr |= X86_MXCSR_IE;
17468 fRes = g_aCmpTbl[bEvil].fUnordered;
17469 }
17470 else
17471 {
17472 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17473
17474 RTFLOAT32U r32Src1, r32Src2;
17475 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
17476 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
17477
17478 *pfMxcsr |= fDe;
17479 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17480 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17481 if (f32_eq(f32Src1, f32Src2, &SoftState))
17482 fRes = g_aCmpTbl[bEvil].fEqual;
17483 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17484 fRes = g_aCmpTbl[bEvil].fLowerThan;
17485 else
17486 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17487 }
17488
17489 return fRes;
17490}
17491
17492
17493static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
17494{
17495 bool fRes;
17496 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17497
17498 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
17499 {
17500 *pfMxcsr |= X86_MXCSR_IE;
17501 fRes = g_aCmpTbl[bEvil].fUnordered;
17502 }
17503 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
17504 {
17505 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17506 *pfMxcsr |= X86_MXCSR_IE;
17507 fRes = g_aCmpTbl[bEvil].fUnordered;
17508 }
17509 else
17510 {
17511 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17512
17513 RTFLOAT64U r64Src1, r64Src2;
17514 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
17515 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
17516
17517 *pfMxcsr |= fDe;
17518 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17519 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17520 if (f64_eq(f64Src1, f64Src2, &SoftState))
17521 fRes = g_aCmpTbl[bEvil].fEqual;
17522 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17523 fRes = g_aCmpTbl[bEvil].fLowerThan;
17524 else
17525 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17526 }
17527
17528 return fRes;
17529}
17530
17531
17532IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17533{
17534 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17535 {
17536 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
17537 puDst->au32[i] = UINT32_MAX;
17538 else
17539 puDst->au32[i] = 0;
17540 }
17541}
17542
17543
17544IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17545{
17546 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17547 {
17548 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
17549 puDst->au64[i] = UINT64_MAX;
17550 else
17551 puDst->au64[i] = 0;
17552 }
17553}
17554
17555
17556IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17557{
17558 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
17559 puDst->au32[0] = UINT32_MAX;
17560 else
17561 puDst->au32[0] = 0;
17562
17563 puDst->au32[1] = pSrc->uSrc1.au32[1];
17564 puDst->au64[1] = pSrc->uSrc1.au64[1];
17565}
17566
17567
17568IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17569{
17570 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
17571 puDst->au64[0] = UINT64_MAX;
17572 else
17573 puDst->au64[0] = 0;
17574
17575 puDst->au64[1] = pSrc->uSrc1.au64[1];
17576}
17577#endif
17578
17579
17580/**
17581 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
17582 */
17583
17584#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
17585#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
17586#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
17587
17588#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
17589
17590DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
17591{
17592 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
17593 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17594
17595 fMxcsr &= ~X86_MXCSR_RC_MASK;
17596 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
17597 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17598}
17599
17600static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
17601{
17602 RTFLOAT32U r32Src, r32Dst;
17603 float32_t f32Src;
17604 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
17605 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
17606
17607 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
17608 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
17609
17610 iemFpSoftF32ToIprt(&r32Dst, f32Src);
17611 return r32Dst;
17612}
17613
17614static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
17615{
17616 RTFLOAT64U r64Src, r64Dst;
17617 float64_t f64Src;
17618 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
17619 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
17620
17621 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
17622 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
17623
17624 iemFpSoftF64ToIprt(&r64Dst, f64Src);
17625 return r64Dst;
17626}
17627
17628#ifdef IEM_WITHOUT_ASSEMBLY
17629IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17630{
17631 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17632 puDst->au32[1] = pSrc->uSrc1.au32[1];
17633 puDst->au64[1] = pSrc->uSrc1.au64[1];
17634}
17635
17636
17637IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17638{
17639 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17640 puDst->au64[1] = pSrc->uSrc1.au64[1];
17641}
17642#endif
17643
17644IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17645{
17646 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17647 {
17648 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17649 }
17650}
17651
17652
17653IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17654{
17655 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17656 {
17657 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17658 }
17659}
17660
17661/**
17662 * CVTPD2PI
17663 */
17664#ifdef IEM_WITHOUT_ASSEMBLY
17665static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17666{
17667 RTFLOAT64U r64Src;
17668 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17669
17670 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17671 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17672 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17673}
17674
17675
17676IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17677{
17678 RTUINT64U u64Res;
17679 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17680 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17681
17682 *pu64Dst = u64Res.u;
17683 *pfMxcsr = fMxcsrOut;
17684}
17685#endif
17686
17687
17688/**
17689 * CVTTPD2PI
17690 */
17691#ifdef IEM_WITHOUT_ASSEMBLY
17692static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17693{
17694 RTFLOAT64U r64Src;
17695 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17696
17697 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17698 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17699 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17700}
17701
17702
17703IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17704{
17705 RTUINT64U u64Res;
17706 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17707 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17708
17709 *pu64Dst = u64Res.u;
17710 *pfMxcsr = fMxcsrOut;
17711}
17712#endif
17713
17714
17715/**
17716 * CVTPI2PS
17717 */
17718#ifdef IEM_WITHOUT_ASSEMBLY
17719static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
17720{
17721 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17722 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
17723 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
17724}
17725
17726
17727IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17728{
17729 RTUINT64U uSrc = { u64Src };
17730 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
17731 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
17732 *pfMxcsr = fMxcsrOut;
17733}
17734#endif
17735
17736
17737/**
17738 * CVTPI2PD
17739 */
17740#ifdef IEM_WITHOUT_ASSEMBLY
17741static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
17742{
17743 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17744 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
17745 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
17746}
17747
17748
17749IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17750{
17751 RTUINT64U uSrc = { u64Src };
17752 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
17753 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
17754 *pfMxcsr = fMxcsrOut;
17755}
17756#endif
17757
17758
17759/**
17760 * CVTPS2PI
17761 */
17762#ifdef IEM_WITHOUT_ASSEMBLY
17763static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17764{
17765 RTFLOAT32U r32Src;
17766 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17767
17768 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17769 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17770 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17771}
17772
17773
17774IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17775{
17776 RTUINT64U uDst;
17777 RTUINT64U uSrc = { u64Src };
17778 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17779 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17780 *pu64Dst = uDst.u;
17781 *pfMxcsr = fMxcsrOut;
17782}
17783#endif
17784
17785
17786/**
17787 * CVTTPS2PI
17788 */
17789#ifdef IEM_WITHOUT_ASSEMBLY
17790static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17791{
17792 RTFLOAT32U r32Src;
17793 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17794
17795 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17796 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17797 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17798}
17799
17800
17801IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17802{
17803 RTUINT64U uDst;
17804 RTUINT64U uSrc = { u64Src };
17805 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17806 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17807 *pu64Dst = uDst.u;
17808 *pfMxcsr = fMxcsrOut;
17809}
17810#endif
17811
17812/**
17813 * RDRAND
17814 */
17815IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
17816{
17817 *puDst = 0;
17818 *pEFlags &= ~X86_EFL_STATUS_BITS;
17819 *pEFlags |= X86_EFL_CF;
17820}
17821
17822IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
17823{
17824 *puDst = 0;
17825 *pEFlags &= ~X86_EFL_STATUS_BITS;
17826 *pEFlags |= X86_EFL_CF;
17827}
17828
17829IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
17830{
17831 *puDst = 0;
17832 *pEFlags &= ~X86_EFL_STATUS_BITS;
17833 *pEFlags |= X86_EFL_CF;
17834}
17835
17836/**
17837 * RDSEED
17838 */
17839IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
17840{
17841 *puDst = 0;
17842 *pEFlags &= ~X86_EFL_STATUS_BITS;
17843 *pEFlags |= X86_EFL_CF;
17844}
17845
17846IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
17847{
17848 *puDst = 0;
17849 *pEFlags &= ~X86_EFL_STATUS_BITS;
17850 *pEFlags |= X86_EFL_CF;
17851}
17852
17853IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
17854{
17855 *puDst = 0;
17856 *pEFlags &= ~X86_EFL_STATUS_BITS;
17857 *pEFlags |= X86_EFL_CF;
17858}
17859
17860
17861/**
17862 * SHA1NEXTE
17863 */
17864IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17865{
17866 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
17867
17868 puDst->au32[0] = puSrc->au32[0];
17869 puDst->au32[1] = puSrc->au32[1];
17870 puDst->au32[2] = puSrc->au32[2];
17871 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
17872}
17873
17874/**
17875 * SHA1MSG1
17876 */
17877IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17878{
17879 uint32_t u32W0 = puDst->au32[3];
17880 uint32_t u32W1 = puDst->au32[2];
17881 uint32_t u32W2 = puDst->au32[1];
17882 uint32_t u32W3 = puDst->au32[0];
17883 uint32_t u32W4 = puSrc->au32[3];
17884 uint32_t u32W5 = puSrc->au32[2];
17885
17886 puDst->au32[3] = u32W2 ^ u32W0;
17887 puDst->au32[2] = u32W3 ^ u32W1;
17888 puDst->au32[1] = u32W4 ^ u32W2;
17889 puDst->au32[0] = u32W5 ^ u32W3;
17890}
17891
17892/**
17893 * SHA1MSG2
17894 */
17895IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17896{
17897 uint32_t u32W13 = puSrc->au32[2];
17898 uint32_t u32W14 = puSrc->au32[1];
17899 uint32_t u32W15 = puSrc->au32[0];
17900 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
17901 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
17902 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
17903 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
17904
17905 puDst->au32[3] = u32W16;
17906 puDst->au32[2] = u32W17;
17907 puDst->au32[1] = u32W18;
17908 puDst->au32[0] = u32W19;
17909}
17910
17911/**
17912 * SHA1RNDS4
17913 */
17914typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
17915typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
17916
17917static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
17918{
17919 return (u32B & u32C) ^ (~u32B & u32D);
17920}
17921
17922static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
17923{
17924 return u32B ^ u32C ^ u32D;
17925}
17926
17927static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
17928{
17929 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
17930}
17931
17932static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
17933{
17934 return u32B ^ u32C ^ u32D;
17935}
17936
17937IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17938{
17939 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
17940 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
17941
17942 uint32_t au32A[5];
17943 uint32_t au32B[5];
17944 uint32_t au32C[5];
17945 uint32_t au32D[5];
17946 uint32_t au32E[5];
17947 uint32_t au32W[4];
17948 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
17949 uint32_t u32K = s_au32K[bEvil & 0x3];
17950
17951 au32A[0] = puDst->au32[3];
17952 au32B[0] = puDst->au32[2];
17953 au32C[0] = puDst->au32[1];
17954 au32D[0] = puDst->au32[0];
17955 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
17956 au32W[i] = puSrc->au32[3 - i];
17957
17958 /* Round 0 is a bit different than the other rounds. */
17959 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
17960 au32B[1] = au32A[0];
17961 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
17962 au32D[1] = au32C[0];
17963 au32E[1] = au32D[0];
17964
17965 for (uint32_t i = 1; i <= 3; i++)
17966 {
17967 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
17968 au32B[i + 1] = au32A[i];
17969 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
17970 au32D[i + 1] = au32C[i];
17971 au32E[i + 1] = au32D[i];
17972 }
17973
17974 puDst->au32[3] = au32A[4];
17975 puDst->au32[2] = au32B[4];
17976 puDst->au32[1] = au32C[4];
17977 puDst->au32[0] = au32D[4];
17978}
17979
17980
17981/**
17982 * SHA256MSG1
17983 */
17984DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
17985{
17986 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
17987}
17988
17989IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17990{
17991 uint32_t u32W4 = puSrc->au32[0];
17992 uint32_t u32W3 = puDst->au32[3];
17993 uint32_t u32W2 = puDst->au32[2];
17994 uint32_t u32W1 = puDst->au32[1];
17995 uint32_t u32W0 = puDst->au32[0];
17996
17997 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
17998 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
17999 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
18000 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
18001}
18002
18003/**
18004 * SHA256MSG2
18005 */
18006DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
18007{
18008 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
18009}
18010
18011IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18012{
18013 uint32_t u32W14 = puSrc->au32[2];
18014 uint32_t u32W15 = puSrc->au32[3];
18015 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
18016 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
18017 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
18018 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
18019
18020 puDst->au32[3] = u32W19;
18021 puDst->au32[2] = u32W18;
18022 puDst->au32[1] = u32W17;
18023 puDst->au32[0] = u32W16;
18024}
18025
18026/**
18027 * SHA256RNDS2
18028 */
18029DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18030{
18031 return (u32X & u32Y) ^ (~u32X & u32Z);
18032}
18033
18034DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18035{
18036 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
18037}
18038
18039DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
18040{
18041 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
18042}
18043
18044DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
18045{
18046 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
18047}
18048
18049IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
18050{
18051 uint32_t au32A[3];
18052 uint32_t au32B[3];
18053 uint32_t au32C[3];
18054 uint32_t au32D[3];
18055 uint32_t au32E[3];
18056 uint32_t au32F[3];
18057 uint32_t au32G[3];
18058 uint32_t au32H[3];
18059 uint32_t au32WK[2];
18060
18061 au32A[0] = puSrc->au32[3];
18062 au32B[0] = puSrc->au32[2];
18063 au32C[0] = puDst->au32[3];
18064 au32D[0] = puDst->au32[2];
18065 au32E[0] = puSrc->au32[1];
18066 au32F[0] = puSrc->au32[0];
18067 au32G[0] = puDst->au32[1];
18068 au32H[0] = puDst->au32[0];
18069
18070 au32WK[0] = puXmm0Constants->au32[0];
18071 au32WK[1] = puXmm0Constants->au32[1];
18072
18073 for (uint32_t i = 0; i < 2; i++)
18074 {
18075 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18076 + iemAImpl_sha256_upper_sigma1(au32E[i])
18077 + au32WK[i]
18078 + au32H[i]
18079 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
18080 + iemAImpl_sha256_upper_sigma0(au32A[i]);
18081 au32B[i + 1] = au32A[i];
18082 au32C[i + 1] = au32B[i];
18083 au32D[i + 1] = au32C[i];
18084 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18085 + iemAImpl_sha256_upper_sigma1(au32E[i])
18086 + au32WK[i]
18087 + au32H[i]
18088 + au32D[i];
18089 au32F[i + 1] = au32E[i];
18090 au32G[i + 1] = au32F[i];
18091 au32H[i + 1] = au32G[i];
18092 }
18093
18094 puDst->au32[3] = au32A[2];
18095 puDst->au32[2] = au32B[2];
18096 puDst->au32[1] = au32E[2];
18097 puDst->au32[0] = au32F[2];
18098}
18099
18100
18101/**
18102 * ADCX
18103 */
18104#define ADX_EMIT(a_Flag, a_Type, a_Max) \
18105 do \
18106 { \
18107 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
18108 a_Type uTmp = *puDst + uSrc; \
18109 if (uTmp < uSrc) \
18110 *pfEFlags |= (a_Flag); \
18111 else \
18112 *pfEFlags &= ~(a_Flag); \
18113 if ( uTmp == a_Max \
18114 && f) \
18115 *pfEFlags |= (a_Flag); \
18116 if (f) \
18117 uTmp++; \
18118 *puDst = uTmp; \
18119 } \
18120 while (0)
18121
18122IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t *pfEFlags, uint32_t uSrc))
18123{
18124 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
18125}
18126
18127IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint32_t *pfEFlags, uint64_t uSrc))
18128{
18129 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
18130}
18131
18132
18133/**
18134 * ADOX
18135 */
18136IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t *pfEFlags, uint32_t uSrc))
18137{
18138 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
18139}
18140
18141IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint32_t *pfEFlags, uint64_t uSrc))
18142{
18143 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
18144}
18145
18146
18147/**
18148 * MPSADBW
18149 */
18150IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18151{
18152 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18153 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18154 int16_t ai16Src1[11];
18155 int16_t ai16Src2[4];
18156
18157 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18158 ai16Src1[i] = puDst->au8[idxSrc1 + i];
18159
18160 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18161 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
18162
18163 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18164 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18165 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18166 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18167 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18168}
18169
18170
18171/**
18172 * DPPS
18173 */
18174IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18175{
18176 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18177 AssertReleaseFailed();
18178}
18179
18180
18181/**
18182 * DPPD
18183 */
18184IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18185{
18186 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18187 AssertReleaseFailed();
18188}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette