VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 104188

Last change on this file since 104188 was 104188, checked in by vboxsync, 8 months ago

VMM/IEM: Implement vpslldq, vpsrldq, instruction dispatch & emulation, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 724.5 KB
Line 
1/* $Id: IEMAllAImplC.cpp 104188 2024-04-05 13:16:50Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Calculates the parity flag.
87 *
88 * @returns X86_EFL_PF or 0.
89 * @param a_uResult Unsigned result value.
90 */
91#if !defined(RT_ARCH_ARM64) || 1 /** @todo profile this... micro benching in tstIEMAImpl indicates no gain, but it may be skewed. */
92# define IEM_EFL_CALC_PARITY(a_uResult) (g_afParity[(a_uResult) & 0xff])
93#else
94# define IEM_EFL_CALC_PARITY(a_uResult) iemAImplCalcParity(a_uResult)
95DECL_FORCE_INLINE(uint32_t) iemAImplCalcParity(uint32_t uResult)
96{
97 /* Emulate 8-bit pop count. This translates to 4 EOR instructions on
98 ARM64 as they can shift the 2nd source operand. */
99 uint8_t bPf = uResult ^ (uResult >> 4);
100 bPf ^= bPf >> 2;
101 bPf ^= bPf >> 1;
102 bPf ^= 1;
103 return (bPf & 1) << X86_EFL_PF_BIT;
104}
105#endif
106
107/**
108 * Extracts the OF flag from a OF calculation result.
109 *
110 * These are typically used by concating with a bitcount. The problem is that
111 * 8-bit values needs shifting in the other direction than the others.
112 */
113#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
114#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
115#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
116#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
117
118/**
119 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
120 *
121 * @returns Status bits.
122 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
123 * @param a_uResult Unsigned result value.
124 * @param a_uSrc The source value (for AF calc).
125 * @param a_uDst The original destination value (for AF+OF calc).
126 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
127 * @param a_CfExpr Bool expression for the carry flag (CF).
128 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
129 */
130#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
131 do { \
132 uint32_t fEflTmp = *(a_pfEFlags); \
133 fEflTmp &= ~X86_EFL_STATUS_BITS; \
134 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
135 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
136 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
137 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
138 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
139 \
140 /* Overflow during ADDition happens when both inputs have the same signed \
141 bit value and the result has a different sign bit value. \
142 \
143 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
144 follows that for SUBtraction the signed bit value must differ between \
145 the two inputs and the result's signed bit diff from the first input. \
146 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
147 \
148 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
149 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
150 & RT_BIT_64(a_cBitsWidth - 1)) \
151 & ((a_uResult) ^ (a_uDst)) ); \
152 *(a_pfEFlags) = fEflTmp; \
153 } while (0)
154
155/**
156 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
157 *
158 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
159 * undefined. We clear AF, as that seems to make the most sense and also seems
160 * to be the correct behavior on current CPUs.
161 *
162 * @returns Status bits.
163 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
164 * @param a_uResult Unsigned result value.
165 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
166 * @param a_fExtra Additional bits to set.
167 */
168#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
169 do { \
170 uint32_t fEflTmp = *(a_pfEFlags); \
171 fEflTmp &= ~X86_EFL_STATUS_BITS; \
172 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
173 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
174 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
175 fEflTmp |= (a_fExtra); \
176 *(a_pfEFlags) = fEflTmp; \
177 } while (0)
178
179
180/*********************************************************************************************************************************
181* Global Variables *
182*********************************************************************************************************************************/
183/**
184 * Parity calculation table.
185 *
186 * This is also used by iemAllAImpl.asm.
187 *
188 * The generator code:
189 * @code
190 * #include <stdio.h>
191 *
192 * int main()
193 * {
194 * unsigned b;
195 * for (b = 0; b < 256; b++)
196 * {
197 * int cOnes = ( b & 1)
198 * + ((b >> 1) & 1)
199 * + ((b >> 2) & 1)
200 * + ((b >> 3) & 1)
201 * + ((b >> 4) & 1)
202 * + ((b >> 5) & 1)
203 * + ((b >> 6) & 1)
204 * + ((b >> 7) & 1);
205 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
206 * b,
207 * (b >> 7) & 1,
208 * (b >> 6) & 1,
209 * (b >> 5) & 1,
210 * (b >> 4) & 1,
211 * (b >> 3) & 1,
212 * (b >> 2) & 1,
213 * (b >> 1) & 1,
214 * b & 1,
215 * cOnes & 1 ? "0" : "X86_EFL_PF");
216 * }
217 * return 0;
218 * }
219 * @endcode
220 */
221uint8_t const g_afParity[256] =
222{
223 /* 0000 = 00000000b */ X86_EFL_PF,
224 /* 0x01 = 00000001b */ 0,
225 /* 0x02 = 00000010b */ 0,
226 /* 0x03 = 00000011b */ X86_EFL_PF,
227 /* 0x04 = 00000100b */ 0,
228 /* 0x05 = 00000101b */ X86_EFL_PF,
229 /* 0x06 = 00000110b */ X86_EFL_PF,
230 /* 0x07 = 00000111b */ 0,
231 /* 0x08 = 00001000b */ 0,
232 /* 0x09 = 00001001b */ X86_EFL_PF,
233 /* 0x0a = 00001010b */ X86_EFL_PF,
234 /* 0x0b = 00001011b */ 0,
235 /* 0x0c = 00001100b */ X86_EFL_PF,
236 /* 0x0d = 00001101b */ 0,
237 /* 0x0e = 00001110b */ 0,
238 /* 0x0f = 00001111b */ X86_EFL_PF,
239 /* 0x10 = 00010000b */ 0,
240 /* 0x11 = 00010001b */ X86_EFL_PF,
241 /* 0x12 = 00010010b */ X86_EFL_PF,
242 /* 0x13 = 00010011b */ 0,
243 /* 0x14 = 00010100b */ X86_EFL_PF,
244 /* 0x15 = 00010101b */ 0,
245 /* 0x16 = 00010110b */ 0,
246 /* 0x17 = 00010111b */ X86_EFL_PF,
247 /* 0x18 = 00011000b */ X86_EFL_PF,
248 /* 0x19 = 00011001b */ 0,
249 /* 0x1a = 00011010b */ 0,
250 /* 0x1b = 00011011b */ X86_EFL_PF,
251 /* 0x1c = 00011100b */ 0,
252 /* 0x1d = 00011101b */ X86_EFL_PF,
253 /* 0x1e = 00011110b */ X86_EFL_PF,
254 /* 0x1f = 00011111b */ 0,
255 /* 0x20 = 00100000b */ 0,
256 /* 0x21 = 00100001b */ X86_EFL_PF,
257 /* 0x22 = 00100010b */ X86_EFL_PF,
258 /* 0x23 = 00100011b */ 0,
259 /* 0x24 = 00100100b */ X86_EFL_PF,
260 /* 0x25 = 00100101b */ 0,
261 /* 0x26 = 00100110b */ 0,
262 /* 0x27 = 00100111b */ X86_EFL_PF,
263 /* 0x28 = 00101000b */ X86_EFL_PF,
264 /* 0x29 = 00101001b */ 0,
265 /* 0x2a = 00101010b */ 0,
266 /* 0x2b = 00101011b */ X86_EFL_PF,
267 /* 0x2c = 00101100b */ 0,
268 /* 0x2d = 00101101b */ X86_EFL_PF,
269 /* 0x2e = 00101110b */ X86_EFL_PF,
270 /* 0x2f = 00101111b */ 0,
271 /* 0x30 = 00110000b */ X86_EFL_PF,
272 /* 0x31 = 00110001b */ 0,
273 /* 0x32 = 00110010b */ 0,
274 /* 0x33 = 00110011b */ X86_EFL_PF,
275 /* 0x34 = 00110100b */ 0,
276 /* 0x35 = 00110101b */ X86_EFL_PF,
277 /* 0x36 = 00110110b */ X86_EFL_PF,
278 /* 0x37 = 00110111b */ 0,
279 /* 0x38 = 00111000b */ 0,
280 /* 0x39 = 00111001b */ X86_EFL_PF,
281 /* 0x3a = 00111010b */ X86_EFL_PF,
282 /* 0x3b = 00111011b */ 0,
283 /* 0x3c = 00111100b */ X86_EFL_PF,
284 /* 0x3d = 00111101b */ 0,
285 /* 0x3e = 00111110b */ 0,
286 /* 0x3f = 00111111b */ X86_EFL_PF,
287 /* 0x40 = 01000000b */ 0,
288 /* 0x41 = 01000001b */ X86_EFL_PF,
289 /* 0x42 = 01000010b */ X86_EFL_PF,
290 /* 0x43 = 01000011b */ 0,
291 /* 0x44 = 01000100b */ X86_EFL_PF,
292 /* 0x45 = 01000101b */ 0,
293 /* 0x46 = 01000110b */ 0,
294 /* 0x47 = 01000111b */ X86_EFL_PF,
295 /* 0x48 = 01001000b */ X86_EFL_PF,
296 /* 0x49 = 01001001b */ 0,
297 /* 0x4a = 01001010b */ 0,
298 /* 0x4b = 01001011b */ X86_EFL_PF,
299 /* 0x4c = 01001100b */ 0,
300 /* 0x4d = 01001101b */ X86_EFL_PF,
301 /* 0x4e = 01001110b */ X86_EFL_PF,
302 /* 0x4f = 01001111b */ 0,
303 /* 0x50 = 01010000b */ X86_EFL_PF,
304 /* 0x51 = 01010001b */ 0,
305 /* 0x52 = 01010010b */ 0,
306 /* 0x53 = 01010011b */ X86_EFL_PF,
307 /* 0x54 = 01010100b */ 0,
308 /* 0x55 = 01010101b */ X86_EFL_PF,
309 /* 0x56 = 01010110b */ X86_EFL_PF,
310 /* 0x57 = 01010111b */ 0,
311 /* 0x58 = 01011000b */ 0,
312 /* 0x59 = 01011001b */ X86_EFL_PF,
313 /* 0x5a = 01011010b */ X86_EFL_PF,
314 /* 0x5b = 01011011b */ 0,
315 /* 0x5c = 01011100b */ X86_EFL_PF,
316 /* 0x5d = 01011101b */ 0,
317 /* 0x5e = 01011110b */ 0,
318 /* 0x5f = 01011111b */ X86_EFL_PF,
319 /* 0x60 = 01100000b */ X86_EFL_PF,
320 /* 0x61 = 01100001b */ 0,
321 /* 0x62 = 01100010b */ 0,
322 /* 0x63 = 01100011b */ X86_EFL_PF,
323 /* 0x64 = 01100100b */ 0,
324 /* 0x65 = 01100101b */ X86_EFL_PF,
325 /* 0x66 = 01100110b */ X86_EFL_PF,
326 /* 0x67 = 01100111b */ 0,
327 /* 0x68 = 01101000b */ 0,
328 /* 0x69 = 01101001b */ X86_EFL_PF,
329 /* 0x6a = 01101010b */ X86_EFL_PF,
330 /* 0x6b = 01101011b */ 0,
331 /* 0x6c = 01101100b */ X86_EFL_PF,
332 /* 0x6d = 01101101b */ 0,
333 /* 0x6e = 01101110b */ 0,
334 /* 0x6f = 01101111b */ X86_EFL_PF,
335 /* 0x70 = 01110000b */ 0,
336 /* 0x71 = 01110001b */ X86_EFL_PF,
337 /* 0x72 = 01110010b */ X86_EFL_PF,
338 /* 0x73 = 01110011b */ 0,
339 /* 0x74 = 01110100b */ X86_EFL_PF,
340 /* 0x75 = 01110101b */ 0,
341 /* 0x76 = 01110110b */ 0,
342 /* 0x77 = 01110111b */ X86_EFL_PF,
343 /* 0x78 = 01111000b */ X86_EFL_PF,
344 /* 0x79 = 01111001b */ 0,
345 /* 0x7a = 01111010b */ 0,
346 /* 0x7b = 01111011b */ X86_EFL_PF,
347 /* 0x7c = 01111100b */ 0,
348 /* 0x7d = 01111101b */ X86_EFL_PF,
349 /* 0x7e = 01111110b */ X86_EFL_PF,
350 /* 0x7f = 01111111b */ 0,
351 /* 0x80 = 10000000b */ 0,
352 /* 0x81 = 10000001b */ X86_EFL_PF,
353 /* 0x82 = 10000010b */ X86_EFL_PF,
354 /* 0x83 = 10000011b */ 0,
355 /* 0x84 = 10000100b */ X86_EFL_PF,
356 /* 0x85 = 10000101b */ 0,
357 /* 0x86 = 10000110b */ 0,
358 /* 0x87 = 10000111b */ X86_EFL_PF,
359 /* 0x88 = 10001000b */ X86_EFL_PF,
360 /* 0x89 = 10001001b */ 0,
361 /* 0x8a = 10001010b */ 0,
362 /* 0x8b = 10001011b */ X86_EFL_PF,
363 /* 0x8c = 10001100b */ 0,
364 /* 0x8d = 10001101b */ X86_EFL_PF,
365 /* 0x8e = 10001110b */ X86_EFL_PF,
366 /* 0x8f = 10001111b */ 0,
367 /* 0x90 = 10010000b */ X86_EFL_PF,
368 /* 0x91 = 10010001b */ 0,
369 /* 0x92 = 10010010b */ 0,
370 /* 0x93 = 10010011b */ X86_EFL_PF,
371 /* 0x94 = 10010100b */ 0,
372 /* 0x95 = 10010101b */ X86_EFL_PF,
373 /* 0x96 = 10010110b */ X86_EFL_PF,
374 /* 0x97 = 10010111b */ 0,
375 /* 0x98 = 10011000b */ 0,
376 /* 0x99 = 10011001b */ X86_EFL_PF,
377 /* 0x9a = 10011010b */ X86_EFL_PF,
378 /* 0x9b = 10011011b */ 0,
379 /* 0x9c = 10011100b */ X86_EFL_PF,
380 /* 0x9d = 10011101b */ 0,
381 /* 0x9e = 10011110b */ 0,
382 /* 0x9f = 10011111b */ X86_EFL_PF,
383 /* 0xa0 = 10100000b */ X86_EFL_PF,
384 /* 0xa1 = 10100001b */ 0,
385 /* 0xa2 = 10100010b */ 0,
386 /* 0xa3 = 10100011b */ X86_EFL_PF,
387 /* 0xa4 = 10100100b */ 0,
388 /* 0xa5 = 10100101b */ X86_EFL_PF,
389 /* 0xa6 = 10100110b */ X86_EFL_PF,
390 /* 0xa7 = 10100111b */ 0,
391 /* 0xa8 = 10101000b */ 0,
392 /* 0xa9 = 10101001b */ X86_EFL_PF,
393 /* 0xaa = 10101010b */ X86_EFL_PF,
394 /* 0xab = 10101011b */ 0,
395 /* 0xac = 10101100b */ X86_EFL_PF,
396 /* 0xad = 10101101b */ 0,
397 /* 0xae = 10101110b */ 0,
398 /* 0xaf = 10101111b */ X86_EFL_PF,
399 /* 0xb0 = 10110000b */ 0,
400 /* 0xb1 = 10110001b */ X86_EFL_PF,
401 /* 0xb2 = 10110010b */ X86_EFL_PF,
402 /* 0xb3 = 10110011b */ 0,
403 /* 0xb4 = 10110100b */ X86_EFL_PF,
404 /* 0xb5 = 10110101b */ 0,
405 /* 0xb6 = 10110110b */ 0,
406 /* 0xb7 = 10110111b */ X86_EFL_PF,
407 /* 0xb8 = 10111000b */ X86_EFL_PF,
408 /* 0xb9 = 10111001b */ 0,
409 /* 0xba = 10111010b */ 0,
410 /* 0xbb = 10111011b */ X86_EFL_PF,
411 /* 0xbc = 10111100b */ 0,
412 /* 0xbd = 10111101b */ X86_EFL_PF,
413 /* 0xbe = 10111110b */ X86_EFL_PF,
414 /* 0xbf = 10111111b */ 0,
415 /* 0xc0 = 11000000b */ X86_EFL_PF,
416 /* 0xc1 = 11000001b */ 0,
417 /* 0xc2 = 11000010b */ 0,
418 /* 0xc3 = 11000011b */ X86_EFL_PF,
419 /* 0xc4 = 11000100b */ 0,
420 /* 0xc5 = 11000101b */ X86_EFL_PF,
421 /* 0xc6 = 11000110b */ X86_EFL_PF,
422 /* 0xc7 = 11000111b */ 0,
423 /* 0xc8 = 11001000b */ 0,
424 /* 0xc9 = 11001001b */ X86_EFL_PF,
425 /* 0xca = 11001010b */ X86_EFL_PF,
426 /* 0xcb = 11001011b */ 0,
427 /* 0xcc = 11001100b */ X86_EFL_PF,
428 /* 0xcd = 11001101b */ 0,
429 /* 0xce = 11001110b */ 0,
430 /* 0xcf = 11001111b */ X86_EFL_PF,
431 /* 0xd0 = 11010000b */ 0,
432 /* 0xd1 = 11010001b */ X86_EFL_PF,
433 /* 0xd2 = 11010010b */ X86_EFL_PF,
434 /* 0xd3 = 11010011b */ 0,
435 /* 0xd4 = 11010100b */ X86_EFL_PF,
436 /* 0xd5 = 11010101b */ 0,
437 /* 0xd6 = 11010110b */ 0,
438 /* 0xd7 = 11010111b */ X86_EFL_PF,
439 /* 0xd8 = 11011000b */ X86_EFL_PF,
440 /* 0xd9 = 11011001b */ 0,
441 /* 0xda = 11011010b */ 0,
442 /* 0xdb = 11011011b */ X86_EFL_PF,
443 /* 0xdc = 11011100b */ 0,
444 /* 0xdd = 11011101b */ X86_EFL_PF,
445 /* 0xde = 11011110b */ X86_EFL_PF,
446 /* 0xdf = 11011111b */ 0,
447 /* 0xe0 = 11100000b */ 0,
448 /* 0xe1 = 11100001b */ X86_EFL_PF,
449 /* 0xe2 = 11100010b */ X86_EFL_PF,
450 /* 0xe3 = 11100011b */ 0,
451 /* 0xe4 = 11100100b */ X86_EFL_PF,
452 /* 0xe5 = 11100101b */ 0,
453 /* 0xe6 = 11100110b */ 0,
454 /* 0xe7 = 11100111b */ X86_EFL_PF,
455 /* 0xe8 = 11101000b */ X86_EFL_PF,
456 /* 0xe9 = 11101001b */ 0,
457 /* 0xea = 11101010b */ 0,
458 /* 0xeb = 11101011b */ X86_EFL_PF,
459 /* 0xec = 11101100b */ 0,
460 /* 0xed = 11101101b */ X86_EFL_PF,
461 /* 0xee = 11101110b */ X86_EFL_PF,
462 /* 0xef = 11101111b */ 0,
463 /* 0xf0 = 11110000b */ X86_EFL_PF,
464 /* 0xf1 = 11110001b */ 0,
465 /* 0xf2 = 11110010b */ 0,
466 /* 0xf3 = 11110011b */ X86_EFL_PF,
467 /* 0xf4 = 11110100b */ 0,
468 /* 0xf5 = 11110101b */ X86_EFL_PF,
469 /* 0xf6 = 11110110b */ X86_EFL_PF,
470 /* 0xf7 = 11110111b */ 0,
471 /* 0xf8 = 11111000b */ 0,
472 /* 0xf9 = 11111001b */ X86_EFL_PF,
473 /* 0xfa = 11111010b */ X86_EFL_PF,
474 /* 0xfb = 11111011b */ 0,
475 /* 0xfc = 11111100b */ X86_EFL_PF,
476 /* 0xfd = 11111101b */ 0,
477 /* 0xfe = 11111110b */ 0,
478 /* 0xff = 11111111b */ X86_EFL_PF,
479};
480
481/* for clang: */
482extern const RTFLOAT32U g_ar32Zero[];
483extern const RTFLOAT64U g_ar64Zero[];
484extern const RTFLOAT80U g_ar80Zero[];
485extern const RTFLOAT32U g_ar32One[];
486extern const RTFLOAT80U g_ar80One[];
487extern const RTFLOAT80U g_r80Indefinite;
488extern const RTFLOAT32U g_ar32Infinity[];
489extern const RTFLOAT64U g_ar64Infinity[];
490extern const RTFLOAT80U g_ar80Infinity[];
491extern const RTFLOAT128U g_r128Ln2;
492extern const RTUINT128U g_u128Ln2Mantissa;
493extern const RTUINT128U g_u128Ln2MantissaIntel;
494extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
495extern const RTFLOAT32U g_ar32QNaN[];
496extern const RTFLOAT64U g_ar64QNaN[];
497
498/** Zero values (indexed by fSign). */
499RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
500RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
501RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
502
503/** One values (indexed by fSign). */
504RTFLOAT32U const g_ar32One[] =
505{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
506RTFLOAT80U const g_ar80One[] =
507{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
508
509/** Indefinite (negative). */
510RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
511
512/** Infinities (indexed by fSign). */
513RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
514RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
515RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
516
517/** Default QNaNs (indexed by fSign). */
518RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
519RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
520
521
522#if 0
523/** 128-bit floating point constant: 2.0 */
524const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
525#endif
526
527
528/* The next section is generated by tools/IEMGenFpuConstants: */
529
530/** The ln2 constant as 128-bit floating point value.
531 * base-10: 6.93147180559945309417232121458176575e-1
532 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
533 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
534//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
535const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
536/** High precision ln2 value.
537 * base-10: 6.931471805599453094172321214581765680747e-1
538 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
539 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
540const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
541/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
542 * base-10: 6.931471805599453094151379470289064954613e-1
543 * base-16: b.17217f7d1cf79abc0000000000000000@-1
544 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
545const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
546
547/** Horner constants for f2xm1 */
548const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
549{
550 /* a0
551 * base-10: 1.00000000000000000000000000000000000e0
552 * base-16: 1.0000000000000000000000000000@0
553 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
554 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
555 /* a1
556 * base-10: 5.00000000000000000000000000000000000e-1
557 * base-16: 8.0000000000000000000000000000@-1
558 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
559 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
560 /* a2
561 * base-10: 1.66666666666666666666666666666666658e-1
562 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
563 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
564 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
565 /* a3
566 * base-10: 4.16666666666666666666666666666666646e-2
567 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
568 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
569 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
570 /* a4
571 * base-10: 8.33333333333333333333333333333333323e-3
572 * base-16: 2.2222222222222222222222222222@-2
573 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
574 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
575 /* a5
576 * base-10: 1.38888888888888888888888888888888874e-3
577 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
578 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
579 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
580 /* a6
581 * base-10: 1.98412698412698412698412698412698412e-4
582 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
583 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
584 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
585 /* a7
586 * base-10: 2.48015873015873015873015873015873015e-5
587 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
588 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
589 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
590 /* a8
591 * base-10: 2.75573192239858906525573192239858902e-6
592 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
593 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
594 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
595 /* a9
596 * base-10: 2.75573192239858906525573192239858865e-7
597 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
598 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
599 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
600 /* a10
601 * base-10: 2.50521083854417187750521083854417184e-8
602 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
603 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
604 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
605 /* a11
606 * base-10: 2.08767569878680989792100903212014296e-9
607 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
608 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
609 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
610 /* a12
611 * base-10: 1.60590438368216145993923771701549472e-10
612 * base-16: b.092309d43684be51c198e91d7b40@-9
613 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
614 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
615 /* a13
616 * base-10: 1.14707455977297247138516979786821043e-11
617 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
618 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
619 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
620 /* a14
621 * base-10: 7.64716373181981647590113198578806964e-13
622 * base-16: d.73f9f399dc0f88ec32b587746578@-11
623 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
624 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
625 /* a15
626 * base-10: 4.77947733238738529743820749111754352e-14
627 * base-16: d.73f9f399dc0f88ec32b587746578@-12
628 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
629 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
630 /* a16
631 * base-10: 2.81145725434552076319894558301031970e-15
632 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
633 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
634 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
635 /* a17
636 * base-10: 1.56192069685862264622163643500573321e-16
637 * base-16: b.413c31dcbecbbdd8024435161550@-14
638 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
639 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
640 /* a18
641 * base-10: 8.22063524662432971695598123687227980e-18
642 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
643 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
644 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
645 /* a19
646 * base-10: 4.11031762331216485847799061843614006e-19
647 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
648 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
649 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
650 /* a20
651 * base-10: 1.95729410633912612308475743735054143e-20
652 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
653 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
654 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
655 /* a21
656 * base-10: 8.89679139245057328674889744250246106e-22
657 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
658 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
659 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
660};
661
662
663/*
664 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
665 * it all in C is probably safer atm., optimize what's necessary later, maybe.
666 */
667#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
668
669
670/*********************************************************************************************************************************
671* Binary Operations *
672*********************************************************************************************************************************/
673
674/*
675 * ADD
676 */
677
678IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
679{
680 uint64_t uDst = *puDst;
681 uint64_t uResult = uDst + uSrc;
682 *puDst = uResult;
683 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
684}
685
686# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
687
688IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
689{
690 uint32_t uDst = *puDst;
691 uint32_t uResult = uDst + uSrc;
692 *puDst = uResult;
693 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
694}
695
696
697IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
698{
699 uint16_t uDst = *puDst;
700 uint16_t uResult = uDst + uSrc;
701 *puDst = uResult;
702 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
703}
704
705
706IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
707{
708 uint8_t uDst = *puDst;
709 uint8_t uResult = uDst + uSrc;
710 *puDst = uResult;
711 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
712}
713
714# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
715
716/*
717 * ADC
718 */
719
720IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
721{
722 if (!(*pfEFlags & X86_EFL_CF))
723 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
724 else
725 {
726 uint64_t uDst = *puDst;
727 uint64_t uResult = uDst + uSrc + 1;
728 *puDst = uResult;
729 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
730 }
731}
732
733# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
734
735IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
736{
737 if (!(*pfEFlags & X86_EFL_CF))
738 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
739 else
740 {
741 uint32_t uDst = *puDst;
742 uint32_t uResult = uDst + uSrc + 1;
743 *puDst = uResult;
744 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
745 }
746}
747
748
749IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
750{
751 if (!(*pfEFlags & X86_EFL_CF))
752 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
753 else
754 {
755 uint16_t uDst = *puDst;
756 uint16_t uResult = uDst + uSrc + 1;
757 *puDst = uResult;
758 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
759 }
760}
761
762
763IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
764{
765 if (!(*pfEFlags & X86_EFL_CF))
766 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
767 else
768 {
769 uint8_t uDst = *puDst;
770 uint8_t uResult = uDst + uSrc + 1;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
773 }
774}
775
776# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
777
778/*
779 * SUB
780 */
781# if !defined(RT_ARCH_ARM64)
782
783IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
784{
785 uint64_t uDst = *puDst;
786 uint64_t uResult = uDst - uSrc;
787 *puDst = uResult;
788 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
789}
790
791# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
792
793IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
794{
795 uint32_t uDst = *puDst;
796 uint32_t uResult = uDst - uSrc;
797 *puDst = uResult;
798 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
799}
800
801
802IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
803{
804 uint16_t uDst = *puDst;
805 uint16_t uResult = uDst - uSrc;
806 *puDst = uResult;
807 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
808}
809
810
811IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
812{
813 uint8_t uDst = *puDst;
814 uint8_t uResult = uDst - uSrc;
815 *puDst = uResult;
816 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
817}
818
819# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
820# endif /* !RT_ARCH_ARM64 */
821
822/*
823 * SBB
824 */
825
826IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
827{
828 if (!(*pfEFlags & X86_EFL_CF))
829 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
830 else
831 {
832 uint64_t uDst = *puDst;
833 uint64_t uResult = uDst - uSrc - 1;
834 *puDst = uResult;
835 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
836 }
837}
838
839# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
840
841IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
842{
843 if (!(*pfEFlags & X86_EFL_CF))
844 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
845 else
846 {
847 uint32_t uDst = *puDst;
848 uint32_t uResult = uDst - uSrc - 1;
849 *puDst = uResult;
850 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
851 }
852}
853
854
855IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
856{
857 if (!(*pfEFlags & X86_EFL_CF))
858 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
859 else
860 {
861 uint16_t uDst = *puDst;
862 uint16_t uResult = uDst - uSrc - 1;
863 *puDst = uResult;
864 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
865 }
866}
867
868
869IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
870{
871 if (!(*pfEFlags & X86_EFL_CF))
872 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
873 else
874 {
875 uint8_t uDst = *puDst;
876 uint8_t uResult = uDst - uSrc - 1;
877 *puDst = uResult;
878 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
879 }
880}
881
882# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
883
884
885/*
886 * OR
887 */
888
889IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
890{
891 uint64_t uResult = *puDst | uSrc;
892 *puDst = uResult;
893 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
894}
895
896# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
897
898IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
899{
900 uint32_t uResult = *puDst | uSrc;
901 *puDst = uResult;
902 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
903}
904
905
906IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
907{
908 uint16_t uResult = *puDst | uSrc;
909 *puDst = uResult;
910 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
911}
912
913
914IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
915{
916 uint8_t uResult = *puDst | uSrc;
917 *puDst = uResult;
918 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
919}
920
921# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
922
923/*
924 * XOR
925 */
926
927IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
928{
929 uint64_t uResult = *puDst ^ uSrc;
930 *puDst = uResult;
931 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
932}
933
934# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
935
936IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
937{
938 uint32_t uResult = *puDst ^ uSrc;
939 *puDst = uResult;
940 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
941}
942
943
944IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
945{
946 uint16_t uResult = *puDst ^ uSrc;
947 *puDst = uResult;
948 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
949}
950
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
953{
954 uint8_t uResult = *puDst ^ uSrc;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
957}
958
959# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
960
961/*
962 * AND
963 */
964
965IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
966{
967 uint64_t const uResult = *puDst & uSrc;
968 *puDst = uResult;
969 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
970}
971
972# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
973
974IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
975{
976 uint32_t const uResult = *puDst & uSrc;
977 *puDst = uResult;
978 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
979}
980
981
982IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
983{
984 uint16_t const uResult = *puDst & uSrc;
985 *puDst = uResult;
986 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
987}
988
989
990IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
991{
992 uint8_t const uResult = *puDst & uSrc;
993 *puDst = uResult;
994 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
995}
996
997# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
998#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
999
1000/*
1001 * ANDN (BMI1 instruction)
1002 */
1003
1004IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1005{
1006 uint64_t const uResult = ~uSrc1 & uSrc2;
1007 *puDst = uResult;
1008 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
1009}
1010
1011
1012IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1013{
1014 uint32_t const uResult = ~uSrc1 & uSrc2;
1015 *puDst = uResult;
1016 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
1017}
1018
1019
1020#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1022{
1023 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1024}
1025#endif
1026
1027
1028#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1029IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1030{
1031 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1032}
1033#endif
1034
1035#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1036
1037/*
1038 * CMP
1039 */
1040
1041IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1042{
1043 uint64_t uDstTmp = *puDst;
1044 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1045}
1046
1047# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1048
1049IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1050{
1051 uint32_t uDstTmp = *puDst;
1052 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1053}
1054
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint16_t uDstTmp = *puDst;
1059 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint8_t uDstTmp = *puDst;
1066 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1067}
1068
1069# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1070
1071/*
1072 * TEST
1073 */
1074
1075IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1076{
1077 uint64_t uResult = *puDst & uSrc;
1078 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
1079}
1080
1081# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1082
1083IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1084{
1085 uint32_t uResult = *puDst & uSrc;
1086 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
1087}
1088
1089
1090IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1091{
1092 uint16_t uResult = *puDst & uSrc;
1093 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
1094}
1095
1096
1097IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1098{
1099 uint8_t uResult = *puDst & uSrc;
1100 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
1101}
1102
1103# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1104
1105
1106/*
1107 * LOCK prefixed variants of the above
1108 */
1109
1110/** 64-bit locked binary operand operation. */
1111# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1112 do { \
1113 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1114 uint ## a_cBitsWidth ## _t uTmp; \
1115 uint32_t fEflTmp; \
1116 do \
1117 { \
1118 uTmp = uOld; \
1119 fEflTmp = *pfEFlags; \
1120 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1121 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1122 *pfEFlags = fEflTmp; \
1123 } while (0)
1124
1125
1126#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1127 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1128 uint ## a_cBitsWidth ## _t uSrc, \
1129 uint32_t *pfEFlags)) \
1130 { \
1131 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1132 }
1133
1134EMIT_LOCKED_BIN_OP(add, 64)
1135EMIT_LOCKED_BIN_OP(adc, 64)
1136EMIT_LOCKED_BIN_OP(sub, 64)
1137EMIT_LOCKED_BIN_OP(sbb, 64)
1138EMIT_LOCKED_BIN_OP(or, 64)
1139EMIT_LOCKED_BIN_OP(xor, 64)
1140EMIT_LOCKED_BIN_OP(and, 64)
1141# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1142EMIT_LOCKED_BIN_OP(add, 32)
1143EMIT_LOCKED_BIN_OP(adc, 32)
1144EMIT_LOCKED_BIN_OP(sub, 32)
1145EMIT_LOCKED_BIN_OP(sbb, 32)
1146EMIT_LOCKED_BIN_OP(or, 32)
1147EMIT_LOCKED_BIN_OP(xor, 32)
1148EMIT_LOCKED_BIN_OP(and, 32)
1149
1150EMIT_LOCKED_BIN_OP(add, 16)
1151EMIT_LOCKED_BIN_OP(adc, 16)
1152EMIT_LOCKED_BIN_OP(sub, 16)
1153EMIT_LOCKED_BIN_OP(sbb, 16)
1154EMIT_LOCKED_BIN_OP(or, 16)
1155EMIT_LOCKED_BIN_OP(xor, 16)
1156EMIT_LOCKED_BIN_OP(and, 16)
1157
1158EMIT_LOCKED_BIN_OP(add, 8)
1159EMIT_LOCKED_BIN_OP(adc, 8)
1160EMIT_LOCKED_BIN_OP(sub, 8)
1161EMIT_LOCKED_BIN_OP(sbb, 8)
1162EMIT_LOCKED_BIN_OP(or, 8)
1163EMIT_LOCKED_BIN_OP(xor, 8)
1164EMIT_LOCKED_BIN_OP(and, 8)
1165# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1166
1167
1168/*
1169 * Bit operations (same signature as above).
1170 */
1171
1172/*
1173 * BT
1174 */
1175
1176IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1177{
1178 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1179 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1180 Assert(uSrc < 64);
1181 uint64_t uDst = *puDst;
1182 if (uDst & RT_BIT_64(uSrc))
1183 *pfEFlags |= X86_EFL_CF;
1184 else
1185 *pfEFlags &= ~X86_EFL_CF;
1186}
1187
1188# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1189
1190IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1191{
1192 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1193 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1194 Assert(uSrc < 32);
1195 uint32_t uDst = *puDst;
1196 if (uDst & RT_BIT_32(uSrc))
1197 *pfEFlags |= X86_EFL_CF;
1198 else
1199 *pfEFlags &= ~X86_EFL_CF;
1200}
1201
1202IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1203{
1204 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1205 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1206 Assert(uSrc < 16);
1207 uint16_t uDst = *puDst;
1208 if (uDst & RT_BIT_32(uSrc))
1209 *pfEFlags |= X86_EFL_CF;
1210 else
1211 *pfEFlags &= ~X86_EFL_CF;
1212}
1213
1214# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1215
1216/*
1217 * BTC
1218 */
1219
1220IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1221{
1222 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1223 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1224 Assert(uSrc < 64);
1225 uint64_t fMask = RT_BIT_64(uSrc);
1226 uint64_t uDst = *puDst;
1227 if (uDst & fMask)
1228 {
1229 uDst &= ~fMask;
1230 *puDst = uDst;
1231 *pfEFlags |= X86_EFL_CF;
1232 }
1233 else
1234 {
1235 uDst |= fMask;
1236 *puDst = uDst;
1237 *pfEFlags &= ~X86_EFL_CF;
1238 }
1239}
1240
1241# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1242
1243IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1244{
1245 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1246 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1247 Assert(uSrc < 32);
1248 uint32_t fMask = RT_BIT_32(uSrc);
1249 uint32_t uDst = *puDst;
1250 if (uDst & fMask)
1251 {
1252 uDst &= ~fMask;
1253 *puDst = uDst;
1254 *pfEFlags |= X86_EFL_CF;
1255 }
1256 else
1257 {
1258 uDst |= fMask;
1259 *puDst = uDst;
1260 *pfEFlags &= ~X86_EFL_CF;
1261 }
1262}
1263
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1268 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1269 Assert(uSrc < 16);
1270 uint16_t fMask = RT_BIT_32(uSrc);
1271 uint16_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 {
1280 uDst |= fMask;
1281 *puDst = uDst;
1282 *pfEFlags &= ~X86_EFL_CF;
1283 }
1284}
1285
1286# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1287
1288/*
1289 * BTR
1290 */
1291
1292IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1293{
1294 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1295 logical operation (AND/OR/whatever). */
1296 Assert(uSrc < 64);
1297 uint64_t fMask = RT_BIT_64(uSrc);
1298 uint64_t uDst = *puDst;
1299 if (uDst & fMask)
1300 {
1301 uDst &= ~fMask;
1302 *puDst = uDst;
1303 *pfEFlags |= X86_EFL_CF;
1304 }
1305 else
1306 *pfEFlags &= ~X86_EFL_CF;
1307}
1308
1309# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1310
1311IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1312{
1313 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1314 logical operation (AND/OR/whatever). */
1315 Assert(uSrc < 32);
1316 uint32_t fMask = RT_BIT_32(uSrc);
1317 uint32_t uDst = *puDst;
1318 if (uDst & fMask)
1319 {
1320 uDst &= ~fMask;
1321 *puDst = uDst;
1322 *pfEFlags |= X86_EFL_CF;
1323 }
1324 else
1325 *pfEFlags &= ~X86_EFL_CF;
1326}
1327
1328
1329IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1330{
1331 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1332 logical operation (AND/OR/whatever). */
1333 Assert(uSrc < 16);
1334 uint16_t fMask = RT_BIT_32(uSrc);
1335 uint16_t uDst = *puDst;
1336 if (uDst & fMask)
1337 {
1338 uDst &= ~fMask;
1339 *puDst = uDst;
1340 *pfEFlags |= X86_EFL_CF;
1341 }
1342 else
1343 *pfEFlags &= ~X86_EFL_CF;
1344}
1345
1346# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1347
1348/*
1349 * BTS
1350 */
1351
1352IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1353{
1354 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1355 logical operation (AND/OR/whatever). */
1356 Assert(uSrc < 64);
1357 uint64_t fMask = RT_BIT_64(uSrc);
1358 uint64_t uDst = *puDst;
1359 if (uDst & fMask)
1360 *pfEFlags |= X86_EFL_CF;
1361 else
1362 {
1363 uDst |= fMask;
1364 *puDst = uDst;
1365 *pfEFlags &= ~X86_EFL_CF;
1366 }
1367}
1368
1369# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1370
1371IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1372{
1373 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1374 logical operation (AND/OR/whatever). */
1375 Assert(uSrc < 32);
1376 uint32_t fMask = RT_BIT_32(uSrc);
1377 uint32_t uDst = *puDst;
1378 if (uDst & fMask)
1379 *pfEFlags |= X86_EFL_CF;
1380 else
1381 {
1382 uDst |= fMask;
1383 *puDst = uDst;
1384 *pfEFlags &= ~X86_EFL_CF;
1385 }
1386}
1387
1388
1389IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1390{
1391 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1392 logical operation (AND/OR/whatever). */
1393 Assert(uSrc < 16);
1394 uint16_t fMask = RT_BIT_32(uSrc);
1395 uint32_t uDst = *puDst;
1396 if (uDst & fMask)
1397 *pfEFlags |= X86_EFL_CF;
1398 else
1399 {
1400 uDst |= fMask;
1401 *puDst = uDst;
1402 *pfEFlags &= ~X86_EFL_CF;
1403 }
1404}
1405
1406# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1407
1408
1409EMIT_LOCKED_BIN_OP(btc, 64)
1410EMIT_LOCKED_BIN_OP(btr, 64)
1411EMIT_LOCKED_BIN_OP(bts, 64)
1412# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1413EMIT_LOCKED_BIN_OP(btc, 32)
1414EMIT_LOCKED_BIN_OP(btr, 32)
1415EMIT_LOCKED_BIN_OP(bts, 32)
1416
1417EMIT_LOCKED_BIN_OP(btc, 16)
1418EMIT_LOCKED_BIN_OP(btr, 16)
1419EMIT_LOCKED_BIN_OP(bts, 16)
1420# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1421
1422#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1423
1424/*
1425 * Helpers for BSR and BSF.
1426 *
1427 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1428 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1429 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1430 * but we restrict ourselves to emulating these recent marchs.
1431 */
1432#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1433 unsigned iBit = (a_iBit); \
1434 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1435 if (iBit) \
1436 { \
1437 *puDst = --iBit; \
1438 fEfl |= IEM_EFL_CALC_PARITY(iBit); \
1439 } \
1440 else \
1441 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1442 *pfEFlags = fEfl; \
1443 } while (0)
1444#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1445 unsigned const iBit = (a_iBit); \
1446 if (iBit) \
1447 { \
1448 *puDst = iBit - 1; \
1449 *pfEFlags &= ~X86_EFL_ZF; \
1450 } \
1451 else \
1452 *pfEFlags |= X86_EFL_ZF; \
1453 } while (0)
1454
1455/*
1456 * BSF - first (least significant) bit set
1457 */
1458#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1459IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1460{
1461 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1462}
1463#endif
1464
1465IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1466{
1467 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1468}
1469
1470IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1471{
1472 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1473}
1474
1475#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1476IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1477{
1478 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1479}
1480#endif
1481
1482IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1483{
1484 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1485}
1486
1487IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1488{
1489 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1490}
1491
1492
1493#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1494IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1495{
1496 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1497}
1498#endif
1499
1500IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1501{
1502 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1503}
1504
1505IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1506{
1507 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1508}
1509
1510
1511
1512/*
1513 * BSR - last (most significant) bit set
1514 */
1515#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1516IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1517{
1518 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1519}
1520#endif
1521
1522IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1523{
1524 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1525}
1526
1527IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1528{
1529 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1530}
1531
1532
1533#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1534IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1535{
1536 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1537}
1538#endif
1539
1540IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1541{
1542 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1543}
1544
1545IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1546{
1547 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1548}
1549
1550
1551#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1552IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1553{
1554 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1555}
1556#endif
1557
1558IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1559{
1560 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1561}
1562
1563IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1564{
1565 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1566}
1567
1568
1569/*
1570 * Helpers for LZCNT and TZCNT.
1571 */
1572#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1573 unsigned const uResult = (a_uResult); \
1574 *(a_puDst) = uResult; \
1575 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1576 if (uResult) \
1577 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
1578 else \
1579 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1580 if (!a_uSrc) \
1581 fEfl |= X86_EFL_CF; \
1582 *(a_pfEFlags) = fEfl; \
1583 } while (0)
1584#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1585 unsigned const uResult = (a_uResult); \
1586 *(a_puDst) = uResult; \
1587 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1588 if (!uResult) \
1589 fEfl |= X86_EFL_ZF; \
1590 if (!a_uSrc) \
1591 fEfl |= X86_EFL_CF; \
1592 *(a_pfEFlags) = fEfl; \
1593 } while (0)
1594
1595
1596/*
1597 * LZCNT - count leading zero bits.
1598 */
1599#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1600IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1601{
1602 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1603}
1604#endif
1605
1606IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1607{
1608 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1609}
1610
1611IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1612{
1613 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1614}
1615
1616
1617#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1618IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1619{
1620 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1621}
1622#endif
1623
1624IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1625{
1626 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1627}
1628
1629IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1630{
1631 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1632}
1633
1634
1635#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1636IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1637{
1638 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1639}
1640#endif
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1650}
1651
1652
1653/*
1654 * TZCNT - count leading zero bits.
1655 */
1656#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1657IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1658{
1659 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1660}
1661#endif
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1666}
1667
1668IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1669{
1670 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1671}
1672
1673
1674#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1675IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1676{
1677 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1678}
1679#endif
1680
1681IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1682{
1683 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1684}
1685
1686IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1687{
1688 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1689}
1690
1691
1692#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1693IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1694{
1695 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1696}
1697#endif
1698
1699IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1700{
1701 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1702}
1703
1704IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1705{
1706 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1707}
1708
1709
1710
1711/*
1712 * BEXTR (BMI1 instruction)
1713 */
1714#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1715IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1716 a_Type uSrc2, uint32_t *pfEFlags)) \
1717{ \
1718 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1719 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1720 a_Type uResult; \
1721 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1722 if (iFirstBit < a_cBits) \
1723 { \
1724 uResult = uSrc1 >> iFirstBit; \
1725 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1726 if (cBits < a_cBits) \
1727 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1728 *puDst = uResult; \
1729 if (!uResult) \
1730 fEfl |= X86_EFL_ZF; \
1731 } \
1732 else \
1733 { \
1734 *puDst = uResult = 0; \
1735 fEfl |= X86_EFL_ZF; \
1736 } \
1737 /** @todo complete flag calculations. */ \
1738 *pfEFlags = fEfl; \
1739}
1740
1741EMIT_BEXTR(64, uint64_t, _fallback)
1742EMIT_BEXTR(32, uint32_t, _fallback)
1743#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1744EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1745#endif
1746#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1747EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1748#endif
1749
1750/*
1751 * BLSR (BMI1 instruction)
1752 */
1753#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1754IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1755{ \
1756 uint32_t fEfl1 = *pfEFlags; \
1757 uint32_t fEfl2 = fEfl1; \
1758 *puDst = uSrc; \
1759 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1760 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1761 \
1762 /* AMD: The carry flag is from the SUB operation. */ \
1763 /* 10890xe: PF always cleared? */ \
1764 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1765 fEfl2 |= fEfl1 & X86_EFL_CF; \
1766 *pfEFlags = fEfl2; \
1767}
1768
1769EMIT_BLSR(64, uint64_t, _fallback)
1770EMIT_BLSR(32, uint32_t, _fallback)
1771#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1772EMIT_BLSR(64, uint64_t, RT_NOTHING)
1773#endif
1774#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1775EMIT_BLSR(32, uint32_t, RT_NOTHING)
1776#endif
1777
1778/*
1779 * BLSMSK (BMI1 instruction)
1780 */
1781#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1782IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1783{ \
1784 uint32_t fEfl1 = *pfEFlags; \
1785 uint32_t fEfl2 = fEfl1; \
1786 *puDst = uSrc; \
1787 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1788 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1789 \
1790 /* AMD: The carry flag is from the SUB operation. */ \
1791 /* 10890xe: PF always cleared? */ \
1792 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1793 fEfl2 |= fEfl1 & X86_EFL_CF; \
1794 *pfEFlags = fEfl2; \
1795}
1796
1797EMIT_BLSMSK(64, uint64_t, _fallback)
1798EMIT_BLSMSK(32, uint32_t, _fallback)
1799#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1800EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1801#endif
1802#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1803EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1804#endif
1805
1806/*
1807 * BLSI (BMI1 instruction)
1808 */
1809#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1810IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1811{ \
1812 uint32_t fEfl1 = *pfEFlags; \
1813 uint32_t fEfl2 = fEfl1; \
1814 *puDst = uSrc; \
1815 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1816 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1817 \
1818 /* AMD: The carry flag is from the SUB operation. */ \
1819 /* 10890xe: PF always cleared? */ \
1820 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1821 fEfl2 |= fEfl1 & X86_EFL_CF; \
1822 *pfEFlags = fEfl2; \
1823}
1824
1825EMIT_BLSI(64, uint64_t, _fallback)
1826EMIT_BLSI(32, uint32_t, _fallback)
1827#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1828EMIT_BLSI(64, uint64_t, RT_NOTHING)
1829#endif
1830#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1831EMIT_BLSI(32, uint32_t, RT_NOTHING)
1832#endif
1833
1834/*
1835 * BZHI (BMI2 instruction)
1836 */
1837#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1838IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1839 a_Type uSrc2, uint32_t *pfEFlags)) \
1840{ \
1841 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1842 a_Type uResult; \
1843 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1844 if (iFirstBit < a_cBits) \
1845 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1846 else \
1847 { \
1848 uResult = uSrc1; \
1849 fEfl |= X86_EFL_CF; \
1850 } \
1851 *puDst = uResult; \
1852 fEfl |= X86_EFL_CALC_ZF(uResult); \
1853 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1854 *pfEFlags = fEfl; \
1855}
1856
1857EMIT_BZHI(64, uint64_t, _fallback)
1858EMIT_BZHI(32, uint32_t, _fallback)
1859#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1860EMIT_BZHI(64, uint64_t, RT_NOTHING)
1861#endif
1862#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1863EMIT_BZHI(32, uint32_t, RT_NOTHING)
1864#endif
1865
1866/*
1867 * POPCNT
1868 */
1869RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1870{
1871 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1872 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1873 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1874 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1875};
1876
1877/** @todo Use native popcount where possible and employ some more efficient
1878 * algorithm here (or in asm.h fallback)! */
1879
1880DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1881{
1882 return g_abBitCounts6[ u16 & 0x3f]
1883 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1884 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1885}
1886
1887DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1888{
1889 return g_abBitCounts6[ u32 & 0x3f]
1890 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1891 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1892 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1893 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1894 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1895}
1896
1897DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1898{
1899 return g_abBitCounts6[ u64 & 0x3f]
1900 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1901 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1902 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1903 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1904 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1905 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1906 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1907 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1908 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1909 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1910}
1911
1912#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1913IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1914{ \
1915 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1916 a_Type uResult; \
1917 if (uSrc) \
1918 uResult = iemPopCountU ## a_cBits(uSrc); \
1919 else \
1920 { \
1921 fEfl |= X86_EFL_ZF; \
1922 uResult = 0; \
1923 } \
1924 *puDst = uResult; \
1925 *pfEFlags = fEfl; \
1926}
1927
1928EMIT_POPCNT(64, uint64_t, _fallback)
1929EMIT_POPCNT(32, uint32_t, _fallback)
1930EMIT_POPCNT(16, uint16_t, _fallback)
1931#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1932EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1933#endif
1934#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1935EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1936EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1937#endif
1938
1939
1940#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1941
1942/*
1943 * XCHG
1944 */
1945
1946IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1947{
1948#if ARCH_BITS >= 64
1949 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1950#else
1951 uint64_t uOldMem = *puMem;
1952 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1953 ASMNopPause();
1954 *puReg = uOldMem;
1955#endif
1956}
1957
1958# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1959
1960IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1961{
1962 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1963}
1964
1965
1966IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1967{
1968 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1969}
1970
1971
1972IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1973{
1974 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1975}
1976
1977# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1978
1979
1980/* Unlocked variants for fDisregardLock mode: */
1981
1982IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1983{
1984 uint64_t const uOld = *puMem;
1985 *puMem = *puReg;
1986 *puReg = uOld;
1987}
1988
1989# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1990
1991IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1992{
1993 uint32_t const uOld = *puMem;
1994 *puMem = *puReg;
1995 *puReg = uOld;
1996}
1997
1998
1999IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
2000{
2001 uint16_t const uOld = *puMem;
2002 *puMem = *puReg;
2003 *puReg = uOld;
2004}
2005
2006
2007IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
2008{
2009 uint8_t const uOld = *puMem;
2010 *puMem = *puReg;
2011 *puReg = uOld;
2012}
2013
2014# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2015
2016
2017/*
2018 * XADD and LOCK XADD.
2019 */
2020#define EMIT_XADD(a_cBitsWidth, a_Type) \
2021IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2022{ \
2023 a_Type uDst = *puDst; \
2024 a_Type uResult = uDst; \
2025 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
2026 *puDst = uResult; \
2027 *puReg = uDst; \
2028} \
2029\
2030IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2031{ \
2032 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2033 a_Type uResult; \
2034 uint32_t fEflTmp; \
2035 do \
2036 { \
2037 uResult = uOld; \
2038 fEflTmp = *pfEFlags; \
2039 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2040 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2041 *puReg = uOld; \
2042 *pfEFlags = fEflTmp; \
2043}
2044EMIT_XADD(64, uint64_t)
2045# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2046EMIT_XADD(32, uint32_t)
2047EMIT_XADD(16, uint16_t)
2048EMIT_XADD(8, uint8_t)
2049# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2050
2051#endif
2052
2053/*
2054 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2055 *
2056 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2057 * instructions are emulated as locked.
2058 */
2059#if defined(IEM_WITHOUT_ASSEMBLY)
2060
2061IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2062{
2063 uint8_t uOld = *puAl;
2064 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2065 Assert(*puAl == uOld);
2066 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2067}
2068
2069
2070IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2071{
2072 uint16_t uOld = *puAx;
2073 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2074 Assert(*puAx == uOld);
2075 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2076}
2077
2078
2079IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2080{
2081 uint32_t uOld = *puEax;
2082 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2083 Assert(*puEax == uOld);
2084 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2085}
2086
2087
2088# if ARCH_BITS == 32
2089IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2090# else
2091IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2092# endif
2093{
2094# if ARCH_BITS == 32
2095 uint64_t const uSrcReg = *puSrcReg;
2096# endif
2097 uint64_t uOld = *puRax;
2098 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2099 Assert(*puRax == uOld);
2100 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2101}
2102
2103
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2105 uint32_t *pEFlags))
2106{
2107 uint64_t const uNew = pu64EbxEcx->u;
2108 uint64_t const uOld = pu64EaxEdx->u;
2109 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2110 {
2111 Assert(pu64EaxEdx->u == uOld);
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 *pEFlags &= ~X86_EFL_ZF;
2116}
2117
2118
2119# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2120IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2121 uint32_t *pEFlags))
2122{
2123# ifdef VBOX_STRICT
2124 RTUINT128U const uOld = *pu128RaxRdx;
2125# endif
2126# if defined(RT_ARCH_AMD64)
2127 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2128 &pu128RaxRdx->u))
2129# else
2130 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2131# endif
2132 {
2133 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2134 *pEFlags |= X86_EFL_ZF;
2135 }
2136 else
2137 *pEFlags &= ~X86_EFL_ZF;
2138}
2139# endif
2140
2141#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2142
2143# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2144IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2145 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2146{
2147 RTUINT128U u128Tmp = *pu128Dst;
2148 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2149 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2150 {
2151 *pu128Dst = *pu128RbxRcx;
2152 *pEFlags |= X86_EFL_ZF;
2153 }
2154 else
2155 {
2156 *pu128RaxRdx = u128Tmp;
2157 *pEFlags &= ~X86_EFL_ZF;
2158 }
2159}
2160#endif /* !RT_ARCH_ARM64 */
2161
2162#if defined(IEM_WITHOUT_ASSEMBLY)
2163
2164/* Unlocked versions mapped to the locked ones: */
2165
2166IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2167{
2168 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2169}
2170
2171
2172IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2173{
2174# if 0
2175 /* If correctly aligned, used the locked variation. */
2176 if (!((uintptr_t)pu16Dst & 1))
2177 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2178 else
2179# endif
2180 {
2181 /* Otherwise emulate it as best as we can. */
2182 uint16_t const uOld = *puAx;
2183 uint16_t const uDst = *pu16Dst;
2184 if (uOld == uDst)
2185 {
2186 *pu16Dst = uSrcReg;
2187 iemAImpl_cmp_u16(&uOld, uOld, pEFlags);
2188 }
2189 else
2190 {
2191 *puAx = uDst;
2192 iemAImpl_cmp_u16(&uOld, uDst, pEFlags);
2193 }
2194 }
2195}
2196
2197
2198IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2199{
2200# if 0
2201 /* If correctly aligned, used the locked variation. */
2202 if (!((uintptr_t)pu32Dst & 3))
2203 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2204 else
2205# endif
2206 {
2207 /* Otherwise emulate it as best as we can. */
2208 uint32_t const uOld = *puEax;
2209 uint32_t const uDst = *pu32Dst;
2210 if (uOld == uDst)
2211 {
2212 *pu32Dst = uSrcReg;
2213 iemAImpl_cmp_u32(&uOld, uOld, pEFlags);
2214 }
2215 else
2216 {
2217 *puEax = uDst;
2218 iemAImpl_cmp_u32(&uOld, uDst, pEFlags);
2219 }
2220 }
2221}
2222
2223
2224# if ARCH_BITS == 32
2225IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2226{
2227# if 0
2228 /* If correctly aligned, used the locked variation. */
2229 if (!((uintptr_t)pu32Dst & 7))
2230 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2231 else
2232# endif
2233 {
2234 /* Otherwise emulate it as best as we can. */
2235 uint64_t const uOld = *puRax;
2236 uint64_t const uSrc = *puSrcReg;
2237 uint64_t const uDst = *pu64Dst;
2238 if (uOld == uDst)
2239 {
2240 *pu64Dst = uSrc;
2241 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2242 }
2243 else
2244 {
2245 *puRax = uDst;
2246 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2247 }
2248 }
2249}
2250# else /* ARCH_BITS != 32 */
2251IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2252{
2253# if 0
2254 /* If correctly aligned, used the locked variation. */
2255 if (!((uintptr_t)pu64Dst & 7))
2256 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2257 else
2258# endif
2259 {
2260 /* Otherwise emulate it as best as we can. */
2261 uint64_t const uOld = *puRax;
2262 uint64_t const uDst = *pu64Dst;
2263 if (uOld == uDst)
2264 {
2265 *pu64Dst = uSrcReg;
2266 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2267 }
2268 else
2269 {
2270 *puRax = uDst;
2271 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2272 }
2273 }
2274}
2275# endif /* ARCH_BITS != 32 */
2276
2277
2278IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2279{
2280# if 0
2281 /* If correctly aligned, used the locked variation. */
2282 if (!((uintptr_t)pu64Dst & 7))
2283 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2284 else
2285# endif
2286 {
2287 /* Otherwise emulate it as best as we can. */
2288 uint64_t const uNew = pu64EbxEcx->u;
2289 uint64_t const uOld = pu64EaxEdx->u;
2290 uint64_t const uDst = *pu64Dst;
2291 if (uDst == uOld)
2292 {
2293 *pu64Dst = uNew;
2294 *pEFlags |= X86_EFL_ZF;
2295 }
2296 else
2297 {
2298 pu64EaxEdx->u = uDst;
2299 *pEFlags &= ~X86_EFL_ZF;
2300 }
2301 }
2302}
2303
2304
2305IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2306 uint32_t *pEFlags))
2307{
2308# if 0
2309 /* If correctly aligned, used the locked variation. */
2310 if (!((uintptr_t)pu64Dst & 15))
2311 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2312 else
2313# endif
2314 {
2315 /* Otherwise emulate it as best as we can. */
2316# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2317 uint128_t const uNew = pu128RbxRcx->u;
2318 uint128_t const uOld = pu128RaxRdx->u;
2319 uint128_t const uDst = pu128Dst->u;
2320 if (uDst == uOld)
2321 {
2322 pu128Dst->u = uNew;
2323 *pEFlags |= X86_EFL_ZF;
2324 }
2325 else
2326 {
2327 pu128RaxRdx->u = uDst;
2328 *pEFlags &= ~X86_EFL_ZF;
2329 }
2330# else
2331 RTUINT128U const uNew = *pu128RbxRcx;
2332 RTUINT128U const uOld = *pu128RaxRdx;
2333 RTUINT128U const uDst = *pu128Dst;
2334 if ( uDst.s.Lo == uOld.s.Lo
2335 && uDst.s.Hi == uOld.s.Hi)
2336 {
2337 *pu128Dst = uNew;
2338 *pEFlags |= X86_EFL_ZF;
2339 }
2340 else
2341 {
2342 *pu128RaxRdx = uDst;
2343 *pEFlags &= ~X86_EFL_ZF;
2344 }
2345# endif
2346 }
2347}
2348
2349#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2350
2351#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2352 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2353
2354/*
2355 * MUL, IMUL, DIV and IDIV helpers.
2356 *
2357 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2358 * division step so we can select between using C operators and
2359 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2360 *
2361 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2362 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2363 * input loads and the result storing.
2364 */
2365
2366DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2367{
2368# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2369 pQuotient->s.Lo = 0;
2370 pQuotient->s.Hi = 0;
2371# endif
2372 RTUINT128U Divisor;
2373 Divisor.s.Lo = u64Divisor;
2374 Divisor.s.Hi = 0;
2375 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2376}
2377
2378# define DIV_LOAD(a_Dividend) \
2379 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2380# define DIV_LOAD_U8(a_Dividend) \
2381 a_Dividend.u = *puAX
2382
2383# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2384# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2385
2386# define MUL_LOAD_F1() *puA
2387# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2388
2389# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2390# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2391
2392# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2393 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2394# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2395 RTUInt128AssignNeg(&(a_Value))
2396
2397# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2398 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2399# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2400 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2401
2402# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2403 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2404 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2405# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2406 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2407
2408
2409/*
2410 * MUL
2411 */
2412# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2413IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2414{ \
2415 RTUINT ## a_cBitsWidth2x ## U Result; \
2416 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2417 a_fnStore(Result); \
2418 \
2419 /* Calc EFLAGS: */ \
2420 uint32_t fEfl = *pfEFlags; \
2421 if (a_fIntelFlags) \
2422 { /* Intel: 6700K and 10980XE behavior */ \
2423 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2424 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2425 fEfl |= X86_EFL_SF; \
2426 fEfl |= IEM_EFL_CALC_PARITY(Result.s.Lo); \
2427 if (Result.s.Hi != 0) \
2428 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2429 } \
2430 else \
2431 { /* AMD: 3990X */ \
2432 if (Result.s.Hi != 0) \
2433 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2434 else \
2435 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2436 } \
2437 *pfEFlags = fEfl; \
2438 return 0; \
2439} \
2440
2441# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2442 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2443 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2444 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2445
2446# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2447EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2448 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2449# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2450EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2451 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2452EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2453 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2454EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2455 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2456# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2457# endif /* !DOXYGEN_RUNNING */
2458
2459/*
2460 * MULX
2461 */
2462# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2463IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2464 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2465{ \
2466 RTUINT ## a_cBitsWidth2x ## U Result; \
2467 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2468 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2469 *puDst1 = Result.s.Hi; \
2470} \
2471
2472# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2473EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2474EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2475# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2476EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2477EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2478# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2479# endif /* !DOXYGEN_RUNNING */
2480
2481
2482/*
2483 * IMUL
2484 *
2485 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2486 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2487 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2488 */
2489# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2490 a_Suffix, a_fIntelFlags) \
2491IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2492{ \
2493 RTUINT ## a_cBitsWidth2x ## U Result; \
2494 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2495 \
2496 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2497 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2498 { \
2499 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2500 { \
2501 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2502 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2503 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2504 } \
2505 else \
2506 { \
2507 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2508 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2509 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2510 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2511 a_fnNeg(Result, a_cBitsWidth2x); \
2512 } \
2513 } \
2514 else \
2515 { \
2516 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2517 { \
2518 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2519 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2520 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2521 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2522 a_fnNeg(Result, a_cBitsWidth2x); \
2523 } \
2524 else \
2525 { \
2526 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2527 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2528 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2529 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2530 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2531 } \
2532 } \
2533 a_fnStore(Result); \
2534 \
2535 if (a_fIntelFlags) \
2536 { \
2537 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2538 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2539 fEfl |= X86_EFL_SF; \
2540 fEfl |= IEM_EFL_CALC_PARITY(Result.s.Lo & 0xff); \
2541 } \
2542 *pfEFlags = fEfl; \
2543 return 0; \
2544}
2545# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2546 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2547 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2548 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2549
2550# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2551EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2552 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2553# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2554EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2555 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2556EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2557 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2558EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2559 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2560# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2561# endif /* !DOXYGEN_RUNNING */
2562
2563
2564/*
2565 * IMUL with two operands are mapped onto the three operand variant, ignoring
2566 * the high part of the product.
2567 */
2568# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2569IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2570{ \
2571 a_uType uIgn; \
2572 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2573} \
2574\
2575IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2576{ \
2577 a_uType uIgn; \
2578 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2579} \
2580\
2581IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2582{ \
2583 a_uType uIgn; \
2584 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2585}
2586
2587EMIT_IMUL_TWO(64, uint64_t)
2588# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2589EMIT_IMUL_TWO(32, uint32_t)
2590EMIT_IMUL_TWO(16, uint16_t)
2591# endif
2592
2593
2594/*
2595 * DIV
2596 */
2597# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2598 a_Suffix, a_fIntelFlags) \
2599IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2600{ \
2601 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2602 a_fnLoad(Dividend); \
2603 if ( uDivisor != 0 \
2604 && Dividend.s.Hi < uDivisor) \
2605 { \
2606 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2607 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2608 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2609 \
2610 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2611 if (!a_fIntelFlags) \
2612 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2613 return 0; \
2614 } \
2615 /* #DE */ \
2616 return -1; \
2617}
2618# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2619 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2620 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2621 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2622
2623# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2624EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2625 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2626# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2627EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2628 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2629EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2630 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2631EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2632 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2633# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2634# endif /* !DOXYGEN_RUNNING */
2635
2636
2637/*
2638 * IDIV
2639 *
2640 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2641 * set AF and clear PF, ZF and SF just like it does for DIV.
2642 *
2643 */
2644# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2645 a_Suffix, a_fIntelFlags) \
2646IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2647{ \
2648 /* Note! Skylake leaves all flags alone. */ \
2649 \
2650 /** @todo overflow checks */ \
2651 if (uDivisor != 0) \
2652 { \
2653 /* \
2654 * Convert to unsigned division. \
2655 */ \
2656 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2657 a_fnLoad(Dividend); \
2658 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2659 if (fSignedDividend) \
2660 a_fnNeg(Dividend, a_cBitsWidth2x); \
2661 \
2662 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2663 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2664 uDivisorPositive = uDivisor; \
2665 else \
2666 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2667 \
2668 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2669 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2670 \
2671 /* \
2672 * Setup the result, checking for overflows. \
2673 */ \
2674 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2675 { \
2676 if (!fSignedDividend) \
2677 { \
2678 /* Positive divisor, positive dividend => result positive. */ \
2679 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2680 { \
2681 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2682 if (!a_fIntelFlags) \
2683 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2684 return 0; \
2685 } \
2686 } \
2687 else \
2688 { \
2689 /* Positive divisor, negative dividend => result negative. */ \
2690 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2691 { \
2692 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2693 if (!a_fIntelFlags) \
2694 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2695 return 0; \
2696 } \
2697 } \
2698 } \
2699 else \
2700 { \
2701 if (!fSignedDividend) \
2702 { \
2703 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2704 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2705 { \
2706 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2707 if (!a_fIntelFlags) \
2708 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2709 return 0; \
2710 } \
2711 } \
2712 else \
2713 { \
2714 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2715 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2716 { \
2717 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2718 if (!a_fIntelFlags) \
2719 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2720 return 0; \
2721 } \
2722 } \
2723 } \
2724 } \
2725 /* #DE */ \
2726 return -1; \
2727}
2728# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2729 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2730 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2731 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2732
2733# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2734EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2735 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2736# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2737EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2738 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2739EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2740 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2741EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2742 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2743# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2744# endif /* !DOXYGEN_RUNNING */
2745
2746#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2747
2748
2749/*********************************************************************************************************************************
2750* Unary operations. *
2751*********************************************************************************************************************************/
2752#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2753
2754/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2755 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2756 *
2757 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2758 * borrowing in arithmetic loops on intel 8008).
2759 *
2760 * @returns Status bits.
2761 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2762 * @param a_uResult Unsigned result value.
2763 * @param a_uDst The original destination value (for AF calc).
2764 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2765 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2766 */
2767#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2768 do { \
2769 uint32_t fEflTmp = *(a_pfEFlags); \
2770 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2771 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
2772 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2773 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2774 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2775 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2776 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2777 *(a_pfEFlags) = fEflTmp; \
2778 } while (0)
2779
2780/*
2781 * INC
2782 */
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint64_t uDst = *puDst;
2787 uint64_t uResult = uDst + 1;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2790}
2791
2792# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2793
2794IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2795{
2796 uint32_t uDst = *puDst;
2797 uint32_t uResult = uDst + 1;
2798 *puDst = uResult;
2799 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2800}
2801
2802
2803IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2804{
2805 uint16_t uDst = *puDst;
2806 uint16_t uResult = uDst + 1;
2807 *puDst = uResult;
2808 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2809}
2810
2811IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2812{
2813 uint8_t uDst = *puDst;
2814 uint8_t uResult = uDst + 1;
2815 *puDst = uResult;
2816 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2817}
2818
2819# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2820
2821
2822/*
2823 * DEC
2824 */
2825
2826IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2827{
2828 uint64_t uDst = *puDst;
2829 uint64_t uResult = uDst - 1;
2830 *puDst = uResult;
2831 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2832}
2833
2834# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2835
2836IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2837{
2838 uint32_t uDst = *puDst;
2839 uint32_t uResult = uDst - 1;
2840 *puDst = uResult;
2841 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2842}
2843
2844
2845IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2846{
2847 uint16_t uDst = *puDst;
2848 uint16_t uResult = uDst - 1;
2849 *puDst = uResult;
2850 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2851}
2852
2853
2854IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2855{
2856 uint8_t uDst = *puDst;
2857 uint8_t uResult = uDst - 1;
2858 *puDst = uResult;
2859 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2860}
2861
2862# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2863
2864
2865/*
2866 * NOT
2867 */
2868
2869IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2870{
2871 uint64_t uDst = *puDst;
2872 uint64_t uResult = ~uDst;
2873 *puDst = uResult;
2874 /* EFLAGS are not modified. */
2875 RT_NOREF_PV(pfEFlags);
2876}
2877
2878# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2879
2880IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2881{
2882 uint32_t uDst = *puDst;
2883 uint32_t uResult = ~uDst;
2884 *puDst = uResult;
2885 /* EFLAGS are not modified. */
2886 RT_NOREF_PV(pfEFlags);
2887}
2888
2889IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2890{
2891 uint16_t uDst = *puDst;
2892 uint16_t uResult = ~uDst;
2893 *puDst = uResult;
2894 /* EFLAGS are not modified. */
2895 RT_NOREF_PV(pfEFlags);
2896}
2897
2898IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2899{
2900 uint8_t uDst = *puDst;
2901 uint8_t uResult = ~uDst;
2902 *puDst = uResult;
2903 /* EFLAGS are not modified. */
2904 RT_NOREF_PV(pfEFlags);
2905}
2906
2907# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2908
2909
2910/*
2911 * NEG
2912 */
2913
2914/**
2915 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2916 *
2917 * @returns Status bits.
2918 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2919 * @param a_uResult Unsigned result value.
2920 * @param a_uDst The original destination value (for AF calc).
2921 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2922 */
2923#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2924 do { \
2925 uint32_t fEflTmp = *(a_pfEFlags); \
2926 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2927 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2928 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
2929 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2930 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2931 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2932 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2933 *(a_pfEFlags) = fEflTmp; \
2934 } while (0)
2935
2936IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2937{
2938 uint64_t uDst = *puDst;
2939 uint64_t uResult = (uint64_t)0 - uDst;
2940 *puDst = uResult;
2941 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2942}
2943
2944# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2945
2946IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2947{
2948 uint32_t uDst = *puDst;
2949 uint32_t uResult = (uint32_t)0 - uDst;
2950 *puDst = uResult;
2951 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2952}
2953
2954
2955IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2956{
2957 uint16_t uDst = *puDst;
2958 uint16_t uResult = (uint16_t)0 - uDst;
2959 *puDst = uResult;
2960 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2961}
2962
2963
2964IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2965{
2966 uint8_t uDst = *puDst;
2967 uint8_t uResult = (uint8_t)0 - uDst;
2968 *puDst = uResult;
2969 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2970}
2971
2972# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2973
2974/*
2975 * Locked variants.
2976 */
2977
2978/** Emit a function for doing a locked unary operand operation. */
2979# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2980 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2981 uint32_t *pfEFlags)) \
2982 { \
2983 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2984 uint ## a_cBitsWidth ## _t uTmp; \
2985 uint32_t fEflTmp; \
2986 do \
2987 { \
2988 uTmp = uOld; \
2989 fEflTmp = *pfEFlags; \
2990 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2991 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2992 *pfEFlags = fEflTmp; \
2993 }
2994
2995EMIT_LOCKED_UNARY_OP(inc, 64)
2996EMIT_LOCKED_UNARY_OP(dec, 64)
2997EMIT_LOCKED_UNARY_OP(not, 64)
2998EMIT_LOCKED_UNARY_OP(neg, 64)
2999# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3000EMIT_LOCKED_UNARY_OP(inc, 32)
3001EMIT_LOCKED_UNARY_OP(dec, 32)
3002EMIT_LOCKED_UNARY_OP(not, 32)
3003EMIT_LOCKED_UNARY_OP(neg, 32)
3004
3005EMIT_LOCKED_UNARY_OP(inc, 16)
3006EMIT_LOCKED_UNARY_OP(dec, 16)
3007EMIT_LOCKED_UNARY_OP(not, 16)
3008EMIT_LOCKED_UNARY_OP(neg, 16)
3009
3010EMIT_LOCKED_UNARY_OP(inc, 8)
3011EMIT_LOCKED_UNARY_OP(dec, 8)
3012EMIT_LOCKED_UNARY_OP(not, 8)
3013EMIT_LOCKED_UNARY_OP(neg, 8)
3014# endif
3015
3016#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
3017
3018
3019/*********************************************************************************************************************************
3020* Shifting and Rotating *
3021*********************************************************************************************************************************/
3022
3023/*
3024 * ROL
3025 */
3026#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3027IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3028{ \
3029 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3030 if (cShift) \
3031 { \
3032 if (a_cBitsWidth < 32) \
3033 cShift &= a_cBitsWidth - 1; \
3034 a_uType const uDst = *puDst; \
3035 a_uType const uResult = a_fnHlp(uDst, cShift); \
3036 *puDst = uResult; \
3037 \
3038 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3039 it the same way as for 1 bit shifts. */ \
3040 AssertCompile(X86_EFL_CF_BIT == 0); \
3041 uint32_t fEfl = *pfEFlags; \
3042 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3043 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3044 fEfl |= fCarry; \
3045 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3046 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3047 else /* Intel 10980XE: According to the first sub-shift: */ \
3048 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3049 *pfEFlags = fEfl; \
3050 } \
3051}
3052
3053#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3054EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3055#endif
3056EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3057EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3058
3059#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3060EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3061#endif
3062EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3063EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3064
3065DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3066{
3067 return (uValue << cShift) | (uValue >> (16 - cShift));
3068}
3069#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3071#endif
3072EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3073EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3074
3075DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3076{
3077 return (uValue << cShift) | (uValue >> (8 - cShift));
3078}
3079#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3080EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3081#endif
3082EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3083EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3084
3085
3086/*
3087 * ROR
3088 */
3089#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3090IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3091{ \
3092 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3093 if (cShift) \
3094 { \
3095 if (a_cBitsWidth < 32) \
3096 cShift &= a_cBitsWidth - 1; \
3097 a_uType const uDst = *puDst; \
3098 a_uType const uResult = a_fnHlp(uDst, cShift); \
3099 *puDst = uResult; \
3100 \
3101 /* Calc EFLAGS: */ \
3102 AssertCompile(X86_EFL_CF_BIT == 0); \
3103 uint32_t fEfl = *pfEFlags; \
3104 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3105 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3106 fEfl |= fCarry; \
3107 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3108 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3109 else /* Intel 10980XE: According to the first sub-shift: */ \
3110 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3111 *pfEFlags = fEfl; \
3112 } \
3113}
3114
3115#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3116EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3117#endif
3118EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3119EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3120
3121#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3122EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3123#endif
3124EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3125EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3126
3127DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3128{
3129 return (uValue >> cShift) | (uValue << (16 - cShift));
3130}
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3133#endif
3134EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3135EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3136
3137DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3138{
3139 return (uValue >> cShift) | (uValue << (8 - cShift));
3140}
3141#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3142EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3143#endif
3144EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3145EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3146
3147
3148/*
3149 * RCL
3150 */
3151#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3152IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3153{ \
3154 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3155 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3156 cShift %= a_cBitsWidth + 1; \
3157 if (cShift) \
3158 { \
3159 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3160 cShift %= a_cBitsWidth + 1; \
3161 a_uType const uDst = *puDst; \
3162 a_uType uResult = uDst << cShift; \
3163 if (cShift > 1) \
3164 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3165 \
3166 AssertCompile(X86_EFL_CF_BIT == 0); \
3167 uint32_t fEfl = *pfEFlags; \
3168 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3169 uResult |= (a_uType)fInCarry << (cShift - 1); \
3170 \
3171 *puDst = uResult; \
3172 \
3173 /* Calc EFLAGS. */ \
3174 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3175 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3176 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3177 fEfl |= fOutCarry; \
3178 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3179 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3180 else /* Intel 10980XE: According to the first sub-shift: */ \
3181 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3182 *pfEFlags = fEfl; \
3183 } \
3184}
3185
3186#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3187EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3188#endif
3189EMIT_RCL(64, uint64_t, _intel, 1)
3190EMIT_RCL(64, uint64_t, _amd, 0)
3191
3192#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3193EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3194#endif
3195EMIT_RCL(32, uint32_t, _intel, 1)
3196EMIT_RCL(32, uint32_t, _amd, 0)
3197
3198#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3199EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3200#endif
3201EMIT_RCL(16, uint16_t, _intel, 1)
3202EMIT_RCL(16, uint16_t, _amd, 0)
3203
3204#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3205EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3206#endif
3207EMIT_RCL(8, uint8_t, _intel, 1)
3208EMIT_RCL(8, uint8_t, _amd, 0)
3209
3210
3211/*
3212 * RCR
3213 */
3214#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3215IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3216{ \
3217 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3218 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3219 cShift %= a_cBitsWidth + 1; \
3220 if (cShift) \
3221 { \
3222 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3223 cShift %= a_cBitsWidth + 1; \
3224 a_uType const uDst = *puDst; \
3225 a_uType uResult = uDst >> cShift; \
3226 if (cShift > 1) \
3227 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3228 \
3229 AssertCompile(X86_EFL_CF_BIT == 0); \
3230 uint32_t fEfl = *pfEFlags; \
3231 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3232 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3233 *puDst = uResult; \
3234 \
3235 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3236 it the same way as for 1 bit shifts. */ \
3237 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3238 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3239 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3240 fEfl |= fOutCarry; \
3241 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3242 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3243 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3244 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3245 *pfEFlags = fEfl; \
3246 } \
3247}
3248
3249#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3250EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3251#endif
3252EMIT_RCR(64, uint64_t, _intel, 1)
3253EMIT_RCR(64, uint64_t, _amd, 0)
3254
3255#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3256EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3257#endif
3258EMIT_RCR(32, uint32_t, _intel, 1)
3259EMIT_RCR(32, uint32_t, _amd, 0)
3260
3261#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3262EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3263#endif
3264EMIT_RCR(16, uint16_t, _intel, 1)
3265EMIT_RCR(16, uint16_t, _amd, 0)
3266
3267#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3268EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3269#endif
3270EMIT_RCR(8, uint8_t, _intel, 1)
3271EMIT_RCR(8, uint8_t, _amd, 0)
3272
3273
3274/*
3275 * SHL
3276 */
3277#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3278IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3279{ \
3280 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3281 if (cShift) \
3282 { \
3283 a_uType const uDst = *puDst; \
3284 a_uType uResult = uDst << cShift; \
3285 *puDst = uResult; \
3286 \
3287 /* Calc EFLAGS. */ \
3288 AssertCompile(X86_EFL_CF_BIT == 0); \
3289 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3290 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3291 fEfl |= fCarry; \
3292 if (!a_fIntelFlags) \
3293 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3294 else \
3295 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3299 if (!a_fIntelFlags) \
3300 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3301 *pfEFlags = fEfl; \
3302 } \
3303}
3304
3305#if !defined(RT_ARCH_ARM64)
3306
3307# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3308EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3309# endif
3310EMIT_SHL(64, uint64_t, _intel, 1)
3311EMIT_SHL(64, uint64_t, _amd, 0)
3312
3313# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3314EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3315# endif
3316EMIT_SHL(32, uint32_t, _intel, 1)
3317EMIT_SHL(32, uint32_t, _amd, 0)
3318
3319# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3320EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3321# endif
3322EMIT_SHL(16, uint16_t, _intel, 1)
3323EMIT_SHL(16, uint16_t, _amd, 0)
3324
3325# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3326EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3327# endif
3328EMIT_SHL(8, uint8_t, _intel, 1)
3329EMIT_SHL(8, uint8_t, _amd, 0)
3330
3331#endif /* !RT_ARCH_ARM64 */
3332
3333
3334/*
3335 * SHR
3336 */
3337#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3338IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3339{ \
3340 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3341 if (cShift) \
3342 { \
3343 a_uType const uDst = *puDst; \
3344 a_uType uResult = uDst >> cShift; \
3345 *puDst = uResult; \
3346 \
3347 /* Calc EFLAGS. */ \
3348 AssertCompile(X86_EFL_CF_BIT == 0); \
3349 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3350 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3351 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3352 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3356 if (!a_fIntelFlags) \
3357 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3358 *pfEFlags = fEfl; \
3359 } \
3360}
3361
3362#if !defined(RT_ARCH_ARM64)
3363
3364# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3365EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3366# endif
3367EMIT_SHR(64, uint64_t, _intel, 1)
3368EMIT_SHR(64, uint64_t, _amd, 0)
3369
3370# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3371EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3372# endif
3373EMIT_SHR(32, uint32_t, _intel, 1)
3374EMIT_SHR(32, uint32_t, _amd, 0)
3375
3376# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3377EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3378# endif
3379EMIT_SHR(16, uint16_t, _intel, 1)
3380EMIT_SHR(16, uint16_t, _amd, 0)
3381
3382# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3383EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3384# endif
3385EMIT_SHR(8, uint8_t, _intel, 1)
3386EMIT_SHR(8, uint8_t, _amd, 0)
3387
3388#endif /* !RT_ARCH_ARM64 */
3389
3390
3391/*
3392 * SAR
3393 */
3394#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3395IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3396{ \
3397 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3398 if (cShift) \
3399 { \
3400 a_iType const iDst = (a_iType)*puDst; \
3401 a_uType uResult = iDst >> cShift; \
3402 *puDst = uResult; \
3403 \
3404 /* Calc EFLAGS. \
3405 Note! The OF flag is always zero because the result never differs from the input. */ \
3406 AssertCompile(X86_EFL_CF_BIT == 0); \
3407 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3408 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3409 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3410 fEfl |= X86_EFL_CALC_ZF(uResult); \
3411 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3412 if (!a_fIntelFlags) \
3413 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3414 *pfEFlags = fEfl; \
3415 } \
3416}
3417
3418#if !defined(RT_ARCH_ARM64)
3419
3420# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3421EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3422# endif
3423EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3424EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3425
3426# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3427EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3428# endif
3429EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3430EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3431
3432# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3433EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3434# endif
3435EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3436EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3437
3438# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3439EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3440# endif
3441EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3442EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3443
3444#endif /* !RT_ARCH_ARM64 */
3445
3446
3447/*
3448 * SHLD
3449 *
3450 * - CF is the last bit shifted out of puDst.
3451 * - AF is always cleared by Intel 10980XE.
3452 * - AF is always set by AMD 3990X.
3453 * - OF is set according to the first shift on Intel 10980XE, it seems.
3454 * - OF is set according to the last sub-shift on AMD 3990X.
3455 * - ZF, SF and PF are calculated according to the result by both vendors.
3456 *
3457 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3458 * pick either the source register or the destination register for input bits
3459 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3460 * intel has changed behaviour here several times. We implement what current
3461 * skylake based does for now, we can extend this later as needed.
3462 */
3463#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3464IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3465 uint32_t *pfEFlags)) \
3466{ \
3467 cShift &= a_cBitsWidth - 1; \
3468 if (cShift) \
3469 { \
3470 a_uType const uDst = *puDst; \
3471 a_uType uResult = uDst << cShift; \
3472 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3473 *puDst = uResult; \
3474 \
3475 /* CALC EFLAGS: */ \
3476 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3477 if (a_fIntelFlags) \
3478 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3479 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3480 else \
3481 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3482 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3483 fEfl |= X86_EFL_AF; \
3484 } \
3485 AssertCompile(X86_EFL_CF_BIT == 0); \
3486 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3487 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3488 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3489 fEfl |= X86_EFL_CALC_ZF(uResult); \
3490 *pfEFlags = fEfl; \
3491 } \
3492}
3493
3494#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3495EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3496#endif
3497EMIT_SHLD(64, uint64_t, _intel, 1)
3498EMIT_SHLD(64, uint64_t, _amd, 0)
3499
3500#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3501EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3502#endif
3503EMIT_SHLD(32, uint32_t, _intel, 1)
3504EMIT_SHLD(32, uint32_t, _amd, 0)
3505
3506#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3507IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3508{ \
3509 cShift &= 31; \
3510 if (cShift) \
3511 { \
3512 uint16_t const uDst = *puDst; \
3513 uint64_t const uTmp = a_fIntelFlags \
3514 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3515 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3516 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3517 *puDst = uResult; \
3518 \
3519 /* CALC EFLAGS: */ \
3520 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3521 AssertCompile(X86_EFL_CF_BIT == 0); \
3522 if (a_fIntelFlags) \
3523 { \
3524 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3525 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3526 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3527 } \
3528 else \
3529 { \
3530 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3531 if (cShift < 16) \
3532 { \
3533 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3534 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3535 } \
3536 else \
3537 { \
3538 if (cShift == 16) \
3539 fEfl |= uDst & X86_EFL_CF; \
3540 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3541 } \
3542 fEfl |= X86_EFL_AF; \
3543 } \
3544 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3545 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3546 fEfl |= X86_EFL_CALC_ZF(uResult); \
3547 *pfEFlags = fEfl; \
3548 } \
3549}
3550
3551#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3552EMIT_SHLD_16(RT_NOTHING, 1)
3553#endif
3554EMIT_SHLD_16(_intel, 1)
3555EMIT_SHLD_16(_amd, 0)
3556
3557
3558/*
3559 * SHRD
3560 *
3561 * EFLAGS behaviour seems to be the same as with SHLD:
3562 * - CF is the last bit shifted out of puDst.
3563 * - AF is always cleared by Intel 10980XE.
3564 * - AF is always set by AMD 3990X.
3565 * - OF is set according to the first shift on Intel 10980XE, it seems.
3566 * - OF is set according to the last sub-shift on AMD 3990X.
3567 * - ZF, SF and PF are calculated according to the result by both vendors.
3568 *
3569 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3570 * pick either the source register or the destination register for input bits
3571 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3572 * intel has changed behaviour here several times. We implement what current
3573 * skylake based does for now, we can extend this later as needed.
3574 */
3575#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3576IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3577{ \
3578 cShift &= a_cBitsWidth - 1; \
3579 if (cShift) \
3580 { \
3581 a_uType const uDst = *puDst; \
3582 a_uType uResult = uDst >> cShift; \
3583 uResult |= uSrc << (a_cBitsWidth - cShift); \
3584 *puDst = uResult; \
3585 \
3586 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3587 AssertCompile(X86_EFL_CF_BIT == 0); \
3588 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3589 if (a_fIntelFlags) \
3590 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3591 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3592 else \
3593 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3594 if (cShift > 1) /* Set according to last shift. */ \
3595 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3596 else \
3597 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3598 fEfl |= X86_EFL_AF; \
3599 } \
3600 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3601 fEfl |= X86_EFL_CALC_ZF(uResult); \
3602 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3603 *pfEFlags = fEfl; \
3604 } \
3605}
3606
3607#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3608EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3609#endif
3610EMIT_SHRD(64, uint64_t, _intel, 1)
3611EMIT_SHRD(64, uint64_t, _amd, 0)
3612
3613#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3614EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3615#endif
3616EMIT_SHRD(32, uint32_t, _intel, 1)
3617EMIT_SHRD(32, uint32_t, _amd, 0)
3618
3619#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3620IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3621{ \
3622 cShift &= 31; \
3623 if (cShift) \
3624 { \
3625 uint16_t const uDst = *puDst; \
3626 uint64_t const uTmp = a_fIntelFlags \
3627 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3628 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3629 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3630 *puDst = uResult; \
3631 \
3632 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3633 AssertCompile(X86_EFL_CF_BIT == 0); \
3634 if (a_fIntelFlags) \
3635 { \
3636 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3637 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3638 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3639 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3640 } \
3641 else \
3642 { \
3643 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3644 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3645 /* AMD 3990X: Set according to last shift. AF always set. */ \
3646 if (cShift > 1) /* Set according to last shift. */ \
3647 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3648 else \
3649 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3650 fEfl |= X86_EFL_AF; \
3651 } \
3652 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3653 fEfl |= X86_EFL_CALC_ZF(uResult); \
3654 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3655 *pfEFlags = fEfl; \
3656 } \
3657}
3658
3659#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3660EMIT_SHRD_16(RT_NOTHING, 1)
3661#endif
3662EMIT_SHRD_16(_intel, 1)
3663EMIT_SHRD_16(_amd, 0)
3664
3665
3666/*
3667 * RORX (BMI2)
3668 */
3669#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3670IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3671{ \
3672 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3673}
3674
3675#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3676EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3677#endif
3678#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3679EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3680#endif
3681
3682
3683/*
3684 * SHLX (BMI2)
3685 */
3686#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3687IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3688{ \
3689 cShift &= a_cBitsWidth - 1; \
3690 *puDst = uSrc << cShift; \
3691}
3692
3693#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3694EMIT_SHLX(64, uint64_t, RT_NOTHING)
3695EMIT_SHLX(64, uint64_t, _fallback)
3696#endif
3697#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3698EMIT_SHLX(32, uint32_t, RT_NOTHING)
3699EMIT_SHLX(32, uint32_t, _fallback)
3700#endif
3701
3702
3703/*
3704 * SHRX (BMI2)
3705 */
3706#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3707IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3708{ \
3709 cShift &= a_cBitsWidth - 1; \
3710 *puDst = uSrc >> cShift; \
3711}
3712
3713#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3714EMIT_SHRX(64, uint64_t, RT_NOTHING)
3715EMIT_SHRX(64, uint64_t, _fallback)
3716#endif
3717#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3718EMIT_SHRX(32, uint32_t, RT_NOTHING)
3719EMIT_SHRX(32, uint32_t, _fallback)
3720#endif
3721
3722
3723/*
3724 * SARX (BMI2)
3725 */
3726#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3727IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3728{ \
3729 cShift &= a_cBitsWidth - 1; \
3730 *puDst = (a_iType)uSrc >> cShift; \
3731}
3732
3733#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3734EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3735EMIT_SARX(64, uint64_t, int64_t, _fallback)
3736#endif
3737#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3738EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3739EMIT_SARX(32, uint32_t, int32_t, _fallback)
3740#endif
3741
3742
3743/*
3744 * PDEP (BMI2)
3745 */
3746#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3747IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3748{ \
3749 a_uType uResult = 0; \
3750 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3751 if (fMask & ((a_uType)1 << iMaskBit)) \
3752 { \
3753 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3754 iBit++; \
3755 } \
3756 *puDst = uResult; \
3757}
3758
3759#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3760EMIT_PDEP(64, uint64_t, RT_NOTHING)
3761#endif
3762EMIT_PDEP(64, uint64_t, _fallback)
3763#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3764EMIT_PDEP(32, uint32_t, RT_NOTHING)
3765#endif
3766EMIT_PDEP(32, uint32_t, _fallback)
3767
3768/*
3769 * PEXT (BMI2)
3770 */
3771#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3772IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3773{ \
3774 a_uType uResult = 0; \
3775 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3776 if (fMask & ((a_uType)1 << iMaskBit)) \
3777 { \
3778 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3779 iBit++; \
3780 } \
3781 *puDst = uResult; \
3782}
3783
3784#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3785EMIT_PEXT(64, uint64_t, RT_NOTHING)
3786#endif
3787EMIT_PEXT(64, uint64_t, _fallback)
3788#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3789EMIT_PEXT(32, uint32_t, RT_NOTHING)
3790#endif
3791EMIT_PEXT(32, uint32_t, _fallback)
3792
3793
3794#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3795
3796# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3797/*
3798 * BSWAP
3799 */
3800
3801IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3802{
3803 *puDst = ASMByteSwapU64(*puDst);
3804}
3805
3806
3807IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3808{
3809 *puDst = ASMByteSwapU32(*puDst);
3810}
3811
3812
3813/* Note! undocument, so 32-bit arg */
3814IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3815{
3816#if 0
3817 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3818#else
3819 /* This is the behaviour AMD 3990x (64-bit mode): */
3820 *(uint16_t *)puDst = 0;
3821#endif
3822}
3823
3824# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3825
3826
3827
3828# if defined(IEM_WITHOUT_ASSEMBLY)
3829
3830/*
3831 * LFENCE, SFENCE & MFENCE.
3832 */
3833
3834IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3835{
3836 ASMReadFence();
3837}
3838
3839
3840IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3841{
3842 ASMWriteFence();
3843}
3844
3845
3846IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3847{
3848 ASMMemoryFence();
3849}
3850
3851
3852# ifndef RT_ARCH_ARM64
3853IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3854{
3855 ASMMemoryFence();
3856}
3857# endif
3858
3859# endif
3860
3861#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3862
3863
3864IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3865{
3866 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3867 {
3868 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3869 *pu16Dst |= u16Src & X86_SEL_RPL;
3870
3871 *pfEFlags |= X86_EFL_ZF;
3872 }
3873 else
3874 *pfEFlags &= ~X86_EFL_ZF;
3875}
3876
3877
3878#if defined(IEM_WITHOUT_ASSEMBLY)
3879
3880/*********************************************************************************************************************************
3881* x87 FPU Loads *
3882*********************************************************************************************************************************/
3883
3884IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3885{
3886 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3887 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3888 {
3889 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3890 pFpuRes->r80Result.sj64.fInteger = 1;
3891 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3892 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3893 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3894 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3895 }
3896 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3897 {
3898 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3899 pFpuRes->r80Result.s.uExponent = 0;
3900 pFpuRes->r80Result.s.uMantissa = 0;
3901 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3902 }
3903 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3904 {
3905 /* Subnormal values gets normalized. */
3906 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3907 pFpuRes->r80Result.sj64.fInteger = 1;
3908 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3909 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3910 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3911 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3912 pFpuRes->FSW |= X86_FSW_DE;
3913 if (!(pFpuState->FCW & X86_FCW_DM))
3914 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3915 }
3916 else if (RTFLOAT32U_IS_INF(pr32Val))
3917 {
3918 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3919 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3920 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3921 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3922 }
3923 else
3924 {
3925 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3926 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3927 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3928 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3929 pFpuRes->r80Result.sj64.fInteger = 1;
3930 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3931 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3932 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3933 {
3934 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3935 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3936 pFpuRes->FSW |= X86_FSW_IE;
3937
3938 if (!(pFpuState->FCW & X86_FCW_IM))
3939 {
3940 /* The value is not pushed. */
3941 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3942 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3943 pFpuRes->r80Result.au64[0] = 0;
3944 pFpuRes->r80Result.au16[4] = 0;
3945 }
3946 }
3947 else
3948 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3949 }
3950}
3951
3952
3953IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3954{
3955 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3956 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3957 {
3958 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3959 pFpuRes->r80Result.sj64.fInteger = 1;
3960 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3961 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3962 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3963 }
3964 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3965 {
3966 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3967 pFpuRes->r80Result.s.uExponent = 0;
3968 pFpuRes->r80Result.s.uMantissa = 0;
3969 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3970 }
3971 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3972 {
3973 /* Subnormal values gets normalized. */
3974 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3975 pFpuRes->r80Result.sj64.fInteger = 1;
3976 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3977 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3978 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3979 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3980 pFpuRes->FSW |= X86_FSW_DE;
3981 if (!(pFpuState->FCW & X86_FCW_DM))
3982 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3983 }
3984 else if (RTFLOAT64U_IS_INF(pr64Val))
3985 {
3986 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3987 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3988 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3989 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3990 }
3991 else
3992 {
3993 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3994 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3995 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3996 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3997 pFpuRes->r80Result.sj64.fInteger = 1;
3998 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3999 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
4000 {
4001 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
4002 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4003 pFpuRes->FSW |= X86_FSW_IE;
4004
4005 if (!(pFpuState->FCW & X86_FCW_IM))
4006 {
4007 /* The value is not pushed. */
4008 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
4009 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
4010 pFpuRes->r80Result.au64[0] = 0;
4011 pFpuRes->r80Result.au16[4] = 0;
4012 }
4013 }
4014 else
4015 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4016 }
4017}
4018
4019
4020IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
4021{
4022 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
4023 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
4024 /* Raises no exceptions. */
4025 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4026}
4027
4028
4029IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4030{
4031 pFpuRes->r80Result.sj64.fSign = 0;
4032 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4033 pFpuRes->r80Result.sj64.fInteger = 1;
4034 pFpuRes->r80Result.sj64.uFraction = 0;
4035
4036 /*
4037 * FPU status word:
4038 * - TOP is irrelevant, but we must match x86 assembly version.
4039 * - C1 is always cleared as we don't have any stack overflows.
4040 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4041 */
4042 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4043}
4044
4045
4046IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4047{
4048 pFpuRes->r80Result.sj64.fSign = 0;
4049 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4050 pFpuRes->r80Result.sj64.fInteger = 1;
4051 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4052 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4053 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4054 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4055}
4056
4057
4058IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4059{
4060 pFpuRes->r80Result.sj64.fSign = 0;
4061 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4062 pFpuRes->r80Result.sj64.fInteger = 1;
4063 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4064 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4065 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4066}
4067
4068
4069IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4070{
4071 pFpuRes->r80Result.sj64.fSign = 0;
4072 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4073 pFpuRes->r80Result.sj64.fInteger = 1;
4074 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4075 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4076 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4077 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4078}
4079
4080
4081IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4082{
4083 pFpuRes->r80Result.sj64.fSign = 0;
4084 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4085 pFpuRes->r80Result.sj64.fInteger = 1;
4086 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4087 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4088 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4089 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4090}
4091
4092
4093IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4094{
4095 pFpuRes->r80Result.sj64.fSign = 0;
4096 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4097 pFpuRes->r80Result.sj64.fInteger = 1;
4098 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4099 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4100 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4101 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4102}
4103
4104
4105IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4106{
4107 pFpuRes->r80Result.s.fSign = 0;
4108 pFpuRes->r80Result.s.uExponent = 0;
4109 pFpuRes->r80Result.s.uMantissa = 0;
4110 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4111}
4112
4113#define EMIT_FILD(a_cBits) \
4114IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4115 int ## a_cBits ## _t const *piVal)) \
4116{ \
4117 int ## a_cBits ## _t iVal = *piVal; \
4118 if (iVal == 0) \
4119 { \
4120 pFpuRes->r80Result.s.fSign = 0; \
4121 pFpuRes->r80Result.s.uExponent = 0; \
4122 pFpuRes->r80Result.s.uMantissa = 0; \
4123 } \
4124 else \
4125 { \
4126 if (iVal > 0) \
4127 pFpuRes->r80Result.s.fSign = 0; \
4128 else \
4129 { \
4130 pFpuRes->r80Result.s.fSign = 1; \
4131 iVal = -iVal; \
4132 } \
4133 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4134 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4135 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4136 } \
4137 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4138}
4139EMIT_FILD(16)
4140EMIT_FILD(32)
4141EMIT_FILD(64)
4142
4143
4144IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4145{
4146 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4147 if ( pd80Val->s.abPairs[0] == 0
4148 && pd80Val->s.abPairs[1] == 0
4149 && pd80Val->s.abPairs[2] == 0
4150 && pd80Val->s.abPairs[3] == 0
4151 && pd80Val->s.abPairs[4] == 0
4152 && pd80Val->s.abPairs[5] == 0
4153 && pd80Val->s.abPairs[6] == 0
4154 && pd80Val->s.abPairs[7] == 0
4155 && pd80Val->s.abPairs[8] == 0)
4156 {
4157 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4158 pFpuRes->r80Result.s.uExponent = 0;
4159 pFpuRes->r80Result.s.uMantissa = 0;
4160 }
4161 else
4162 {
4163 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4164
4165 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4166 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4167 cPairs--;
4168
4169 uint64_t uVal = 0;
4170 uint64_t uFactor = 1;
4171 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4172 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4173 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4174
4175 unsigned const cBits = ASMBitLastSetU64(uVal);
4176 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4177 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4178 }
4179}
4180
4181
4182/*********************************************************************************************************************************
4183* x87 FPU Stores *
4184*********************************************************************************************************************************/
4185
4186/**
4187 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4188 *
4189 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4190 *
4191 * @returns Updated FPU status word value.
4192 * @param fSignIn Incoming sign indicator.
4193 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4194 * @param iExponentIn Unbiased exponent.
4195 * @param fFcw The FPU control word.
4196 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4197 * @param pr32Dst Where to return the output value, if one should be
4198 * returned.
4199 *
4200 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4201 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4202 */
4203static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4204 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4205{
4206 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4207 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4208 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4209 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4210 ? fRoundingOffMask
4211 : 0;
4212 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4213
4214 /*
4215 * Deal with potential overflows/underflows first, optimizing for none.
4216 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4217 */
4218 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4219 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4220 { /* likely? */ }
4221 /*
4222 * Underflow if the exponent zero or negative. This is attempted mapped
4223 * to a subnormal number when possible, with some additional trickery ofc.
4224 */
4225 else if (iExponentOut <= 0)
4226 {
4227 bool const fIsTiny = iExponentOut < 0
4228 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4229 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4230 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4231 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4232
4233 if (iExponentOut <= 0)
4234 {
4235 uMantissaIn = iExponentOut <= -63
4236 ? uMantissaIn != 0
4237 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4238 fRoundedOff = uMantissaIn & fRoundingOffMask;
4239 if (fRoundedOff && fIsTiny)
4240 fFsw |= X86_FSW_UE;
4241 iExponentOut = 0;
4242 }
4243 }
4244 /*
4245 * Overflow if at or above max exponent value or if we will reach max
4246 * when rounding. Will return +/-zero or +/-max value depending on
4247 * whether we're rounding or not.
4248 */
4249 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4250 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4251 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4252 {
4253 fFsw |= X86_FSW_OE;
4254 if (!(fFcw & X86_FCW_OM))
4255 return fFsw | X86_FSW_ES | X86_FSW_B;
4256 fFsw |= X86_FSW_PE;
4257 if (uRoundingAdd)
4258 fFsw |= X86_FSW_C1;
4259 if (!(fFcw & X86_FCW_PM))
4260 fFsw |= X86_FSW_ES | X86_FSW_B;
4261
4262 pr32Dst->s.fSign = fSignIn;
4263 if (uRoundingAdd)
4264 { /* Zero */
4265 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4266 pr32Dst->s.uFraction = 0;
4267 }
4268 else
4269 { /* Max */
4270 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4271 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4272 }
4273 return fFsw;
4274 }
4275
4276 /*
4277 * Normal or subnormal number.
4278 */
4279 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4280 uint64_t uMantissaOut = uMantissaIn;
4281 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4282 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4283 || fRoundedOff != uRoundingAdd)
4284 {
4285 uMantissaOut = uMantissaIn + uRoundingAdd;
4286 if (uMantissaOut >= uMantissaIn)
4287 { /* likely */ }
4288 else
4289 {
4290 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4291 iExponentOut++;
4292 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4293 fFsw |= X86_FSW_C1;
4294 }
4295 }
4296 else
4297 uMantissaOut = uMantissaIn;
4298
4299 /* Truncate the mantissa and set the return value. */
4300 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4301
4302 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4303 pr32Dst->s.uExponent = iExponentOut;
4304 pr32Dst->s.fSign = fSignIn;
4305
4306 /* Set status flags realted to rounding. */
4307 if (fRoundedOff)
4308 {
4309 fFsw |= X86_FSW_PE;
4310 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4311 fFsw |= X86_FSW_C1;
4312 if (!(fFcw & X86_FCW_PM))
4313 fFsw |= X86_FSW_ES | X86_FSW_B;
4314 }
4315
4316 return fFsw;
4317}
4318
4319
4320/**
4321 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4322 */
4323IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4324 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4325{
4326 uint16_t const fFcw = pFpuState->FCW;
4327 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4328 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4329 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4330 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4331 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4332 {
4333 pr32Dst->s.fSign = pr80Src->s.fSign;
4334 pr32Dst->s.uExponent = 0;
4335 pr32Dst->s.uFraction = 0;
4336 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4337 }
4338 else if (RTFLOAT80U_IS_INF(pr80Src))
4339 {
4340 pr32Dst->s.fSign = pr80Src->s.fSign;
4341 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4342 pr32Dst->s.uFraction = 0;
4343 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4344 }
4345 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4346 {
4347 /* Mapped to +/-QNaN */
4348 pr32Dst->s.fSign = pr80Src->s.fSign;
4349 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4350 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4351 }
4352 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4353 {
4354 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4355 if (fFcw & X86_FCW_IM)
4356 {
4357 pr32Dst->s.fSign = 1;
4358 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4359 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4360 fFsw |= X86_FSW_IE;
4361 }
4362 else
4363 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4364 }
4365 else if (RTFLOAT80U_IS_NAN(pr80Src))
4366 {
4367 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4368 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4369 {
4370 pr32Dst->s.fSign = pr80Src->s.fSign;
4371 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4372 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4373 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4374 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4375 fFsw |= X86_FSW_IE;
4376 }
4377 else
4378 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4379 }
4380 else
4381 {
4382 /* Denormal values causes both an underflow and precision exception. */
4383 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4384 if (fFcw & X86_FCW_UM)
4385 {
4386 pr32Dst->s.fSign = pr80Src->s.fSign;
4387 pr32Dst->s.uExponent = 0;
4388 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4389 {
4390 pr32Dst->s.uFraction = 1;
4391 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4392 if (!(fFcw & X86_FCW_PM))
4393 fFsw |= X86_FSW_ES | X86_FSW_B;
4394 }
4395 else
4396 {
4397 pr32Dst->s.uFraction = 0;
4398 fFsw |= X86_FSW_UE | X86_FSW_PE;
4399 if (!(fFcw & X86_FCW_PM))
4400 fFsw |= X86_FSW_ES | X86_FSW_B;
4401 }
4402 }
4403 else
4404 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4405 }
4406 *pu16FSW = fFsw;
4407}
4408
4409
4410/**
4411 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4412 *
4413 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4414 *
4415 * @returns Updated FPU status word value.
4416 * @param fSignIn Incoming sign indicator.
4417 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4418 * @param iExponentIn Unbiased exponent.
4419 * @param fFcw The FPU control word.
4420 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4421 * @param pr64Dst Where to return the output value, if one should be
4422 * returned.
4423 *
4424 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4425 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4426 */
4427static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4428 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4429{
4430 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4431 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4432 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4433 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4434 ? fRoundingOffMask
4435 : 0;
4436 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4437
4438 /*
4439 * Deal with potential overflows/underflows first, optimizing for none.
4440 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4441 */
4442 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4443 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4444 { /* likely? */ }
4445 /*
4446 * Underflow if the exponent zero or negative. This is attempted mapped
4447 * to a subnormal number when possible, with some additional trickery ofc.
4448 */
4449 else if (iExponentOut <= 0)
4450 {
4451 bool const fIsTiny = iExponentOut < 0
4452 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4453 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4454 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4455 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4456
4457 if (iExponentOut <= 0)
4458 {
4459 uMantissaIn = iExponentOut <= -63
4460 ? uMantissaIn != 0
4461 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4462 fRoundedOff = uMantissaIn & fRoundingOffMask;
4463 if (fRoundedOff && fIsTiny)
4464 fFsw |= X86_FSW_UE;
4465 iExponentOut = 0;
4466 }
4467 }
4468 /*
4469 * Overflow if at or above max exponent value or if we will reach max
4470 * when rounding. Will return +/-zero or +/-max value depending on
4471 * whether we're rounding or not.
4472 */
4473 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4474 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4475 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4476 {
4477 fFsw |= X86_FSW_OE;
4478 if (!(fFcw & X86_FCW_OM))
4479 return fFsw | X86_FSW_ES | X86_FSW_B;
4480 fFsw |= X86_FSW_PE;
4481 if (uRoundingAdd)
4482 fFsw |= X86_FSW_C1;
4483 if (!(fFcw & X86_FCW_PM))
4484 fFsw |= X86_FSW_ES | X86_FSW_B;
4485
4486 pr64Dst->s64.fSign = fSignIn;
4487 if (uRoundingAdd)
4488 { /* Zero */
4489 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4490 pr64Dst->s64.uFraction = 0;
4491 }
4492 else
4493 { /* Max */
4494 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4495 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4496 }
4497 return fFsw;
4498 }
4499
4500 /*
4501 * Normal or subnormal number.
4502 */
4503 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4504 uint64_t uMantissaOut = uMantissaIn;
4505 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4506 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4507 || fRoundedOff != uRoundingAdd)
4508 {
4509 uMantissaOut = uMantissaIn + uRoundingAdd;
4510 if (uMantissaOut >= uMantissaIn)
4511 { /* likely */ }
4512 else
4513 {
4514 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4515 iExponentOut++;
4516 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4517 fFsw |= X86_FSW_C1;
4518 }
4519 }
4520 else
4521 uMantissaOut = uMantissaIn;
4522
4523 /* Truncate the mantissa and set the return value. */
4524 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4525
4526 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4527 pr64Dst->s64.uExponent = iExponentOut;
4528 pr64Dst->s64.fSign = fSignIn;
4529
4530 /* Set status flags realted to rounding. */
4531 if (fRoundedOff)
4532 {
4533 fFsw |= X86_FSW_PE;
4534 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4535 fFsw |= X86_FSW_C1;
4536 if (!(fFcw & X86_FCW_PM))
4537 fFsw |= X86_FSW_ES | X86_FSW_B;
4538 }
4539
4540 return fFsw;
4541}
4542
4543
4544/**
4545 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4546 */
4547IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4548 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4549{
4550 uint16_t const fFcw = pFpuState->FCW;
4551 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4552 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4553 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4554 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4555 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4556 {
4557 pr64Dst->s64.fSign = pr80Src->s.fSign;
4558 pr64Dst->s64.uExponent = 0;
4559 pr64Dst->s64.uFraction = 0;
4560 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4561 }
4562 else if (RTFLOAT80U_IS_INF(pr80Src))
4563 {
4564 pr64Dst->s64.fSign = pr80Src->s.fSign;
4565 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4566 pr64Dst->s64.uFraction = 0;
4567 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4568 }
4569 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4570 {
4571 /* Mapped to +/-QNaN */
4572 pr64Dst->s64.fSign = pr80Src->s.fSign;
4573 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4574 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4575 }
4576 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4577 {
4578 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4579 if (fFcw & X86_FCW_IM)
4580 {
4581 pr64Dst->s64.fSign = 1;
4582 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4583 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4584 fFsw |= X86_FSW_IE;
4585 }
4586 else
4587 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4588 }
4589 else if (RTFLOAT80U_IS_NAN(pr80Src))
4590 {
4591 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4592 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4593 {
4594 pr64Dst->s64.fSign = pr80Src->s.fSign;
4595 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4596 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4597 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4598 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4599 fFsw |= X86_FSW_IE;
4600 }
4601 else
4602 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4603 }
4604 else
4605 {
4606 /* Denormal values causes both an underflow and precision exception. */
4607 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4608 if (fFcw & X86_FCW_UM)
4609 {
4610 pr64Dst->s64.fSign = pr80Src->s.fSign;
4611 pr64Dst->s64.uExponent = 0;
4612 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4613 {
4614 pr64Dst->s64.uFraction = 1;
4615 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4616 if (!(fFcw & X86_FCW_PM))
4617 fFsw |= X86_FSW_ES | X86_FSW_B;
4618 }
4619 else
4620 {
4621 pr64Dst->s64.uFraction = 0;
4622 fFsw |= X86_FSW_UE | X86_FSW_PE;
4623 if (!(fFcw & X86_FCW_PM))
4624 fFsw |= X86_FSW_ES | X86_FSW_B;
4625 }
4626 }
4627 else
4628 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4629 }
4630 *pu16FSW = fFsw;
4631}
4632
4633
4634IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4635 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4636{
4637 /*
4638 * FPU status word:
4639 * - TOP is irrelevant, but we must match x86 assembly version (0).
4640 * - C1 is always cleared as we don't have any stack overflows.
4641 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4642 */
4643 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4644 *pr80Dst = *pr80Src;
4645}
4646
4647
4648/*
4649 *
4650 * Mantissa:
4651 * 63 56 48 40 32 24 16 8 0
4652 * v v v v v v v v v
4653 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4654 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4655 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4656 *
4657 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4658 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4659 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4660 * where we'll drop off all but bit 63.
4661 */
4662#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4663IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4664 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4665{ \
4666 uint16_t const fFcw = pFpuState->FCW; \
4667 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4668 bool const fSignIn = pr80Val->s.fSign; \
4669 \
4670 /* \
4671 * Deal with normal numbers first. \
4672 */ \
4673 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4674 { \
4675 uint64_t uMantissa = pr80Val->s.uMantissa; \
4676 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4677 \
4678 if ((uint32_t)iExponent <= a_cBits - 2) \
4679 { \
4680 unsigned const cShiftOff = 63 - iExponent; \
4681 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4682 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4683 ? RT_BIT_64(cShiftOff - 1) \
4684 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4685 ? fRoundingOffMask \
4686 : 0; \
4687 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4688 \
4689 uMantissa >>= cShiftOff; \
4690 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4691 uMantissa += uRounding; \
4692 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4693 { \
4694 if (fRoundedOff) \
4695 { \
4696 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4697 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4698 else if (uRounding) \
4699 fFsw |= X86_FSW_C1; \
4700 fFsw |= X86_FSW_PE; \
4701 if (!(fFcw & X86_FCW_PM)) \
4702 fFsw |= X86_FSW_ES | X86_FSW_B; \
4703 } \
4704 \
4705 if (!fSignIn) \
4706 *piDst = (a_iType)uMantissa; \
4707 else \
4708 *piDst = -(a_iType)uMantissa; \
4709 } \
4710 else \
4711 { \
4712 /* overflowed after rounding. */ \
4713 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4714 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4715 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4716 \
4717 /* Special case for the integer minimum value. */ \
4718 if (fSignIn) \
4719 { \
4720 *piDst = a_iTypeMin; \
4721 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4722 if (!(fFcw & X86_FCW_PM)) \
4723 fFsw |= X86_FSW_ES | X86_FSW_B; \
4724 } \
4725 else \
4726 { \
4727 fFsw |= X86_FSW_IE; \
4728 if (fFcw & X86_FCW_IM) \
4729 *piDst = a_iTypeMin; \
4730 else \
4731 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4732 } \
4733 } \
4734 } \
4735 /* \
4736 * Tiny sub-zero numbers. \
4737 */ \
4738 else if (iExponent < 0) \
4739 { \
4740 if (!fSignIn) \
4741 { \
4742 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4743 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4744 { \
4745 *piDst = 1; \
4746 fFsw |= X86_FSW_C1; \
4747 } \
4748 else \
4749 *piDst = 0; \
4750 } \
4751 else \
4752 { \
4753 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4754 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4755 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4756 *piDst = 0; \
4757 else \
4758 { \
4759 *piDst = -1; \
4760 fFsw |= X86_FSW_C1; \
4761 } \
4762 } \
4763 fFsw |= X86_FSW_PE; \
4764 if (!(fFcw & X86_FCW_PM)) \
4765 fFsw |= X86_FSW_ES | X86_FSW_B; \
4766 } \
4767 /* \
4768 * Special MIN case. \
4769 */ \
4770 else if ( fSignIn && iExponent == a_cBits - 1 \
4771 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4772 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4773 : uMantissa == RT_BIT_64(63))) \
4774 { \
4775 *piDst = a_iTypeMin; \
4776 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4777 { \
4778 fFsw |= X86_FSW_PE; \
4779 if (!(fFcw & X86_FCW_PM)) \
4780 fFsw |= X86_FSW_ES | X86_FSW_B; \
4781 } \
4782 } \
4783 /* \
4784 * Too large/small number outside the target integer range. \
4785 */ \
4786 else \
4787 { \
4788 fFsw |= X86_FSW_IE; \
4789 if (fFcw & X86_FCW_IM) \
4790 *piDst = a_iTypeIndefinite; \
4791 else \
4792 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4793 } \
4794 } \
4795 /* \
4796 * Map both +0 and -0 to integer zero (signless/+). \
4797 */ \
4798 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4799 *piDst = 0; \
4800 /* \
4801 * Denormals are just really tiny sub-zero numbers that are either rounded \
4802 * to zero, 1 or -1 depending on sign and rounding control. \
4803 */ \
4804 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4805 { \
4806 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4807 *piDst = 0; \
4808 else \
4809 { \
4810 *piDst = fSignIn ? -1 : 1; \
4811 fFsw |= X86_FSW_C1; \
4812 } \
4813 fFsw |= X86_FSW_PE; \
4814 if (!(fFcw & X86_FCW_PM)) \
4815 fFsw |= X86_FSW_ES | X86_FSW_B; \
4816 } \
4817 /* \
4818 * All other special values are considered invalid arguments and result \
4819 * in an IE exception and indefinite value if masked. \
4820 */ \
4821 else \
4822 { \
4823 fFsw |= X86_FSW_IE; \
4824 if (fFcw & X86_FCW_IM) \
4825 *piDst = a_iTypeIndefinite; \
4826 else \
4827 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4828 } \
4829 *pu16FSW = fFsw; \
4830}
4831EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4832EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4833EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4834
4835#endif /*IEM_WITHOUT_ASSEMBLY */
4836
4837
4838/*
4839 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4840 *
4841 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4842 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4843 * thus the @a a_cBitsIn.
4844 */
4845#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4846IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4847 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4848{ \
4849 uint16_t const fFcw = pFpuState->FCW; \
4850 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4851 bool const fSignIn = pr80Val->s.fSign; \
4852 \
4853 /* \
4854 * Deal with normal numbers first. \
4855 */ \
4856 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4857 { \
4858 uint64_t uMantissa = pr80Val->s.uMantissa; \
4859 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4860 \
4861 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4862 { \
4863 unsigned const cShiftOff = 63 - iExponent; \
4864 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4865 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4866 uMantissa >>= cShiftOff; \
4867 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4868 if (!fSignIn) \
4869 *piDst = (a_iType)uMantissa; \
4870 else \
4871 *piDst = -(a_iType)uMantissa; \
4872 \
4873 if (fRoundedOff) \
4874 { \
4875 fFsw |= X86_FSW_PE; \
4876 if (!(fFcw & X86_FCW_PM)) \
4877 fFsw |= X86_FSW_ES | X86_FSW_B; \
4878 } \
4879 } \
4880 /* \
4881 * Tiny sub-zero numbers. \
4882 */ \
4883 else if (iExponent < 0) \
4884 { \
4885 *piDst = 0; \
4886 fFsw |= X86_FSW_PE; \
4887 if (!(fFcw & X86_FCW_PM)) \
4888 fFsw |= X86_FSW_ES | X86_FSW_B; \
4889 } \
4890 /* \
4891 * Special MIN case. \
4892 */ \
4893 else if ( fSignIn && iExponent == a_cBits - 1 \
4894 && (a_cBits < 64 \
4895 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4896 : uMantissa == RT_BIT_64(63)) ) \
4897 { \
4898 *piDst = a_iTypeMin; \
4899 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4900 { \
4901 fFsw |= X86_FSW_PE; \
4902 if (!(fFcw & X86_FCW_PM)) \
4903 fFsw |= X86_FSW_ES | X86_FSW_B; \
4904 } \
4905 } \
4906 /* \
4907 * Figure this weirdness. \
4908 */ \
4909 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4910 { \
4911 *piDst = 0; \
4912 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4913 { \
4914 fFsw |= X86_FSW_PE; \
4915 if (!(fFcw & X86_FCW_PM)) \
4916 fFsw |= X86_FSW_ES | X86_FSW_B; \
4917 } \
4918 } \
4919 /* \
4920 * Too large/small number outside the target integer range. \
4921 */ \
4922 else \
4923 { \
4924 fFsw |= X86_FSW_IE; \
4925 if (fFcw & X86_FCW_IM) \
4926 *piDst = a_iTypeIndefinite; \
4927 else \
4928 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4929 } \
4930 } \
4931 /* \
4932 * Map both +0 and -0 to integer zero (signless/+). \
4933 */ \
4934 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4935 *piDst = 0; \
4936 /* \
4937 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4938 */ \
4939 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4940 { \
4941 *piDst = 0; \
4942 fFsw |= X86_FSW_PE; \
4943 if (!(fFcw & X86_FCW_PM)) \
4944 fFsw |= X86_FSW_ES | X86_FSW_B; \
4945 } \
4946 /* \
4947 * All other special values are considered invalid arguments and result \
4948 * in an IE exception and indefinite value if masked. \
4949 */ \
4950 else \
4951 { \
4952 fFsw |= X86_FSW_IE; \
4953 if (fFcw & X86_FCW_IM) \
4954 *piDst = a_iTypeIndefinite; \
4955 else \
4956 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4957 } \
4958 *pu16FSW = fFsw; \
4959}
4960#if defined(IEM_WITHOUT_ASSEMBLY)
4961EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4962EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4963EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4964#endif
4965EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4966EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4967
4968
4969#if defined(IEM_WITHOUT_ASSEMBLY)
4970
4971IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4972 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4973{
4974 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4975 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4976 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4977 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4978 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4979
4980 uint16_t const fFcw = pFpuState->FCW;
4981 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4982 bool const fSignIn = pr80Src->s.fSign;
4983
4984 /*
4985 * Deal with normal numbers first.
4986 */
4987 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4988 {
4989 uint64_t uMantissa = pr80Src->s.uMantissa;
4990 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4991 if ( (uint32_t)iExponent <= 58
4992 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4993 {
4994 unsigned const cShiftOff = 63 - iExponent;
4995 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4996 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4997 ? RT_BIT_64(cShiftOff - 1)
4998 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4999 ? fRoundingOffMask
5000 : 0;
5001 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
5002
5003 uMantissa >>= cShiftOff;
5004 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
5005 uMantissa += uRounding;
5006 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
5007 {
5008 if (fRoundedOff)
5009 {
5010 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
5011 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
5012 else if (uRounding)
5013 fFsw |= X86_FSW_C1;
5014 fFsw |= X86_FSW_PE;
5015 if (!(fFcw & X86_FCW_PM))
5016 fFsw |= X86_FSW_ES | X86_FSW_B;
5017 }
5018
5019 pd80Dst->s.fSign = fSignIn;
5020 pd80Dst->s.uPad = 0;
5021 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
5022 {
5023 unsigned const uDigits = uMantissa % 100;
5024 uMantissa /= 100;
5025 uint8_t const bLo = uDigits % 10;
5026 uint8_t const bHi = uDigits / 10;
5027 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
5028 }
5029 }
5030 else
5031 {
5032 /* overflowed after rounding. */
5033 fFsw |= X86_FSW_IE;
5034 if (fFcw & X86_FCW_IM)
5035 *pd80Dst = s_d80Indefinite;
5036 else
5037 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5038 }
5039 }
5040 /*
5041 * Tiny sub-zero numbers.
5042 */
5043 else if (iExponent < 0)
5044 {
5045 if (!fSignIn)
5046 {
5047 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5048 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5049 {
5050 *pd80Dst = s_ad80One[fSignIn];
5051 fFsw |= X86_FSW_C1;
5052 }
5053 else
5054 *pd80Dst = s_ad80Zeros[fSignIn];
5055 }
5056 else
5057 {
5058 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5059 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5060 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5061 *pd80Dst = s_ad80Zeros[fSignIn];
5062 else
5063 {
5064 *pd80Dst = s_ad80One[fSignIn];
5065 fFsw |= X86_FSW_C1;
5066 }
5067 }
5068 fFsw |= X86_FSW_PE;
5069 if (!(fFcw & X86_FCW_PM))
5070 fFsw |= X86_FSW_ES | X86_FSW_B;
5071 }
5072 /*
5073 * Too large/small number outside the target integer range.
5074 */
5075 else
5076 {
5077 fFsw |= X86_FSW_IE;
5078 if (fFcw & X86_FCW_IM)
5079 *pd80Dst = s_d80Indefinite;
5080 else
5081 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5082 }
5083 }
5084 /*
5085 * Map both +0 and -0 to integer zero (signless/+).
5086 */
5087 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5088 *pd80Dst = s_ad80Zeros[fSignIn];
5089 /*
5090 * Denormals are just really tiny sub-zero numbers that are either rounded
5091 * to zero, 1 or -1 depending on sign and rounding control.
5092 */
5093 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5094 {
5095 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5096 *pd80Dst = s_ad80Zeros[fSignIn];
5097 else
5098 {
5099 *pd80Dst = s_ad80One[fSignIn];
5100 fFsw |= X86_FSW_C1;
5101 }
5102 fFsw |= X86_FSW_PE;
5103 if (!(fFcw & X86_FCW_PM))
5104 fFsw |= X86_FSW_ES | X86_FSW_B;
5105 }
5106 /*
5107 * All other special values are considered invalid arguments and result
5108 * in an IE exception and indefinite value if masked.
5109 */
5110 else
5111 {
5112 fFsw |= X86_FSW_IE;
5113 if (fFcw & X86_FCW_IM)
5114 *pd80Dst = s_d80Indefinite;
5115 else
5116 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5117 }
5118 *pu16FSW = fFsw;
5119}
5120
5121
5122/*********************************************************************************************************************************
5123* FPU Helpers *
5124*********************************************************************************************************************************/
5125AssertCompileSize(RTFLOAT128U, 16);
5126AssertCompileSize(RTFLOAT80U, 10);
5127AssertCompileSize(RTFLOAT64U, 8);
5128AssertCompileSize(RTFLOAT32U, 4);
5129
5130/**
5131 * Normalizes a possible pseudo-normal value.
5132 *
5133 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5134 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5135 * i.e. changing uExponent from 0 to 1.
5136 *
5137 * This macro will declare a RTFLOAT80U with the name given by
5138 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5139 * a normalization was performed.
5140 *
5141 * @note This must be applied before calling SoftFloat with a value that couldbe
5142 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5143 * correctly.
5144 */
5145#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5146 RTFLOAT80U a_r80ValNormalized; \
5147 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5148 { \
5149 a_r80ValNormalized = *a_pr80Val; \
5150 a_r80ValNormalized.s.uExponent = 1; \
5151 a_pr80Val = &a_r80ValNormalized; \
5152 } else do {} while (0)
5153
5154#ifdef IEM_WITH_FLOAT128_FOR_FPU
5155
5156DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5157{
5158 int fNew;
5159 switch (fFcw & X86_FCW_RC_MASK)
5160 {
5161 default:
5162 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5163 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5164 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5165 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5166 }
5167 int fOld = fegetround();
5168 fesetround(fNew);
5169 return fOld;
5170}
5171
5172
5173DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5174{
5175 fesetround(fOld);
5176}
5177
5178DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5179{
5180 RT_NOREF(fFcw);
5181 RTFLOAT128U Tmp;
5182 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5183 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5184 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5185 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5186 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5187 {
5188 Assert(Tmp.s.uExponent == 0);
5189 Tmp.s2.uSignAndExponent++;
5190 }
5191 return *(_Float128 *)&Tmp;
5192}
5193
5194
5195DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5196{
5197 RT_NOREF(fFcw);
5198 RTFLOAT128U Tmp;
5199 *(_Float128 *)&Tmp = rd128ValSrc;
5200 ASMCompilerBarrier();
5201 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5202 {
5203 pr80Dst->s.fSign = Tmp.s64.fSign;
5204 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5205 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5206 | Tmp.s64.uFractionLo >> (64 - 15);
5207
5208 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5209 unsigned const cShiftOff = 64 - 15;
5210 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5211 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5212 if (uRoundedOff)
5213 {
5214 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5215 ? RT_BIT_64(cShiftOff - 1)
5216 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5217 ? fRoundingOffMask
5218 : 0;
5219 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5220 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5221 || uRoundedOff != uRoundingAdd)
5222 {
5223 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5224 {
5225 uFraction += 1;
5226 if (!(uFraction & RT_BIT_64(63)))
5227 { /* likely */ }
5228 else
5229 {
5230 uFraction >>= 1;
5231 pr80Dst->s.uExponent++;
5232 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5233 return fFsw;
5234 }
5235 fFsw |= X86_FSW_C1;
5236 }
5237 }
5238 fFsw |= X86_FSW_PE;
5239 if (!(fFcw & X86_FCW_PM))
5240 fFsw |= X86_FSW_ES | X86_FSW_B;
5241 }
5242 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5243 }
5244 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5245 {
5246 pr80Dst->s.fSign = Tmp.s64.fSign;
5247 pr80Dst->s.uExponent = 0;
5248 pr80Dst->s.uMantissa = 0;
5249 }
5250 else if (RTFLOAT128U_IS_INF(&Tmp))
5251 {
5252 pr80Dst->s.fSign = Tmp.s64.fSign;
5253 pr80Dst->s.uExponent = 0;
5254 pr80Dst->s.uMantissa = 0;
5255 }
5256 return fFsw;
5257}
5258
5259
5260#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5261
5262/** Initializer for the SoftFloat state structure. */
5263# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5264 { \
5265 softfloat_tininess_afterRounding, \
5266 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5267 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5268 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5269 : (uint8_t)softfloat_round_minMag, \
5270 0, \
5271 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5272 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5273 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5274 }
5275
5276/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5277# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5278 ( (a_fFsw) \
5279 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5280 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5281 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5282 ? X86_FSW_ES | X86_FSW_B : 0) )
5283
5284
5285DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5286{
5287 RT_NOREF(fFcw);
5288 Assert(cBits > 64);
5289# if 0 /* rounding does not seem to help */
5290 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5291 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5292 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5293 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5294 {
5295 uint64_t uOld = r128.v[0];
5296 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5297 if (r128.v[0] < uOld)
5298 r128.v[1] += 1;
5299 }
5300# else
5301 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5302# endif
5303 return r128;
5304}
5305
5306
5307DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5308{
5309 RT_NOREF(fFcw);
5310 Assert(cBits > 64);
5311# if 0 /* rounding does not seem to help, not even on constants */
5312 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5313 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5314 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5315 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5316 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5317 {
5318 uint64_t uOld = r128.v[0];
5319 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5320 if (r128.v[0] < uOld)
5321 r128.v[1] += 1;
5322 }
5323 return r128;
5324# else
5325 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5326 return r128;
5327# endif
5328}
5329
5330
5331# if 0 /* unused */
5332DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5333{
5334 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5335 return r128;
5336}
5337# endif
5338
5339
5340/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5341DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5342{
5343 extFloat80_t Tmp;
5344 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5345 Tmp.signif = pr80Val->s2.uMantissa;
5346 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5347 return extF80_to_f128(Tmp, &Ignored);
5348}
5349
5350
5351/**
5352 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5353 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5354 *
5355 * This is only a structure format conversion, nothing else.
5356 */
5357DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5358{
5359 extFloat80_t Tmp;
5360 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5361 Tmp.signif = pr80Val->s2.uMantissa;
5362 return Tmp;
5363}
5364
5365
5366/**
5367 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5368 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5369 *
5370 * This is only a structure format conversion, nothing else.
5371 */
5372DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5373{
5374 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5375 pr80Dst->s2.uMantissa = r80XSrc.signif;
5376 return pr80Dst;
5377}
5378
5379
5380DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5381{
5382 RT_NOREF(fFcw);
5383 RTFLOAT128U Tmp;
5384 *(float128_t *)&Tmp = r128Src;
5385 ASMCompilerBarrier();
5386
5387 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5388 {
5389 pr80Dst->s.fSign = Tmp.s64.fSign;
5390 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5391 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5392 | Tmp.s64.uFractionLo >> (64 - 15);
5393
5394 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5395 unsigned const cShiftOff = 64 - 15;
5396 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5397 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5398 if (uRoundedOff)
5399 {
5400 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5401 ? RT_BIT_64(cShiftOff - 1)
5402 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5403 ? fRoundingOffMask
5404 : 0;
5405 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5406 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5407 || uRoundedOff != uRoundingAdd)
5408 {
5409 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5410 {
5411 uFraction += 1;
5412 if (!(uFraction & RT_BIT_64(63)))
5413 { /* likely */ }
5414 else
5415 {
5416 uFraction >>= 1;
5417 pr80Dst->s.uExponent++;
5418 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5419 return fFsw;
5420 }
5421 fFsw |= X86_FSW_C1;
5422 }
5423 }
5424 fFsw |= X86_FSW_PE;
5425 if (!(fFcw & X86_FCW_PM))
5426 fFsw |= X86_FSW_ES | X86_FSW_B;
5427 }
5428
5429 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5430 }
5431 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5432 {
5433 pr80Dst->s.fSign = Tmp.s64.fSign;
5434 pr80Dst->s.uExponent = 0;
5435 pr80Dst->s.uMantissa = 0;
5436 }
5437 else if (RTFLOAT128U_IS_INF(&Tmp))
5438 {
5439 pr80Dst->s.fSign = Tmp.s64.fSign;
5440 pr80Dst->s.uExponent = 0x7fff;
5441 pr80Dst->s.uMantissa = 0;
5442 }
5443 return fFsw;
5444}
5445
5446
5447/**
5448 * Helper for transfering exception and C1 to FSW and setting the result value
5449 * accordingly.
5450 *
5451 * @returns Updated FSW.
5452 * @param pSoftState The SoftFloat state following the operation.
5453 * @param r80XResult The result of the SoftFloat operation.
5454 * @param pr80Result Where to store the result for IEM.
5455 * @param fFcw The FPU control word.
5456 * @param fFsw The FSW before the operation, with necessary bits
5457 * cleared and such.
5458 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5459 * raised.
5460 */
5461DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5462 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5463 PCRTFLOAT80U pr80XcptResult)
5464{
5465 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5466 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5467 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5468 fFsw |= X86_FSW_ES | X86_FSW_B;
5469
5470 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5471 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5472 else
5473 {
5474 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5475 *pr80Result = *pr80XcptResult;
5476 }
5477 return fFsw;
5478}
5479
5480
5481/**
5482 * Helper doing polynomial evaluation using Horner's method.
5483 *
5484 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5485 */
5486float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5487 unsigned cPrecision, softfloat_state_t *pSoftState)
5488{
5489 Assert(cHornerConsts > 1);
5490 size_t i = cHornerConsts - 1;
5491 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5492 while (i-- > 0)
5493 {
5494 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5495 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5496 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5497 }
5498 return r128Result;
5499}
5500
5501#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5502
5503
5504/**
5505 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5506 * mantissa, exponent and sign.
5507 *
5508 * @returns Updated FSW.
5509 * @param pr80Dst Where to return the composed value.
5510 * @param fSign The sign.
5511 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5512 * ignored and should be zero. This will probably be
5513 * modified during normalization and rounding.
5514 * @param iExponent Unbiased exponent.
5515 * @param fFcw The FPU control word.
5516 * @param fFsw The FPU status word.
5517 */
5518static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5519 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5520{
5521 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5522
5523 iExponent += RTFLOAT80U_EXP_BIAS;
5524
5525 /* Do normalization if necessary and possible. */
5526 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5527 {
5528 int cShift = 192 - RTUInt256BitCount(puMantissa);
5529 if (iExponent > cShift)
5530 iExponent -= cShift;
5531 else
5532 {
5533 if (fFcw & X86_FCW_UM)
5534 {
5535 if (iExponent > 0)
5536 cShift = --iExponent;
5537 else
5538 cShift = 0;
5539 }
5540 iExponent -= cShift;
5541 }
5542 RTUInt256AssignShiftLeft(puMantissa, cShift);
5543 }
5544
5545 /* Do rounding. */
5546 uint64_t uMantissa = puMantissa->QWords.qw2;
5547 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5548 {
5549 bool fAdd;
5550 switch (fFcw & X86_FCW_RC_MASK)
5551 {
5552 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5553 case X86_FCW_RC_NEAREST:
5554 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5555 {
5556 if ( (uMantissa & 1)
5557 || puMantissa->QWords.qw0 != 0
5558 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5559 {
5560 fAdd = true;
5561 break;
5562 }
5563 uMantissa &= ~(uint64_t)1;
5564 }
5565 fAdd = false;
5566 break;
5567 case X86_FCW_RC_ZERO:
5568 fAdd = false;
5569 break;
5570 case X86_FCW_RC_UP:
5571 fAdd = !fSign;
5572 break;
5573 case X86_FCW_RC_DOWN:
5574 fAdd = fSign;
5575 break;
5576 }
5577 if (fAdd)
5578 {
5579 uint64_t const uTmp = uMantissa;
5580 uMantissa = uTmp + 1;
5581 if (uMantissa < uTmp)
5582 {
5583 uMantissa >>= 1;
5584 uMantissa |= RT_BIT_64(63);
5585 iExponent++;
5586 }
5587 fFsw |= X86_FSW_C1;
5588 }
5589 fFsw |= X86_FSW_PE;
5590 if (!(fFcw & X86_FCW_PM))
5591 fFsw |= X86_FSW_ES | X86_FSW_B;
5592 }
5593
5594 /* Check for underflow (denormals). */
5595 if (iExponent <= 0)
5596 {
5597 if (fFcw & X86_FCW_UM)
5598 {
5599 if (uMantissa & RT_BIT_64(63))
5600 uMantissa >>= 1;
5601 iExponent = 0;
5602 }
5603 else
5604 {
5605 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5606 fFsw |= X86_FSW_ES | X86_FSW_B;
5607 }
5608 fFsw |= X86_FSW_UE;
5609 }
5610 /* Check for overflow */
5611 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5612 {
5613 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5614 }
5615
5616 /* Compose the result. */
5617 pr80Dst->s.uMantissa = uMantissa;
5618 pr80Dst->s.uExponent = iExponent;
5619 pr80Dst->s.fSign = fSign;
5620 return fFsw;
5621}
5622
5623
5624/**
5625 * See also iemAImpl_fld_r80_from_r32
5626 */
5627static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5628{
5629 uint16_t fFsw = 0;
5630 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5631 {
5632 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5633 pr80Dst->sj64.fInteger = 1;
5634 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5635 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5636 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5637 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5638 }
5639 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5640 {
5641 pr80Dst->s.fSign = pr32Val->s.fSign;
5642 pr80Dst->s.uExponent = 0;
5643 pr80Dst->s.uMantissa = 0;
5644 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5645 }
5646 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5647 {
5648 /* Subnormal -> normalized + X86_FSW_DE return. */
5649 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5650 pr80Dst->sj64.fInteger = 1;
5651 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5652 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5653 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5654 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5655 fFsw = X86_FSW_DE;
5656 }
5657 else if (RTFLOAT32U_IS_INF(pr32Val))
5658 {
5659 pr80Dst->s.fSign = pr32Val->s.fSign;
5660 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5661 pr80Dst->s.uMantissa = RT_BIT_64(63);
5662 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5663 }
5664 else
5665 {
5666 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5667 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5668 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5669 pr80Dst->sj64.fInteger = 1;
5670 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5671 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5672 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5673 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5674 }
5675 return fFsw;
5676}
5677
5678
5679/**
5680 * See also iemAImpl_fld_r80_from_r64
5681 */
5682static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5683{
5684 uint16_t fFsw = 0;
5685 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5686 {
5687 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5688 pr80Dst->sj64.fInteger = 1;
5689 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5690 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5691 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5692 }
5693 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5694 {
5695 pr80Dst->s.fSign = pr64Val->s.fSign;
5696 pr80Dst->s.uExponent = 0;
5697 pr80Dst->s.uMantissa = 0;
5698 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5699 }
5700 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5701 {
5702 /* Subnormal values gets normalized. */
5703 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5704 pr80Dst->sj64.fInteger = 1;
5705 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5706 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5707 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5708 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5709 fFsw = X86_FSW_DE;
5710 }
5711 else if (RTFLOAT64U_IS_INF(pr64Val))
5712 {
5713 pr80Dst->s.fSign = pr64Val->s.fSign;
5714 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5715 pr80Dst->s.uMantissa = RT_BIT_64(63);
5716 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5717 }
5718 else
5719 {
5720 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5721 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5722 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5723 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5724 pr80Dst->sj64.fInteger = 1;
5725 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5726 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5727 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5728 }
5729 return fFsw;
5730}
5731
5732
5733/**
5734 * See also EMIT_FILD.
5735 */
5736#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5737static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5738{ \
5739 if (iVal == 0) \
5740 { \
5741 pr80Dst->s.fSign = 0; \
5742 pr80Dst->s.uExponent = 0; \
5743 pr80Dst->s.uMantissa = 0; \
5744 } \
5745 else \
5746 { \
5747 if (iVal > 0) \
5748 pr80Dst->s.fSign = 0; \
5749 else \
5750 { \
5751 pr80Dst->s.fSign = 1; \
5752 iVal = -iVal; \
5753 } \
5754 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5755 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5756 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5757 } \
5758 return pr80Dst; \
5759}
5760EMIT_CONVERT_IXX_TO_R80(16)
5761EMIT_CONVERT_IXX_TO_R80(32)
5762//EMIT_CONVERT_IXX_TO_R80(64)
5763
5764/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5765#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5766IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5767{ \
5768 RTFLOAT80U r80Val2; \
5769 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5770 Assert(!fFsw || fFsw == X86_FSW_DE); \
5771 if (fFsw) \
5772 { \
5773 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5774 fFsw = 0; \
5775 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5776 { \
5777 pFpuRes->r80Result = *pr80Val1; \
5778 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5779 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5780 return; \
5781 } \
5782 } \
5783 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5784 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5785}
5786
5787/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5788#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5789IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5790{ \
5791 RTFLOAT80U r80Val2; \
5792 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5793 Assert(!fFsw || fFsw == X86_FSW_DE); \
5794 if (fFsw) \
5795 { \
5796 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5797 fFsw = 0; \
5798 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5799 { \
5800 pFpuRes->r80Result = *pr80Val1; \
5801 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5802 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5803 return; \
5804 } \
5805 } \
5806 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5807 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5808}
5809
5810/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5811#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5812IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5813{ \
5814 RTFLOAT80U r80Val2; \
5815 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5816 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5817}
5818
5819/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5820#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5821IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5822{ \
5823 RTFLOAT80U r80Val2; \
5824 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5825 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5826}
5827
5828
5829
5830/*********************************************************************************************************************************
5831* x86 FPU Division Operations *
5832*********************************************************************************************************************************/
5833
5834/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5835static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5836 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5837{
5838 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5839 {
5840 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5841 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5842 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5843 }
5844 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5845 { /* Div by zero. */
5846 if (fFcw & X86_FCW_ZM)
5847 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5848 else
5849 {
5850 *pr80Result = *pr80Val1Org;
5851 fFsw |= X86_FSW_ES | X86_FSW_B;
5852 }
5853 fFsw |= X86_FSW_ZE;
5854 }
5855 else
5856 { /* Invalid operand */
5857 if (fFcw & X86_FCW_IM)
5858 *pr80Result = g_r80Indefinite;
5859 else
5860 {
5861 *pr80Result = *pr80Val1Org;
5862 fFsw |= X86_FSW_ES | X86_FSW_B;
5863 }
5864 fFsw |= X86_FSW_IE;
5865 }
5866 return fFsw;
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 uint16_t const fFcw = pFpuState->FCW;
5874 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5875
5876 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5877 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5878 {
5879 if (fFcw & X86_FCW_IM)
5880 pFpuRes->r80Result = g_r80Indefinite;
5881 else
5882 {
5883 pFpuRes->r80Result = *pr80Val1;
5884 fFsw |= X86_FSW_ES | X86_FSW_B;
5885 }
5886 fFsw |= X86_FSW_IE;
5887 }
5888 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5889 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5890 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5891 {
5892 if (fFcw & X86_FCW_DM)
5893 {
5894 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5895 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5896 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5897 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5898 }
5899 else
5900 {
5901 pFpuRes->r80Result = *pr80Val1;
5902 fFsw |= X86_FSW_ES | X86_FSW_B;
5903 }
5904 fFsw |= X86_FSW_DE;
5905 }
5906 /* SoftFloat can handle the rest: */
5907 else
5908 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5909
5910 pFpuRes->FSW = fFsw;
5911}
5912
5913
5914EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5915EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5916EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5917EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5918
5919
5920IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5921 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5922{
5923 uint16_t const fFcw = pFpuState->FCW;
5924 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5925
5926 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5927 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5928 {
5929 if (fFcw & X86_FCW_IM)
5930 pFpuRes->r80Result = g_r80Indefinite;
5931 else
5932 {
5933 pFpuRes->r80Result = *pr80Val1;
5934 fFsw |= X86_FSW_ES | X86_FSW_B;
5935 }
5936 fFsw |= X86_FSW_IE;
5937 }
5938 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5939 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5940 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5941 {
5942 if (fFcw & X86_FCW_DM)
5943 {
5944 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5945 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5946 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5947 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5948 }
5949 else
5950 {
5951 pFpuRes->r80Result = *pr80Val1;
5952 fFsw |= X86_FSW_ES | X86_FSW_B;
5953 }
5954 fFsw |= X86_FSW_DE;
5955 }
5956 /* SoftFloat can handle the rest: */
5957 else
5958 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5959
5960 pFpuRes->FSW = fFsw;
5961}
5962
5963
5964EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5965EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5966EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5967EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5968
5969
5970/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5971static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5972 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5973{
5974 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5975 {
5976 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5977 uint16_t fCxFlags = 0;
5978 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5979 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5980 &fCxFlags, &SoftState);
5981 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5982 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5983 if ( !(fFsw & X86_FSW_IE)
5984 && !RTFLOAT80U_IS_NAN(pr80Result)
5985 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5986 {
5987 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5988 fFsw |= fCxFlags & X86_FSW_C_MASK;
5989 }
5990 return fFsw;
5991 }
5992
5993 /* Invalid operand */
5994 if (fFcw & X86_FCW_IM)
5995 *pr80Result = g_r80Indefinite;
5996 else
5997 {
5998 *pr80Result = *pr80Val1Org;
5999 fFsw |= X86_FSW_ES | X86_FSW_B;
6000 }
6001 return fFsw | X86_FSW_IE;
6002}
6003
6004
6005static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6006 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
6007{
6008 uint16_t const fFcw = pFpuState->FCW;
6009 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6010
6011 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
6012 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
6013 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
6014 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
6015 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
6016 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
6017 {
6018 if (fFcw & X86_FCW_IM)
6019 pFpuRes->r80Result = g_r80Indefinite;
6020 else
6021 {
6022 pFpuRes->r80Result = *pr80Val1;
6023 fFsw |= X86_FSW_ES | X86_FSW_B;
6024 }
6025 fFsw |= X86_FSW_IE;
6026 }
6027 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
6028 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
6029 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
6030 {
6031 if (fFcw & X86_FCW_DM)
6032 {
6033 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6034 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6035 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6036 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6037 pr80Val1Org, fLegacyInstr);
6038 }
6039 else
6040 {
6041 pFpuRes->r80Result = *pr80Val1;
6042 fFsw |= X86_FSW_ES | X86_FSW_B;
6043 }
6044 fFsw |= X86_FSW_DE;
6045 }
6046 /* SoftFloat can handle the rest: */
6047 else
6048 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6049 pr80Val1, fLegacyInstr);
6050
6051 pFpuRes->FSW = fFsw;
6052}
6053
6054
6055IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6056 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6057{
6058 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6059}
6060
6061
6062IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6063 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6064{
6065 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6066}
6067
6068
6069/*********************************************************************************************************************************
6070* x87 FPU Multiplication Operations *
6071*********************************************************************************************************************************/
6072
6073/** Worker for iemAImpl_fmul_r80_by_r80. */
6074static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6075 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6076{
6077 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6078 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6079 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6080}
6081
6082
6083IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6084 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6085{
6086 uint16_t const fFcw = pFpuState->FCW;
6087 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6088
6089 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6090 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6091 {
6092 if (fFcw & X86_FCW_IM)
6093 pFpuRes->r80Result = g_r80Indefinite;
6094 else
6095 {
6096 pFpuRes->r80Result = *pr80Val1;
6097 fFsw |= X86_FSW_ES | X86_FSW_B;
6098 }
6099 fFsw |= X86_FSW_IE;
6100 }
6101 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6102 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6103 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6104 {
6105 if (fFcw & X86_FCW_DM)
6106 {
6107 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6108 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6109 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6110 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6111 }
6112 else
6113 {
6114 pFpuRes->r80Result = *pr80Val1;
6115 fFsw |= X86_FSW_ES | X86_FSW_B;
6116 }
6117 fFsw |= X86_FSW_DE;
6118 }
6119 /* SoftFloat can handle the rest: */
6120 else
6121 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6122
6123 pFpuRes->FSW = fFsw;
6124}
6125
6126
6127EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6128EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6129EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6130EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6131
6132
6133/*********************************************************************************************************************************
6134* x87 FPU Addition *
6135*********************************************************************************************************************************/
6136
6137/** Worker for iemAImpl_fadd_r80_by_r80. */
6138static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6139 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6140{
6141 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6142 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6143 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6144}
6145
6146
6147IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6148 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6149{
6150 uint16_t const fFcw = pFpuState->FCW;
6151 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6152
6153 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6154 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6155 {
6156 if (fFcw & X86_FCW_IM)
6157 pFpuRes->r80Result = g_r80Indefinite;
6158 else
6159 {
6160 pFpuRes->r80Result = *pr80Val1;
6161 fFsw |= X86_FSW_ES | X86_FSW_B;
6162 }
6163 fFsw |= X86_FSW_IE;
6164 }
6165 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6166 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6167 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6168 {
6169 if (fFcw & X86_FCW_DM)
6170 {
6171 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6172 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6173 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6174 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6175 }
6176 else
6177 {
6178 pFpuRes->r80Result = *pr80Val1;
6179 fFsw |= X86_FSW_ES | X86_FSW_B;
6180 }
6181 fFsw |= X86_FSW_DE;
6182 }
6183 /* SoftFloat can handle the rest: */
6184 else
6185 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6186
6187 pFpuRes->FSW = fFsw;
6188}
6189
6190
6191EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6192EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6193EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6194EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6195
6196
6197/*********************************************************************************************************************************
6198* x87 FPU Subtraction *
6199*********************************************************************************************************************************/
6200
6201/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6202static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6203 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6204{
6205 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6206 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6207 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6208}
6209
6210
6211IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6212 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6213{
6214 uint16_t const fFcw = pFpuState->FCW;
6215 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6216
6217 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6218 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6219 {
6220 if (fFcw & X86_FCW_IM)
6221 pFpuRes->r80Result = g_r80Indefinite;
6222 else
6223 {
6224 pFpuRes->r80Result = *pr80Val1;
6225 fFsw |= X86_FSW_ES | X86_FSW_B;
6226 }
6227 fFsw |= X86_FSW_IE;
6228 }
6229 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6230 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6231 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6232 {
6233 if (fFcw & X86_FCW_DM)
6234 {
6235 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6236 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6237 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6238 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6239 }
6240 else
6241 {
6242 pFpuRes->r80Result = *pr80Val1;
6243 fFsw |= X86_FSW_ES | X86_FSW_B;
6244 }
6245 fFsw |= X86_FSW_DE;
6246 }
6247 /* SoftFloat can handle the rest: */
6248 else
6249 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6250
6251 pFpuRes->FSW = fFsw;
6252}
6253
6254
6255EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6256EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6257EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6258EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6259
6260
6261/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6262IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6263 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6264{
6265 uint16_t const fFcw = pFpuState->FCW;
6266 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6267
6268 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6269 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6270 {
6271 if (fFcw & X86_FCW_IM)
6272 pFpuRes->r80Result = g_r80Indefinite;
6273 else
6274 {
6275 pFpuRes->r80Result = *pr80Val1;
6276 fFsw |= X86_FSW_ES | X86_FSW_B;
6277 }
6278 fFsw |= X86_FSW_IE;
6279 }
6280 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6281 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6282 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6283 {
6284 if (fFcw & X86_FCW_DM)
6285 {
6286 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6287 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6288 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6289 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6290 }
6291 else
6292 {
6293 pFpuRes->r80Result = *pr80Val1;
6294 fFsw |= X86_FSW_ES | X86_FSW_B;
6295 }
6296 fFsw |= X86_FSW_DE;
6297 }
6298 /* SoftFloat can handle the rest: */
6299 else
6300 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6301
6302 pFpuRes->FSW = fFsw;
6303}
6304
6305
6306EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6307EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6308EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6309EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6310
6311
6312/*********************************************************************************************************************************
6313* x87 FPU Trigometric Operations *
6314*********************************************************************************************************************************/
6315static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6316{
6317 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6318 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6319 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6320 extFloat80_t v;
6321 (void)fFcw;
6322
6323 v = extF80_atan2(y, x, &SoftState);
6324
6325 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6326 return fFsw;
6327}
6328
6329IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6330 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6331{
6332 uint16_t const fFcw = pFpuState->FCW;
6333 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6334
6335 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6336 {
6337 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6338
6339 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6340 if (!(fFcw & X86_FCW_PM))
6341 fFsw |= X86_FSW_ES | X86_FSW_B;
6342 }
6343 else
6344 {
6345 fFsw |= X86_FSW_IE;
6346 if (!(fFcw & X86_FCW_IM))
6347 {
6348 pFpuRes->r80Result = *pr80Val2;
6349 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6350 }
6351 else
6352 {
6353 pFpuRes->r80Result = g_r80Indefinite;
6354 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6355 }
6356 }
6357
6358 pFpuRes->FSW = fFsw;
6359}
6360#endif /* IEM_WITHOUT_ASSEMBLY */
6361
6362IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6363 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6364{
6365 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6366}
6367
6368IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6369 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6370{
6371 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6372}
6373
6374
6375#if defined(IEM_WITHOUT_ASSEMBLY)
6376static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6377{
6378 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6379 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6380 extFloat80_t v;
6381 (void)fFcw;
6382
6383 v = extF80_tan(x, &SoftState);
6384
6385 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6386 return fFsw;
6387}
6388
6389IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6390{
6391 uint16_t const fFcw = pFpuState->FCW;
6392 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6393
6394 if (RTFLOAT80U_IS_ZERO(pr80Val))
6395 {
6396 pFpuResTwo->r80Result1 = *pr80Val;
6397 pFpuResTwo->r80Result2 = g_ar80One[0];
6398 }
6399 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6400 {
6401 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6402 {
6403 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6404 pFpuResTwo->r80Result1 = *pr80Val;
6405 }
6406 else
6407 {
6408 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6409 {
6410 pFpuResTwo->r80Result1 = *pr80Val;
6411 }
6412 else
6413 {
6414 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6415 }
6416
6417 pFpuResTwo->r80Result2 = g_ar80One[0];
6418
6419 fFsw |= X86_FSW_PE;
6420 if (!(fFcw & X86_FCW_PM))
6421 fFsw |= X86_FSW_ES | X86_FSW_B;
6422 }
6423 }
6424 else
6425 {
6426 fFsw |= X86_FSW_IE;
6427 if (!(fFcw & X86_FCW_IM))
6428 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6429 }
6430
6431 pFpuResTwo->FSW = fFsw;
6432}
6433#endif /* IEM_WITHOUT_ASSEMBLY */
6434
6435IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6436{
6437 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6438}
6439
6440IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6441{
6442 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6443}
6444
6445#ifdef IEM_WITHOUT_ASSEMBLY
6446
6447static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6448{
6449 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6450 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6451 extFloat80_t v;
6452 (void)fFcw;
6453
6454 v = extF80_sin(x, &SoftState);
6455
6456 iemFpuSoftF80ToIprt(pr80Result, v);
6457
6458 return fFsw;
6459}
6460
6461IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6462{
6463 uint16_t const fFcw = pFpuState->FCW;
6464 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6465
6466 if (RTFLOAT80U_IS_ZERO(pr80Val))
6467 {
6468 pFpuRes->r80Result = *pr80Val;
6469 }
6470 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6471 {
6472 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6473 {
6474 fFsw |= X86_FSW_C2;
6475 pFpuRes->r80Result = *pr80Val;
6476 }
6477 else
6478 {
6479 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6480 {
6481 pFpuRes->r80Result = *pr80Val;
6482 }
6483 else
6484 {
6485 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6486 }
6487 fFsw |= X86_FSW_PE;
6488 if (!(fFcw & X86_FCW_PM))
6489 fFsw |= X86_FSW_ES | X86_FSW_B;
6490 }
6491 }
6492 else if (RTFLOAT80U_IS_INF(pr80Val))
6493 {
6494 fFsw |= X86_FSW_IE;
6495 if (!(fFcw & X86_FCW_IM))
6496 {
6497 fFsw |= X86_FSW_ES | X86_FSW_B;
6498 pFpuRes->r80Result = *pr80Val;
6499 }
6500 else
6501 {
6502 pFpuRes->r80Result = g_r80Indefinite;
6503 }
6504 }
6505 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6506 {
6507 fFsw |= X86_FSW_DE;
6508
6509 if (fFcw & X86_FCW_DM)
6510 {
6511 if (fFcw & X86_FCW_UM)
6512 {
6513 pFpuRes->r80Result = *pr80Val;
6514 }
6515 else
6516 {
6517 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6518 uint64_t uMantissa = pr80Val->s.uMantissa;
6519 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6520
6521 uExponent = 64 - uExponent;
6522 uMantissa <<= uExponent;
6523 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6524
6525 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6526 pFpuRes->r80Result.s.uMantissa = uMantissa;
6527 pFpuRes->r80Result.s.uExponent = uExponent;
6528 }
6529
6530 fFsw |= X86_FSW_UE | X86_FSW_PE;
6531
6532 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6533 {
6534 /* All the exceptions are masked. */
6535 }
6536 else
6537 {
6538 fFsw |= X86_FSW_ES | X86_FSW_B;
6539 }
6540 }
6541 else
6542 {
6543 pFpuRes->r80Result = *pr80Val;
6544
6545 fFsw |= X86_FSW_ES | X86_FSW_B;
6546 }
6547 }
6548 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6549 {
6550 pFpuRes->r80Result = *pr80Val;
6551 fFsw |= X86_FSW_DE;
6552
6553 if (fFcw & X86_FCW_DM)
6554 {
6555 if (fFcw & X86_FCW_PM)
6556 {
6557 fFsw |= X86_FSW_PE;
6558 }
6559 else
6560 {
6561 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6562 }
6563
6564 pFpuRes->r80Result.sj64.uExponent = 1;
6565 }
6566 else
6567 {
6568 fFsw |= X86_FSW_ES | X86_FSW_B;
6569 }
6570 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6571 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6572 {
6573 pFpuRes->r80Result = *pr80Val;
6574 } else {
6575 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6576 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6577 && (fFcw & X86_FCW_IM))
6578 pFpuRes->r80Result = g_r80Indefinite;
6579 else
6580 {
6581 pFpuRes->r80Result = *pr80Val;
6582 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6583 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6584 }
6585
6586 fFsw |= X86_FSW_IE;
6587 if (!(fFcw & X86_FCW_IM))
6588 fFsw |= X86_FSW_ES | X86_FSW_B;
6589 }
6590
6591 pFpuRes->FSW = fFsw;
6592}
6593#endif /* IEM_WITHOUT_ASSEMBLY */
6594
6595IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6596{
6597 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6598}
6599
6600IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6601{
6602 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6603}
6604
6605#ifdef IEM_WITHOUT_ASSEMBLY
6606
6607static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6608{
6609 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6610 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6611 extFloat80_t v;
6612 (void)fFcw;
6613
6614 v = extF80_cos(x, &SoftState);
6615
6616 iemFpuSoftF80ToIprt(pr80Result, v);
6617
6618 return fFsw;
6619}
6620
6621IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6622{
6623 uint16_t const fFcw = pFpuState->FCW;
6624 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6625
6626 if (RTFLOAT80U_IS_ZERO(pr80Val))
6627 {
6628 pFpuRes->r80Result = g_ar80One[0];
6629 }
6630 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6631 {
6632 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6633 {
6634 fFsw |= X86_FSW_C2;
6635 pFpuRes->r80Result = *pr80Val;
6636 }
6637 else
6638 {
6639 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6640 {
6641 pFpuRes->r80Result = g_ar80One[0];
6642
6643 }
6644 else
6645 {
6646 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6647 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6648 }
6649 fFsw |= X86_FSW_PE;
6650 if (!(fFcw & X86_FCW_PM))
6651 fFsw |= X86_FSW_ES | X86_FSW_B;
6652 }
6653 }
6654 else if (RTFLOAT80U_IS_INF(pr80Val))
6655 {
6656 fFsw |= X86_FSW_IE;
6657 if (!(fFcw & X86_FCW_IM))
6658 {
6659 fFsw |= X86_FSW_ES | X86_FSW_B;
6660 pFpuRes->r80Result = *pr80Val;
6661 }
6662 else
6663 {
6664 pFpuRes->r80Result = g_r80Indefinite;
6665 }
6666 }
6667 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6668 {
6669 fFsw |= X86_FSW_DE;
6670
6671 if (fFcw & X86_FCW_DM)
6672 {
6673 pFpuRes->r80Result = g_ar80One[0];
6674
6675 if (fFcw & X86_FCW_PM)
6676 {
6677 fFsw |= X86_FSW_PE;
6678 }
6679 else
6680 {
6681 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6682 }
6683 }
6684 else
6685 {
6686 pFpuRes->r80Result = *pr80Val;
6687 fFsw |= X86_FSW_ES | X86_FSW_B;
6688 }
6689 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6690 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6691 {
6692 pFpuRes->r80Result = *pr80Val;
6693 } else {
6694 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6695 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6696 && (fFcw & X86_FCW_IM))
6697 pFpuRes->r80Result = g_r80Indefinite;
6698 else
6699 {
6700 pFpuRes->r80Result = *pr80Val;
6701 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6702 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6703 }
6704
6705 fFsw |= X86_FSW_IE;
6706 if (!(fFcw & X86_FCW_IM))
6707 fFsw |= X86_FSW_ES | X86_FSW_B;
6708 }
6709
6710 pFpuRes->FSW = fFsw;
6711}
6712#endif /* IEM_WITHOUT_ASSEMBLY */
6713
6714IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6715{
6716 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6717}
6718
6719IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6720{
6721 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6722}
6723
6724#ifdef IEM_WITHOUT_ASSEMBLY
6725
6726static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6727{
6728 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6729 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6730 extFloat80_t r80Sin, r80Cos;
6731 (void)fFcw;
6732
6733 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6734
6735 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6736 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6737
6738 return fFsw;
6739}
6740
6741IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6742{
6743 uint16_t const fFcw = pFpuState->FCW;
6744 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6745
6746 if (RTFLOAT80U_IS_ZERO(pr80Val))
6747 {
6748 pFpuResTwo->r80Result1 = *pr80Val;
6749 pFpuResTwo->r80Result2 = g_ar80One[0];
6750 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6751 }
6752 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6753 {
6754 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6755 {
6756 fFsw |= X86_FSW_C2;
6757
6758 if (fFcw & X86_FCW_IM)
6759 {
6760 pFpuResTwo->r80Result1 = g_r80Indefinite;
6761 }
6762 else
6763 {
6764 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6765 }
6766
6767 pFpuResTwo->r80Result2 = *pr80Val;
6768 }
6769 else
6770 {
6771 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6772
6773 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6774 {
6775 pFpuResTwo->r80Result1 = *pr80Val;
6776 pFpuResTwo->r80Result2 = g_ar80One[0];
6777 }
6778 else
6779 {
6780 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6781 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6782 }
6783 fFsw |= X86_FSW_PE;
6784 if (!(fFcw & X86_FCW_PM))
6785 fFsw |= X86_FSW_ES | X86_FSW_B;
6786 }
6787 }
6788 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6789 {
6790 fFsw |= X86_FSW_DE;
6791
6792 if (fFcw & X86_FCW_DM)
6793 {
6794 pFpuResTwo->r80Result1 = *pr80Val;
6795 pFpuResTwo->r80Result2 = g_ar80One[0];
6796 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6797
6798 if (fFcw & X86_FCW_PM)
6799 {
6800 fFsw |= X86_FSW_PE;
6801 }
6802 else
6803 {
6804 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6805 }
6806
6807 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6808 }
6809 else
6810 {
6811 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6812 pFpuResTwo->r80Result2 = *pr80Val;
6813 fFsw |= X86_FSW_ES | X86_FSW_B;
6814 }
6815 }
6816 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6817 {
6818 fFsw |= X86_FSW_DE;
6819
6820 if (fFcw & X86_FCW_DM)
6821 {
6822 pFpuResTwo->r80Result2 = g_ar80One[0];
6823
6824 if (fFcw & X86_FCW_UM)
6825 {
6826 pFpuResTwo->r80Result1 = *pr80Val;
6827 }
6828 else
6829 {
6830 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6831 uint64_t uMantissa = pr80Val->s.uMantissa;
6832 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6833
6834 uExponent = 64 - uExponent;
6835 uMantissa <<= uExponent;
6836 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6837
6838 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6839 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6840 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6841 }
6842
6843 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6844 fFsw |= X86_FSW_UE | X86_FSW_PE;
6845
6846 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6847 {
6848 /* All the exceptions are masked. */
6849 }
6850 else
6851 {
6852 fFsw |= X86_FSW_ES | X86_FSW_B;
6853 }
6854 }
6855 else
6856 {
6857 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6858 pFpuResTwo->r80Result2 = *pr80Val;
6859 fFsw |= X86_FSW_ES | X86_FSW_B;
6860 }
6861 }
6862 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6863 {
6864 pFpuResTwo->r80Result1 = *pr80Val;
6865 pFpuResTwo->r80Result2 = *pr80Val;
6866 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6867 }
6868 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6869 {
6870 if (fFcw & X86_FCW_IM)
6871 {
6872 pFpuResTwo->r80Result1 = g_r80Indefinite;
6873 pFpuResTwo->r80Result2 = g_r80Indefinite;
6874 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6875 }
6876 else
6877 {
6878 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6879 pFpuResTwo->r80Result2 = *pr80Val;
6880 }
6881
6882 fFsw |= X86_FSW_IE;
6883 if (!(fFcw & X86_FCW_IM))
6884 fFsw |= X86_FSW_ES | X86_FSW_B;
6885 }
6886 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6887 {
6888 pFpuResTwo->r80Result1 = *pr80Val;
6889 pFpuResTwo->r80Result2 = *pr80Val;
6890
6891 if (fFcw & X86_FCW_IM)
6892 {
6893 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6894 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6895 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6896 }
6897 else
6898 {
6899 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6900 pFpuResTwo->r80Result2 = *pr80Val;
6901 }
6902
6903 fFsw |= X86_FSW_IE;
6904 if (!(fFcw & X86_FCW_IM))
6905 fFsw |= X86_FSW_ES | X86_FSW_B;
6906 }
6907 else if (RTFLOAT80U_IS_INF(pr80Val))
6908 {
6909 if (fFcw & X86_FCW_IM)
6910 {
6911 pFpuResTwo->r80Result1 = g_r80Indefinite;
6912 pFpuResTwo->r80Result2 = g_r80Indefinite;
6913 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6914 }
6915 else
6916 {
6917 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6918 pFpuResTwo->r80Result2 = *pr80Val;
6919 }
6920
6921 fFsw |= X86_FSW_IE;
6922 if (!(fFcw & X86_FCW_IM))
6923 fFsw |= X86_FSW_ES | X86_FSW_B;
6924 }
6925
6926 pFpuResTwo->FSW = fFsw;
6927}
6928#endif /* IEM_WITHOUT_ASSEMBLY */
6929
6930IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6931{
6932 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6933}
6934
6935IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6936{
6937 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6938}
6939
6940#ifdef IEM_WITHOUT_ASSEMBLY
6941
6942
6943/*********************************************************************************************************************************
6944* x87 FPU Compare and Testing Operations *
6945*********************************************************************************************************************************/
6946
6947IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6948{
6949 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6950
6951 if (RTFLOAT80U_IS_ZERO(pr80Val))
6952 fFsw |= X86_FSW_C3;
6953 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6954 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6955 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6956 {
6957 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6958 if (!(pFpuState->FCW & X86_FCW_DM))
6959 fFsw |= X86_FSW_ES | X86_FSW_B;
6960 }
6961 else
6962 {
6963 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6964 if (!(pFpuState->FCW & X86_FCW_IM))
6965 fFsw |= X86_FSW_ES | X86_FSW_B;
6966 }
6967
6968 *pu16Fsw = fFsw;
6969}
6970
6971
6972IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6973{
6974 RT_NOREF(pFpuState);
6975 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6976
6977 /* C1 = sign bit (always, even if empty Intel says). */
6978 if (pr80Val->s.fSign)
6979 fFsw |= X86_FSW_C1;
6980
6981 /* Classify the value in C0, C2, C3. */
6982 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6983 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6984 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6985 fFsw |= X86_FSW_C2;
6986 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6987 fFsw |= X86_FSW_C3;
6988 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6989 fFsw |= X86_FSW_C0;
6990 else if (RTFLOAT80U_IS_INF(pr80Val))
6991 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6992 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6993 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6994 /* whatever else: 0 */
6995
6996 *pu16Fsw = fFsw;
6997}
6998
6999
7000/**
7001 * Worker for fcom, fucom, and friends.
7002 */
7003static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7004 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
7005{
7006 /*
7007 * Unpack the values.
7008 */
7009 bool const fSign1 = pr80Val1->s.fSign;
7010 int32_t iExponent1 = pr80Val1->s.uExponent;
7011 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
7012
7013 bool const fSign2 = pr80Val2->s.fSign;
7014 int32_t iExponent2 = pr80Val2->s.uExponent;
7015 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
7016
7017 /*
7018 * Check for invalid inputs.
7019 */
7020 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
7021 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
7022 {
7023 if (!(fFcw & X86_FCW_IM))
7024 fFsw |= X86_FSW_ES | X86_FSW_B;
7025 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
7026 }
7027
7028 /*
7029 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
7030 */
7031 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7032 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7033 {
7034 if ( fIeOnAllNaNs
7035 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7036 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7037 {
7038 fFsw |= X86_FSW_IE;
7039 if (!(fFcw & X86_FCW_IM))
7040 fFsw |= X86_FSW_ES | X86_FSW_B;
7041 }
7042 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
7043 }
7044
7045 /*
7046 * Normalize the values.
7047 */
7048 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7049 {
7050 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7051 iExponent1 = 1;
7052 else
7053 {
7054 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7055 uMantissa1 <<= iExponent1;
7056 iExponent1 = 1 - iExponent1;
7057 }
7058 fFsw |= X86_FSW_DE;
7059 if (!(fFcw & X86_FCW_DM))
7060 fFsw |= X86_FSW_ES | X86_FSW_B;
7061 }
7062
7063 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7064 {
7065 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7066 iExponent2 = 1;
7067 else
7068 {
7069 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7070 uMantissa2 <<= iExponent2;
7071 iExponent2 = 1 - iExponent2;
7072 }
7073 fFsw |= X86_FSW_DE;
7074 if (!(fFcw & X86_FCW_DM))
7075 fFsw |= X86_FSW_ES | X86_FSW_B;
7076 }
7077
7078 /*
7079 * Test if equal (val1 == val2):
7080 */
7081 if ( uMantissa1 == uMantissa2
7082 && iExponent1 == iExponent2
7083 && ( fSign1 == fSign2
7084 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7085 fFsw |= X86_FSW_C3;
7086 /*
7087 * Test if less than (val1 < val2):
7088 */
7089 else if (fSign1 && !fSign2)
7090 fFsw |= X86_FSW_C0;
7091 else if (fSign1 == fSign2)
7092 {
7093 /* Zeros are problematic, however at the most one can be zero here. */
7094 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7095 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7096 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7097 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7098
7099 if ( fSign1
7100 ^ ( iExponent1 < iExponent2
7101 || ( iExponent1 == iExponent2
7102 && uMantissa1 < uMantissa2 ) ) )
7103 fFsw |= X86_FSW_C0;
7104 }
7105 /* else: No flags set if greater. */
7106
7107 return fFsw;
7108}
7109
7110
7111IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7112 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7113{
7114 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7115}
7116
7117
7118
7119
7120IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7121 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7122{
7123 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7124}
7125
7126
7127IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7128 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7129{
7130 RTFLOAT80U r80Val2;
7131 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7132 Assert(!fFsw || fFsw == X86_FSW_DE);
7133 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7134 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7135 {
7136 if (!(pFpuState->FCW & X86_FCW_DM))
7137 fFsw |= X86_FSW_ES | X86_FSW_B;
7138 *pfFsw |= fFsw;
7139 }
7140}
7141
7142
7143IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7144 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7145{
7146 RTFLOAT80U r80Val2;
7147 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7148 Assert(!fFsw || fFsw == X86_FSW_DE);
7149 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7150 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7151 {
7152 if (!(pFpuState->FCW & X86_FCW_DM))
7153 fFsw |= X86_FSW_ES | X86_FSW_B;
7154 *pfFsw |= fFsw;
7155 }
7156}
7157
7158
7159IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7160 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7161{
7162 RTFLOAT80U r80Val2;
7163 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7164 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7165}
7166
7167
7168IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7169 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7170{
7171 RTFLOAT80U r80Val2;
7172 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7173 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7174}
7175
7176
7177/**
7178 * Worker for fcomi & fucomi.
7179 */
7180static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7181 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7182{
7183 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7184 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7185 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7186 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7187
7188 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7189 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7190 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7191}
7192
7193
7194IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7195 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7196{
7197 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7198}
7199
7200
7201IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7202 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7203{
7204 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7205}
7206
7207
7208/*********************************************************************************************************************************
7209* x87 FPU Other Operations *
7210*********************************************************************************************************************************/
7211
7212/**
7213 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7214 */
7215static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7216{
7217 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7218 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7219 true /*exact / generate #PE */, &SoftState));
7220 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7221}
7222
7223
7224IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7225{
7226 uint16_t const fFcw = pFpuState->FCW;
7227 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7228
7229 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7230 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7231 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7232 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7233 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7234 || RTFLOAT80U_IS_INF(pr80Val))
7235 pFpuRes->r80Result = *pr80Val;
7236 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7237 {
7238 fFsw |= X86_FSW_DE;
7239 if (fFcw & X86_FCW_DM)
7240 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7241 else
7242 {
7243 pFpuRes->r80Result = *pr80Val;
7244 fFsw |= X86_FSW_ES | X86_FSW_B;
7245 }
7246 }
7247 else
7248 {
7249 if (fFcw & X86_FCW_IM)
7250 {
7251 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7252 pFpuRes->r80Result = g_r80Indefinite;
7253 else
7254 {
7255 pFpuRes->r80Result = *pr80Val;
7256 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7257 }
7258 }
7259 else
7260 {
7261 pFpuRes->r80Result = *pr80Val;
7262 fFsw |= X86_FSW_ES | X86_FSW_B;
7263 }
7264 fFsw |= X86_FSW_IE;
7265 }
7266 pFpuRes->FSW = fFsw;
7267}
7268
7269
7270IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7271 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7272{
7273 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7274 it does everything we need it to do. */
7275 uint16_t const fFcw = pFpuState->FCW;
7276 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7277 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7278 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7279 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7280}
7281
7282
7283/**
7284 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7285 */
7286static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7287{
7288 Assert(!pr80Val->s.fSign);
7289 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7290 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7291 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7292}
7293
7294
7295IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7296{
7297 uint16_t const fFcw = pFpuState->FCW;
7298 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7299
7300 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7301 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7302 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7303 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7304 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7305 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7306 pFpuRes->r80Result = *pr80Val;
7307 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7308 {
7309 fFsw |= X86_FSW_DE;
7310 if (fFcw & X86_FCW_DM)
7311 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7312 else
7313 {
7314 pFpuRes->r80Result = *pr80Val;
7315 fFsw |= X86_FSW_ES | X86_FSW_B;
7316 }
7317 }
7318 else
7319 {
7320 if (fFcw & X86_FCW_IM)
7321 {
7322 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7323 pFpuRes->r80Result = g_r80Indefinite;
7324 else
7325 {
7326 pFpuRes->r80Result = *pr80Val;
7327 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7328 }
7329 }
7330 else
7331 {
7332 pFpuRes->r80Result = *pr80Val;
7333 fFsw |= X86_FSW_ES | X86_FSW_B;
7334 }
7335 fFsw |= X86_FSW_IE;
7336 }
7337 pFpuRes->FSW = fFsw;
7338}
7339
7340
7341/**
7342 * @code{.unparsed}
7343 * x x * ln2
7344 * f(x) = 2 - 1 = e - 1
7345 *
7346 * @endcode
7347 *
7348 * We can approximate e^x by a Taylor/Maclaurin series (see
7349 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7350 * @code{.unparsed}
7351 * n 0 1 2 3 4
7352 * inf x x x x x x
7353 * SUM ----- = --- + --- + --- + --- + --- + ...
7354 * n=0 n! 0! 1! 2! 3! 4!
7355 *
7356 * 2 3 4
7357 * x x x
7358 * = 1 + x + --- + --- + --- + ...
7359 * 2! 3! 4!
7360 * @endcode
7361 *
7362 * Given z = x * ln2, we get:
7363 * @code{.unparsed}
7364 * 2 3 4 n
7365 * z z z z z
7366 * e - 1 = z + --- + --- + --- + ... + ---
7367 * 2! 3! 4! n!
7368 * @endcode
7369 *
7370 * Wanting to use Horner's method, we move one z outside and get:
7371 * @code{.unparsed}
7372 * 2 3 (n-1)
7373 * z z z z
7374 * = z ( 1 + --- + --- + --- + ... + ------- )
7375 * 2! 3! 4! n!
7376 * @endcode
7377 *
7378 * The constants we need for using Horner's methods are 1 and 1 / n!.
7379 *
7380 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7381 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7382 * and can approximate it to be 1.0. For a visual demonstration of this
7383 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7384 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7385 *
7386 *
7387 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7388 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7389 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7390 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7391 * blocks). (The one bit difference is probably an implicit one missing from
7392 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7393 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7394 * exponent.
7395 *
7396 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7397 * successfully reproduced the exact results from an Intel 10980XE, there is
7398 * always a portition of rounding differences. Not going to spend too much time
7399 * on getting this 100% the same, at least not now.
7400 *
7401 * P.S. If someone are really curious about 8087 and its contstants:
7402 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7403 *
7404 *
7405 * @param pr80Val The exponent value (x), less than 1.0, greater than
7406 * -1.0 and not zero. This can be a normal, denormal
7407 * or pseudo-denormal value.
7408 * @param pr80Result Where to return the result.
7409 * @param fFcw FPU control word.
7410 * @param fFsw FPU status word.
7411 */
7412static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7413{
7414 /* As mentioned above, we can skip the expensive polynomial calculation
7415 as it will be close enough to 1.0 that it makes no difference.
7416
7417 The cutoff point for intel 10980XE is exponents >= -69. Intel
7418 also seems to be using a 67-bit or 68-bit constant value, and we get
7419 a smattering of rounding differences if we go for higher precision. */
7420 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7421 {
7422 RTUINT256U u256;
7423 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7424 u256.QWords.qw0 |= 1; /* force #PE */
7425 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7426 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7427 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7428 : 1 - RTFLOAT80U_EXP_BIAS,
7429 fFcw, fFsw);
7430 }
7431 else
7432 {
7433#ifdef IEM_WITH_FLOAT128_FOR_FPU
7434 /* This approach is not good enough for small values, we end up with zero. */
7435 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7436 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7437 _Float128 rd128Result = powf128(2.0L, rd128Val);
7438 rd128Result -= 1.0L;
7439 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7440 iemFpuF128RestoreRounding(fOldRounding);
7441
7442# else
7443 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7444 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7445
7446 /* As mentioned above, enforce 68-bit internal mantissa width to better
7447 match the Intel 10980XE results. */
7448 unsigned const cPrecision = 68;
7449
7450 /* first calculate z = x * ln2 */
7451 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7452 cPrecision);
7453
7454 /* Then do the polynomial evaluation. */
7455 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7456 cPrecision, &SoftState);
7457 r = f128_mul(z, r, &SoftState);
7458
7459 /* Output the result. */
7460 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7461# endif
7462 }
7463 return fFsw;
7464}
7465
7466
7467IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7468{
7469 uint16_t const fFcw = pFpuState->FCW;
7470 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7471
7472 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7473 {
7474 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7475 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7476 else
7477 {
7478 /* Special case:
7479 2^+1.0 - 1.0 = 1.0
7480 2^-1.0 - 1.0 = -0.5 */
7481 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7482 && pr80Val->s.uMantissa == RT_BIT_64(63))
7483 {
7484 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7485 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7486 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7487 }
7488 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7489 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7490 else
7491 pFpuRes->r80Result = *pr80Val;
7492 fFsw |= X86_FSW_PE;
7493 if (!(fFcw & X86_FCW_PM))
7494 fFsw |= X86_FSW_ES | X86_FSW_B;
7495 }
7496 }
7497 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7498 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7499 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7500 pFpuRes->r80Result = *pr80Val;
7501 else if (RTFLOAT80U_IS_INF(pr80Val))
7502 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7503 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7504 {
7505 fFsw |= X86_FSW_DE;
7506 if (fFcw & X86_FCW_DM)
7507 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7508 else
7509 {
7510 pFpuRes->r80Result = *pr80Val;
7511 fFsw |= X86_FSW_ES | X86_FSW_B;
7512 }
7513 }
7514 else
7515 {
7516 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7517 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7518 && (fFcw & X86_FCW_IM))
7519 pFpuRes->r80Result = g_r80Indefinite;
7520 else
7521 {
7522 pFpuRes->r80Result = *pr80Val;
7523 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7524 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7525 }
7526 fFsw |= X86_FSW_IE;
7527 if (!(fFcw & X86_FCW_IM))
7528 fFsw |= X86_FSW_ES | X86_FSW_B;
7529 }
7530 pFpuRes->FSW = fFsw;
7531}
7532
7533#endif /* IEM_WITHOUT_ASSEMBLY */
7534
7535IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7536{
7537 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7538}
7539
7540IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7541{
7542 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7543}
7544
7545#ifdef IEM_WITHOUT_ASSEMBLY
7546
7547IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7548{
7549 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7550 pFpuRes->r80Result = *pr80Val;
7551 pFpuRes->r80Result.s.fSign = 0;
7552}
7553
7554
7555IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7556{
7557 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7558 pFpuRes->r80Result = *pr80Val;
7559 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7560}
7561
7562
7563IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7564{
7565 uint16_t const fFcw = pFpuState->FCW;
7566 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7567
7568 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7569 {
7570 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7571 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7572
7573 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7574 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7575 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7576 }
7577 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7578 {
7579 fFsw |= X86_FSW_ZE;
7580 if (fFcw & X86_FCW_ZM)
7581 {
7582 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7583 pFpuResTwo->r80Result2 = *pr80Val;
7584 }
7585 else
7586 {
7587 pFpuResTwo->r80Result2 = *pr80Val;
7588 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7589 }
7590 }
7591 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7592 {
7593 fFsw |= X86_FSW_DE;
7594 if (fFcw & X86_FCW_DM)
7595 {
7596 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7597 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7598 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7599 int32_t iExponent = -16382;
7600 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7601 {
7602 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7603 iExponent--;
7604 }
7605
7606 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7607 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7608 }
7609 else
7610 {
7611 pFpuResTwo->r80Result2 = *pr80Val;
7612 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7613 }
7614 }
7615 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7616 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7617 {
7618 pFpuResTwo->r80Result1 = *pr80Val;
7619 pFpuResTwo->r80Result2 = *pr80Val;
7620 }
7621 else if (RTFLOAT80U_IS_INF(pr80Val))
7622 {
7623 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7624 pFpuResTwo->r80Result2 = *pr80Val;
7625 }
7626 else
7627 {
7628 if (fFcw & X86_FCW_IM)
7629 {
7630 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7631 pFpuResTwo->r80Result1 = g_r80Indefinite;
7632 else
7633 {
7634 pFpuResTwo->r80Result1 = *pr80Val;
7635 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7636 }
7637 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7638 }
7639 else
7640 {
7641 pFpuResTwo->r80Result2 = *pr80Val;
7642 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7643 }
7644 fFsw |= X86_FSW_IE;
7645 }
7646 pFpuResTwo->FSW = fFsw;
7647}
7648#endif /* IEM_WITHOUT_ASSEMBLY */
7649
7650#if defined(IEM_WITHOUT_ASSEMBLY)
7651
7652static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7653{
7654 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7655 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7656 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7657 extFloat80_t v;
7658 (void)fFcw;
7659
7660 v = extF80_ylog2x(y, x, &SoftState);
7661 iemFpuSoftF80ToIprt(pr80Result, v);
7662
7663 return fFsw;
7664}
7665
7666IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7667 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7668{
7669 uint16_t const fFcw = pFpuState->FCW;
7670 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7671
7672 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7673 {
7674 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7675
7676 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7677 if (!(fFcw & X86_FCW_PM))
7678 fFsw |= X86_FSW_ES | X86_FSW_B;
7679 }
7680 else
7681 {
7682 fFsw |= X86_FSW_IE;
7683
7684 if (!(fFcw & X86_FCW_IM))
7685 {
7686 pFpuRes->r80Result = *pr80Val2;
7687 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7688 }
7689 else
7690 {
7691 pFpuRes->r80Result = g_r80Indefinite;
7692 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7693 }
7694 }
7695
7696 pFpuRes->FSW = fFsw;
7697}
7698#endif /* IEM_WITHOUT_ASSEMBLY */
7699
7700IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7701 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7702{
7703 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7704}
7705
7706IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7707 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7708{
7709 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7710}
7711
7712#if defined(IEM_WITHOUT_ASSEMBLY)
7713
7714static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7715{
7716 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7717 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7718 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7719 extFloat80_t v;
7720 (void)fFcw;
7721
7722 v = extF80_ylog2xp1(y, x, &SoftState);
7723 iemFpuSoftF80ToIprt(pr80Result, v);
7724
7725 return fFsw;
7726}
7727
7728IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7730{
7731 uint16_t const fFcw = pFpuState->FCW;
7732 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7733
7734 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7735 {
7736 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7737
7738 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7739 if (!(fFcw & X86_FCW_PM))
7740 fFsw |= X86_FSW_ES | X86_FSW_B;
7741 }
7742 else
7743 {
7744 fFsw |= X86_FSW_IE;
7745
7746 if (!(fFcw & X86_FCW_IM))
7747 {
7748 pFpuRes->r80Result = *pr80Val2;
7749 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7750 }
7751 else
7752 {
7753 pFpuRes->r80Result = g_r80Indefinite;
7754 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7755 }
7756 }
7757
7758 pFpuRes->FSW = fFsw;
7759}
7760
7761#endif /* IEM_WITHOUT_ASSEMBLY */
7762
7763IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7764 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7765{
7766 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7767}
7768
7769IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7770 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7771{
7772 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7773}
7774
7775
7776/*********************************************************************************************************************************
7777* MMX, SSE & AVX *
7778*********************************************************************************************************************************/
7779
7780/*
7781 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7782 */
7783#ifdef IEM_WITHOUT_ASSEMBLY
7784
7785IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(uint64_t *puDst, uint64_t const *puSrc))
7786{
7787 *puDst &= *puSrc;
7788}
7789
7790
7791IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7792{
7793 puDst->au64[0] &= puSrc->au64[0];
7794 puDst->au64[1] &= puSrc->au64[1];
7795}
7796
7797#endif
7798
7799IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7800{
7801 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7802 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7803}
7804
7805
7806IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7807{
7808 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7809 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7810 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7811 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7812}
7813
7814
7815/*
7816 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7817 */
7818#ifdef IEM_WITHOUT_ASSEMBLY
7819
7820IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(uint64_t *puDst, uint64_t const *puSrc))
7821{
7822 *puDst = ~*puDst & *puSrc;
7823}
7824
7825
7826IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7827{
7828 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7829 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7830}
7831
7832#endif
7833
7834IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7835{
7836 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7837 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7838}
7839
7840
7841IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7842{
7843 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7844 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7845 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7846 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7847}
7848
7849
7850/*
7851 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7852 */
7853#ifdef IEM_WITHOUT_ASSEMBLY
7854
7855IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(uint64_t *puDst, uint64_t const *puSrc))
7856{
7857 *puDst |= *puSrc;
7858}
7859
7860
7861IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7862{
7863 puDst->au64[0] |= puSrc->au64[0];
7864 puDst->au64[1] |= puSrc->au64[1];
7865}
7866
7867#endif
7868
7869IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7870{
7871 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7872 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7873}
7874
7875
7876IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7877{
7878 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7879 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7880 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7881 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7882}
7883
7884
7885/*
7886 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7887 */
7888#ifdef IEM_WITHOUT_ASSEMBLY
7889
7890IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(uint64_t *puDst, uint64_t const *puSrc))
7891{
7892 *puDst ^= *puSrc;
7893}
7894
7895
7896IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7897{
7898 puDst->au64[0] ^= puSrc->au64[0];
7899 puDst->au64[1] ^= puSrc->au64[1];
7900}
7901
7902#endif
7903
7904IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7905{
7906 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7907 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7908}
7909
7910
7911IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7912{
7913 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7914 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7915 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7916 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7917}
7918
7919
7920/*
7921 * PCMPEQB / VPCMPEQB
7922 */
7923#ifdef IEM_WITHOUT_ASSEMBLY
7924
7925IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(uint64_t *puDst, uint64_t const *puSrc))
7926{
7927 RTUINT64U uSrc1 = { *puDst };
7928 RTUINT64U uSrc2 = { *puSrc };
7929 RTUINT64U uDst;
7930 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7931 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7932 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7933 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7934 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7935 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7936 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7937 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7938 *puDst = uDst.u;
7939}
7940
7941
7942IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7943{
7944 RTUINT128U uSrc1 = *puDst;
7945 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7946 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7947 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7948 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7949 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7950 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7951 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7952 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7953 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7954 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7955 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7956 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7957 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7958 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7959 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7960 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7961}
7962
7963#endif
7964
7965IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7966{
7967 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7968 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7969 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7970 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7971 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7972 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7973 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7974 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7975 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7976 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7977 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7978 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7979 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7980 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7981 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7982 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7983}
7984
7985IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7986{
7987 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7988 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7989 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7990 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7991 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7992 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7993 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7994 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7995 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7996 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7997 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7998 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7999 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8000 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8001 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8002 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8003 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8004 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8005 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8006 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8007 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8008 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8009 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8010 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8011 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8012 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8013 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8014 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8015 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8016 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8017 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8018 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8019}
8020
8021
8022/*
8023 * PCMPEQW / VPCMPEQW
8024 */
8025#ifdef IEM_WITHOUT_ASSEMBLY
8026
8027IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8028{
8029 RTUINT64U uSrc1 = { *puDst };
8030 RTUINT64U uSrc2 = { *puSrc };
8031 RTUINT64U uDst;
8032 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8033 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8034 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8035 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8036 *puDst = uDst.u;
8037}
8038
8039
8040IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8041{
8042 RTUINT128U uSrc1 = *puDst;
8043 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8044 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8045 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8046 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8047 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8048 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8049 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8050 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8051}
8052
8053#endif
8054
8055IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8056{
8057 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8058 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8059 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8060 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8061 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8062 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8063 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8064 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8065}
8066
8067IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8068{
8069 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8070 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8071 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8072 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8073 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8074 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8075 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8076 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8077 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8078 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8079 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8080 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8081 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8082 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8083 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8084 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8085}
8086
8087
8088/*
8089 * PCMPEQD / VPCMPEQD.
8090 */
8091#ifdef IEM_WITHOUT_ASSEMBLY
8092
8093IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8094{
8095 RTUINT64U uSrc1 = { *puDst };
8096 RTUINT64U uSrc2 = { *puSrc };
8097 RTUINT64U uDst;
8098 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8099 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8100 *puDst = uDst.u;
8101}
8102
8103
8104IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8105{
8106 RTUINT128U uSrc1 = *puDst;
8107 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8108 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8109 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8110 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8111}
8112
8113#endif /* IEM_WITHOUT_ASSEMBLY */
8114
8115IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8116{
8117 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8118 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8119 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8120 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8121}
8122
8123IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8124{
8125 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8126 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8127 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8128 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8129 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8130 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8131 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8132 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8133}
8134
8135
8136/*
8137 * PCMPEQQ / VPCMPEQQ.
8138 */
8139IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8140{
8141 RTUINT128U uSrc1 = *puDst;
8142 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8143 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8144}
8145
8146IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8147{
8148 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8149 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8150}
8151
8152IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8153{
8154 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8155 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8156 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8157 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8158}
8159
8160
8161/*
8162 * PCMPGTB / VPCMPGTB
8163 */
8164#ifdef IEM_WITHOUT_ASSEMBLY
8165
8166IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8167{
8168 RTUINT64U uSrc1 = { *puDst };
8169 RTUINT64U uSrc2 = { *puSrc };
8170 RTUINT64U uDst;
8171 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8172 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8173 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8174 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8175 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8176 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8177 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8178 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8179 *puDst = uDst.u;
8180}
8181
8182
8183IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8184{
8185 RTUINT128U uSrc1 = *puDst;
8186 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8187 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8188 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8189 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8190 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8191 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8192 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8193 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8194 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8195 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8196 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8197 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8198 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8199 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8200 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8201 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8202}
8203
8204#endif
8205
8206IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8207{
8208 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8209 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8210 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8211 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8212 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8213 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8214 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8215 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8216 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8217 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8218 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8219 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8220 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8221 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8222 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8223 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8224}
8225
8226IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8227{
8228 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8229 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8230 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8231 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8232 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8233 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8234 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8235 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8236 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8237 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8238 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8239 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8240 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8241 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8242 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8243 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8244 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8245 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8246 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8247 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8248 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8249 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8250 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8251 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8252 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8253 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8254 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8255 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8256 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8257 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8258 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8259 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8260}
8261
8262
8263/*
8264 * PCMPGTW / VPCMPGTW
8265 */
8266#ifdef IEM_WITHOUT_ASSEMBLY
8267
8268IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8269{
8270 RTUINT64U uSrc1 = { *puDst };
8271 RTUINT64U uSrc2 = { *puSrc };
8272 RTUINT64U uDst;
8273 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8274 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8275 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8276 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8277 *puDst = uDst.u;
8278}
8279
8280
8281IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8282{
8283 RTUINT128U uSrc1 = *puDst;
8284 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8285 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8286 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8287 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8288 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8289 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8290 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8291 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8292}
8293
8294#endif
8295
8296IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8297{
8298 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8299 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8300 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8301 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8302 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8303 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8304 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8305 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8306}
8307
8308IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8309{
8310 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8311 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8312 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8313 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8314 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8315 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8316 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8317 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8318 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8319 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8320 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8321 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8322 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8323 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8324 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8325 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8326}
8327
8328
8329/*
8330 * PCMPGTD / VPCMPGTD.
8331 */
8332#ifdef IEM_WITHOUT_ASSEMBLY
8333
8334IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8335{
8336 RTUINT64U uSrc1 = { *puDst };
8337 RTUINT64U uSrc2 = { *puSrc };
8338 RTUINT64U uDst;
8339 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8340 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8341 *puDst = uDst.u;
8342}
8343
8344
8345IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8346{
8347 RTUINT128U uSrc1 = *puDst;
8348 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8349 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8350 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8351 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8352}
8353
8354#endif /* IEM_WITHOUT_ASSEMBLY */
8355
8356IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8357{
8358 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8359 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8360 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8361 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8362}
8363
8364IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8365{
8366 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8367 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8368 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8369 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8370 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8371 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8372 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8373 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8374}
8375
8376
8377/*
8378 * PCMPGTQ / VPCMPGTQ.
8379 */
8380IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8381{
8382 RTUINT128U uSrc1 = *puDst;
8383 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8384 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8385}
8386
8387IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8388{
8389 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8390 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8391}
8392
8393IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8394{
8395 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8396 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8397 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8398 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8399}
8400
8401
8402/*
8403 * PADDB / VPADDB
8404 */
8405#ifdef IEM_WITHOUT_ASSEMBLY
8406
8407IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8408{
8409 RTUINT64U uSrc1 = { *puDst };
8410 RTUINT64U uSrc2 = { *puSrc };
8411 RTUINT64U uDst;
8412 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8413 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8414 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8415 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8416 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8417 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8418 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8419 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8420 *puDst = uDst.u;
8421}
8422
8423
8424IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8425{
8426 RTUINT128U uSrc1 = *puDst;
8427 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8428 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8429 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8430 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8431 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8432 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8433 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8434 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8435 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8436 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8437 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8438 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8439 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8440 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8441 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8442 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8443}
8444
8445#endif
8446
8447
8448IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8449{
8450 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8451 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8452 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8453 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8454 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8455 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8456 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8457 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8458 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8459 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8460 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8461 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8462 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8463 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8464 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8465 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8466}
8467
8468IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8469{
8470 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8471 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8472 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8473 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8474 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8475 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8476 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8477 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8478 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8479 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8480 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8481 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8482 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8483 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8484 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8485 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8486 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8487 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8488 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8489 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8490 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8491 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8492 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8493 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8494 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8495 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8496 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8497 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8498 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8499 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8500 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8501 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8502}
8503
8504
8505/*
8506 * PADDSB / VPADDSB
8507 */
8508#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8509 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8510 ? (uint8_t)(a_iWord) \
8511 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8512
8513#ifdef IEM_WITHOUT_ASSEMBLY
8514
8515IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8516{
8517 RTUINT64U uSrc1 = { *puDst };
8518 RTUINT64U uSrc2 = { *puSrc };
8519 RTUINT64U uDst;
8520 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8521 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8522 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8523 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8524 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8525 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8526 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8527 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8528 *puDst = uDst.u;
8529}
8530
8531
8532IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8533{
8534 RTUINT128U uSrc1 = *puDst;
8535 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8536 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8537 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8538 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8539 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8540 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8541 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8542 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8543 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8544 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8545 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8546 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8547 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8548 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8549 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8550 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8551}
8552
8553#endif
8554
8555IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8556 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8557{
8558 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8559 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8560 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8561 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8562 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8563 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8564 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8565 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8566 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8567 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8568 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8569 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8570 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8571 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8572 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8573 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8574}
8575
8576IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8577 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8578{
8579 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8580 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8581 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8582 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8583 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8584 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8585 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8586 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8587 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8588 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8589 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8590 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8591 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8592 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8593 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8594 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8595 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8596 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8597 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8598 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8599 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8600 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8601 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8602 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8603 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8604 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8605 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8606 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8607 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8608 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8609 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8610 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8611}
8612
8613
8614/*
8615 * PADDUSB / VPADDUSB
8616 */
8617#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8618 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8619 ? (uint8_t)(a_uWord) \
8620 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8621
8622#ifdef IEM_WITHOUT_ASSEMBLY
8623
8624IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8625{
8626 RTUINT64U uSrc1 = { *puDst };
8627 RTUINT64U uSrc2 = { *puSrc };
8628 RTUINT64U uDst;
8629 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8630 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8631 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8632 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8633 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8634 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8635 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8636 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8637 *puDst = uDst.u;
8638}
8639
8640
8641IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8642{
8643 RTUINT128U uSrc1 = *puDst;
8644 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8645 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8646 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8647 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8648 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8649 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8650 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8651 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8652 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8653 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8654 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8655 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8656 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8657 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8658 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8659 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8660}
8661
8662#endif
8663
8664IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8665 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8666{
8667 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8668 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8669 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8670 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8671 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8672 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8673 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8674 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8675 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8676 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8677 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8678 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8679 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8680 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8681 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8682 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8683}
8684
8685IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8686 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8687{
8688 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8689 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8690 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8691 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8692 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8693 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8694 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8695 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8696 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8697 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8698 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8699 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8700 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8701 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8702 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8703 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8704 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8705 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8706 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8707 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8708 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8709 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8710 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8711 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8712 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8713 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8714 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8715 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8716 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8717 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8718 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8719 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8720}
8721
8722
8723/*
8724 * PADDW / VPADDW
8725 */
8726#ifdef IEM_WITHOUT_ASSEMBLY
8727
8728IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8729{
8730 RTUINT64U uSrc1 = { *puDst };
8731 RTUINT64U uSrc2 = { *puSrc };
8732 RTUINT64U uDst;
8733 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8734 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8735 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8736 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8737 *puDst = uDst.u;
8738}
8739
8740
8741IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8742{
8743 RTUINT128U uSrc1 = *puDst;
8744 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8745 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8746 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8747 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8748 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8749 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8750 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8751 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8752}
8753
8754#endif
8755
8756
8757IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8758{
8759 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8760 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8761 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8762 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8763 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8764 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8765 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8766 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8767}
8768
8769IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8770{
8771 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8772 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8773 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8774 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8775 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8776 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8777 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8778 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8779 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8780 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8781 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8782 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8783 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8784 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8785 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8786 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8787}
8788
8789
8790/*
8791 * PADDSW / VPADDSW
8792 */
8793#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8794 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8795 ? (uint16_t)(a_iDword) \
8796 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8797
8798#ifdef IEM_WITHOUT_ASSEMBLY
8799
8800IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8801{
8802 RTUINT64U uSrc1 = { *puDst };
8803 RTUINT64U uSrc2 = { *puSrc };
8804 RTUINT64U uDst;
8805 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8806 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8807 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8808 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8809 *puDst = uDst.u;
8810}
8811
8812
8813IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8814{
8815 RTUINT128U uSrc1 = *puDst;
8816 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8817 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8818 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8819 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8820 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8821 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8822 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8823 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8824}
8825
8826#endif
8827
8828IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8829 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8830{
8831 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8832 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8833 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8834 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8835 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8836 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8837 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8838 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8839}
8840
8841IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8842 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8843{
8844 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8845 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8846 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8847 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8848 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8849 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8850 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8851 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8852 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8853 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8854 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8855 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8856 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8857 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8858 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8859 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8860}
8861
8862
8863/*
8864 * PADDUSW / VPADDUSW
8865 */
8866#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8867 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8868 ? (uint16_t)(a_uDword) \
8869 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8870
8871#ifdef IEM_WITHOUT_ASSEMBLY
8872
8873IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8874{
8875 RTUINT64U uSrc1 = { *puDst };
8876 RTUINT64U uSrc2 = { *puSrc };
8877 RTUINT64U uDst;
8878 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8879 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8880 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8881 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8882 *puDst = uDst.u;
8883}
8884
8885
8886IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8887{
8888 RTUINT128U uSrc1 = *puDst;
8889 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8890 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8891 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8892 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8893 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8894 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8895 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8896 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8897}
8898
8899#endif
8900
8901IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
8902 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8903{
8904 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8905 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8906 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8907 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8908 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8909 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8910 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8911 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8912}
8913
8914IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
8915 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8916{
8917 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8918 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8919 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8920 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8921 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8922 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8923 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8924 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8925 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
8926 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
8927 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
8928 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
8929 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
8930 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
8931 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
8932 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
8933}
8934
8935
8936/*
8937 * PADDD / VPADDD.
8938 */
8939#ifdef IEM_WITHOUT_ASSEMBLY
8940
8941IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8942{
8943 RTUINT64U uSrc1 = { *puDst };
8944 RTUINT64U uSrc2 = { *puSrc };
8945 RTUINT64U uDst;
8946 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8947 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8948 *puDst = uDst.u;
8949}
8950
8951
8952IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8953{
8954 RTUINT128U uSrc1 = *puDst;
8955 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8956 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8957 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8958 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8959}
8960
8961#endif /* IEM_WITHOUT_ASSEMBLY */
8962
8963IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8964{
8965 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8966 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8967 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8968 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8969}
8970
8971IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8972{
8973 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8974 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8975 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8976 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8977 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8978 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8979 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8980 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8981}
8982
8983
8984/*
8985 * PADDQ / VPADDQ.
8986 */
8987#ifdef IEM_WITHOUT_ASSEMBLY
8988
8989IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(uint64_t *puDst, uint64_t const *puSrc))
8990{
8991 *puDst = *puDst + *puSrc;
8992}
8993
8994IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8995{
8996 RTUINT128U uSrc1 = *puDst;
8997 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8998 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8999}
9000
9001#endif
9002
9003IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9004{
9005 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9006 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9007}
9008
9009IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9010{
9011 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9012 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9013 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9014 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9015}
9016
9017
9018/*
9019 * PSUBB / VPSUBB
9020 */
9021#ifdef IEM_WITHOUT_ASSEMBLY
9022
9023IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9024{
9025 RTUINT64U uSrc1 = { *puDst };
9026 RTUINT64U uSrc2 = { *puSrc };
9027 RTUINT64U uDst;
9028 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9029 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9030 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9031 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9032 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9033 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9034 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9035 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9036 *puDst = uDst.u;
9037}
9038
9039
9040IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9041{
9042 RTUINT128U uSrc1 = *puDst;
9043 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9044 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9045 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9046 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9047 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9048 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9049 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9050 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9051 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9052 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9053 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9054 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9055 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9056 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9057 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9058 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9059}
9060
9061#endif
9062
9063IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9064{
9065 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9066 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9067 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9068 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9069 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9070 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9071 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9072 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9073 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9074 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9075 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9076 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9077 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9078 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9079 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9080 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9081}
9082
9083IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9084{
9085 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9086 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9087 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9088 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9089 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9090 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9091 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9092 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9093 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9094 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9095 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9096 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9097 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9098 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9099 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9100 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9101 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9102 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9103 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9104 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9105 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9106 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9107 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9108 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9109 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9110 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9111 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9112 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9113 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9114 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9115 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9116 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9117}
9118
9119
9120/*
9121 * PSUBSB / VSUBSB
9122 */
9123#ifdef IEM_WITHOUT_ASSEMBLY
9124
9125IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9126{
9127 RTUINT64U uSrc1 = { *puDst };
9128 RTUINT64U uSrc2 = { *puSrc };
9129 RTUINT64U uDst;
9130 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9131 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9132 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9133 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9134 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9135 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9136 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9137 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9138 *puDst = uDst.u;
9139}
9140
9141
9142IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9143{
9144 RTUINT128U uSrc1 = *puDst;
9145 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9146 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9147 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9148 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9149 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9150 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9151 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9152 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9153 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9154 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9155 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9156 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9157 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9158 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9159 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9160 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9161}
9162
9163#endif
9164
9165IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9166 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9167{
9168 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9169 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9170 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9171 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9172 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9173 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9174 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9175 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9176 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9177 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9178 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9179 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9180 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9181 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9182 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9183 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9184}
9185
9186IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9187 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9188{
9189 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9190 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9191 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9192 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9193 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9194 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9195 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9196 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9197 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9198 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9199 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9200 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9201 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9202 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9203 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9204 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9205 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9206 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9207 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9208 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9209 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9210 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9211 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9212 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9213 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9214 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9215 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9216 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9217 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9218 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9219 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9220 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9221}
9222
9223
9224/*
9225 * PSUBUSB / VPSUBUSW
9226 */
9227#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9228 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9229 ? (uint8_t)(a_uWord) \
9230 : (uint8_t)0 )
9231
9232#ifdef IEM_WITHOUT_ASSEMBLY
9233
9234IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9235{
9236 RTUINT64U uSrc1 = { *puDst };
9237 RTUINT64U uSrc2 = { *puSrc };
9238 RTUINT64U uDst;
9239 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9240 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9241 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9242 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9243 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9244 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9245 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9246 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9247 *puDst = uDst.u;
9248}
9249
9250
9251IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9252{
9253 RTUINT128U uSrc1 = *puDst;
9254 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9255 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9256 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9257 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9258 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9259 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9260 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9261 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9262 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9263 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9264 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9265 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9266 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9267 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9268 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9269 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9270}
9271
9272#endif
9273
9274IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9275 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9276{
9277 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9278 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9279 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9280 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9281 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9282 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9283 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9284 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9285 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9286 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9287 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9288 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9289 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9290 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9291 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9292 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9293}
9294
9295IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9296 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9297{
9298 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9299 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9300 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9301 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9302 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9303 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9304 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9305 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9306 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9307 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9308 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9309 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9310 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9311 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9312 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9313 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9314 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9315 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9316 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9317 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9318 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9319 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9320 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9321 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9322 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9323 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9324 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9325 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9326 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9327 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9328 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9329 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9330}
9331
9332
9333/*
9334 * PSUBW / VPSUBW
9335 */
9336#ifdef IEM_WITHOUT_ASSEMBLY
9337
9338IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9339{
9340 RTUINT64U uSrc1 = { *puDst };
9341 RTUINT64U uSrc2 = { *puSrc };
9342 RTUINT64U uDst;
9343 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9344 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9345 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9346 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9347 *puDst = uDst.u;
9348}
9349
9350
9351IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9352{
9353 RTUINT128U uSrc1 = *puDst;
9354 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9355 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9356 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9357 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9358 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9359 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9360 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9361 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9362}
9363
9364#endif
9365
9366IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9367{
9368 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9369 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9370 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9371 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9372 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9373 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9374 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9375 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9376}
9377
9378IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9379{
9380 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9381 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9382 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9383 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9384 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9385 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9386 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9387 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9388 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9389 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9390 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9391 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9392 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9393 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9394 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9395 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9396}
9397
9398
9399/*
9400 * PSUBSW / VPSUBSW
9401 */
9402#ifdef IEM_WITHOUT_ASSEMBLY
9403
9404IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9405{
9406 RTUINT64U uSrc1 = { *puDst };
9407 RTUINT64U uSrc2 = { *puSrc };
9408 RTUINT64U uDst;
9409 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9410 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9411 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9412 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9413 *puDst = uDst.u;
9414}
9415
9416
9417IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9418{
9419 RTUINT128U uSrc1 = *puDst;
9420 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9421 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9422 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9423 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9424 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9425 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9426 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9427 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9428}
9429
9430#endif
9431
9432IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9433 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9434{
9435 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9436 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9437 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9438 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9439 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9440 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9441 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9442 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9443}
9444
9445IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9446 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9447{
9448 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9449 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9450 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9451 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9452 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9453 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9454 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9455 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9456 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9457 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9458 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9459 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9460 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9461 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9462 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9463 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9464}
9465
9466
9467/*
9468 * PSUBUSW / VPSUBUSW
9469 */
9470#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9471 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9472 ? (uint16_t)(a_uDword) \
9473 : (uint16_t)0 )
9474
9475#ifdef IEM_WITHOUT_ASSEMBLY
9476
9477IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9478{
9479 RTUINT64U uSrc1 = { *puDst };
9480 RTUINT64U uSrc2 = { *puSrc };
9481 RTUINT64U uDst;
9482 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9483 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9484 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9485 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9486 *puDst = uDst.u;
9487}
9488
9489
9490IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9491{
9492 RTUINT128U uSrc1 = *puDst;
9493 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9494 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9495 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9496 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9497 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9498 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9499 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9500 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9501}
9502
9503#endif
9504
9505IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9506 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9507{
9508 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9509 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9510 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9511 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9512 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9513 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9514 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9515 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9516}
9517
9518IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9519 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9520{
9521 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9522 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9523 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9524 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9525 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9526 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9527 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9528 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9529 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9530 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9531 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9532 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9533 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9534 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9535 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9536 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9537}
9538
9539
9540
9541/*
9542 * PSUBD / VPSUBD.
9543 */
9544#ifdef IEM_WITHOUT_ASSEMBLY
9545
9546IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(uint64_t *puDst, uint64_t const *puSrc))
9547{
9548 RTUINT64U uSrc1 = { *puDst };
9549 RTUINT64U uSrc2 = { *puSrc };
9550 RTUINT64U uDst;
9551 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9552 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9553 *puDst = uDst.u;
9554}
9555
9556
9557IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9558{
9559 RTUINT128U uSrc1 = *puDst;
9560 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9561 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9562 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9563 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9564}
9565
9566#endif /* IEM_WITHOUT_ASSEMBLY */
9567
9568IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9569{
9570 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9571 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9572 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9573 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9574}
9575
9576IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9577{
9578 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9579 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9580 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9581 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9582 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9583 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9584 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9585 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9586}
9587
9588
9589/*
9590 * PSUBQ / VPSUBQ.
9591 */
9592#ifdef IEM_WITHOUT_ASSEMBLY
9593
9594IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9595{
9596 *puDst = *puDst - *puSrc;
9597}
9598
9599IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9600{
9601 RTUINT128U uSrc1 = *puDst;
9602 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9603 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9604}
9605
9606#endif
9607
9608IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9609{
9610 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9611 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9612}
9613
9614IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9615{
9616 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9617 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9618 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9619 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9620}
9621
9622
9623
9624/*
9625 * PMULLW / VPMULLW / PMULLD / VPMULLD
9626 */
9627#ifdef IEM_WITHOUT_ASSEMBLY
9628
9629IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9630{
9631 RTUINT64U uSrc1 = { *puDst };
9632 RTUINT64U uSrc2 = { *puSrc };
9633 RTUINT64U uDst;
9634 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9635 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9636 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9637 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9638 *puDst = uDst.u;
9639}
9640
9641
9642IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9643{
9644 RTUINT128U uSrc1 = *puDst;
9645 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9646 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9647 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9648 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9649 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9650 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9651 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9652 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9653}
9654
9655#endif
9656
9657IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9658{
9659 RTUINT128U uSrc1 = *puDst;
9660
9661 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9662 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9663 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9664 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9665}
9666
9667
9668IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9669{
9670 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9671 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9672 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9673 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9674 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9675 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9676 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9677 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9678}
9679
9680
9681IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9682{
9683 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9684 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9685 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9686 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9687 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9688 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9689 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9690 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9691 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9692 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9693 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9694 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9695 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9696 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9697 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9698 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9699}
9700
9701
9702IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9703{
9704 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9705 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9706 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9707 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9708}
9709
9710
9711IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9712{
9713 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9714 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9715 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9716 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9717 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9718 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9719 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9720 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9721}
9722
9723
9724/*
9725 * PMULHW / VPMULHW
9726 */
9727#ifdef IEM_WITHOUT_ASSEMBLY
9728
9729IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9730{
9731 RTUINT64U uSrc1 = { *puDst };
9732 RTUINT64U uSrc2 = { *puSrc };
9733 RTUINT64U uDst;
9734 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9735 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9736 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9737 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9738 *puDst = uDst.u;
9739}
9740
9741
9742IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9743{
9744 RTUINT128U uSrc1 = *puDst;
9745 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9746 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9747 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9748 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9749 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9750 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9751 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9752 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9753}
9754
9755#endif
9756
9757IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9758{
9759 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9760 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9761 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9762 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9763 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9764 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9765 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9766 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9767}
9768
9769
9770IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9771{
9772 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9773 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9774 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9775 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9776 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9777 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9778 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9779 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9780 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9781 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9782 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9783 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9784 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9785 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9786 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9787 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9788}
9789
9790
9791/*
9792 * PMULHUW / VPMULHUW
9793 */
9794#ifdef IEM_WITHOUT_ASSEMBLY
9795
9796IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9797{
9798 RTUINT64U uSrc1 = { *puDst };
9799 RTUINT64U uSrc2 = { *puSrc };
9800 RTUINT64U uDst;
9801 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9802 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9803 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9804 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9805 *puDst = uDst.u;
9806}
9807
9808
9809IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9810{
9811 RTUINT128U uSrc1 = *puDst;
9812 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9813 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9814 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9815 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9816 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9817 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9818 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9819 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9820}
9821
9822#endif
9823
9824IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9825{
9826 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9827 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9828 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9829 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9830 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9831 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9832 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9833 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9834}
9835
9836
9837IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9838{
9839 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9840 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9841 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9842 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9843 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9844 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9845 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9846 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9847 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9848 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9849 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9850 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9851 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9852 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9853 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9854 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9855}
9856
9857
9858/*
9859 * PSRLW / VPSRLW
9860 */
9861#ifdef IEM_WITHOUT_ASSEMBLY
9862
9863IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9864{
9865 RTUINT64U uSrc1 = { *puDst };
9866 RTUINT64U uSrc2 = { *puSrc };
9867 RTUINT64U uDst;
9868
9869 if (uSrc2.au64[0] <= 15)
9870 {
9871 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9872 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9873 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9874 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9875 }
9876 else
9877 {
9878 uDst.au64[0] = 0;
9879 }
9880 *puDst = uDst.u;
9881}
9882
9883
9884IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9885{
9886 RTUINT64U uSrc1 = { *puDst };
9887 RTUINT64U uDst;
9888
9889 if (uShift <= 15)
9890 {
9891 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9892 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9893 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9894 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9895 }
9896 else
9897 {
9898 uDst.au64[0] = 0;
9899 }
9900 *puDst = uDst.u;
9901}
9902
9903
9904IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9905{
9906 RTUINT128U uSrc1 = *puDst;
9907
9908 if (puSrc->au64[0] <= 15)
9909 {
9910 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9911 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9912 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9913 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9914 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9915 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9916 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9917 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9918 }
9919 else
9920 {
9921 puDst->au64[0] = 0;
9922 puDst->au64[1] = 0;
9923 }
9924}
9925
9926IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9927{
9928 RTUINT128U uSrc1 = *puDst;
9929
9930 if (uShift <= 15)
9931 {
9932 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9933 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9934 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9935 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9936 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9937 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9938 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9939 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9940 }
9941 else
9942 {
9943 puDst->au64[0] = 0;
9944 puDst->au64[1] = 0;
9945 }
9946}
9947
9948#endif
9949
9950IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
9951{
9952 RTUINT128U uSrc1 = *puSrc1;
9953
9954 if (uShift <= 15)
9955 {
9956 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9957 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9958 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9959 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9960 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9961 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9962 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9963 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9964 }
9965 else
9966 {
9967 puDst->au64[0] = 0;
9968 puDst->au64[1] = 0;
9969 }
9970}
9971
9972IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9973{
9974 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
9975}
9976
9977IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
9978{
9979 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, uShift);
9980}
9981
9982IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
9983{
9984 RTUINT256U uSrc1 = *puSrc1;
9985
9986 if (uShift <= 15)
9987 {
9988 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9989 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9990 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9991 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9992 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9993 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9994 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9995 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9996 puDst->au16[8] = uSrc1.au16[8] >> uShift;
9997 puDst->au16[9] = uSrc1.au16[9] >> uShift;
9998 puDst->au16[10] = uSrc1.au16[10] >> uShift;
9999 puDst->au16[11] = uSrc1.au16[11] >> uShift;
10000 puDst->au16[12] = uSrc1.au16[12] >> uShift;
10001 puDst->au16[13] = uSrc1.au16[13] >> uShift;
10002 puDst->au16[14] = uSrc1.au16[14] >> uShift;
10003 puDst->au16[15] = uSrc1.au16[15] >> uShift;
10004 }
10005 else
10006 {
10007 puDst->au64[0] = 0;
10008 puDst->au64[1] = 0;
10009 puDst->au64[2] = 0;
10010 puDst->au64[3] = 0;
10011 }
10012}
10013
10014IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10015{
10016 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, uShift);
10017}
10018
10019IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10020{
10021 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10022}
10023
10024
10025/*
10026 * PSRAW / VPSRAW
10027 */
10028#ifdef IEM_WITHOUT_ASSEMBLY
10029
10030IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10031{
10032 RTUINT64U uSrc1 = { *puDst };
10033 RTUINT64U uSrc2 = { *puSrc };
10034 RTUINT64U uDst;
10035 uint8_t uShift;
10036
10037 uShift = RT_MIN(15, uSrc2.au64[0]);
10038
10039 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10040 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10041 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10042 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10043
10044 *puDst = uDst.u;
10045}
10046
10047
10048IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10049{
10050 RTUINT64U uSrc1 = { *puDst };
10051 RTUINT64U uDst;
10052
10053 uShift = RT_MIN(15, uShift);
10054
10055 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10056 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10057 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10058 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10059
10060 *puDst = uDst.u;
10061}
10062
10063
10064IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10065{
10066 RTUINT128U uSrc1 = *puDst;
10067 uint8_t uShift;
10068
10069 uShift = RT_MIN(15, puSrc->au64[0]);
10070
10071 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10072 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10073 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10074 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10075 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10076 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10077 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10078 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10079}
10080
10081IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10082{
10083 RTUINT128U uSrc1 = *puDst;
10084
10085 uShift = RT_MIN(15, uShift);
10086
10087 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10088 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10089 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10090 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10091 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10092 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10093 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10094 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10095}
10096
10097#endif
10098
10099IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10100{
10101 RTUINT128U uSrc1 = *puSrc1;
10102
10103 uShift = RT_MIN(15, uShift);
10104
10105 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10106 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10107 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10108 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10109 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10110 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10111 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10112 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10113}
10114
10115IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10116{
10117 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10118}
10119
10120IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10121{
10122 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, uShift);
10123}
10124
10125IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10126{
10127 RTUINT256U uSrc1 = *puSrc1;
10128
10129 uShift = RT_MIN(15, uShift);
10130
10131 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10132 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10133 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10134 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10135 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10136 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10137 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10138 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10139 puDst->ai16[8] = uSrc1.ai16[8] >> uShift;
10140 puDst->ai16[9] = uSrc1.ai16[9] >> uShift;
10141 puDst->ai16[10] = uSrc1.ai16[10] >> uShift;
10142 puDst->ai16[11] = uSrc1.ai16[11] >> uShift;
10143 puDst->ai16[12] = uSrc1.ai16[12] >> uShift;
10144 puDst->ai16[13] = uSrc1.ai16[13] >> uShift;
10145 puDst->ai16[14] = uSrc1.ai16[14] >> uShift;
10146 puDst->ai16[15] = uSrc1.ai16[15] >> uShift;
10147}
10148
10149IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10150{
10151 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, uShift);
10152}
10153
10154IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10155{
10156 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10157}
10158
10159
10160/*
10161 * PSLLW / VPSLLW
10162 */
10163#ifdef IEM_WITHOUT_ASSEMBLY
10164
10165IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10166{
10167 RTUINT64U uSrc1 = { *puDst };
10168 RTUINT64U uSrc2 = { *puSrc };
10169 RTUINT64U uDst;
10170
10171 if (uSrc2.au64[0] <= 15)
10172 {
10173 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10174 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10175 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10176 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10177 }
10178 else
10179 {
10180 uDst.au64[0] = 0;
10181 }
10182 *puDst = uDst.u;
10183}
10184
10185
10186IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10187{
10188 RTUINT64U uSrc1 = { *puDst };
10189 RTUINT64U uDst;
10190
10191 if (uShift <= 15)
10192 {
10193 uDst.au16[0] = uSrc1.au16[0] << uShift;
10194 uDst.au16[1] = uSrc1.au16[1] << uShift;
10195 uDst.au16[2] = uSrc1.au16[2] << uShift;
10196 uDst.au16[3] = uSrc1.au16[3] << uShift;
10197 }
10198 else
10199 {
10200 uDst.au64[0] = 0;
10201 }
10202 *puDst = uDst.u;
10203}
10204
10205
10206IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10207{
10208 RTUINT128U uSrc1 = *puDst;
10209
10210 if (puSrc->au64[0] <= 15)
10211 {
10212 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10213 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10214 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10215 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10216 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10217 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10218 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10219 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10220 }
10221 else
10222 {
10223 puDst->au64[0] = 0;
10224 puDst->au64[1] = 0;
10225 }
10226}
10227
10228IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10229{
10230 RTUINT128U uSrc1 = *puDst;
10231
10232 if (uShift <= 15)
10233 {
10234 puDst->au16[0] = uSrc1.au16[0] << uShift;
10235 puDst->au16[1] = uSrc1.au16[1] << uShift;
10236 puDst->au16[2] = uSrc1.au16[2] << uShift;
10237 puDst->au16[3] = uSrc1.au16[3] << uShift;
10238 puDst->au16[4] = uSrc1.au16[4] << uShift;
10239 puDst->au16[5] = uSrc1.au16[5] << uShift;
10240 puDst->au16[6] = uSrc1.au16[6] << uShift;
10241 puDst->au16[7] = uSrc1.au16[7] << uShift;
10242 }
10243 else
10244 {
10245 puDst->au64[0] = 0;
10246 puDst->au64[1] = 0;
10247 }
10248}
10249
10250#endif
10251
10252IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10253{
10254 RTUINT128U uSrc1 = *puSrc1;
10255
10256 if (uShift <= 15)
10257 {
10258 puDst->au16[0] = uSrc1.au16[0] << uShift;
10259 puDst->au16[1] = uSrc1.au16[1] << uShift;
10260 puDst->au16[2] = uSrc1.au16[2] << uShift;
10261 puDst->au16[3] = uSrc1.au16[3] << uShift;
10262 puDst->au16[4] = uSrc1.au16[4] << uShift;
10263 puDst->au16[5] = uSrc1.au16[5] << uShift;
10264 puDst->au16[6] = uSrc1.au16[6] << uShift;
10265 puDst->au16[7] = uSrc1.au16[7] << uShift;
10266 }
10267 else
10268 {
10269 puDst->au64[0] = 0;
10270 puDst->au64[1] = 0;
10271 }
10272}
10273
10274IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10275{
10276 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10277}
10278
10279IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10280{
10281 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, uShift);
10282}
10283
10284IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10285{
10286 RTUINT256U uSrc1 = *puSrc1;
10287
10288 if (uShift <= 15)
10289 {
10290 puDst->au16[0] = uSrc1.au16[0] << uShift;
10291 puDst->au16[1] = uSrc1.au16[1] << uShift;
10292 puDst->au16[2] = uSrc1.au16[2] << uShift;
10293 puDst->au16[3] = uSrc1.au16[3] << uShift;
10294 puDst->au16[4] = uSrc1.au16[4] << uShift;
10295 puDst->au16[5] = uSrc1.au16[5] << uShift;
10296 puDst->au16[6] = uSrc1.au16[6] << uShift;
10297 puDst->au16[7] = uSrc1.au16[7] << uShift;
10298 puDst->au16[8] = uSrc1.au16[8] << uShift;
10299 puDst->au16[9] = uSrc1.au16[9] << uShift;
10300 puDst->au16[10] = uSrc1.au16[10] << uShift;
10301 puDst->au16[11] = uSrc1.au16[11] << uShift;
10302 puDst->au16[12] = uSrc1.au16[12] << uShift;
10303 puDst->au16[13] = uSrc1.au16[13] << uShift;
10304 puDst->au16[14] = uSrc1.au16[14] << uShift;
10305 puDst->au16[15] = uSrc1.au16[15] << uShift;
10306 }
10307 else
10308 {
10309 puDst->au64[0] = 0;
10310 puDst->au64[1] = 0;
10311 puDst->au64[2] = 0;
10312 puDst->au64[3] = 0;
10313 }
10314}
10315
10316IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10317{
10318 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10319}
10320
10321IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10322{
10323 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, uShift);
10324}
10325
10326/*
10327 * PSRLD / VPSRLD
10328 */
10329#ifdef IEM_WITHOUT_ASSEMBLY
10330
10331IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10332{
10333 RTUINT64U uSrc1 = { *puDst };
10334 RTUINT64U uSrc2 = { *puSrc };
10335 RTUINT64U uDst;
10336
10337 if (uSrc2.au64[0] <= 31)
10338 {
10339 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10340 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10341 }
10342 else
10343 {
10344 uDst.au64[0] = 0;
10345 }
10346 *puDst = uDst.u;
10347}
10348
10349
10350IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10351{
10352 RTUINT64U uSrc1 = { *puDst };
10353 RTUINT64U uDst;
10354
10355 if (uShift <= 31)
10356 {
10357 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10358 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10359 }
10360 else
10361 {
10362 uDst.au64[0] = 0;
10363 }
10364 *puDst = uDst.u;
10365}
10366
10367
10368IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10369{
10370 RTUINT128U uSrc1 = *puDst;
10371
10372 if (puSrc->au64[0] <= 31)
10373 {
10374 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10375 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10376 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10377 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10378 }
10379 else
10380 {
10381 puDst->au64[0] = 0;
10382 puDst->au64[1] = 0;
10383 }
10384}
10385
10386IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10387{
10388 RTUINT128U uSrc1 = *puDst;
10389
10390 if (uShift <= 31)
10391 {
10392 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10393 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10394 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10395 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10396 }
10397 else
10398 {
10399 puDst->au64[0] = 0;
10400 puDst->au64[1] = 0;
10401 }
10402}
10403
10404#endif
10405
10406IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10407{
10408 RTUINT128U uSrc1 = *puSrc1;
10409
10410 if (uShift <= 31)
10411 {
10412 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10413 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10414 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10415 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10416 }
10417 else
10418 {
10419 puDst->au64[0] = 0;
10420 puDst->au64[1] = 0;
10421 }
10422}
10423
10424IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10425{
10426 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, uShift);
10427}
10428
10429IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10430{
10431 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10432}
10433
10434IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10435{
10436 RTUINT256U uSrc1 = *puSrc1;
10437
10438 if (uShift <= 31)
10439 {
10440 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10441 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10442 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10443 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10444 puDst->au32[4] = uSrc1.au32[4] >> uShift;
10445 puDst->au32[5] = uSrc1.au32[5] >> uShift;
10446 puDst->au32[6] = uSrc1.au32[6] >> uShift;
10447 puDst->au32[7] = uSrc1.au32[7] >> uShift;
10448 }
10449 else
10450 {
10451 puDst->au64[0] = 0;
10452 puDst->au64[1] = 0;
10453 puDst->au64[2] = 0;
10454 puDst->au64[3] = 0;
10455 }
10456}
10457
10458IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10459{
10460 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10461}
10462
10463IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10464{
10465 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, uShift);
10466}
10467
10468
10469/*
10470 * PSRAD / VPSRAD
10471 */
10472#ifdef IEM_WITHOUT_ASSEMBLY
10473
10474IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10475{
10476 RTUINT64U uSrc1 = { *puDst };
10477 RTUINT64U uSrc2 = { *puSrc };
10478 RTUINT64U uDst;
10479 uint8_t uShift;
10480
10481 uShift = RT_MIN(31, uSrc2.au64[0]);
10482
10483 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10484 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10485
10486 *puDst = uDst.u;
10487}
10488
10489
10490IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10491{
10492 RTUINT64U uSrc1 = { *puDst };
10493 RTUINT64U uDst;
10494
10495 uShift = RT_MIN(31, uShift);
10496
10497 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10498 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10499
10500 *puDst = uDst.u;
10501}
10502
10503
10504IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10505{
10506 RTUINT128U uSrc1 = *puDst;
10507 uint8_t uShift;
10508
10509 uShift = RT_MIN(31, puSrc->au64[0]);
10510
10511 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10512 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10513 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10514 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10515}
10516
10517IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10518{
10519 RTUINT128U uSrc1 = *puDst;
10520
10521 uShift = RT_MIN(31, uShift);
10522
10523 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10524 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10525 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10526 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10527}
10528
10529#endif
10530
10531IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10532{
10533 RTUINT128U uSrc1 = *puSrc1;
10534
10535 uShift = RT_MIN(31, uShift);
10536
10537 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10538 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10539 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10540 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10541}
10542
10543IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10544{
10545 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, uShift);
10546}
10547
10548IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10549{
10550 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10551}
10552
10553IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10554{
10555 RTUINT256U uSrc1 = *puSrc1;
10556
10557 uShift = RT_MIN(31, uShift);
10558
10559 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10560 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10561 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10562 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10563 puDst->ai32[4] = uSrc1.ai32[4] >> uShift;
10564 puDst->ai32[5] = uSrc1.ai32[5] >> uShift;
10565 puDst->ai32[6] = uSrc1.ai32[6] >> uShift;
10566 puDst->ai32[7] = uSrc1.ai32[7] >> uShift;
10567}
10568
10569IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10570{
10571 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10572}
10573
10574IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10575{
10576 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, uShift);
10577}
10578
10579
10580/*
10581 * PSLLD / VPSLLD
10582 */
10583#ifdef IEM_WITHOUT_ASSEMBLY
10584
10585IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10586{
10587 RTUINT64U uSrc1 = { *puDst };
10588 RTUINT64U uSrc2 = { *puSrc };
10589 RTUINT64U uDst;
10590
10591 if (uSrc2.au64[0] <= 31)
10592 {
10593 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10594 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10595 }
10596 else
10597 {
10598 uDst.au64[0] = 0;
10599 }
10600 *puDst = uDst.u;
10601}
10602
10603
10604IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10605{
10606 RTUINT64U uSrc1 = { *puDst };
10607 RTUINT64U uDst;
10608
10609 if (uShift <= 31)
10610 {
10611 uDst.au32[0] = uSrc1.au32[0] << uShift;
10612 uDst.au32[1] = uSrc1.au32[1] << uShift;
10613 }
10614 else
10615 {
10616 uDst.au64[0] = 0;
10617 }
10618 *puDst = uDst.u;
10619}
10620
10621
10622IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10623{
10624 RTUINT128U uSrc1 = *puDst;
10625
10626 if (puSrc->au64[0] <= 31)
10627 {
10628 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10629 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10630 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10631 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10632 }
10633 else
10634 {
10635 puDst->au64[0] = 0;
10636 puDst->au64[1] = 0;
10637 }
10638}
10639
10640IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10641{
10642 RTUINT128U uSrc1 = *puDst;
10643
10644 if (uShift <= 31)
10645 {
10646 puDst->au32[0] = uSrc1.au32[0] << uShift;
10647 puDst->au32[1] = uSrc1.au32[1] << uShift;
10648 puDst->au32[2] = uSrc1.au32[2] << uShift;
10649 puDst->au32[3] = uSrc1.au32[3] << uShift;
10650 }
10651 else
10652 {
10653 puDst->au64[0] = 0;
10654 puDst->au64[1] = 0;
10655 }
10656}
10657
10658#endif
10659
10660IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10661{
10662 RTUINT128U uSrc1 = *puSrc1;
10663
10664 if (uShift <= 31)
10665 {
10666 puDst->au32[0] = uSrc1.au32[0] << uShift;
10667 puDst->au32[1] = uSrc1.au32[1] << uShift;
10668 puDst->au32[2] = uSrc1.au32[2] << uShift;
10669 puDst->au32[3] = uSrc1.au32[3] << uShift;
10670 }
10671 else
10672 {
10673 puDst->au64[0] = 0;
10674 puDst->au64[1] = 0;
10675 }
10676}
10677
10678IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10679{
10680 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, uShift);
10681}
10682
10683IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10684{
10685 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10686}
10687
10688IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10689{
10690 RTUINT256U uSrc1 = *puSrc1;
10691
10692 if (uShift <= 31)
10693 {
10694 puDst->au32[0] = uSrc1.au32[0] << uShift;
10695 puDst->au32[1] = uSrc1.au32[1] << uShift;
10696 puDst->au32[2] = uSrc1.au32[2] << uShift;
10697 puDst->au32[3] = uSrc1.au32[3] << uShift;
10698 puDst->au32[4] = uSrc1.au32[4] << uShift;
10699 puDst->au32[5] = uSrc1.au32[5] << uShift;
10700 puDst->au32[6] = uSrc1.au32[6] << uShift;
10701 puDst->au32[7] = uSrc1.au32[7] << uShift;
10702 }
10703 else
10704 {
10705 puDst->au64[0] = 0;
10706 puDst->au64[1] = 0;
10707 puDst->au64[2] = 0;
10708 puDst->au64[3] = 0;
10709 }
10710}
10711
10712IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10713{
10714 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10715}
10716
10717IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10718{
10719 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, uShift);
10720}
10721
10722
10723/*
10724 * PSRLQ / VPSRLQ
10725 */
10726#ifdef IEM_WITHOUT_ASSEMBLY
10727
10728IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10729{
10730 RTUINT64U uSrc1 = { *puDst };
10731 RTUINT64U uSrc2 = { *puSrc };
10732 RTUINT64U uDst;
10733
10734 if (uSrc2.au64[0] <= 63)
10735 {
10736 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10737 }
10738 else
10739 {
10740 uDst.au64[0] = 0;
10741 }
10742 *puDst = uDst.u;
10743}
10744
10745
10746IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10747{
10748 RTUINT64U uSrc1 = { *puDst };
10749 RTUINT64U uDst;
10750
10751 if (uShift <= 63)
10752 {
10753 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10754 }
10755 else
10756 {
10757 uDst.au64[0] = 0;
10758 }
10759 *puDst = uDst.u;
10760}
10761
10762
10763IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10764{
10765 RTUINT128U uSrc1 = *puDst;
10766
10767 if (puSrc->au64[0] <= 63)
10768 {
10769 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10770 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10771 }
10772 else
10773 {
10774 puDst->au64[0] = 0;
10775 puDst->au64[1] = 0;
10776 }
10777}
10778
10779IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10780{
10781 RTUINT128U uSrc1 = *puDst;
10782
10783 if (uShift <= 63)
10784 {
10785 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10786 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10787 }
10788 else
10789 {
10790 puDst->au64[0] = 0;
10791 puDst->au64[1] = 0;
10792 }
10793}
10794
10795#endif
10796
10797IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10798{
10799 RTUINT128U uSrc1 = *puSrc1;
10800
10801 if (uShift <= 63)
10802 {
10803 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10804 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10805 }
10806 else
10807 {
10808 puDst->au64[0] = 0;
10809 puDst->au64[1] = 0;
10810 }
10811}
10812
10813IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10814{
10815 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, uShift);
10816}
10817
10818IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10819{
10820 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10821}
10822
10823IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10824{
10825 RTUINT256U uSrc1 = *puSrc1;
10826
10827 if (uShift <= 63)
10828 {
10829 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10830 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10831 puDst->au64[2] = uSrc1.au64[2] >> uShift;
10832 puDst->au64[3] = uSrc1.au64[3] >> uShift;
10833 }
10834 else
10835 {
10836 puDst->au64[0] = 0;
10837 puDst->au64[1] = 0;
10838 puDst->au64[2] = 0;
10839 puDst->au64[3] = 0;
10840 }
10841}
10842
10843IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10844{
10845 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10846}
10847
10848IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10849{
10850 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, uShift);
10851}
10852
10853
10854/*
10855 * PSLLQ / VPSLLQ
10856 */
10857#ifdef IEM_WITHOUT_ASSEMBLY
10858
10859IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10860{
10861 RTUINT64U uSrc1 = { *puDst };
10862 RTUINT64U uSrc2 = { *puSrc };
10863 RTUINT64U uDst;
10864
10865 if (uSrc2.au64[0] <= 63)
10866 {
10867 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10868 }
10869 else
10870 {
10871 uDst.au64[0] = 0;
10872 }
10873 *puDst = uDst.u;
10874}
10875
10876
10877IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10878{
10879 RTUINT64U uSrc1 = { *puDst };
10880 RTUINT64U uDst;
10881
10882 if (uShift <= 63)
10883 {
10884 uDst.au64[0] = uSrc1.au64[0] << uShift;
10885 }
10886 else
10887 {
10888 uDst.au64[0] = 0;
10889 }
10890 *puDst = uDst.u;
10891}
10892
10893
10894IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10895{
10896 RTUINT128U uSrc1 = *puDst;
10897
10898 if (puSrc->au64[0] <= 63)
10899 {
10900 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10901 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10902 }
10903 else
10904 {
10905 puDst->au64[0] = 0;
10906 puDst->au64[1] = 0;
10907 }
10908}
10909
10910IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10911{
10912 RTUINT128U uSrc1 = *puDst;
10913
10914 if (uShift <= 63)
10915 {
10916 puDst->au64[0] = uSrc1.au64[0] << uShift;
10917 puDst->au64[1] = uSrc1.au64[1] << uShift;
10918 }
10919 else
10920 {
10921 puDst->au64[0] = 0;
10922 puDst->au64[1] = 0;
10923 }
10924}
10925
10926#endif
10927
10928IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10929{
10930 RTUINT128U uSrc1 = *puSrc1;
10931
10932 if (uShift <= 63)
10933 {
10934 puDst->au64[0] = uSrc1.au64[0] << uShift;
10935 puDst->au64[1] = uSrc1.au64[1] << uShift;
10936 }
10937 else
10938 {
10939 puDst->au64[0] = 0;
10940 puDst->au64[1] = 0;
10941 }
10942}
10943
10944IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10945{
10946 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10947}
10948
10949IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10950{
10951 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, uShift);
10952}
10953
10954IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10955{
10956 RTUINT256U uSrc1 = *puSrc1;
10957
10958 if (uShift <= 63)
10959 {
10960 puDst->au64[0] = uSrc1.au64[0] << uShift;
10961 puDst->au64[1] = uSrc1.au64[1] << uShift;
10962 puDst->au64[2] = uSrc1.au64[2] << uShift;
10963 puDst->au64[3] = uSrc1.au64[3] << uShift;
10964 }
10965 else
10966 {
10967 puDst->au64[0] = 0;
10968 puDst->au64[1] = 0;
10969 puDst->au64[2] = 0;
10970 puDst->au64[3] = 0;
10971 }
10972}
10973
10974IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10975{
10976 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10977}
10978
10979IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10980{
10981 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, uShift);
10982}
10983
10984
10985/*
10986 * PSRLDQ / VPSRLDQ
10987 */
10988#ifdef IEM_WITHOUT_ASSEMBLY
10989
10990IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10991{
10992 if (uShift < 16)
10993 {
10994 RTUINT128U uSrc1 = *puDst;
10995 int i;
10996
10997 for (i = 0; i < 16 - uShift; ++i)
10998 puDst->au8[i] = uSrc1.au8[i + uShift];
10999 for (i = 16 - uShift; i < 16; ++i)
11000 puDst->au8[i] = 0;
11001 }
11002 else
11003 {
11004 puDst->au64[0] = 0;
11005 puDst->au64[1] = 0;
11006 }
11007}
11008
11009IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11010{
11011 if (uShift < 16)
11012 {
11013 RTUINT128U uSrc1 = *puSrc;
11014 int i;
11015
11016 for (i = 0; i < 16 - uShift; ++i)
11017 puDst->au8[i] = uSrc1.au8[i + uShift];
11018 for (i = 16 - uShift; i < 16; ++i)
11019 puDst->au8[i] = 0;
11020 }
11021 else
11022 {
11023 puDst->au64[0] = 0;
11024 puDst->au64[1] = 0;
11025 }
11026}
11027
11028IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11029{
11030 iemAImpl_vpsrldq_imm_u128(&puDst->au128[0], &puSrc->au128[0], uShift);
11031 iemAImpl_vpsrldq_imm_u128(&puDst->au128[1], &puSrc->au128[1], uShift);
11032}
11033#endif
11034
11035IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11036{
11037 if (uShift < 16)
11038 {
11039 RTUINT128U uSrc1 = *puSrc;
11040 int i;
11041
11042 for (i = 0; i < 16 - uShift; ++i)
11043 puDst->au8[i] = uSrc1.au8[i + uShift];
11044 for (i = 16 - uShift; i < 16; ++i)
11045 puDst->au8[i] = 0;
11046 }
11047 else
11048 {
11049 puDst->au64[0] = 0;
11050 puDst->au64[1] = 0;
11051 }
11052}
11053
11054IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11055{
11056 iemAImpl_vpsrldq_imm_u128_fallback(&puDst->au128[0], &puSrc->au128[0], uShift);
11057 iemAImpl_vpsrldq_imm_u128_fallback(&puDst->au128[1], &puSrc->au128[1], uShift);
11058}
11059
11060
11061/*
11062 * PSLLDQ / VPSLLDQ
11063 */
11064#ifdef IEM_WITHOUT_ASSEMBLY
11065
11066IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11067{
11068 if (uShift < 16)
11069 {
11070 RTUINT128U uSrc1 = *puDst;
11071 int i;
11072
11073 for (i = 0; i < uShift; ++i)
11074 puDst->au8[i] = 0;
11075 for (i = uShift; i < 16; ++i)
11076 puDst->au8[i] = uSrc1.au8[i - uShift];
11077 }
11078 else
11079 {
11080 puDst->au64[0] = 0;
11081 puDst->au64[1] = 0;
11082 }
11083}
11084
11085IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11086{
11087 if (uShift < 16)
11088 {
11089 RTUINT128U uSrc1 = *puSrc;
11090 int i;
11091
11092 for (i = 0; i < uShift; ++i)
11093 puDst->au8[i] = 0;
11094 for (i = uShift; i < 16; ++i)
11095 puDst->au8[i] = uSrc1.au8[i - uShift];
11096 }
11097 else
11098 {
11099 puDst->au64[0] = 0;
11100 puDst->au64[1] = 0;
11101 }
11102}
11103
11104IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11105{
11106 iemAImpl_vpslldq_imm_u128(&puDst->au128[0], &puSrc->au128[0], uShift);
11107 iemAImpl_vpslldq_imm_u128(&puDst->au128[1], &puSrc->au128[1], uShift);
11108}
11109
11110#endif
11111
11112IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11113{
11114 if (uShift < 16)
11115 {
11116 RTUINT128U uSrc1 = *puSrc;
11117 int i;
11118
11119 for (i = 0; i < uShift; ++i)
11120 puDst->au8[i] = 0;
11121 for (i = uShift; i < 16; ++i)
11122 puDst->au8[i] = uSrc1.au8[i - uShift];
11123 }
11124 else
11125 {
11126 puDst->au64[0] = 0;
11127 puDst->au64[1] = 0;
11128 }
11129}
11130
11131IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11132{
11133 iemAImpl_vpslldq_imm_u128_fallback(&puDst->au128[0], &puSrc->au128[0], uShift);
11134 iemAImpl_vpslldq_imm_u128_fallback(&puDst->au128[1], &puSrc->au128[1], uShift);
11135}
11136
11137
11138/*
11139 * VPSRLVD
11140 */
11141IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11142{
11143 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11144 {
11145 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11146 }
11147}
11148
11149IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11150{
11151 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11152 {
11153 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11154 }
11155}
11156
11157
11158/*
11159 * VPSRAVD
11160 */
11161IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11162{
11163 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11164 {
11165 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11166 }
11167}
11168
11169IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11170{
11171 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11172 {
11173 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11174 }
11175}
11176
11177
11178/*
11179 * VPSLLVD
11180 */
11181IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11182{
11183 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11184 {
11185 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11186 }
11187}
11188
11189IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11190{
11191 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11192 {
11193 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11194 }
11195}
11196
11197
11198/*
11199 * VPSRLVQ
11200 */
11201IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11202{
11203 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11204 {
11205 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11206 }
11207}
11208
11209IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11210{
11211 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11212 {
11213 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11214 }
11215}
11216
11217
11218/*
11219 * VPSLLVQ
11220 */
11221IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11222{
11223 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11224 {
11225 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11226 }
11227}
11228
11229IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11230{
11231 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11232 {
11233 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11234 }
11235}
11236
11237
11238/*
11239 * PMADDWD / VPMADDWD
11240 */
11241#ifdef IEM_WITHOUT_ASSEMBLY
11242
11243IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11244{
11245 RTUINT64U uSrc1 = { *puDst };
11246 RTUINT64U uSrc2 = { *puSrc };
11247 RTUINT64U uDst;
11248
11249 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11250 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11251 *puDst = uDst.u;
11252}
11253
11254
11255IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11256{
11257 RTUINT128U uSrc1 = *puDst;
11258
11259 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11260 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11261 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11262 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11263}
11264
11265#endif
11266
11267
11268IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
11269{
11270 RTUINT64U uSrc1 = { *puDst };
11271 RTUINT64U uSrc2 = { *puSrc };
11272 RTUINT64U uDst;
11273
11274 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11275 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11276 *puDst = uDst.u;
11277}
11278
11279
11280IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11281{
11282 RTUINT128U uSrc1 = *puDst;
11283
11284 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11285 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11286 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11287 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11288}
11289
11290
11291IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11292{
11293 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11294 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11295 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11296 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11297}
11298
11299
11300IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11301{
11302 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11303 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11304 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11305 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11306 puDst->ai32[4] = (int32_t)puSrc1->ai16[8] * puSrc2->ai16[8] + (int32_t)puSrc1->ai16[9] * puSrc2->ai16[9];
11307 puDst->ai32[5] = (int32_t)puSrc1->ai16[10] * puSrc2->ai16[10] + (int32_t)puSrc1->ai16[11] * puSrc2->ai16[11];
11308 puDst->ai32[6] = (int32_t)puSrc1->ai16[12] * puSrc2->ai16[12] + (int32_t)puSrc1->ai16[13] * puSrc2->ai16[13];
11309 puDst->ai32[7] = (int32_t)puSrc1->ai16[14] * puSrc2->ai16[14] + (int32_t)puSrc1->ai16[15] * puSrc2->ai16[15];
11310}
11311
11312
11313/*
11314 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
11315 */
11316#ifdef IEM_WITHOUT_ASSEMBLY
11317
11318IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(uint64_t *puDst, uint64_t const *puSrc))
11319{
11320 RTUINT64U uSrc1 = { *puDst };
11321 RTUINT64U uSrc2 = { *puSrc };
11322 RTUINT64U uDst;
11323
11324 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
11325 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
11326 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
11327 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
11328 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
11329 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
11330 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
11331 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
11332 *puDst = uDst.u;
11333}
11334
11335
11336IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11337{
11338 RTUINT128U uSrc1 = *puDst;
11339
11340 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
11341 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
11342 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
11343 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
11344 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
11345 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
11346 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
11347 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
11348 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
11349 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
11350 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
11351 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
11352 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
11353 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
11354 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
11355 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
11356}
11357
11358#endif
11359
11360
11361IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11362{
11363 RTUINT128U uSrc1 = *puDst;
11364
11365 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
11366 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
11367 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
11368 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
11369 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
11370 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
11371 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
11372 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
11373}
11374
11375
11376IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11377{
11378 RTUINT128U uSrc1 = *puDst;
11379
11380 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
11381 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
11382 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
11383 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
11384}
11385
11386
11387IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11388{
11389 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11390 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11391 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11392 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11393 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11394 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11395 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11396 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11397 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11398 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11399 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11400 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11401 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11402 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11403 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11404 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11405}
11406
11407
11408IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11409{
11410 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11411 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11412 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11413 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11414 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11415 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11416 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11417 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11418 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11419 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11420 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11421 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11422 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11423 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11424 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11425 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11426 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
11427 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
11428 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
11429 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
11430 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
11431 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
11432 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
11433 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
11434 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
11435 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
11436 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
11437 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
11438 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
11439 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
11440 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
11441 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
11442}
11443
11444
11445IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11446{
11447 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11448 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11449 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11450 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11451 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11452 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11453 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11454 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11455}
11456
11457
11458IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11459{
11460 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11461 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11462 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11463 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11464 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11465 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11466 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11467 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11468 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11469 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11470 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
11471 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
11472 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
11473 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
11474 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
11475 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
11476}
11477
11478
11479IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11480{
11481 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11482 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11483 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11484 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11485}
11486
11487
11488IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11489{
11490 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11491 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11492 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11493 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11494 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11495 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11496 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11497 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11498}
11499
11500
11501/*
11502 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11503 */
11504#ifdef IEM_WITHOUT_ASSEMBLY
11505
11506IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11507{
11508 RTUINT64U uSrc1 = { *puDst };
11509 RTUINT64U uSrc2 = { *puSrc };
11510 RTUINT64U uDst;
11511
11512 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11513 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11514 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11515 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11516 *puDst = uDst.u;
11517}
11518
11519
11520IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11521{
11522 RTUINT128U uSrc1 = *puDst;
11523
11524 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11525 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11526 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11527 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11528 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11529 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11530 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11531 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11532}
11533
11534#endif
11535
11536IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11537{
11538 RTUINT128U uSrc1 = *puDst;
11539
11540 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11541 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11542 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11543 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11544 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11545 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11546 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11547 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11548 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11549 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11550 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11551 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11552 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11553 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11554 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11555 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11556}
11557
11558
11559IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11560{
11561 RTUINT128U uSrc1 = *puDst;
11562
11563 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11564 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11565 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11566 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11567}
11568
11569
11570IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11571{
11572 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11573 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11574 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11575 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11576 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11577 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11578 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11579 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11580 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11581 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11582 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11583 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11584 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11585 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11586 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11587 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11588}
11589
11590
11591IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11592{
11593 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11594 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11595 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11596 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11597 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11598 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11599 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11600 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11601 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11602 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11603 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11604 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11605 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11606 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11607 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11608 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11609 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11610 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11611 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11612 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11613 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11614 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11615 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11616 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11617 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11618 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11619 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11620 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11621 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11622 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11623 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11624 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11625}
11626
11627
11628IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11629{
11630 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11631 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11632 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11633 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11634 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11635 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11636 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11637 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11638}
11639
11640
11641IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11642{
11643 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11644 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11645 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11646 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11647 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11648 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11649 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11650 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11651 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11652 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11653 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11654 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11655 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11656 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11657 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11658 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11659}
11660
11661
11662IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11663{
11664 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11665 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11666 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11667 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11668}
11669
11670
11671IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11672{
11673 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11674 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11675 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11676 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11677 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11678 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11679 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11680 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11681}
11682
11683
11684/*
11685 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11686 */
11687#ifdef IEM_WITHOUT_ASSEMBLY
11688
11689IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(uint64_t *puDst, uint64_t const *puSrc))
11690{
11691 RTUINT64U uSrc1 = { *puDst };
11692 RTUINT64U uSrc2 = { *puSrc };
11693 RTUINT64U uDst;
11694
11695 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11696 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11697 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11698 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11699 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11700 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11701 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11702 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11703 *puDst = uDst.u;
11704}
11705
11706
11707IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11708{
11709 RTUINT128U uSrc1 = *puDst;
11710
11711 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11712 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11713 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11714 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11715 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11716 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11717 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11718 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11719 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11720 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11721 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11722 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11723 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11724 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11725 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11726 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11727}
11728
11729#endif
11730
11731IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11732{
11733 RTUINT128U uSrc1 = *puDst;
11734
11735 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11736 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11737 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11738 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11739 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11740 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11741 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11742 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11743}
11744
11745
11746IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11747{
11748 RTUINT128U uSrc1 = *puDst;
11749
11750 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11751 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11752 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11753 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11754}
11755
11756
11757IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11758{
11759 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11760 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11761 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11762 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11763 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11764 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11765 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11766 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11767 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11768 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11769 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11770 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11771 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11772 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11773 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11774 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11775}
11776
11777
11778IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11779{
11780 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11781 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11782 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11783 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11784 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11785 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11786 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11787 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11788 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11789 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11790 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11791 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11792 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11793 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11794 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11795 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11796 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11797 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11798 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11799 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11800 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11801 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11802 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11803 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11804 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11805 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11806 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11807 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11808 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11809 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11810 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11811 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11812}
11813
11814
11815IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11816{
11817 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11818 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11819 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11820 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11821 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11822 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11823 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11824 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11825}
11826
11827
11828IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11829{
11830 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11831 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11832 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11833 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11834 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11835 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11836 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11837 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11838 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11839 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11840 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11841 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11842 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11843 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11844 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11845 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11846}
11847
11848
11849IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11850{
11851 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11852 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11853 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11854 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11855}
11856
11857
11858IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11859{
11860 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11861 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11862 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11863 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11864 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11865 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11866 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11867 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11868}
11869
11870
11871/*
11872 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11873 */
11874#ifdef IEM_WITHOUT_ASSEMBLY
11875
11876IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11877{
11878 RTUINT64U uSrc1 = { *puDst };
11879 RTUINT64U uSrc2 = { *puSrc };
11880 RTUINT64U uDst;
11881
11882 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11883 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11884 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11885 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11886 *puDst = uDst.u;
11887}
11888
11889
11890IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11891{
11892 RTUINT128U uSrc1 = *puDst;
11893
11894 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11895 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11896 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11897 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11898 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11899 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11900 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11901 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11902}
11903
11904#endif
11905
11906IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11907{
11908 RTUINT128U uSrc1 = *puDst;
11909
11910 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11911 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11912 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11913 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11914 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11915 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11916 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11917 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11918 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11919 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11920 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11921 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11922 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11923 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11924 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
11925 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
11926}
11927
11928
11929IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11930{
11931 RTUINT128U uSrc1 = *puDst;
11932
11933 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11934 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11935 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11936 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11937}
11938
11939
11940IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11941{
11942 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11943 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11944 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11945 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11946 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11947 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11948 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11949 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11950 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11951 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11952 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11953 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11954 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11955 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11956 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11957 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11958}
11959
11960
11961IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11962{
11963 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11964 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11965 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11966 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11967 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11968 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11969 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11970 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11971 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11972 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11973 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11974 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11975 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11976 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11977 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11978 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11979 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11980 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11981 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11982 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11983 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11984 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11985 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11986 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11987 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11988 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11989 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11990 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11991 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11992 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11993 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11994 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11995}
11996
11997
11998IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11999{
12000 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12001 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12002 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12003 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12004 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12005 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12006 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12007 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12008}
12009
12010
12011IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12012{
12013 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12014 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12015 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12016 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12017 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12018 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12019 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12020 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12021 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
12022 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
12023 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
12024 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
12025 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
12026 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
12027 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
12028 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
12029}
12030
12031
12032IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12033{
12034 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12035 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12036 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12037 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12038}
12039
12040
12041IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12042{
12043 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12044 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12045 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12046 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12047 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
12048 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
12049 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
12050 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
12051}
12052
12053
12054/*
12055 * PAVGB / VPAVGB / PAVGW / VPAVGW
12056 */
12057#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
12058#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
12059
12060#ifdef IEM_WITHOUT_ASSEMBLY
12061
12062IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12063{
12064 RTUINT64U uSrc1 = { *puDst };
12065 RTUINT64U uSrc2 = { *puSrc };
12066 RTUINT64U uDst;
12067
12068 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
12069 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
12070 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
12071 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
12072 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
12073 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
12074 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
12075 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
12076 *puDst = uDst.u;
12077}
12078
12079
12080IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12081{
12082 RTUINT128U uSrc1 = *puDst;
12083
12084 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12085 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12086 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12087 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12088 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12089 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12090 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12091 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12092 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12093 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12094 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12095 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12096 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12097 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12098 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12099 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12100}
12101
12102
12103IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12104{
12105 RTUINT64U uSrc1 = { *puDst };
12106 RTUINT64U uSrc2 = { *puSrc };
12107 RTUINT64U uDst;
12108
12109 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
12110 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
12111 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
12112 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
12113 *puDst = uDst.u;
12114}
12115
12116
12117IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12118{
12119 RTUINT128U uSrc1 = *puDst;
12120
12121 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
12122 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
12123 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
12124 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
12125 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
12126 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
12127 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
12128 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
12129}
12130
12131#endif
12132
12133IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12134{
12135 RTUINT128U uSrc1 = *puDst;
12136
12137 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12138 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12139 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12140 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12141 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12142 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12143 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12144 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12145 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12146 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12147 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12148 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12149 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12150 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12151 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12152 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12153}
12154
12155
12156IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12157{
12158 RTUINT128U uSrc1 = *puDst;
12159
12160 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12161 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12162 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12163 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12164 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12165 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12166 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12167 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12168 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12169 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12170 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12171 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12172 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12173 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12174 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12175 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12176}
12177
12178
12179IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12180{
12181 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12182 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12183 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12184 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12185 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12186 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12187 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12188 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12189 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12190 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12191 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12192 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12193 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12194 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12195 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12196 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12197}
12198
12199
12200IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12201{
12202 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12203 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12204 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12205 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12206 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12207 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12208 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12209 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12210 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12211 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12212 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12213 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12214 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12215 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12216 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12217 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12218 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
12219 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
12220 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
12221 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
12222 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
12223 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
12224 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
12225 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
12226 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
12227 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
12228 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
12229 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
12230 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
12231 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
12232 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
12233 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
12234}
12235
12236
12237IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12238{
12239 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12240 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12241 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12242 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12243 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12244 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12245 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12246 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12247}
12248
12249
12250IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12251{
12252 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12253 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12254 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12255 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12256 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12257 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12258 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12259 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12260 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
12261 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
12262 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
12263 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
12264 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
12265 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
12266 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
12267 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
12268}
12269
12270#undef PAVGB_EXEC
12271#undef PAVGW_EXEC
12272
12273
12274/*
12275 * PMOVMSKB / VPMOVMSKB
12276 */
12277#ifdef IEM_WITHOUT_ASSEMBLY
12278
12279IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
12280{
12281 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12282 uint64_t const uSrc = *pu64Src;
12283 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
12284 | ((uSrc >> (15-1)) & RT_BIT_64(1))
12285 | ((uSrc >> (23-2)) & RT_BIT_64(2))
12286 | ((uSrc >> (31-3)) & RT_BIT_64(3))
12287 | ((uSrc >> (39-4)) & RT_BIT_64(4))
12288 | ((uSrc >> (47-5)) & RT_BIT_64(5))
12289 | ((uSrc >> (55-6)) & RT_BIT_64(6))
12290 | ((uSrc >> (63-7)) & RT_BIT_64(7));
12291}
12292
12293
12294IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
12295{
12296 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12297 uint64_t const uSrc0 = pu128Src->QWords.qw0;
12298 uint64_t const uSrc1 = pu128Src->QWords.qw1;
12299 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12300 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12301 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12302 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12303 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12304 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12305 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12306 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12307 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12308 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12309 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12310 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12311 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12312 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12313 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12314 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
12315}
12316
12317#endif
12318
12319IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
12320{
12321 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12322 uint64_t const uSrc0 = puSrc->QWords.qw0;
12323 uint64_t const uSrc1 = puSrc->QWords.qw1;
12324 uint64_t const uSrc2 = puSrc->QWords.qw2;
12325 uint64_t const uSrc3 = puSrc->QWords.qw3;
12326 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12327 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12328 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12329 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12330 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12331 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12332 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12333 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12334 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12335 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12336 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12337 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12338 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12339 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12340 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12341 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
12342 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
12343 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
12344 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
12345 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
12346 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
12347 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
12348 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
12349 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
12350 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
12351 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
12352 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
12353 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
12354 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
12355 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
12356 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
12357 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
12358}
12359
12360
12361/*
12362 * [V]PSHUFB
12363 */
12364
12365IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
12366{
12367 RTUINT64U const uSrc = { *puSrc };
12368 RTUINT64U const uDstIn = { *puDst };
12369 ASMCompilerBarrier();
12370 RTUINT64U uDstOut = { 0 };
12371 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
12372 {
12373 uint8_t idxSrc = uSrc.au8[iByte];
12374 if (!(idxSrc & 0x80))
12375 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
12376 }
12377 *puDst = uDstOut.u;
12378}
12379
12380
12381IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12382{
12383 RTUINT128U const uSrc = *puSrc;
12384 RTUINT128U const uDstIn = *puDst;
12385 ASMCompilerBarrier();
12386 puDst->au64[0] = 0;
12387 puDst->au64[1] = 0;
12388 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12389 {
12390 uint8_t idxSrc = uSrc.au8[iByte];
12391 if (!(idxSrc & 0x80))
12392 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
12393 }
12394}
12395
12396
12397IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12398{
12399 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
12400 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
12401 ASMCompilerBarrier();
12402 puDst->au64[0] = 0;
12403 puDst->au64[1] = 0;
12404 for (unsigned iByte = 0; iByte < 16; iByte++)
12405 {
12406 uint8_t idxSrc = uSrc2.au8[iByte];
12407 if (!(idxSrc & 0x80))
12408 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12409 }
12410}
12411
12412
12413IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12414{
12415 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
12416 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
12417 ASMCompilerBarrier();
12418 puDst->au64[0] = 0;
12419 puDst->au64[1] = 0;
12420 puDst->au64[2] = 0;
12421 puDst->au64[3] = 0;
12422 for (unsigned iByte = 0; iByte < 16; iByte++)
12423 {
12424 uint8_t idxSrc = uSrc2.au8[iByte];
12425 if (!(idxSrc & 0x80))
12426 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12427 }
12428 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12429 {
12430 uint8_t idxSrc = uSrc2.au8[iByte];
12431 if (!(idxSrc & 0x80))
12432 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
12433 }
12434}
12435
12436
12437/*
12438 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
12439 */
12440#ifdef IEM_WITHOUT_ASSEMBLY
12441
12442IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
12443{
12444 uint64_t const uSrc = *puSrc;
12445 ASMCompilerBarrier();
12446 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12447 uSrc >> (((bEvil >> 2) & 3) * 16),
12448 uSrc >> (((bEvil >> 4) & 3) * 16),
12449 uSrc >> (((bEvil >> 6) & 3) * 16));
12450}
12451
12452
12453IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12454{
12455 puDst->QWords.qw0 = puSrc->QWords.qw0;
12456 uint64_t const uSrc = puSrc->QWords.qw1;
12457 ASMCompilerBarrier();
12458 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12459 uSrc >> (((bEvil >> 2) & 3) * 16),
12460 uSrc >> (((bEvil >> 4) & 3) * 16),
12461 uSrc >> (((bEvil >> 6) & 3) * 16));
12462}
12463
12464#endif
12465
12466IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12467{
12468 puDst->QWords.qw0 = puSrc->QWords.qw0;
12469 uint64_t const uSrc1 = puSrc->QWords.qw1;
12470 puDst->QWords.qw2 = puSrc->QWords.qw2;
12471 uint64_t const uSrc3 = puSrc->QWords.qw3;
12472 ASMCompilerBarrier();
12473 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12474 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12475 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12476 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12477 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12478 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12479 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12480 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12481}
12482
12483#ifdef IEM_WITHOUT_ASSEMBLY
12484IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12485{
12486 puDst->QWords.qw1 = puSrc->QWords.qw1;
12487 uint64_t const uSrc = puSrc->QWords.qw0;
12488 ASMCompilerBarrier();
12489 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12490 uSrc >> (((bEvil >> 2) & 3) * 16),
12491 uSrc >> (((bEvil >> 4) & 3) * 16),
12492 uSrc >> (((bEvil >> 6) & 3) * 16));
12493
12494}
12495#endif
12496
12497
12498IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12499{
12500 puDst->QWords.qw3 = puSrc->QWords.qw3;
12501 uint64_t const uSrc2 = puSrc->QWords.qw2;
12502 puDst->QWords.qw1 = puSrc->QWords.qw1;
12503 uint64_t const uSrc0 = puSrc->QWords.qw0;
12504 ASMCompilerBarrier();
12505 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12506 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12507 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12508 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12509 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12510 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12511 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12512 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12513
12514}
12515
12516
12517#ifdef IEM_WITHOUT_ASSEMBLY
12518IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12519{
12520 RTUINT128U const uSrc = *puSrc;
12521 ASMCompilerBarrier();
12522 puDst->au32[0] = uSrc.au32[bEvil & 3];
12523 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12524 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12525 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12526}
12527#endif
12528
12529
12530IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12531{
12532 RTUINT256U const uSrc = *puSrc;
12533 ASMCompilerBarrier();
12534 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12535 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12536 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12537 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12538 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12539 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12540 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12541 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12542}
12543
12544
12545/*
12546 * PUNPCKHBW - high bytes -> words
12547 */
12548#ifdef IEM_WITHOUT_ASSEMBLY
12549
12550IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12551{
12552 RTUINT64U const uSrc2 = { *puSrc };
12553 RTUINT64U const uSrc1 = { *puDst };
12554 ASMCompilerBarrier();
12555 RTUINT64U uDstOut;
12556 uDstOut.au8[0] = uSrc1.au8[4];
12557 uDstOut.au8[1] = uSrc2.au8[4];
12558 uDstOut.au8[2] = uSrc1.au8[5];
12559 uDstOut.au8[3] = uSrc2.au8[5];
12560 uDstOut.au8[4] = uSrc1.au8[6];
12561 uDstOut.au8[5] = uSrc2.au8[6];
12562 uDstOut.au8[6] = uSrc1.au8[7];
12563 uDstOut.au8[7] = uSrc2.au8[7];
12564 *puDst = uDstOut.u;
12565}
12566
12567
12568IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12569{
12570 RTUINT128U const uSrc2 = *puSrc;
12571 RTUINT128U const uSrc1 = *puDst;
12572 ASMCompilerBarrier();
12573 RTUINT128U uDstOut;
12574 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12575 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12576 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12577 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12578 uDstOut.au8[ 4] = uSrc1.au8[10];
12579 uDstOut.au8[ 5] = uSrc2.au8[10];
12580 uDstOut.au8[ 6] = uSrc1.au8[11];
12581 uDstOut.au8[ 7] = uSrc2.au8[11];
12582 uDstOut.au8[ 8] = uSrc1.au8[12];
12583 uDstOut.au8[ 9] = uSrc2.au8[12];
12584 uDstOut.au8[10] = uSrc1.au8[13];
12585 uDstOut.au8[11] = uSrc2.au8[13];
12586 uDstOut.au8[12] = uSrc1.au8[14];
12587 uDstOut.au8[13] = uSrc2.au8[14];
12588 uDstOut.au8[14] = uSrc1.au8[15];
12589 uDstOut.au8[15] = uSrc2.au8[15];
12590 *puDst = uDstOut;
12591}
12592
12593#endif
12594
12595IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12596{
12597 RTUINT128U const uSrc2 = *puSrc2;
12598 RTUINT128U const uSrc1 = *puSrc1;
12599 ASMCompilerBarrier();
12600 RTUINT128U uDstOut;
12601 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12602 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12603 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12604 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12605 uDstOut.au8[ 4] = uSrc1.au8[10];
12606 uDstOut.au8[ 5] = uSrc2.au8[10];
12607 uDstOut.au8[ 6] = uSrc1.au8[11];
12608 uDstOut.au8[ 7] = uSrc2.au8[11];
12609 uDstOut.au8[ 8] = uSrc1.au8[12];
12610 uDstOut.au8[ 9] = uSrc2.au8[12];
12611 uDstOut.au8[10] = uSrc1.au8[13];
12612 uDstOut.au8[11] = uSrc2.au8[13];
12613 uDstOut.au8[12] = uSrc1.au8[14];
12614 uDstOut.au8[13] = uSrc2.au8[14];
12615 uDstOut.au8[14] = uSrc1.au8[15];
12616 uDstOut.au8[15] = uSrc2.au8[15];
12617 *puDst = uDstOut;
12618}
12619
12620
12621IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12622{
12623 RTUINT256U const uSrc2 = *puSrc2;
12624 RTUINT256U const uSrc1 = *puSrc1;
12625 ASMCompilerBarrier();
12626 RTUINT256U uDstOut;
12627 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12628 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12629 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12630 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12631 uDstOut.au8[ 4] = uSrc1.au8[10];
12632 uDstOut.au8[ 5] = uSrc2.au8[10];
12633 uDstOut.au8[ 6] = uSrc1.au8[11];
12634 uDstOut.au8[ 7] = uSrc2.au8[11];
12635 uDstOut.au8[ 8] = uSrc1.au8[12];
12636 uDstOut.au8[ 9] = uSrc2.au8[12];
12637 uDstOut.au8[10] = uSrc1.au8[13];
12638 uDstOut.au8[11] = uSrc2.au8[13];
12639 uDstOut.au8[12] = uSrc1.au8[14];
12640 uDstOut.au8[13] = uSrc2.au8[14];
12641 uDstOut.au8[14] = uSrc1.au8[15];
12642 uDstOut.au8[15] = uSrc2.au8[15];
12643 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12644 uDstOut.au8[16] = uSrc1.au8[24];
12645 uDstOut.au8[17] = uSrc2.au8[24];
12646 uDstOut.au8[18] = uSrc1.au8[25];
12647 uDstOut.au8[19] = uSrc2.au8[25];
12648 uDstOut.au8[20] = uSrc1.au8[26];
12649 uDstOut.au8[21] = uSrc2.au8[26];
12650 uDstOut.au8[22] = uSrc1.au8[27];
12651 uDstOut.au8[23] = uSrc2.au8[27];
12652 uDstOut.au8[24] = uSrc1.au8[28];
12653 uDstOut.au8[25] = uSrc2.au8[28];
12654 uDstOut.au8[26] = uSrc1.au8[29];
12655 uDstOut.au8[27] = uSrc2.au8[29];
12656 uDstOut.au8[28] = uSrc1.au8[30];
12657 uDstOut.au8[29] = uSrc2.au8[30];
12658 uDstOut.au8[30] = uSrc1.au8[31];
12659 uDstOut.au8[31] = uSrc2.au8[31];
12660 *puDst = uDstOut;
12661}
12662
12663
12664/*
12665 * PUNPCKHBW - high words -> dwords
12666 */
12667#ifdef IEM_WITHOUT_ASSEMBLY
12668
12669IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12670{
12671 RTUINT64U const uSrc2 = { *puSrc };
12672 RTUINT64U const uSrc1 = { *puDst };
12673 ASMCompilerBarrier();
12674 RTUINT64U uDstOut;
12675 uDstOut.au16[0] = uSrc1.au16[2];
12676 uDstOut.au16[1] = uSrc2.au16[2];
12677 uDstOut.au16[2] = uSrc1.au16[3];
12678 uDstOut.au16[3] = uSrc2.au16[3];
12679 *puDst = uDstOut.u;
12680}
12681
12682
12683IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12684{
12685 RTUINT128U const uSrc2 = *puSrc;
12686 RTUINT128U const uSrc1 = *puDst;
12687 ASMCompilerBarrier();
12688 RTUINT128U uDstOut;
12689 uDstOut.au16[0] = uSrc1.au16[4];
12690 uDstOut.au16[1] = uSrc2.au16[4];
12691 uDstOut.au16[2] = uSrc1.au16[5];
12692 uDstOut.au16[3] = uSrc2.au16[5];
12693 uDstOut.au16[4] = uSrc1.au16[6];
12694 uDstOut.au16[5] = uSrc2.au16[6];
12695 uDstOut.au16[6] = uSrc1.au16[7];
12696 uDstOut.au16[7] = uSrc2.au16[7];
12697 *puDst = uDstOut;
12698}
12699
12700#endif
12701
12702IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12703{
12704 RTUINT128U const uSrc2 = *puSrc2;
12705 RTUINT128U const uSrc1 = *puSrc1;
12706 ASMCompilerBarrier();
12707 RTUINT128U uDstOut;
12708 uDstOut.au16[0] = uSrc1.au16[4];
12709 uDstOut.au16[1] = uSrc2.au16[4];
12710 uDstOut.au16[2] = uSrc1.au16[5];
12711 uDstOut.au16[3] = uSrc2.au16[5];
12712 uDstOut.au16[4] = uSrc1.au16[6];
12713 uDstOut.au16[5] = uSrc2.au16[6];
12714 uDstOut.au16[6] = uSrc1.au16[7];
12715 uDstOut.au16[7] = uSrc2.au16[7];
12716 *puDst = uDstOut;
12717}
12718
12719
12720IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12721{
12722 RTUINT256U const uSrc2 = *puSrc2;
12723 RTUINT256U const uSrc1 = *puSrc1;
12724 ASMCompilerBarrier();
12725 RTUINT256U uDstOut;
12726 uDstOut.au16[0] = uSrc1.au16[4];
12727 uDstOut.au16[1] = uSrc2.au16[4];
12728 uDstOut.au16[2] = uSrc1.au16[5];
12729 uDstOut.au16[3] = uSrc2.au16[5];
12730 uDstOut.au16[4] = uSrc1.au16[6];
12731 uDstOut.au16[5] = uSrc2.au16[6];
12732 uDstOut.au16[6] = uSrc1.au16[7];
12733 uDstOut.au16[7] = uSrc2.au16[7];
12734
12735 uDstOut.au16[8] = uSrc1.au16[12];
12736 uDstOut.au16[9] = uSrc2.au16[12];
12737 uDstOut.au16[10] = uSrc1.au16[13];
12738 uDstOut.au16[11] = uSrc2.au16[13];
12739 uDstOut.au16[12] = uSrc1.au16[14];
12740 uDstOut.au16[13] = uSrc2.au16[14];
12741 uDstOut.au16[14] = uSrc1.au16[15];
12742 uDstOut.au16[15] = uSrc2.au16[15];
12743 *puDst = uDstOut;
12744}
12745
12746
12747/*
12748 * PUNPCKHBW - high dwords -> qword(s)
12749 */
12750#ifdef IEM_WITHOUT_ASSEMBLY
12751
12752IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12753{
12754 RTUINT64U const uSrc2 = { *puSrc };
12755 RTUINT64U const uSrc1 = { *puDst };
12756 ASMCompilerBarrier();
12757 RTUINT64U uDstOut;
12758 uDstOut.au32[0] = uSrc1.au32[1];
12759 uDstOut.au32[1] = uSrc2.au32[1];
12760 *puDst = uDstOut.u;
12761}
12762
12763
12764IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12765{
12766 RTUINT128U const uSrc2 = *puSrc;
12767 RTUINT128U const uSrc1 = *puDst;
12768 ASMCompilerBarrier();
12769 RTUINT128U uDstOut;
12770 uDstOut.au32[0] = uSrc1.au32[2];
12771 uDstOut.au32[1] = uSrc2.au32[2];
12772 uDstOut.au32[2] = uSrc1.au32[3];
12773 uDstOut.au32[3] = uSrc2.au32[3];
12774 *puDst = uDstOut;
12775}
12776
12777#endif
12778
12779IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12780{
12781 RTUINT128U const uSrc2 = *puSrc2;
12782 RTUINT128U const uSrc1 = *puSrc1;
12783 ASMCompilerBarrier();
12784 RTUINT128U uDstOut;
12785 uDstOut.au32[0] = uSrc1.au32[2];
12786 uDstOut.au32[1] = uSrc2.au32[2];
12787 uDstOut.au32[2] = uSrc1.au32[3];
12788 uDstOut.au32[3] = uSrc2.au32[3];
12789 *puDst = uDstOut;
12790}
12791
12792
12793IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12794{
12795 RTUINT256U const uSrc2 = *puSrc2;
12796 RTUINT256U const uSrc1 = *puSrc1;
12797 ASMCompilerBarrier();
12798 RTUINT256U uDstOut;
12799 uDstOut.au32[0] = uSrc1.au32[2];
12800 uDstOut.au32[1] = uSrc2.au32[2];
12801 uDstOut.au32[2] = uSrc1.au32[3];
12802 uDstOut.au32[3] = uSrc2.au32[3];
12803
12804 uDstOut.au32[4] = uSrc1.au32[6];
12805 uDstOut.au32[5] = uSrc2.au32[6];
12806 uDstOut.au32[6] = uSrc1.au32[7];
12807 uDstOut.au32[7] = uSrc2.au32[7];
12808 *puDst = uDstOut;
12809}
12810
12811
12812/*
12813 * PUNPCKHQDQ -> High qwords -> double qword(s).
12814 */
12815#ifdef IEM_WITHOUT_ASSEMBLY
12816IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12817{
12818 RTUINT128U const uSrc2 = *puSrc;
12819 RTUINT128U const uSrc1 = *puDst;
12820 ASMCompilerBarrier();
12821 RTUINT128U uDstOut;
12822 uDstOut.au64[0] = uSrc1.au64[1];
12823 uDstOut.au64[1] = uSrc2.au64[1];
12824 *puDst = uDstOut;
12825}
12826#endif
12827
12828
12829IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12830{
12831 RTUINT128U const uSrc2 = *puSrc2;
12832 RTUINT128U const uSrc1 = *puSrc1;
12833 ASMCompilerBarrier();
12834 RTUINT128U uDstOut;
12835 uDstOut.au64[0] = uSrc1.au64[1];
12836 uDstOut.au64[1] = uSrc2.au64[1];
12837 *puDst = uDstOut;
12838}
12839
12840
12841IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12842{
12843 RTUINT256U const uSrc2 = *puSrc2;
12844 RTUINT256U const uSrc1 = *puSrc1;
12845 ASMCompilerBarrier();
12846 RTUINT256U uDstOut;
12847 uDstOut.au64[0] = uSrc1.au64[1];
12848 uDstOut.au64[1] = uSrc2.au64[1];
12849
12850 uDstOut.au64[2] = uSrc1.au64[3];
12851 uDstOut.au64[3] = uSrc2.au64[3];
12852 *puDst = uDstOut;
12853}
12854
12855
12856/*
12857 * PUNPCKLBW - low bytes -> words
12858 */
12859#ifdef IEM_WITHOUT_ASSEMBLY
12860
12861IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12862{
12863 RTUINT64U const uSrc2 = { *puSrc };
12864 RTUINT64U const uSrc1 = { *puDst };
12865 ASMCompilerBarrier();
12866 RTUINT64U uDstOut;
12867 uDstOut.au8[0] = uSrc1.au8[0];
12868 uDstOut.au8[1] = uSrc2.au8[0];
12869 uDstOut.au8[2] = uSrc1.au8[1];
12870 uDstOut.au8[3] = uSrc2.au8[1];
12871 uDstOut.au8[4] = uSrc1.au8[2];
12872 uDstOut.au8[5] = uSrc2.au8[2];
12873 uDstOut.au8[6] = uSrc1.au8[3];
12874 uDstOut.au8[7] = uSrc2.au8[3];
12875 *puDst = uDstOut.u;
12876}
12877
12878
12879IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12880{
12881 RTUINT128U const uSrc2 = *puSrc;
12882 RTUINT128U const uSrc1 = *puDst;
12883 ASMCompilerBarrier();
12884 RTUINT128U uDstOut;
12885 uDstOut.au8[ 0] = uSrc1.au8[0];
12886 uDstOut.au8[ 1] = uSrc2.au8[0];
12887 uDstOut.au8[ 2] = uSrc1.au8[1];
12888 uDstOut.au8[ 3] = uSrc2.au8[1];
12889 uDstOut.au8[ 4] = uSrc1.au8[2];
12890 uDstOut.au8[ 5] = uSrc2.au8[2];
12891 uDstOut.au8[ 6] = uSrc1.au8[3];
12892 uDstOut.au8[ 7] = uSrc2.au8[3];
12893 uDstOut.au8[ 8] = uSrc1.au8[4];
12894 uDstOut.au8[ 9] = uSrc2.au8[4];
12895 uDstOut.au8[10] = uSrc1.au8[5];
12896 uDstOut.au8[11] = uSrc2.au8[5];
12897 uDstOut.au8[12] = uSrc1.au8[6];
12898 uDstOut.au8[13] = uSrc2.au8[6];
12899 uDstOut.au8[14] = uSrc1.au8[7];
12900 uDstOut.au8[15] = uSrc2.au8[7];
12901 *puDst = uDstOut;
12902}
12903
12904#endif
12905
12906IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12907{
12908 RTUINT128U const uSrc2 = *puSrc2;
12909 RTUINT128U const uSrc1 = *puSrc1;
12910 ASMCompilerBarrier();
12911 RTUINT128U uDstOut;
12912 uDstOut.au8[ 0] = uSrc1.au8[0];
12913 uDstOut.au8[ 1] = uSrc2.au8[0];
12914 uDstOut.au8[ 2] = uSrc1.au8[1];
12915 uDstOut.au8[ 3] = uSrc2.au8[1];
12916 uDstOut.au8[ 4] = uSrc1.au8[2];
12917 uDstOut.au8[ 5] = uSrc2.au8[2];
12918 uDstOut.au8[ 6] = uSrc1.au8[3];
12919 uDstOut.au8[ 7] = uSrc2.au8[3];
12920 uDstOut.au8[ 8] = uSrc1.au8[4];
12921 uDstOut.au8[ 9] = uSrc2.au8[4];
12922 uDstOut.au8[10] = uSrc1.au8[5];
12923 uDstOut.au8[11] = uSrc2.au8[5];
12924 uDstOut.au8[12] = uSrc1.au8[6];
12925 uDstOut.au8[13] = uSrc2.au8[6];
12926 uDstOut.au8[14] = uSrc1.au8[7];
12927 uDstOut.au8[15] = uSrc2.au8[7];
12928 *puDst = uDstOut;
12929}
12930
12931
12932IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12933{
12934 RTUINT256U const uSrc2 = *puSrc2;
12935 RTUINT256U const uSrc1 = *puSrc1;
12936 ASMCompilerBarrier();
12937 RTUINT256U uDstOut;
12938 uDstOut.au8[ 0] = uSrc1.au8[0];
12939 uDstOut.au8[ 1] = uSrc2.au8[0];
12940 uDstOut.au8[ 2] = uSrc1.au8[1];
12941 uDstOut.au8[ 3] = uSrc2.au8[1];
12942 uDstOut.au8[ 4] = uSrc1.au8[2];
12943 uDstOut.au8[ 5] = uSrc2.au8[2];
12944 uDstOut.au8[ 6] = uSrc1.au8[3];
12945 uDstOut.au8[ 7] = uSrc2.au8[3];
12946 uDstOut.au8[ 8] = uSrc1.au8[4];
12947 uDstOut.au8[ 9] = uSrc2.au8[4];
12948 uDstOut.au8[10] = uSrc1.au8[5];
12949 uDstOut.au8[11] = uSrc2.au8[5];
12950 uDstOut.au8[12] = uSrc1.au8[6];
12951 uDstOut.au8[13] = uSrc2.au8[6];
12952 uDstOut.au8[14] = uSrc1.au8[7];
12953 uDstOut.au8[15] = uSrc2.au8[7];
12954 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12955 uDstOut.au8[16] = uSrc1.au8[16];
12956 uDstOut.au8[17] = uSrc2.au8[16];
12957 uDstOut.au8[18] = uSrc1.au8[17];
12958 uDstOut.au8[19] = uSrc2.au8[17];
12959 uDstOut.au8[20] = uSrc1.au8[18];
12960 uDstOut.au8[21] = uSrc2.au8[18];
12961 uDstOut.au8[22] = uSrc1.au8[19];
12962 uDstOut.au8[23] = uSrc2.au8[19];
12963 uDstOut.au8[24] = uSrc1.au8[20];
12964 uDstOut.au8[25] = uSrc2.au8[20];
12965 uDstOut.au8[26] = uSrc1.au8[21];
12966 uDstOut.au8[27] = uSrc2.au8[21];
12967 uDstOut.au8[28] = uSrc1.au8[22];
12968 uDstOut.au8[29] = uSrc2.au8[22];
12969 uDstOut.au8[30] = uSrc1.au8[23];
12970 uDstOut.au8[31] = uSrc2.au8[23];
12971 *puDst = uDstOut;
12972}
12973
12974
12975/*
12976 * PUNPCKLBW - low words -> dwords
12977 */
12978#ifdef IEM_WITHOUT_ASSEMBLY
12979
12980IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12981{
12982 RTUINT64U const uSrc2 = { *puSrc };
12983 RTUINT64U const uSrc1 = { *puDst };
12984 ASMCompilerBarrier();
12985 RTUINT64U uDstOut;
12986 uDstOut.au16[0] = uSrc1.au16[0];
12987 uDstOut.au16[1] = uSrc2.au16[0];
12988 uDstOut.au16[2] = uSrc1.au16[1];
12989 uDstOut.au16[3] = uSrc2.au16[1];
12990 *puDst = uDstOut.u;
12991}
12992
12993
12994IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12995{
12996 RTUINT128U const uSrc2 = *puSrc;
12997 RTUINT128U const uSrc1 = *puDst;
12998 ASMCompilerBarrier();
12999 RTUINT128U uDstOut;
13000 uDstOut.au16[0] = uSrc1.au16[0];
13001 uDstOut.au16[1] = uSrc2.au16[0];
13002 uDstOut.au16[2] = uSrc1.au16[1];
13003 uDstOut.au16[3] = uSrc2.au16[1];
13004 uDstOut.au16[4] = uSrc1.au16[2];
13005 uDstOut.au16[5] = uSrc2.au16[2];
13006 uDstOut.au16[6] = uSrc1.au16[3];
13007 uDstOut.au16[7] = uSrc2.au16[3];
13008 *puDst = uDstOut;
13009}
13010
13011#endif
13012
13013IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13014{
13015 RTUINT128U const uSrc2 = *puSrc2;
13016 RTUINT128U const uSrc1 = *puSrc1;
13017 ASMCompilerBarrier();
13018 RTUINT128U uDstOut;
13019 uDstOut.au16[0] = uSrc1.au16[0];
13020 uDstOut.au16[1] = uSrc2.au16[0];
13021 uDstOut.au16[2] = uSrc1.au16[1];
13022 uDstOut.au16[3] = uSrc2.au16[1];
13023 uDstOut.au16[4] = uSrc1.au16[2];
13024 uDstOut.au16[5] = uSrc2.au16[2];
13025 uDstOut.au16[6] = uSrc1.au16[3];
13026 uDstOut.au16[7] = uSrc2.au16[3];
13027 *puDst = uDstOut;
13028}
13029
13030
13031IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13032{
13033 RTUINT256U const uSrc2 = *puSrc2;
13034 RTUINT256U const uSrc1 = *puSrc1;
13035 ASMCompilerBarrier();
13036 RTUINT256U uDstOut;
13037 uDstOut.au16[0] = uSrc1.au16[0];
13038 uDstOut.au16[1] = uSrc2.au16[0];
13039 uDstOut.au16[2] = uSrc1.au16[1];
13040 uDstOut.au16[3] = uSrc2.au16[1];
13041 uDstOut.au16[4] = uSrc1.au16[2];
13042 uDstOut.au16[5] = uSrc2.au16[2];
13043 uDstOut.au16[6] = uSrc1.au16[3];
13044 uDstOut.au16[7] = uSrc2.au16[3];
13045
13046 uDstOut.au16[8] = uSrc1.au16[8];
13047 uDstOut.au16[9] = uSrc2.au16[8];
13048 uDstOut.au16[10] = uSrc1.au16[9];
13049 uDstOut.au16[11] = uSrc2.au16[9];
13050 uDstOut.au16[12] = uSrc1.au16[10];
13051 uDstOut.au16[13] = uSrc2.au16[10];
13052 uDstOut.au16[14] = uSrc1.au16[11];
13053 uDstOut.au16[15] = uSrc2.au16[11];
13054 *puDst = uDstOut;
13055}
13056
13057
13058/*
13059 * PUNPCKLBW - low dwords -> qword(s)
13060 */
13061#ifdef IEM_WITHOUT_ASSEMBLY
13062
13063IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
13064{
13065 RTUINT64U const uSrc2 = { *puSrc };
13066 RTUINT64U const uSrc1 = { *puDst };
13067 ASMCompilerBarrier();
13068 RTUINT64U uDstOut;
13069 uDstOut.au32[0] = uSrc1.au32[0];
13070 uDstOut.au32[1] = uSrc2.au32[0];
13071 *puDst = uDstOut.u;
13072}
13073
13074
13075IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13076{
13077 RTUINT128U const uSrc2 = *puSrc;
13078 RTUINT128U const uSrc1 = *puDst;
13079 ASMCompilerBarrier();
13080 RTUINT128U uDstOut;
13081 uDstOut.au32[0] = uSrc1.au32[0];
13082 uDstOut.au32[1] = uSrc2.au32[0];
13083 uDstOut.au32[2] = uSrc1.au32[1];
13084 uDstOut.au32[3] = uSrc2.au32[1];
13085 *puDst = uDstOut;
13086}
13087
13088#endif
13089
13090IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13091{
13092 RTUINT128U const uSrc2 = *puSrc2;
13093 RTUINT128U const uSrc1 = *puSrc1;
13094 ASMCompilerBarrier();
13095 RTUINT128U uDstOut;
13096 uDstOut.au32[0] = uSrc1.au32[0];
13097 uDstOut.au32[1] = uSrc2.au32[0];
13098 uDstOut.au32[2] = uSrc1.au32[1];
13099 uDstOut.au32[3] = uSrc2.au32[1];
13100 *puDst = uDstOut;
13101}
13102
13103
13104IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13105{
13106 RTUINT256U const uSrc2 = *puSrc2;
13107 RTUINT256U const uSrc1 = *puSrc1;
13108 ASMCompilerBarrier();
13109 RTUINT256U uDstOut;
13110 uDstOut.au32[0] = uSrc1.au32[0];
13111 uDstOut.au32[1] = uSrc2.au32[0];
13112 uDstOut.au32[2] = uSrc1.au32[1];
13113 uDstOut.au32[3] = uSrc2.au32[1];
13114
13115 uDstOut.au32[4] = uSrc1.au32[4];
13116 uDstOut.au32[5] = uSrc2.au32[4];
13117 uDstOut.au32[6] = uSrc1.au32[5];
13118 uDstOut.au32[7] = uSrc2.au32[5];
13119 *puDst = uDstOut;
13120}
13121
13122
13123/*
13124 * PUNPCKLQDQ -> Low qwords -> double qword(s).
13125 */
13126#ifdef IEM_WITHOUT_ASSEMBLY
13127IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13128{
13129 RTUINT128U const uSrc2 = *puSrc;
13130 RTUINT128U const uSrc1 = *puDst;
13131 ASMCompilerBarrier();
13132 RTUINT128U uDstOut;
13133 uDstOut.au64[0] = uSrc1.au64[0];
13134 uDstOut.au64[1] = uSrc2.au64[0];
13135 *puDst = uDstOut;
13136}
13137#endif
13138
13139
13140IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13141{
13142 RTUINT128U const uSrc2 = *puSrc2;
13143 RTUINT128U const uSrc1 = *puSrc1;
13144 ASMCompilerBarrier();
13145 RTUINT128U uDstOut;
13146 uDstOut.au64[0] = uSrc1.au64[0];
13147 uDstOut.au64[1] = uSrc2.au64[0];
13148 *puDst = uDstOut;
13149}
13150
13151
13152IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13153{
13154 RTUINT256U const uSrc2 = *puSrc2;
13155 RTUINT256U const uSrc1 = *puSrc1;
13156 ASMCompilerBarrier();
13157 RTUINT256U uDstOut;
13158 uDstOut.au64[0] = uSrc1.au64[0];
13159 uDstOut.au64[1] = uSrc2.au64[0];
13160
13161 uDstOut.au64[2] = uSrc1.au64[2];
13162 uDstOut.au64[3] = uSrc2.au64[2];
13163 *puDst = uDstOut;
13164}
13165
13166
13167/*
13168 * PACKSSWB - signed words -> signed bytes
13169 */
13170
13171#ifdef IEM_WITHOUT_ASSEMBLY
13172
13173IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13174{
13175 RTUINT64U const uSrc2 = { *puSrc };
13176 RTUINT64U const uSrc1 = { *puDst };
13177 ASMCompilerBarrier();
13178 RTUINT64U uDstOut;
13179 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13180 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13181 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13182 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13183 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13184 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13185 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13186 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13187 *puDst = uDstOut.u;
13188}
13189
13190
13191IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13192{
13193 RTUINT128U const uSrc2 = *puSrc;
13194 RTUINT128U const uSrc1 = *puDst;
13195 ASMCompilerBarrier();
13196 RTUINT128U uDstOut;
13197 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13198 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13199 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13200 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13201 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13202 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13203 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13204 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13205 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13206 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13207 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13208 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13209 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13210 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13211 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13212 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13213 *puDst = uDstOut;
13214}
13215
13216#endif
13217
13218IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13219{
13220 RTUINT128U const uSrc2 = *puSrc2;
13221 RTUINT128U const uSrc1 = *puSrc1;
13222 ASMCompilerBarrier();
13223 RTUINT128U uDstOut;
13224 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13225 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13226 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13227 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13228 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13229 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13230 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13231 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13232 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13233 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13234 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13235 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13236 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13237 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13238 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13239 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13240 *puDst = uDstOut;
13241}
13242
13243
13244IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13245{
13246 RTUINT256U const uSrc2 = *puSrc2;
13247 RTUINT256U const uSrc1 = *puSrc1;
13248 ASMCompilerBarrier();
13249 RTUINT256U uDstOut;
13250 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13251 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13252 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13253 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13254 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13255 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13256 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13257 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13258 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13259 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13260 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13261 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13262 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13263 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13264 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13265 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13266
13267 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
13268 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
13269 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
13270 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
13271 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
13272 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
13273 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
13274 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
13275 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
13276 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
13277 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
13278 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
13279 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
13280 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
13281 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
13282 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
13283 *puDst = uDstOut;
13284}
13285
13286
13287/*
13288 * PACKUSWB - signed words -> unsigned bytes
13289 */
13290#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
13291 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
13292 ? (uint8_t)(a_iWord) \
13293 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
13294
13295#ifdef IEM_WITHOUT_ASSEMBLY
13296
13297IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13298{
13299 RTUINT64U const uSrc2 = { *puSrc };
13300 RTUINT64U const uSrc1 = { *puDst };
13301 ASMCompilerBarrier();
13302 RTUINT64U uDstOut;
13303 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13304 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13305 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13306 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13307 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13308 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13309 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13310 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13311 *puDst = uDstOut.u;
13312}
13313
13314
13315IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13316{
13317 RTUINT128U const uSrc2 = *puSrc;
13318 RTUINT128U const uSrc1 = *puDst;
13319 ASMCompilerBarrier();
13320 RTUINT128U uDstOut;
13321 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13322 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13323 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13324 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13325 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13326 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13327 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13328 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13329 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13330 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13331 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13332 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13333 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13334 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13335 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13336 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13337 *puDst = uDstOut;
13338}
13339
13340#endif
13341
13342IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13343{
13344 RTUINT128U const uSrc2 = *puSrc2;
13345 RTUINT128U const uSrc1 = *puSrc1;
13346 ASMCompilerBarrier();
13347 RTUINT128U uDstOut;
13348 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13349 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13350 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13351 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13352 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13353 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13354 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13355 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13356 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13357 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13358 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13359 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13360 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13361 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13362 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13363 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13364 *puDst = uDstOut;
13365}
13366
13367
13368IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13369{
13370 RTUINT256U const uSrc2 = *puSrc2;
13371 RTUINT256U const uSrc1 = *puSrc1;
13372 ASMCompilerBarrier();
13373 RTUINT256U uDstOut;
13374 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13375 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13376 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13377 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13378 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13379 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13380 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13381 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13382 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13383 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13384 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13385 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13386 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13387 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13388 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13389 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13390
13391 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
13392 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
13393 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
13394 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
13395 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
13396 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
13397 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
13398 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
13399 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
13400 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
13401 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
13402 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
13403 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
13404 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
13405 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
13406 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
13407 *puDst = uDstOut;
13408}
13409
13410
13411/*
13412 * PACKSSDW - signed dwords -> signed words
13413 */
13414
13415#ifdef IEM_WITHOUT_ASSEMBLY
13416
13417IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13418{
13419 RTUINT64U const uSrc2 = { *puSrc };
13420 RTUINT64U const uSrc1 = { *puDst };
13421 ASMCompilerBarrier();
13422 RTUINT64U uDstOut;
13423 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13424 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13425 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13426 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13427 *puDst = uDstOut.u;
13428}
13429
13430
13431IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13432{
13433 RTUINT128U const uSrc2 = *puSrc;
13434 RTUINT128U const uSrc1 = *puDst;
13435 ASMCompilerBarrier();
13436 RTUINT128U uDstOut;
13437 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13438 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13439 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13440 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13441 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13442 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13443 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13444 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13445 *puDst = uDstOut;
13446}
13447
13448#endif
13449
13450IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13451{
13452 RTUINT128U const uSrc2 = *puSrc2;
13453 RTUINT128U const uSrc1 = *puSrc1;
13454 ASMCompilerBarrier();
13455 RTUINT128U uDstOut;
13456 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13457 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13458 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13459 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13460 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13461 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13462 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13463 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13464 *puDst = uDstOut;
13465}
13466
13467
13468IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13469{
13470 RTUINT256U const uSrc2 = *puSrc2;
13471 RTUINT256U const uSrc1 = *puSrc1;
13472 ASMCompilerBarrier();
13473 RTUINT256U uDstOut;
13474 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13475 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13476 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13477 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13478 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13479 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13480 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13481 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13482
13483 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13484 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13485 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13486 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13487 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13488 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13489 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13490 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13491 *puDst = uDstOut;
13492}
13493
13494
13495/*
13496 * PACKUSDW - signed dwords -> unsigned words
13497 */
13498#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13499 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13500 ? (uint16_t)(a_iDword) \
13501 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13502
13503#ifdef IEM_WITHOUT_ASSEMBLY
13504IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13505{
13506 RTUINT128U const uSrc2 = *puSrc;
13507 RTUINT128U const uSrc1 = *puDst;
13508 ASMCompilerBarrier();
13509 RTUINT128U uDstOut;
13510 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13511 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13512 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13513 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13514 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13515 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13516 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13517 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13518 *puDst = uDstOut;
13519}
13520#endif
13521
13522IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13523{
13524 RTUINT128U const uSrc2 = *puSrc2;
13525 RTUINT128U const uSrc1 = *puSrc1;
13526 ASMCompilerBarrier();
13527 RTUINT128U uDstOut;
13528 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13529 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13530 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13531 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13532 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13533 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13534 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13535 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13536 *puDst = uDstOut;
13537}
13538
13539
13540IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13541{
13542 RTUINT256U const uSrc2 = *puSrc2;
13543 RTUINT256U const uSrc1 = *puSrc1;
13544 ASMCompilerBarrier();
13545 RTUINT256U uDstOut;
13546 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13547 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13548 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13549 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13550 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13551 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13552 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13553 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13554
13555 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13556 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13557 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13558 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13559 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13560 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13561 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13562 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13563 *puDst = uDstOut;
13564}
13565
13566
13567/*
13568 * [V]PABSB / [V]PABSW / [V]PABSD
13569 */
13570
13571IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13572{
13573 RTUINT64U const uSrc = { *puSrc };
13574 RTUINT64U uDstOut = { 0 };
13575
13576 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13577 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13578 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13579 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13580 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13581 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13582 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13583 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13584 *puDst = uDstOut.u;
13585}
13586
13587
13588IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13589{
13590 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13591 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13592 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13593 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13594 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13595 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13596 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13597 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13598 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13599 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13600 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13601 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13602 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13603 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13604 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13605 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13606}
13607
13608
13609IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13610{
13611 RTUINT64U const uSrc = { *puSrc };
13612 RTUINT64U uDstOut = { 0 };
13613
13614 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13615 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13616 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13617 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13618 *puDst = uDstOut.u;
13619}
13620
13621
13622IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13623{
13624 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13625 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13626 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13627 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13628 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13629 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13630 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13631 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13632}
13633
13634
13635IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13636{
13637 RTUINT64U const uSrc = { *puSrc };
13638 RTUINT64U uDstOut = { 0 };
13639
13640 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13641 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13642 *puDst = uDstOut.u;
13643}
13644
13645
13646IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13647{
13648 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13649 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13650 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13651 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13652}
13653
13654
13655IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13656{
13657 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13658 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13659 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13660 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13661 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13662 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13663 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13664 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13665 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13666 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13667 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13668 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13669 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13670 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13671 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13672 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13673}
13674
13675
13676IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13677{
13678 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13679 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13680 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13681 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13682 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13683 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13684 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13685 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13686 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13687 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13688 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13689 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13690 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13691 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13692 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13693 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13694 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13695 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13696 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13697 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13698 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13699 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13700 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13701 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13702 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13703 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13704 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13705 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13706 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13707 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13708 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13709 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13710}
13711
13712
13713IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13714{
13715 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13716 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13717 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13718 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13719 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13720 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13721 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13722 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13723}
13724
13725
13726IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13727{
13728 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13729 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13730 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13731 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13732 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13733 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13734 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13735 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13736 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13737 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13738 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13739 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13740 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13741 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13742 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13743 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13744}
13745
13746
13747IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13748{
13749 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13750 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13751 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13752 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13753}
13754
13755
13756IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13757{
13758 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13759 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13760 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13761 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13762 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13763 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13764 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13765 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13766}
13767
13768
13769/*
13770 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13771 */
13772IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13773{
13774 RTUINT64U uSrc1 = { *puDst };
13775 RTUINT64U uSrc2 = { *puSrc };
13776 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13777
13778 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13779 {
13780 if (uSrc2.ai8[i] < 0)
13781 uDst.ai8[i] = -uSrc1.ai8[i];
13782 else if (uSrc2.ai8[i] == 0)
13783 uDst.ai8[i] = 0;
13784 else /* uSrc2.ai8[i] > 0 */
13785 uDst.ai8[i] = uSrc1.ai8[i];
13786 }
13787
13788 *puDst = uDst.u;
13789}
13790
13791
13792IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13793{
13794 RTUINT128U uSrc1 = *puDst;
13795
13796 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13797 {
13798 if (puSrc->ai8[i] < 0)
13799 puDst->ai8[i] = -uSrc1.ai8[i];
13800 else if (puSrc->ai8[i] == 0)
13801 puDst->ai8[i] = 0;
13802 else /* puSrc->ai8[i] > 0 */
13803 puDst->ai8[i] = uSrc1.ai8[i];
13804 }
13805}
13806
13807
13808IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13809{
13810 RTUINT64U uSrc1 = { *puDst };
13811 RTUINT64U uSrc2 = { *puSrc };
13812 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13813
13814 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13815 {
13816 if (uSrc2.ai16[i] < 0)
13817 uDst.ai16[i] = -uSrc1.ai16[i];
13818 else if (uSrc2.ai16[i] == 0)
13819 uDst.ai16[i] = 0;
13820 else /* uSrc2.ai16[i] > 0 */
13821 uDst.ai16[i] = uSrc1.ai16[i];
13822 }
13823
13824 *puDst = uDst.u;
13825}
13826
13827
13828IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13829{
13830 RTUINT128U uSrc1 = *puDst;
13831
13832 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13833 {
13834 if (puSrc->ai16[i] < 0)
13835 puDst->ai16[i] = -uSrc1.ai16[i];
13836 else if (puSrc->ai16[i] == 0)
13837 puDst->ai16[i] = 0;
13838 else /* puSrc->ai16[i] > 0 */
13839 puDst->ai16[i] = uSrc1.ai16[i];
13840 }
13841}
13842
13843
13844IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13845{
13846 RTUINT64U uSrc1 = { *puDst };
13847 RTUINT64U uSrc2 = { *puSrc };
13848 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13849
13850 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13851 {
13852 if (uSrc2.ai32[i] < 0)
13853 uDst.ai32[i] = -uSrc1.ai32[i];
13854 else if (uSrc2.ai32[i] == 0)
13855 uDst.ai32[i] = 0;
13856 else /* uSrc2.ai32[i] > 0 */
13857 uDst.ai32[i] = uSrc1.ai32[i];
13858 }
13859
13860 *puDst = uDst.u;
13861}
13862
13863
13864IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13865{
13866 RTUINT128U uSrc1 = *puDst;
13867
13868 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13869 {
13870 if (puSrc->ai32[i] < 0)
13871 puDst->ai32[i] = -uSrc1.ai32[i];
13872 else if (puSrc->ai32[i] == 0)
13873 puDst->ai32[i] = 0;
13874 else /* puSrc->ai32[i] > 0 */
13875 puDst->ai32[i] = uSrc1.ai32[i];
13876 }
13877}
13878
13879
13880IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13881{
13882 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13883 {
13884 if (puSrc2->ai8[i] < 0)
13885 puDst->ai8[i] = -puSrc1->ai8[i];
13886 else if (puSrc2->ai8[i] == 0)
13887 puDst->ai8[i] = 0;
13888 else /* puSrc2->ai8[i] > 0 */
13889 puDst->ai8[i] = puSrc1->ai8[i];
13890 }
13891}
13892
13893
13894IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13895{
13896 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13897 {
13898 if (puSrc2->ai8[i] < 0)
13899 puDst->ai8[i] = -puSrc1->ai8[i];
13900 else if (puSrc2->ai8[i] == 0)
13901 puDst->ai8[i] = 0;
13902 else /* puSrc2->ai8[i] > 0 */
13903 puDst->ai8[i] = puSrc1->ai8[i];
13904 }
13905}
13906
13907
13908IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13909{
13910 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13911 {
13912 if (puSrc2->ai16[i] < 0)
13913 puDst->ai16[i] = -puSrc1->ai16[i];
13914 else if (puSrc2->ai16[i] == 0)
13915 puDst->ai16[i] = 0;
13916 else /* puSrc2->ai16[i] > 0 */
13917 puDst->ai16[i] = puSrc1->ai16[i];
13918 }
13919}
13920
13921
13922IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13923{
13924 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13925 {
13926 if (puSrc2->ai16[i] < 0)
13927 puDst->ai16[i] = -puSrc1->ai16[i];
13928 else if (puSrc2->ai16[i] == 0)
13929 puDst->ai16[i] = 0;
13930 else /* puSrc2->ai16[i] > 0 */
13931 puDst->ai16[i] = puSrc1->ai16[i];
13932 }
13933}
13934
13935
13936IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13937{
13938 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13939 {
13940 if (puSrc2->ai32[i] < 0)
13941 puDst->ai32[i] = -puSrc1->ai32[i];
13942 else if (puSrc2->ai32[i] == 0)
13943 puDst->ai32[i] = 0;
13944 else /* puSrc2->ai32[i] > 0 */
13945 puDst->ai32[i] = puSrc1->ai32[i];
13946 }
13947}
13948
13949
13950IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13951{
13952 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13953 {
13954 if (puSrc2->ai32[i] < 0)
13955 puDst->ai32[i] = -puSrc1->ai32[i];
13956 else if (puSrc2->ai32[i] == 0)
13957 puDst->ai32[i] = 0;
13958 else /* puSrc2->ai32[i] > 0 */
13959 puDst->ai32[i] = puSrc1->ai32[i];
13960 }
13961}
13962
13963
13964/*
13965 * PHADDW / VPHADDW / PHADDD / VPHADDD
13966 */
13967IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13968{
13969 RTUINT64U uSrc1 = { *puDst };
13970 RTUINT64U uSrc2 = { *puSrc };
13971 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13972
13973 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13974 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13975 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13976 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13977 *puDst = uDst.u;
13978}
13979
13980
13981IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13982{
13983 RTUINT128U uSrc1 = *puDst;
13984
13985 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13986 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13987 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13988 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13989
13990 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13991 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13992 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13993 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13994}
13995
13996
13997IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13998{
13999 RTUINT64U uSrc1 = { *puDst };
14000 RTUINT64U uSrc2 = { *puSrc };
14001 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14002
14003 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14004 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
14005 *puDst = uDst.u;
14006}
14007
14008
14009IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14010{
14011 RTUINT128U uSrc1 = *puDst;
14012
14013 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14014 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
14015
14016 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
14017 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
14018}
14019
14020
14021IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14022{
14023 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14024
14025 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
14026 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
14027 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
14028 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
14029
14030 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
14031 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
14032 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
14033 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
14034
14035 puDst->au64[0] = uDst.au64[0];
14036 puDst->au64[1] = uDst.au64[1];
14037}
14038
14039
14040IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14041{
14042 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14043
14044 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
14045 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
14046 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
14047 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
14048 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
14049 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
14050 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
14051 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
14052
14053 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
14054 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
14055 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
14056 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
14057 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
14058 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
14059 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
14060 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
14061
14062 puDst->au64[0] = uDst.au64[0];
14063 puDst->au64[1] = uDst.au64[1];
14064 puDst->au64[2] = uDst.au64[2];
14065 puDst->au64[3] = uDst.au64[3];
14066}
14067
14068
14069IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14070{
14071 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14072
14073 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
14074 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
14075
14076 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
14077 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
14078
14079 puDst->au64[0] = uDst.au64[0];
14080 puDst->au64[1] = uDst.au64[1];
14081}
14082
14083
14084IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14085{
14086 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14087
14088 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
14089 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
14090 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
14091 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
14092
14093 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
14094 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
14095 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
14096 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
14097
14098 puDst->au64[0] = uDst.au64[0];
14099 puDst->au64[1] = uDst.au64[1];
14100 puDst->au64[2] = uDst.au64[2];
14101 puDst->au64[3] = uDst.au64[3];
14102}
14103
14104
14105/*
14106 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
14107 */
14108IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14109{
14110 RTUINT64U uSrc1 = { *puDst };
14111 RTUINT64U uSrc2 = { *puSrc };
14112 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14113
14114 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14115 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14116 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
14117 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
14118 *puDst = uDst.u;
14119}
14120
14121
14122IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14123{
14124 RTUINT128U uSrc1 = *puDst;
14125
14126 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14127 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14128 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
14129 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
14130
14131 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
14132 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
14133 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
14134 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
14135}
14136
14137
14138IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14139{
14140 RTUINT64U uSrc1 = { *puDst };
14141 RTUINT64U uSrc2 = { *puSrc };
14142 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14143
14144 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14145 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
14146 *puDst = uDst.u;
14147}
14148
14149
14150IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14151{
14152 RTUINT128U uSrc1 = *puDst;
14153
14154 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14155 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
14156
14157 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
14158 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
14159}
14160
14161
14162IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14163{
14164 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14165
14166 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
14167 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
14168 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
14169 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
14170
14171 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
14172 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
14173 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
14174 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
14175
14176 puDst->au64[0] = uDst.au64[0];
14177 puDst->au64[1] = uDst.au64[1];
14178}
14179
14180
14181IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14182{
14183 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14184
14185 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
14186 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
14187 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
14188 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
14189 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
14190 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
14191 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
14192 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
14193
14194 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
14195 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
14196 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
14197 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
14198 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
14199 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
14200 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
14201 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
14202
14203 puDst->au64[0] = uDst.au64[0];
14204 puDst->au64[1] = uDst.au64[1];
14205 puDst->au64[2] = uDst.au64[2];
14206 puDst->au64[3] = uDst.au64[3];
14207}
14208
14209
14210IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14211{
14212 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14213
14214 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
14215 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
14216
14217 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
14218 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
14219
14220 puDst->au64[0] = uDst.au64[0];
14221 puDst->au64[1] = uDst.au64[1];
14222}
14223
14224
14225IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14226{
14227 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14228
14229 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
14230 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
14231 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
14232 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
14233
14234 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
14235 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
14236 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
14237 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
14238
14239 puDst->au64[0] = uDst.au64[0];
14240 puDst->au64[1] = uDst.au64[1];
14241 puDst->au64[2] = uDst.au64[2];
14242 puDst->au64[3] = uDst.au64[3];
14243}
14244
14245
14246/*
14247 * PHADDSW / VPHADDSW
14248 */
14249IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14250{
14251 RTUINT64U uSrc1 = { *puDst };
14252 RTUINT64U uSrc2 = { *puSrc };
14253 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14254
14255 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14256 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14257 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
14258 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
14259 *puDst = uDst.u;
14260}
14261
14262
14263IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14264{
14265 RTUINT128U uSrc1 = *puDst;
14266
14267 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14268 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14269 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
14270 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
14271
14272 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
14273 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
14274 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
14275 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
14276}
14277
14278
14279IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14280{
14281 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14282
14283 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
14284 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
14285 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
14286 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
14287
14288 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
14289 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
14290 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
14291 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
14292
14293 puDst->au64[0] = uDst.au64[0];
14294 puDst->au64[1] = uDst.au64[1];
14295}
14296
14297
14298IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14299{
14300 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14301
14302 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
14303 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
14304 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
14305 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
14306 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
14307 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
14308 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
14309 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
14310
14311 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
14312 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
14313 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
14314 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
14315 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
14316 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
14317 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
14318 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
14319
14320 puDst->au64[0] = uDst.au64[0];
14321 puDst->au64[1] = uDst.au64[1];
14322 puDst->au64[2] = uDst.au64[2];
14323 puDst->au64[3] = uDst.au64[3];
14324}
14325
14326
14327/*
14328 * PHSUBSW / VPHSUBSW
14329 */
14330IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14331{
14332 RTUINT64U uSrc1 = { *puDst };
14333 RTUINT64U uSrc2 = { *puSrc };
14334 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14335
14336 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14337 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14338 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
14339 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
14340 *puDst = uDst.u;
14341}
14342
14343
14344IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14345{
14346 RTUINT128U uSrc1 = *puDst;
14347
14348 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14349 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14350 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
14351 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
14352
14353 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
14354 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
14355 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
14356 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
14357}
14358
14359
14360IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14361{
14362 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14363
14364 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
14365 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
14366 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
14367 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
14368
14369 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
14370 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
14371 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
14372 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
14373
14374 puDst->au64[0] = uDst.au64[0];
14375 puDst->au64[1] = uDst.au64[1];
14376}
14377
14378
14379IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14380{
14381 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14382
14383 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
14384 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
14385 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
14386 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
14387 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
14388 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
14389 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
14390 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
14391
14392 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
14393 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
14394 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
14395 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
14396 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
14397 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
14398 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
14399 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
14400
14401 puDst->au64[0] = uDst.au64[0];
14402 puDst->au64[1] = uDst.au64[1];
14403 puDst->au64[2] = uDst.au64[2];
14404 puDst->au64[3] = uDst.au64[3];
14405}
14406
14407
14408/*
14409 * PMADDUBSW / VPMADDUBSW
14410 */
14411IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14412{
14413 RTUINT64U uSrc1 = { *puDst };
14414 RTUINT64U uSrc2 = { *puSrc };
14415 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14416
14417 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14418 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14419 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14420 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14421 *puDst = uDst.u;
14422}
14423
14424
14425IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14426{
14427 RTUINT128U uSrc1 = *puDst;
14428
14429 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14430 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14431 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14432 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14433 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14434 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14435 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14436 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14437}
14438
14439
14440IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14441{
14442 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14443
14444 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14445 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14446 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14447 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14448 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14449 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14450 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14451 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14452
14453 puDst->au64[0] = uDst.au64[0];
14454 puDst->au64[1] = uDst.au64[1];
14455}
14456
14457
14458IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14459{
14460 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14461
14462 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14463 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14464 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14465 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14466 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14467 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14468 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14469 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14470 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14471 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14472 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14473 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14474 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14475 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14476 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14477 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14478
14479 puDst->au64[0] = uDst.au64[0];
14480 puDst->au64[1] = uDst.au64[1];
14481 puDst->au64[2] = uDst.au64[2];
14482 puDst->au64[3] = uDst.au64[3];
14483}
14484
14485
14486/*
14487 * PMULHRSW / VPMULHRSW
14488 */
14489#define DO_PMULHRSW(a_Src1, a_Src2) \
14490 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14491
14492IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14493{
14494 RTUINT64U uSrc1 = { *puDst };
14495 RTUINT64U uSrc2 = { *puSrc };
14496 RTUINT64U uDst;
14497
14498 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14499 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14500 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14501 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14502 *puDst = uDst.u;
14503}
14504
14505
14506IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14507{
14508 RTUINT128U uSrc1 = *puDst;
14509
14510 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14511 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14512 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14513 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14514 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14515 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14516 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14517 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14518}
14519
14520
14521IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14522{
14523 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14524
14525 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14526 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14527 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14528 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14529 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14530 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14531 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14532 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14533
14534 puDst->au64[0] = uDst.au64[0];
14535 puDst->au64[1] = uDst.au64[1];
14536}
14537
14538
14539IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14540{
14541 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14542
14543 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14544 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14545 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14546 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14547 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14548 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14549 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14550 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14551 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14552 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14553 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14554 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14555 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14556 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14557 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14558 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14559
14560 puDst->au64[0] = uDst.au64[0];
14561 puDst->au64[1] = uDst.au64[1];
14562 puDst->au64[2] = uDst.au64[2];
14563 puDst->au64[3] = uDst.au64[3];
14564}
14565
14566
14567/*
14568 * PSADBW / VPSADBW
14569 */
14570#ifdef IEM_WITHOUT_ASSEMBLY
14571
14572IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14573{
14574 RTUINT64U uSrc1 = { *puDst };
14575 RTUINT64U uSrc2 = { *puSrc };
14576 RTUINT64U uDst;
14577 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14578 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14579 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14580 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14581 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14582 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14583 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14584 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14585
14586 uDst.au64[0] = 0;
14587 uDst.au16[0] = uSum;
14588 *puDst = uDst.u;
14589}
14590
14591
14592IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14593{
14594 RTUINT128U uSrc1 = *puDst;
14595
14596 puDst->au64[0] = 0;
14597 puDst->au64[1] = 0;
14598
14599 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14600 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14601 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14602 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14603 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14604 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14605 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14606 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14607 puDst->au16[0] = uSum;
14608
14609 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14610 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14611 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14612 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14613 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14614 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14615 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14616 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14617 puDst->au16[4] = uSum;
14618}
14619
14620#endif
14621
14622IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14623{
14624 RTUINT128U uSrc1 = *puSrc1;
14625 RTUINT128U uSrc2 = *puSrc2;
14626
14627 puDst->au64[0] = 0;
14628 puDst->au64[1] = 0;
14629
14630 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14631 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14632 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14633 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14634 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14635 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14636 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14637 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14638 puDst->au16[0] = uSum;
14639
14640 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14641 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14642 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14643 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14644 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14645 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14646 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14647 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14648 puDst->au16[4] = uSum;
14649}
14650
14651IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14652{
14653 RTUINT256U uSrc1 = *puSrc1;
14654 RTUINT256U uSrc2 = *puSrc2;
14655
14656 puDst->au64[0] = 0;
14657 puDst->au64[1] = 0;
14658 puDst->au64[2] = 0;
14659 puDst->au64[3] = 0;
14660
14661 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14662 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14663 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14664 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14665 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14666 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14667 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14668 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14669 puDst->au16[0] = uSum;
14670
14671 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14672 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14673 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14674 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14675 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14676 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14677 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14678 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14679 puDst->au16[4] = uSum;
14680
14681 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14682 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14683 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14684 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14685 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14686 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14687 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14688 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14689 puDst->au16[8] = uSum;
14690
14691 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14692 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14693 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14694 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14695 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14696 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14697 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14698 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14699 puDst->au16[12] = uSum;
14700}
14701
14702
14703/*
14704 * PMULDQ / VPMULDQ
14705 */
14706IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14707{
14708 RTUINT128U uSrc1 = *puDst;
14709
14710 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14711 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14712}
14713
14714IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14715{
14716 RTUINT128U uSrc1 = *puSrc1;
14717 RTUINT128U uSrc2 = *puSrc2;
14718
14719 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14720 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14721}
14722
14723IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14724{
14725 RTUINT256U uSrc1 = *puSrc1;
14726 RTUINT256U uSrc2 = *puSrc2;
14727
14728 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14729 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14730 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14731 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14732}
14733
14734
14735/*
14736 * PMULUDQ / VPMULUDQ
14737 */
14738#ifdef IEM_WITHOUT_ASSEMBLY
14739
14740IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(uint64_t *puDst, uint64_t const *puSrc))
14741{
14742 RTUINT64U uSrc1 = { *puDst };
14743 RTUINT64U uSrc2 = { *puSrc };
14744 ASMCompilerBarrier();
14745 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14746}
14747
14748
14749IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14750{
14751 RTUINT128U uSrc1 = *puDst;
14752 RTUINT128U uSrc2 = *puSrc;
14753 ASMCompilerBarrier();
14754 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14755 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14756}
14757
14758#endif
14759
14760IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14761{
14762 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14763 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14764 ASMCompilerBarrier();
14765 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14766 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14767}
14768
14769
14770IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14771{
14772 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14773 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14774 ASMCompilerBarrier();
14775 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14776 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14777 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14778 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14779}
14780
14781
14782/*
14783 * UNPCKLPS / VUNPCKLPS
14784 */
14785#ifdef IEM_WITHOUT_ASSEMBLY
14786IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14787{
14788 RTUINT128U uSrc1 = *puDst;
14789 RTUINT128U uSrc2 = *puSrc;
14790 ASMCompilerBarrier();
14791 puDst->au32[0] = uSrc1.au32[0];
14792 puDst->au32[1] = uSrc2.au32[0];
14793 puDst->au32[2] = uSrc1.au32[1];
14794 puDst->au32[3] = uSrc2.au32[1];
14795}
14796
14797#endif
14798
14799IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14800{
14801 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14802 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14803 ASMCompilerBarrier();
14804 puDst->au32[0] = uSrc1.au32[0];
14805 puDst->au32[1] = uSrc2.au32[0];
14806 puDst->au32[2] = uSrc1.au32[1];
14807 puDst->au32[3] = uSrc2.au32[1];
14808}
14809
14810
14811IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14812{
14813 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14814 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14815 ASMCompilerBarrier();
14816 puDst->au32[0] = uSrc1.au32[0];
14817 puDst->au32[1] = uSrc2.au32[0];
14818 puDst->au32[2] = uSrc1.au32[1];
14819 puDst->au32[3] = uSrc2.au32[1];
14820
14821 puDst->au32[4] = uSrc1.au32[4];
14822 puDst->au32[5] = uSrc2.au32[4];
14823 puDst->au32[6] = uSrc1.au32[5];
14824 puDst->au32[7] = uSrc2.au32[5];
14825}
14826
14827
14828/*
14829 * UNPCKLPD / VUNPCKLPD
14830 */
14831#ifdef IEM_WITHOUT_ASSEMBLY
14832IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14833{
14834 RTUINT128U uSrc1 = *puDst;
14835 RTUINT128U uSrc2 = *puSrc;
14836 ASMCompilerBarrier();
14837 puDst->au64[0] = uSrc1.au64[0];
14838 puDst->au64[1] = uSrc2.au64[0];
14839}
14840
14841#endif
14842
14843IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14844{
14845 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14846 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14847 ASMCompilerBarrier();
14848 puDst->au64[0] = uSrc1.au64[0];
14849 puDst->au64[1] = uSrc2.au64[0];
14850}
14851
14852
14853IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14854{
14855 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14856 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14857 ASMCompilerBarrier();
14858 puDst->au64[0] = uSrc1.au64[0];
14859 puDst->au64[1] = uSrc2.au64[0];
14860 puDst->au64[2] = uSrc1.au64[2];
14861 puDst->au64[3] = uSrc2.au64[2];
14862}
14863
14864
14865/*
14866 * UNPCKHPS / VUNPCKHPS
14867 */
14868#ifdef IEM_WITHOUT_ASSEMBLY
14869IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14870{
14871 RTUINT128U uSrc1 = *puDst;
14872 RTUINT128U uSrc2 = *puSrc;
14873 ASMCompilerBarrier();
14874 puDst->au32[0] = uSrc1.au32[2];
14875 puDst->au32[1] = uSrc2.au32[2];
14876 puDst->au32[2] = uSrc1.au32[3];
14877 puDst->au32[3] = uSrc2.au32[3];
14878}
14879
14880#endif
14881
14882IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14883{
14884 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14885 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14886 ASMCompilerBarrier();
14887 puDst->au32[0] = uSrc1.au32[2];
14888 puDst->au32[1] = uSrc2.au32[2];
14889 puDst->au32[2] = uSrc1.au32[3];
14890 puDst->au32[3] = uSrc2.au32[3];
14891}
14892
14893
14894IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14895{
14896 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14897 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14898 ASMCompilerBarrier();
14899 puDst->au32[0] = uSrc1.au32[2];
14900 puDst->au32[1] = uSrc2.au32[2];
14901 puDst->au32[2] = uSrc1.au32[3];
14902 puDst->au32[3] = uSrc2.au32[3];
14903
14904 puDst->au32[4] = uSrc1.au32[6];
14905 puDst->au32[5] = uSrc2.au32[6];
14906 puDst->au32[6] = uSrc1.au32[7];
14907 puDst->au32[7] = uSrc2.au32[7];
14908}
14909
14910
14911/*
14912 * UNPCKHPD / VUNPCKHPD
14913 */
14914#ifdef IEM_WITHOUT_ASSEMBLY
14915IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14916{
14917 RTUINT128U uSrc1 = *puDst;
14918 RTUINT128U uSrc2 = *puSrc;
14919 ASMCompilerBarrier();
14920 puDst->au64[0] = uSrc1.au64[1];
14921 puDst->au64[1] = uSrc2.au64[1];
14922}
14923
14924#endif
14925
14926IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14927{
14928 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14929 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14930 ASMCompilerBarrier();
14931 puDst->au64[0] = uSrc1.au64[1];
14932 puDst->au64[1] = uSrc2.au64[1];
14933}
14934
14935
14936IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14937{
14938 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14939 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14940 ASMCompilerBarrier();
14941 puDst->au64[0] = uSrc1.au64[1];
14942 puDst->au64[1] = uSrc2.au64[1];
14943 puDst->au64[2] = uSrc1.au64[3];
14944 puDst->au64[3] = uSrc2.au64[3];
14945}
14946
14947
14948/*
14949 * CRC32 (SEE 4.2).
14950 */
14951
14952IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14953{
14954 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14955}
14956
14957
14958IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14959{
14960 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14961}
14962
14963IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14964{
14965 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14966}
14967
14968IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14969{
14970 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14971}
14972
14973
14974/*
14975 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14976 */
14977#ifdef IEM_WITHOUT_ASSEMBLY
14978IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14979{
14980 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14981 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14982 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14983 fEfl |= X86_EFL_ZF;
14984 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14985 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14986 fEfl |= X86_EFL_CF;
14987 *pfEFlags = fEfl;
14988}
14989#endif
14990
14991IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14992{
14993 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14994 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14995 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14996 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14997 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14998 fEfl |= X86_EFL_ZF;
14999 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15000 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
15001 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
15002 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15003 fEfl |= X86_EFL_CF;
15004 *pfEFlags = fEfl;
15005}
15006
15007
15008/*
15009 * PMOVSXBW / VPMOVSXBW
15010 */
15011IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15012{
15013 RTUINT64U uSrc1 = { uSrc };
15014 puDst->ai16[0] = uSrc1.ai8[0];
15015 puDst->ai16[1] = uSrc1.ai8[1];
15016 puDst->ai16[2] = uSrc1.ai8[2];
15017 puDst->ai16[3] = uSrc1.ai8[3];
15018 puDst->ai16[4] = uSrc1.ai8[4];
15019 puDst->ai16[5] = uSrc1.ai8[5];
15020 puDst->ai16[6] = uSrc1.ai8[6];
15021 puDst->ai16[7] = uSrc1.ai8[7];
15022}
15023
15024
15025IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15026{
15027 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15028 puDst->ai16[ 0] = uSrc1.ai8[ 0];
15029 puDst->ai16[ 1] = uSrc1.ai8[ 1];
15030 puDst->ai16[ 2] = uSrc1.ai8[ 2];
15031 puDst->ai16[ 3] = uSrc1.ai8[ 3];
15032 puDst->ai16[ 4] = uSrc1.ai8[ 4];
15033 puDst->ai16[ 5] = uSrc1.ai8[ 5];
15034 puDst->ai16[ 6] = uSrc1.ai8[ 6];
15035 puDst->ai16[ 7] = uSrc1.ai8[ 7];
15036 puDst->ai16[ 8] = uSrc1.ai8[ 8];
15037 puDst->ai16[ 9] = uSrc1.ai8[ 9];
15038 puDst->ai16[10] = uSrc1.ai8[10];
15039 puDst->ai16[11] = uSrc1.ai8[11];
15040 puDst->ai16[12] = uSrc1.ai8[12];
15041 puDst->ai16[13] = uSrc1.ai8[13];
15042 puDst->ai16[14] = uSrc1.ai8[14];
15043 puDst->ai16[15] = uSrc1.ai8[15];
15044}
15045
15046
15047/*
15048 * PMOVSXBD / VPMOVSXBD
15049 */
15050IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15051{
15052 RTUINT32U uSrc1 = { uSrc };
15053 puDst->ai32[0] = uSrc1.ai8[0];
15054 puDst->ai32[1] = uSrc1.ai8[1];
15055 puDst->ai32[2] = uSrc1.ai8[2];
15056 puDst->ai32[3] = uSrc1.ai8[3];
15057}
15058
15059
15060IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15061{
15062 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15063 puDst->ai32[0] = uSrc1.ai8[0];
15064 puDst->ai32[1] = uSrc1.ai8[1];
15065 puDst->ai32[2] = uSrc1.ai8[2];
15066 puDst->ai32[3] = uSrc1.ai8[3];
15067 puDst->ai32[4] = uSrc1.ai8[4];
15068 puDst->ai32[5] = uSrc1.ai8[5];
15069 puDst->ai32[6] = uSrc1.ai8[6];
15070 puDst->ai32[7] = uSrc1.ai8[7];
15071}
15072
15073
15074/*
15075 * PMOVSXBQ / VPMOVSXBQ
15076 */
15077IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15078{
15079 RTUINT16U uSrc1 = { uSrc };
15080 puDst->ai64[0] = uSrc1.ai8[0];
15081 puDst->ai64[1] = uSrc1.ai8[1];
15082}
15083
15084
15085IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15086{
15087 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15088 puDst->ai64[0] = uSrc1.ai8[0];
15089 puDst->ai64[1] = uSrc1.ai8[1];
15090 puDst->ai64[2] = uSrc1.ai8[2];
15091 puDst->ai64[3] = uSrc1.ai8[3];
15092}
15093
15094
15095/*
15096 * PMOVSXWD / VPMOVSXWD
15097 */
15098IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15099{
15100 RTUINT64U uSrc1 = { uSrc };
15101 puDst->ai32[0] = uSrc1.ai16[0];
15102 puDst->ai32[1] = uSrc1.ai16[1];
15103 puDst->ai32[2] = uSrc1.ai16[2];
15104 puDst->ai32[3] = uSrc1.ai16[3];
15105}
15106
15107
15108IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15109{
15110 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15111 puDst->ai32[0] = uSrc1.ai16[0];
15112 puDst->ai32[1] = uSrc1.ai16[1];
15113 puDst->ai32[2] = uSrc1.ai16[2];
15114 puDst->ai32[3] = uSrc1.ai16[3];
15115 puDst->ai32[4] = uSrc1.ai16[4];
15116 puDst->ai32[5] = uSrc1.ai16[5];
15117 puDst->ai32[6] = uSrc1.ai16[6];
15118 puDst->ai32[7] = uSrc1.ai16[7];
15119}
15120
15121
15122/*
15123 * PMOVSXWQ / VPMOVSXWQ
15124 */
15125IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15126{
15127 RTUINT32U uSrc1 = { uSrc };
15128 puDst->ai64[0] = uSrc1.ai16[0];
15129 puDst->ai64[1] = uSrc1.ai16[1];
15130}
15131
15132
15133IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15134{
15135 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15136 puDst->ai64[0] = uSrc1.ai16[0];
15137 puDst->ai64[1] = uSrc1.ai16[1];
15138 puDst->ai64[2] = uSrc1.ai16[2];
15139 puDst->ai64[3] = uSrc1.ai16[3];
15140}
15141
15142
15143/*
15144 * PMOVSXDQ / VPMOVSXDQ
15145 */
15146IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15147{
15148 RTUINT64U uSrc1 = { uSrc };
15149 puDst->ai64[0] = uSrc1.ai32[0];
15150 puDst->ai64[1] = uSrc1.ai32[1];
15151}
15152
15153
15154IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15155{
15156 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15157 puDst->ai64[0] = uSrc1.ai32[0];
15158 puDst->ai64[1] = uSrc1.ai32[1];
15159 puDst->ai64[2] = uSrc1.ai32[2];
15160 puDst->ai64[3] = uSrc1.ai32[3];
15161}
15162
15163
15164/*
15165 * PMOVZXBW / VPMOVZXBW
15166 */
15167IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15168{
15169 RTUINT64U uSrc1 = { uSrc };
15170 puDst->au16[0] = uSrc1.au8[0];
15171 puDst->au16[1] = uSrc1.au8[1];
15172 puDst->au16[2] = uSrc1.au8[2];
15173 puDst->au16[3] = uSrc1.au8[3];
15174 puDst->au16[4] = uSrc1.au8[4];
15175 puDst->au16[5] = uSrc1.au8[5];
15176 puDst->au16[6] = uSrc1.au8[6];
15177 puDst->au16[7] = uSrc1.au8[7];
15178}
15179
15180
15181IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15182{
15183 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15184 puDst->au16[ 0] = uSrc1.au8[ 0];
15185 puDst->au16[ 1] = uSrc1.au8[ 1];
15186 puDst->au16[ 2] = uSrc1.au8[ 2];
15187 puDst->au16[ 3] = uSrc1.au8[ 3];
15188 puDst->au16[ 4] = uSrc1.au8[ 4];
15189 puDst->au16[ 5] = uSrc1.au8[ 5];
15190 puDst->au16[ 6] = uSrc1.au8[ 6];
15191 puDst->au16[ 7] = uSrc1.au8[ 7];
15192 puDst->au16[ 8] = uSrc1.au8[ 8];
15193 puDst->au16[ 9] = uSrc1.au8[ 9];
15194 puDst->au16[10] = uSrc1.au8[10];
15195 puDst->au16[11] = uSrc1.au8[11];
15196 puDst->au16[12] = uSrc1.au8[12];
15197 puDst->au16[13] = uSrc1.au8[13];
15198 puDst->au16[14] = uSrc1.au8[14];
15199 puDst->au16[15] = uSrc1.au8[15];
15200}
15201
15202
15203/*
15204 * PMOVZXBD / VPMOVZXBD
15205 */
15206IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15207{
15208 RTUINT32U uSrc1 = { uSrc };
15209 puDst->au32[0] = uSrc1.au8[0];
15210 puDst->au32[1] = uSrc1.au8[1];
15211 puDst->au32[2] = uSrc1.au8[2];
15212 puDst->au32[3] = uSrc1.au8[3];
15213}
15214
15215
15216IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15217{
15218 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15219 puDst->au32[0] = uSrc1.au8[0];
15220 puDst->au32[1] = uSrc1.au8[1];
15221 puDst->au32[2] = uSrc1.au8[2];
15222 puDst->au32[3] = uSrc1.au8[3];
15223 puDst->au32[4] = uSrc1.au8[4];
15224 puDst->au32[5] = uSrc1.au8[5];
15225 puDst->au32[6] = uSrc1.au8[6];
15226 puDst->au32[7] = uSrc1.au8[7];
15227}
15228
15229
15230/*
15231 * PMOVZXBQ / VPMOVZXBQ
15232 */
15233IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15234{
15235 RTUINT16U uSrc1 = { uSrc };
15236 puDst->au64[0] = uSrc1.au8[0];
15237 puDst->au64[1] = uSrc1.au8[1];
15238}
15239
15240
15241IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15242{
15243 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15244 puDst->au64[0] = uSrc1.au8[0];
15245 puDst->au64[1] = uSrc1.au8[1];
15246 puDst->au64[2] = uSrc1.au8[2];
15247 puDst->au64[3] = uSrc1.au8[3];
15248}
15249
15250
15251/*
15252 * PMOVZXWD / VPMOVZXWD
15253 */
15254IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15255{
15256 RTUINT64U uSrc1 = { uSrc };
15257 puDst->au32[0] = uSrc1.au16[0];
15258 puDst->au32[1] = uSrc1.au16[1];
15259 puDst->au32[2] = uSrc1.au16[2];
15260 puDst->au32[3] = uSrc1.au16[3];
15261}
15262
15263
15264IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15265{
15266 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15267 puDst->au32[0] = uSrc1.au16[0];
15268 puDst->au32[1] = uSrc1.au16[1];
15269 puDst->au32[2] = uSrc1.au16[2];
15270 puDst->au32[3] = uSrc1.au16[3];
15271 puDst->au32[4] = uSrc1.au16[4];
15272 puDst->au32[5] = uSrc1.au16[5];
15273 puDst->au32[6] = uSrc1.au16[6];
15274 puDst->au32[7] = uSrc1.au16[7];
15275}
15276
15277
15278/*
15279 * PMOVZXWQ / VPMOVZXWQ
15280 */
15281IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15282{
15283 RTUINT32U uSrc1 = { uSrc };
15284 puDst->au64[0] = uSrc1.au16[0];
15285 puDst->au64[1] = uSrc1.au16[1];
15286}
15287
15288
15289IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15290{
15291 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15292 puDst->au64[0] = uSrc1.au16[0];
15293 puDst->au64[1] = uSrc1.au16[1];
15294 puDst->au64[2] = uSrc1.au16[2];
15295 puDst->au64[3] = uSrc1.au16[3];
15296}
15297
15298
15299/*
15300 * PMOVZXDQ / VPMOVZXDQ
15301 */
15302IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15303{
15304 RTUINT64U uSrc1 = { uSrc };
15305 puDst->au64[0] = uSrc1.au32[0];
15306 puDst->au64[1] = uSrc1.au32[1];
15307}
15308
15309
15310IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15311{
15312 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15313 puDst->au64[0] = uSrc1.au32[0];
15314 puDst->au64[1] = uSrc1.au32[1];
15315 puDst->au64[2] = uSrc1.au32[2];
15316 puDst->au64[3] = uSrc1.au32[3];
15317}
15318
15319/**
15320 * Converts from the packed IPRT 32-bit (single precision) floating point format to
15321 * the SoftFloat 32-bit floating point format (float32_t).
15322 *
15323 * This is only a structure format conversion, nothing else.
15324 */
15325DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
15326{
15327 float32_t Tmp;
15328 Tmp.v = pr32Val->u;
15329 return Tmp;
15330}
15331
15332
15333/**
15334 * Converts from SoftFloat 32-bit floating point format (float32_t)
15335 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
15336 *
15337 * This is only a structure format conversion, nothing else.
15338 */
15339DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
15340{
15341 pr32Dst->u = r32XSrc.v;
15342 return pr32Dst;
15343}
15344
15345
15346/**
15347 * Converts from the packed IPRT 64-bit (single precision) floating point format to
15348 * the SoftFloat 64-bit floating point format (float64_t).
15349 *
15350 * This is only a structure format conversion, nothing else.
15351 */
15352DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
15353{
15354 float64_t Tmp;
15355 Tmp.v = pr64Val->u;
15356 return Tmp;
15357}
15358
15359
15360/**
15361 * Converts from SoftFloat 64-bit floating point format (float64_t)
15362 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
15363 *
15364 * This is only a structure format conversion, nothing else.
15365 */
15366DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
15367{
15368 pr64Dst->u = r64XSrc.v;
15369 return pr64Dst;
15370}
15371
15372
15373/** Initializer for the SoftFloat state structure. */
15374# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
15375 { \
15376 softfloat_tininess_afterRounding, \
15377 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
15378 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
15379 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
15380 : (uint8_t)softfloat_round_minMag, \
15381 0, \
15382 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
15383 32 /* Rounding precision, not relevant for SIMD. */ \
15384 }
15385
15386#ifdef IEM_WITHOUT_ASSEMBLY
15387
15388/**
15389 * Helper for transfering exception to MXCSR and setting the result value
15390 * accordingly.
15391 *
15392 * @returns Updated MXCSR.
15393 * @param pSoftState The SoftFloat state following the operation.
15394 * @param r32Result The result of the SoftFloat operation.
15395 * @param pr32Result Where to store the result for IEM.
15396 * @param fMxcsr The original MXCSR value.
15397 */
15398DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
15399 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15400{
15401 iemFpSoftF32ToIprt(pr32Result, r32Result);
15402
15403 uint8_t fXcpt = pSoftState->exceptionFlags;
15404 if ( (fMxcsr & X86_MXCSR_FZ)
15405 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
15406 {
15407 /* Underflow masked and flush to zero is set. */
15408 pr32Result->s.uFraction = 0;
15409 pr32Result->s.uExponent = 0;
15410 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15411 }
15412
15413 /* If DAZ is set \#DE is never set. */
15414 if ( fMxcsr & X86_MXCSR_DAZ
15415 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15416 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15417 fXcpt &= ~X86_MXCSR_DE;
15418
15419 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15420}
15421
15422
15423/**
15424 * Helper for transfering exception to MXCSR and setting the result value
15425 * accordingly - ignores Flush-to-Zero.
15426 *
15427 * @returns Updated MXCSR.
15428 * @param pSoftState The SoftFloat state following the operation.
15429 * @param r32Result The result of the SoftFloat operation.
15430 * @param pr32Result Where to store the result for IEM.
15431 * @param fMxcsr The original MXCSR value.
15432 */
15433DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15434 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15435{
15436 iemFpSoftF32ToIprt(pr32Result, r32Result);
15437
15438 uint8_t fXcpt = pSoftState->exceptionFlags;
15439 /* If DAZ is set \#DE is never set. */
15440 if ( fMxcsr & X86_MXCSR_DAZ
15441 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15442 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15443 fXcpt &= ~X86_MXCSR_DE;
15444
15445 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15446}
15447
15448
15449/**
15450 * Helper for transfering exception to MXCSR and setting the result value
15451 * accordingly.
15452 *
15453 * @returns Updated MXCSR.
15454 * @param pSoftState The SoftFloat state following the operation.
15455 * @param r64Result The result of the SoftFloat operation.
15456 * @param pr64Result Where to store the result for IEM.
15457 * @param fMxcsr The original MXCSR value.
15458 */
15459DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15460 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15461{
15462 iemFpSoftF64ToIprt(pr64Result, r64Result);
15463 uint8_t fXcpt = pSoftState->exceptionFlags;
15464 if ( (fMxcsr & X86_MXCSR_FZ)
15465 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15466 {
15467 /* Underflow masked and flush to zero is set. */
15468 iemFpSoftF64ToIprt(pr64Result, r64Result);
15469 pr64Result->s.uFractionHigh = 0;
15470 pr64Result->s.uFractionLow = 0;
15471 pr64Result->s.uExponent = 0;
15472 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15473 }
15474
15475 /* If DAZ is set \#DE is never set. */
15476 if ( fMxcsr & X86_MXCSR_DAZ
15477 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15478 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15479 fXcpt &= ~X86_MXCSR_DE;
15480
15481 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15482}
15483
15484
15485/**
15486 * Helper for transfering exception to MXCSR and setting the result value
15487 * accordingly - ignores Flush-to-Zero.
15488 *
15489 * @returns Updated MXCSR.
15490 * @param pSoftState The SoftFloat state following the operation.
15491 * @param r64Result The result of the SoftFloat operation.
15492 * @param pr64Result Where to store the result for IEM.
15493 * @param fMxcsr The original MXCSR value.
15494 */
15495DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15496 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15497{
15498 iemFpSoftF64ToIprt(pr64Result, r64Result);
15499
15500 uint8_t fXcpt = pSoftState->exceptionFlags;
15501 /* If DAZ is set \#DE is never set. */
15502 if ( fMxcsr & X86_MXCSR_DAZ
15503 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15504 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15505 fXcpt &= ~X86_MXCSR_DE;
15506
15507 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15508}
15509
15510#endif /* IEM_WITHOUT_ASSEMBLY */
15511
15512
15513/**
15514 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15515 * in MXCSR into account.
15516 *
15517 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15518 * @param pr32Val Where to store the result.
15519 * @param fMxcsr The input MXCSR value.
15520 * @param pr32Src The value to use.
15521 */
15522DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15523{
15524 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15525 {
15526 if (fMxcsr & X86_MXCSR_DAZ)
15527 {
15528 /* De-normals are changed to 0. */
15529 pr32Val->s.fSign = pr32Src->s.fSign;
15530 pr32Val->s.uFraction = 0;
15531 pr32Val->s.uExponent = 0;
15532 return 0;
15533 }
15534
15535 *pr32Val = *pr32Src;
15536 return X86_MXCSR_DE;
15537 }
15538
15539 *pr32Val = *pr32Src;
15540 return 0;
15541}
15542
15543
15544/**
15545 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15546 * in MXCSR into account.
15547 *
15548 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15549 * @param pr64Val Where to store the result.
15550 * @param fMxcsr The input MXCSR value.
15551 * @param pr64Src The value to use.
15552 */
15553DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15554{
15555 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15556 {
15557 if (fMxcsr & X86_MXCSR_DAZ)
15558 {
15559 /* De-normals are changed to 0. */
15560 pr64Val->s64.fSign = pr64Src->s.fSign;
15561 pr64Val->s64.uFraction = 0;
15562 pr64Val->s64.uExponent = 0;
15563 return 0;
15564 }
15565
15566 *pr64Val = *pr64Src;
15567 return X86_MXCSR_DE;
15568 }
15569
15570 *pr64Val = *pr64Src;
15571 return 0;
15572}
15573
15574#ifdef IEM_WITHOUT_ASSEMBLY
15575
15576/**
15577 * Validates the given input operands returning whether the operation can continue or whether one
15578 * of the source operands contains a NaN value, setting the output accordingly.
15579 *
15580 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15581 * @param pr32Res Where to store the result in case the operation can't continue.
15582 * @param pr32Val1 The first input operand.
15583 * @param pr32Val2 The second input operand.
15584 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15585 */
15586DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15587{
15588 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15589 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15590 if (cSNan + cQNan == 2)
15591 {
15592 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15593 *pr32Res = *pr32Val1;
15594 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15595 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15596 return true;
15597 }
15598 if (cSNan)
15599 {
15600 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15601 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15602 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15603 *pfMxcsr |= X86_MXCSR_IE;
15604 return true;
15605 }
15606 if (cQNan)
15607 {
15608 /* The QNan operand is placed into the result. */
15609 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15610 return true;
15611 }
15612
15613 Assert(!cQNan && !cSNan);
15614 return false;
15615}
15616
15617
15618/**
15619 * Validates the given double precision input operands returning whether the operation can continue or whether one
15620 * of the source operands contains a NaN value, setting the output accordingly.
15621 *
15622 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15623 * @param pr64Res Where to store the result in case the operation can't continue.
15624 * @param pr64Val1 The first input operand.
15625 * @param pr64Val2 The second input operand.
15626 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15627 */
15628DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15629{
15630 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15631 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15632 if (cSNan + cQNan == 2)
15633 {
15634 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15635 *pr64Res = *pr64Val1;
15636 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15637 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15638 return true;
15639 }
15640 if (cSNan)
15641 {
15642 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15643 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15644 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15645 *pfMxcsr |= X86_MXCSR_IE;
15646 return true;
15647 }
15648 if (cQNan)
15649 {
15650 /* The QNan operand is placed into the result. */
15651 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15652 return true;
15653 }
15654
15655 Assert(!cQNan && !cSNan);
15656 return false;
15657}
15658
15659
15660/**
15661 * Validates the given single input operand returning whether the operation can continue or whether
15662 * contains a NaN value, setting the output accordingly.
15663 *
15664 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15665 * @param pr32Res Where to store the result in case the operation can't continue.
15666 * @param pr32Val The input operand.
15667 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15668 */
15669DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15670{
15671 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15672 {
15673 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15674 *pr32Res = *pr32Val;
15675 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15676 *pfMxcsr |= X86_MXCSR_IE;
15677 return true;
15678 }
15679 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15680 {
15681 /* The QNan operand is placed into the result. */
15682 *pr32Res = *pr32Val;
15683 return true;
15684 }
15685
15686 return false;
15687}
15688
15689
15690/**
15691 * Validates the given double input operand returning whether the operation can continue or whether
15692 * contains a NaN value, setting the output accordingly.
15693 *
15694 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15695 * @param pr64Res Where to store the result in case the operation can't continue.
15696 * @param pr64Val The input operand.
15697 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15698 */
15699DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15700{
15701 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15702 {
15703 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15704 *pr64Res = *pr64Val;
15705 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15706 *pfMxcsr |= X86_MXCSR_IE;
15707 return true;
15708 }
15709 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15710 {
15711 /* The QNan operand is placed into the result. */
15712 *pr64Res = *pr64Val;
15713 return true;
15714 }
15715
15716 return false;
15717}
15718
15719#endif /* IEM_WITHOUT_ASSEMBLY */
15720
15721/**
15722 * ADDPS
15723 */
15724#ifdef IEM_WITHOUT_ASSEMBLY
15725static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15726{
15727 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15728 return fMxcsr;
15729
15730 RTFLOAT32U r32Src1, r32Src2;
15731 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15732 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15733 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15734 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15735 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15736}
15737
15738
15739IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15740{
15741 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15742 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15743 | iemAImpl_addps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15744 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15745}
15746#endif
15747
15748
15749/**
15750 * ADDSS
15751 */
15752#ifdef IEM_WITHOUT_ASSEMBLY
15753IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15754{
15755 pResult->ar32[1] = puSrc1->ar32[1];
15756 pResult->ar32[2] = puSrc1->ar32[2];
15757 pResult->ar32[3] = puSrc1->ar32[3];
15758 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15759}
15760#endif
15761
15762
15763/**
15764 * ADDPD
15765 */
15766#ifdef IEM_WITHOUT_ASSEMBLY
15767static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15768{
15769 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15770 return fMxcsr;
15771
15772 RTFLOAT64U r64Src1, r64Src2;
15773 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15774 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15775 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15776 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15777 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15778}
15779
15780
15781IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15782{
15783 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
15784 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15785}
15786#endif
15787
15788
15789/**
15790 * ADDSD
15791 */
15792#ifdef IEM_WITHOUT_ASSEMBLY
15793IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15794{
15795 pResult->ar64[1] = puSrc1->ar64[1];
15796 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
15797}
15798#endif
15799
15800
15801/**
15802 * MULPS
15803 */
15804#ifdef IEM_WITHOUT_ASSEMBLY
15805static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15806{
15807 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15808 return fMxcsr;
15809
15810 RTFLOAT32U r32Src1, r32Src2;
15811 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15812 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15813 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15814 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15815 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15816}
15817
15818
15819IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15820{
15821 return iemAImpl_mulps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15822 | iemAImpl_mulps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15823 | iemAImpl_mulps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15824 | iemAImpl_mulps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15825}
15826#endif
15827
15828
15829/**
15830 * MULSS
15831 */
15832#ifdef IEM_WITHOUT_ASSEMBLY
15833IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15834{
15835 pResult->ar32[1] = puSrc1->ar32[1];
15836 pResult->ar32[2] = puSrc1->ar32[2];
15837 pResult->ar32[3] = puSrc1->ar32[3];
15838 return iemAImpl_mulps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15839}
15840#endif
15841
15842
15843/**
15844 * MULPD
15845 */
15846#ifdef IEM_WITHOUT_ASSEMBLY
15847static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15848{
15849 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15850 return fMxcsr;
15851
15852 RTFLOAT64U r64Src1, r64Src2;
15853 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15854 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15855 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15856 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15857 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15858}
15859
15860
15861IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15862{
15863 return iemAImpl_mulpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
15864 | iemAImpl_mulpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15865}
15866#endif
15867
15868
15869/**
15870 * MULSD
15871 */
15872#ifdef IEM_WITHOUT_ASSEMBLY
15873IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15874{
15875 pResult->ar64[1] = puSrc1->ar64[1];
15876 return iemAImpl_mulpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
15877}
15878#endif
15879
15880
15881/**
15882 * SUBPS
15883 */
15884#ifdef IEM_WITHOUT_ASSEMBLY
15885static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15886{
15887 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15888 return fMxcsr;
15889
15890 RTFLOAT32U r32Src1, r32Src2;
15891 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15892 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15893 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15894 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15895 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15896}
15897
15898
15899IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15900{
15901 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15902 | iemAImpl_subps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15903 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15904 | iemAImpl_subps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15905}
15906#endif
15907
15908
15909/**
15910 * SUBSS
15911 */
15912#ifdef IEM_WITHOUT_ASSEMBLY
15913IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15914{
15915 pResult->ar32[1] = puSrc1->ar32[1];
15916 pResult->ar32[2] = puSrc1->ar32[2];
15917 pResult->ar32[3] = puSrc1->ar32[3];
15918 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15919}
15920#endif
15921
15922
15923/**
15924 * SUBPD
15925 */
15926#ifdef IEM_WITHOUT_ASSEMBLY
15927static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15928{
15929 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15930 return fMxcsr;
15931
15932 RTFLOAT64U r64Src1, r64Src2;
15933 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15934 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15935 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15936 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15937 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15938}
15939
15940
15941IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15942{
15943 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
15944 | iemAImpl_subpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15945}
15946#endif
15947
15948
15949/**
15950 * SUBSD
15951 */
15952#ifdef IEM_WITHOUT_ASSEMBLY
15953IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15954{
15955 pResult->ar64[1] = puSrc1->ar64[1];
15956 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
15957}
15958#endif
15959
15960
15961/**
15962 * MINPS
15963 */
15964#ifdef IEM_WITHOUT_ASSEMBLY
15965static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15966{
15967 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15968 {
15969 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15970 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15971 return fMxcsr | X86_MXCSR_IE;
15972 }
15973
15974 RTFLOAT32U r32Src1, r32Src2;
15975 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15976 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15977 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15978 {
15979 *pr32Res = r32Src2;
15980 return fMxcsr;
15981 }
15982
15983 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15984 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15985 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15986 fLe
15987 ? iemFpSoftF32FromIprt(&r32Src1)
15988 : iemFpSoftF32FromIprt(&r32Src2),
15989 pr32Res, fMxcsr);
15990}
15991
15992
15993IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15994{
15995 return iemAImpl_minps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15996 | iemAImpl_minps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15997 | iemAImpl_minps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15998 | iemAImpl_minps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15999}
16000#endif
16001
16002
16003/**
16004 * MINSS
16005 */
16006#ifdef IEM_WITHOUT_ASSEMBLY
16007IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16008{
16009 pResult->ar32[1] = puSrc1->ar32[1];
16010 pResult->ar32[2] = puSrc1->ar32[2];
16011 pResult->ar32[3] = puSrc1->ar32[3];
16012 return iemAImpl_minps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16013}
16014#endif
16015
16016
16017/**
16018 * MINPD
16019 */
16020#ifdef IEM_WITHOUT_ASSEMBLY
16021static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16022{
16023 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16024 {
16025 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16026 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16027 return fMxcsr | X86_MXCSR_IE;
16028 }
16029
16030 RTFLOAT64U r64Src1, r64Src2;
16031 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16032 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16033 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16034 {
16035 *pr64Res = r64Src2;
16036 return fMxcsr;
16037 }
16038
16039 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16040 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16041 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16042 fLe
16043 ? iemFpSoftF64FromIprt(&r64Src1)
16044 : iemFpSoftF64FromIprt(&r64Src2),
16045 pr64Res, fMxcsr);
16046}
16047
16048
16049IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16050{
16051 return iemAImpl_minpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16052 | iemAImpl_minpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16053}
16054#endif
16055
16056
16057/**
16058 * MINSD
16059 */
16060#ifdef IEM_WITHOUT_ASSEMBLY
16061IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16062{
16063 pResult->ar64[1] = puSrc1->ar64[1];
16064 return iemAImpl_minpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16065}
16066#endif
16067
16068
16069/**
16070 * DIVPS
16071 */
16072#ifdef IEM_WITHOUT_ASSEMBLY
16073static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16074{
16075 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16076 return fMxcsr;
16077
16078 RTFLOAT32U r32Src1, r32Src2;
16079 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16080 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16081 if (RTFLOAT32U_IS_ZERO(&r32Src2))
16082 {
16083 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
16084 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
16085 {
16086 *pr32Res = g_ar32QNaN[1];
16087 return fMxcsr | X86_MXCSR_IE;
16088 }
16089 else if (RTFLOAT32U_IS_INF(&r32Src1))
16090 {
16091 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16092 return fMxcsr;
16093 }
16094 else
16095 {
16096 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16097 return fMxcsr | X86_MXCSR_ZE;
16098 }
16099 }
16100
16101 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16102 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16103 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16104}
16105
16106
16107IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16108{
16109 return iemAImpl_divps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16110 | iemAImpl_divps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16111 | iemAImpl_divps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16112 | iemAImpl_divps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16113}
16114#endif
16115
16116
16117/**
16118 * DIVSS
16119 */
16120#ifdef IEM_WITHOUT_ASSEMBLY
16121IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16122{
16123 pResult->ar32[1] = puSrc1->ar32[1];
16124 pResult->ar32[2] = puSrc1->ar32[2];
16125 pResult->ar32[3] = puSrc1->ar32[3];
16126 return iemAImpl_divps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16127}
16128#endif
16129
16130
16131/**
16132 * DIVPD
16133 */
16134#ifdef IEM_WITHOUT_ASSEMBLY
16135static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16136{
16137 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16138 return fMxcsr;
16139
16140 RTFLOAT64U r64Src1, r64Src2;
16141 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16142 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16143 if (RTFLOAT64U_IS_ZERO(&r64Src2))
16144 {
16145 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
16146 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
16147 {
16148 *pr64Res = g_ar64QNaN[1];
16149 return fMxcsr | X86_MXCSR_IE;
16150 }
16151 else if (RTFLOAT64U_IS_INF(&r64Src1))
16152 {
16153 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16154 return fMxcsr;
16155 }
16156 else
16157 {
16158 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16159 return fMxcsr | X86_MXCSR_ZE;
16160 }
16161 }
16162
16163 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16164 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16165 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16166}
16167
16168
16169IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16170{
16171 return iemAImpl_divpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16172 | iemAImpl_divpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16173}
16174#endif
16175
16176
16177/**
16178 * DIVSD
16179 */
16180#ifdef IEM_WITHOUT_ASSEMBLY
16181IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16182{
16183 pResult->ar64[1] = puSrc1->ar64[1];
16184 return iemAImpl_divpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16185}
16186#endif
16187
16188
16189/**
16190 * MAXPS
16191 */
16192#ifdef IEM_WITHOUT_ASSEMBLY
16193static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16194{
16195 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16196 {
16197 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16198 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16199 return fMxcsr | X86_MXCSR_IE;
16200 }
16201
16202 RTFLOAT32U r32Src1, r32Src2;
16203 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16204 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16205 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16206 {
16207 *pr32Res = r32Src2;
16208 return fMxcsr;
16209 }
16210
16211 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16212 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16213 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16214 fLe
16215 ? iemFpSoftF32FromIprt(&r32Src2)
16216 : iemFpSoftF32FromIprt(&r32Src1),
16217 pr32Res, fMxcsr);
16218}
16219
16220
16221IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16222{
16223 return iemAImpl_maxps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16224 | iemAImpl_maxps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16225 | iemAImpl_maxps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16226 | iemAImpl_maxps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16227}
16228#endif
16229
16230
16231/**
16232 * MAXSS
16233 */
16234#ifdef IEM_WITHOUT_ASSEMBLY
16235IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16236{
16237 pResult->ar32[1] = puSrc1->ar32[1];
16238 pResult->ar32[2] = puSrc1->ar32[2];
16239 pResult->ar32[3] = puSrc1->ar32[3];
16240 return iemAImpl_maxps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16241}
16242#endif
16243
16244
16245/**
16246 * MAXPD
16247 */
16248#ifdef IEM_WITHOUT_ASSEMBLY
16249static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16250{
16251 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16252 {
16253 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16254 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16255 return fMxcsr | X86_MXCSR_IE;
16256 }
16257
16258 RTFLOAT64U r64Src1, r64Src2;
16259 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16260 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16261 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16262 {
16263 *pr64Res = r64Src2;
16264 return fMxcsr;
16265 }
16266
16267 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16268 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16269 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16270 fLe
16271 ? iemFpSoftF64FromIprt(&r64Src2)
16272 : iemFpSoftF64FromIprt(&r64Src1),
16273 pr64Res, fMxcsr);
16274}
16275
16276
16277IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16278{
16279 return iemAImpl_maxpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16280 | iemAImpl_maxpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16281}
16282#endif
16283
16284
16285/**
16286 * MAXSD
16287 */
16288#ifdef IEM_WITHOUT_ASSEMBLY
16289IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16290{
16291 pResult->ar64[1] = puSrc1->ar64[1];
16292 return iemAImpl_maxpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16293}
16294#endif
16295
16296
16297/**
16298 * CVTSS2SD
16299 */
16300#ifdef IEM_WITHOUT_ASSEMBLY
16301static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16302{
16303 RTFLOAT32U r32Src1;
16304 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16305
16306 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16307 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16308 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16309}
16310
16311
16312IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2sd_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16313{
16314 pResult->ar64[1] = puSrc1->ar64[1];
16315 return iemAImpl_cvtss2sd_u128_r32_worker(&pResult->ar64[0], uMxCsrIn, pr32Src2);
16316}
16317#endif
16318
16319
16320/**
16321 * CVTSD2SS
16322 */
16323#ifdef IEM_WITHOUT_ASSEMBLY
16324static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16325{
16326 RTFLOAT64U r64Src1;
16327 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16328
16329 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16330 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16331 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16332}
16333
16334
16335IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2ss_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16336{
16337 pResult->ar32[1] = puSrc1->ar32[1];
16338 pResult->ar32[2] = puSrc1->ar32[2];
16339 pResult->ar32[3] = puSrc1->ar32[3];
16340 return iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->ar32[0], uMxCsrIn, pr64Src2);
16341}
16342#endif
16343
16344
16345/**
16346 * HADDPS
16347 */
16348#ifdef IEM_WITHOUT_ASSEMBLY
16349IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_haddps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16350{
16351 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc1->ar32[1])
16352 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[2], &puSrc1->ar32[3])
16353 | iemAImpl_addps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[0], &puSrc2->ar32[1])
16354 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16355}
16356#endif
16357
16358
16359/**
16360 * HADDPD
16361 */
16362#ifdef IEM_WITHOUT_ASSEMBLY
16363IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_haddpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16364{
16365 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc1->ar64[1])
16366 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16367}
16368#endif
16369
16370
16371/**
16372 * HSUBPS
16373 */
16374#ifdef IEM_WITHOUT_ASSEMBLY
16375IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_hsubps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16376{
16377 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc1->ar32[1])
16378 | iemAImpl_subps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[2], &puSrc1->ar32[3])
16379 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[0], &puSrc2->ar32[1])
16380 | iemAImpl_subps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16381}
16382#endif
16383
16384
16385/**
16386 * HSUBPD
16387 */
16388#ifdef IEM_WITHOUT_ASSEMBLY
16389IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_hsubpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16390{
16391 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc1->ar64[1])
16392 | iemAImpl_subpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16393}
16394#endif
16395
16396
16397/**
16398 * SQRTPS
16399 */
16400#ifdef IEM_WITHOUT_ASSEMBLY
16401static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16402{
16403 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16404 return fMxcsr;
16405
16406 RTFLOAT32U r32Src;
16407 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
16408 if (RTFLOAT32U_IS_ZERO(&r32Src))
16409 {
16410 *pr32Res = r32Src;
16411 return fMxcsr;
16412 }
16413 else if (r32Src.s.fSign)
16414 {
16415 *pr32Res = g_ar32QNaN[1];
16416 return fMxcsr | X86_MXCSR_IE;
16417 }
16418
16419 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16420 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16421 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16422}
16423
16424
16425IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16426{
16427 RT_NOREF(puSrc1);
16428
16429 return iemAImpl_sqrtps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16430 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16431 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16432 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16433}
16434#endif
16435
16436
16437/**
16438 * SQRTSS
16439 */
16440#ifdef IEM_WITHOUT_ASSEMBLY
16441IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16442{
16443 pResult->ar32[1] = puSrc1->ar32[1];
16444 pResult->ar32[2] = puSrc1->ar32[2];
16445 pResult->ar32[3] = puSrc1->ar32[3];
16446 return iemAImpl_sqrtps_u128_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16447}
16448#endif
16449
16450
16451/**
16452 * SQRTPD
16453 */
16454#ifdef IEM_WITHOUT_ASSEMBLY
16455static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16456{
16457 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16458 return fMxcsr;
16459
16460 RTFLOAT64U r64Src;
16461 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16462 if (RTFLOAT64U_IS_ZERO(&r64Src))
16463 {
16464 *pr64Res = r64Src;
16465 return fMxcsr;
16466 }
16467 else if (r64Src.s.fSign)
16468 {
16469 *pr64Res = g_ar64QNaN[1];
16470 return fMxcsr | X86_MXCSR_IE;
16471 }
16472
16473 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16474 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16475 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16476}
16477
16478
16479IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16480{
16481 RT_NOREF(puSrc1);
16482
16483 return iemAImpl_sqrtpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc2->ar64[0])
16484 | iemAImpl_sqrtpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[1]);
16485}
16486#endif
16487
16488
16489/**
16490 * SQRTSD
16491 */
16492#ifdef IEM_WITHOUT_ASSEMBLY
16493IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16494{
16495 pResult->ar64[1] = puSrc1->ar64[1];
16496 return iemAImpl_sqrtpd_u128_worker(&pResult->ar64[0], uMxCsrIn, pr64Src2);
16497}
16498#endif
16499
16500
16501#ifdef IEM_WITHOUT_ASSEMBLY
16502/**
16503 * RSQRTPS
16504 */
16505static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16506{
16507 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16508 return fMxcsr;
16509
16510 RTFLOAT32U r32Src;
16511 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16512 if (RTFLOAT32U_IS_ZERO(&r32Src))
16513 {
16514 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16515 return fMxcsr;
16516 }
16517 else if (r32Src.s.fSign)
16518 {
16519 *pr32Res = g_ar32QNaN[1];
16520 return fMxcsr | X86_MXCSR_IE;
16521 }
16522
16523 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16524 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16525 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16526}
16527
16528
16529IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rsqrtps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16530{
16531 RT_NOREF(puSrc1);
16532
16533 return iemAImpl_rsqrt_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16534 | iemAImpl_rsqrt_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16535 | iemAImpl_rsqrt_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16536 | iemAImpl_rsqrt_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16537}
16538
16539
16540/**
16541 * RSQRTSS
16542 */
16543IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rsqrtss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16544{
16545 pResult->ar32[1] = puSrc1->ar32[1];
16546 pResult->ar32[2] = puSrc1->ar32[2];
16547 pResult->ar32[3] = puSrc1->ar32[3];
16548 return iemAImpl_rsqrt_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16549}
16550#endif
16551
16552
16553/**
16554 * RCPPS
16555 */
16556#ifdef IEM_WITHOUT_ASSEMBLY
16557static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16558{
16559 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16560 return fMxcsr;
16561
16562 RTFLOAT32U r32Src;
16563 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16564 if (RTFLOAT32U_IS_ZERO(&r32Src))
16565 {
16566 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16567 return fMxcsr;
16568 }
16569
16570 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16571 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16572 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16573}
16574
16575
16576IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rcpps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16577{
16578 RT_NOREF(puSrc1);
16579
16580 return iemAImpl_rcp_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16581 | iemAImpl_rcp_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16582 | iemAImpl_rcp_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16583 | iemAImpl_rcp_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16584}
16585
16586
16587/**
16588 * RCPSS
16589 */
16590IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rcpss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16591{
16592 pResult->ar32[1] = puSrc1->ar32[1];
16593 pResult->ar32[2] = puSrc1->ar32[2];
16594 pResult->ar32[3] = puSrc1->ar32[3];
16595 return iemAImpl_rcp_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16596}
16597#endif
16598
16599
16600/**
16601 * ADDSUBPS
16602 */
16603#ifdef IEM_WITHOUT_ASSEMBLY
16604IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsubps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16605{
16606 RT_NOREF(puSrc1);
16607
16608 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16609 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16610 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16611 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16612}
16613#endif
16614
16615
16616/**
16617 * ADDSUBPD
16618 */
16619#ifdef IEM_WITHOUT_ASSEMBLY
16620IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsubpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16621{
16622 RT_NOREF(puSrc1);
16623
16624 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16625 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16626}
16627#endif
16628
16629
16630/**
16631 * CVTPD2PS
16632 */
16633#ifdef IEM_WITHOUT_ASSEMBLY
16634static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16635{
16636 RTFLOAT64U r64Src1;
16637 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16638
16639 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16640 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16641 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16642}
16643
16644
16645IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2ps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16646{
16647 RT_NOREF(puSrc1);
16648
16649 pResult->au32[2] = 0;
16650 pResult->au32[3] = 0;
16651 return iemAImpl_cvtpd2ps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar64[0])
16652 | iemAImpl_cvtpd2ps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar64[1]);
16653}
16654#endif
16655
16656
16657/**
16658 * CVTPS2PD
16659 */
16660#ifdef IEM_WITHOUT_ASSEMBLY
16661static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16662{
16663 RTFLOAT32U r32Src1;
16664 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16665
16666 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16667 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16668 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16669}
16670
16671
16672IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2pd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16673{
16674 RT_NOREF(puSrc1);
16675
16676 return iemAImpl_cvtps2pd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc2->ar32[0])
16677 | iemAImpl_cvtps2pd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar32[1]);
16678}
16679#endif
16680
16681
16682/**
16683 * CVTDQ2PS
16684 */
16685#ifdef IEM_WITHOUT_ASSEMBLY
16686static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16687{
16688 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16689 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16690 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16691}
16692
16693
16694IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtdq2ps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16695{
16696 RT_NOREF(puSrc1);
16697
16698 return iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[0], uMxCsrIn, puSrc2->ai32[0])
16699 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[1], uMxCsrIn, puSrc2->ai32[1])
16700 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[2], uMxCsrIn, puSrc2->ai32[2])
16701 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[3], uMxCsrIn, puSrc2->ai32[3]);
16702}
16703#endif
16704
16705
16706/**
16707 * CVTPS2DQ
16708 */
16709#ifdef IEM_WITHOUT_ASSEMBLY
16710static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16711{
16712 RTFLOAT32U r32Src;
16713 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16714
16715 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16716 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16717 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16718}
16719
16720
16721IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16722{
16723 RT_NOREF(puSrc1);
16724
16725 return iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar32[0])
16726 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar32[1])
16727 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[2], uMxCsrIn, &puSrc2->ar32[2])
16728 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[3], uMxCsrIn, &puSrc2->ar32[3]);
16729}
16730#endif
16731
16732
16733/**
16734 * CVTTPS2DQ
16735 */
16736#ifdef IEM_WITHOUT_ASSEMBLY
16737static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16738{
16739 RTFLOAT32U r32Src;
16740 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16741
16742 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16743 SoftState.roundingMode = softfloat_round_minMag;
16744 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16745 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16746}
16747
16748
16749IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttps2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16750{
16751 RT_NOREF(puSrc1);
16752
16753 return iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar32[0])
16754 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar32[1])
16755 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[2], uMxCsrIn, &puSrc2->ar32[2])
16756 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[3], uMxCsrIn, &puSrc2->ar32[3]);
16757}
16758#endif
16759
16760
16761/**
16762 * CVTTPD2DQ
16763 */
16764#ifdef IEM_WITHOUT_ASSEMBLY
16765static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16766{
16767 RTFLOAT64U r64Src;
16768 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16769
16770 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16771 SoftState.roundingMode = softfloat_round_minMag;
16772 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16773 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16774}
16775
16776
16777IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttpd2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16778{
16779 RT_NOREF(puSrc1);
16780
16781 pResult->au64[1] = 0;
16782 return iemAImpl_cvttpd2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar64[0])
16783 | iemAImpl_cvttpd2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar64[1]);
16784}
16785#endif
16786
16787
16788/**
16789 * CVTDQ2PD
16790 */
16791#ifdef IEM_WITHOUT_ASSEMBLY
16792static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16793{
16794 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16795 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16796 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16797}
16798
16799
16800IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtdq2pd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16801{
16802 RT_NOREF(puSrc1);
16803
16804 return iemAImpl_cvtdq2pd_u128_worker(&pResult->ar64[0], uMxCsrIn, puSrc2->ai32[0])
16805 | iemAImpl_cvtdq2pd_u128_worker(&pResult->ar64[1], uMxCsrIn, puSrc2->ai32[1]);
16806}
16807#endif
16808
16809
16810/**
16811 * CVTPD2DQ
16812 */
16813#ifdef IEM_WITHOUT_ASSEMBLY
16814static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16815{
16816 RTFLOAT64U r64Src;
16817 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16818
16819 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16820 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16821 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16822}
16823
16824
16825IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16826{
16827 RT_NOREF(puSrc1);
16828
16829 pResult->au64[1] = 0;
16830 return iemAImpl_cvtpd2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar64[0])
16831 | iemAImpl_cvtpd2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar64[1]);
16832}
16833#endif
16834
16835
16836/**
16837 * [V]SHUFPS
16838 */
16839#ifdef IEM_WITHOUT_ASSEMBLY
16840IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16841{
16842 RTUINT128U const uSrc1 = *puDst;
16843 RTUINT128U const uSrc2 = *puSrc;
16844 ASMCompilerBarrier();
16845 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16846 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16847 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16848 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16849}
16850#endif
16851
16852
16853IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16854{
16855 RTUINT128U const uSrc1 = *puSrc1;
16856 RTUINT128U const uSrc2 = *puSrc2;
16857 ASMCompilerBarrier();
16858 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16859 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16860 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16861 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16862}
16863
16864
16865IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16866{
16867 RTUINT256U const uSrc1 = *puSrc1;
16868 RTUINT256U const uSrc2 = *puSrc2;
16869 ASMCompilerBarrier();
16870 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16871 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16872 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16873 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16874
16875 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
16876 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
16877 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
16878 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
16879}
16880
16881
16882/**
16883 * [V]SHUFPD
16884 */
16885#ifdef IEM_WITHOUT_ASSEMBLY
16886IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16887{
16888 RTUINT128U const uSrc1 = *puDst;
16889 RTUINT128U const uSrc2 = *puSrc;
16890 ASMCompilerBarrier();
16891 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16892 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16893}
16894#endif
16895
16896
16897IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16898{
16899 RTUINT128U const uSrc1 = *puSrc1;
16900 RTUINT128U const uSrc2 = *puSrc2;
16901 ASMCompilerBarrier();
16902 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16903 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16904}
16905
16906
16907IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16908{
16909 RTUINT256U const uSrc1 = *puSrc1;
16910 RTUINT256U const uSrc2 = *puSrc2;
16911 ASMCompilerBarrier();
16912 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16913 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16914 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
16915 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
16916}
16917
16918
16919/*
16920 * PHMINPOSUW / VPHMINPOSUW
16921 */
16922IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16923{
16924 uint16_t u16Min = puSrc->au16[0];
16925 uint8_t idxMin = 0;
16926
16927 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
16928 if (puSrc->au16[i] < u16Min)
16929 {
16930 u16Min = puSrc->au16[i];
16931 idxMin = i;
16932 }
16933
16934 puDst->au64[0] = 0;
16935 puDst->au64[1] = 0;
16936 puDst->au16[0] = u16Min;
16937 puDst->au16[1] = idxMin;
16938}
16939
16940
16941IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16942{
16943 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
16944}
16945
16946
16947/**
16948 * VPERMILPS
16949 */
16950#ifdef IEM_WITHOUT_ASSEMBLY
16951IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16952{
16953 RTUINT128U const uSrc = *puSrc;
16954 ASMCompilerBarrier();
16955
16956 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
16957 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
16958 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
16959 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
16960}
16961
16962
16963IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
16964{
16965 RTUINT256U const uSrc = *puSrc;
16966 ASMCompilerBarrier();
16967
16968 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
16969 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
16970 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
16971 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
16972
16973 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
16974 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
16975 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
16976 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
16977}
16978
16979IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
16980{
16981 RTUINT128U const uSrc1 = *puSrc1;
16982 RTUINT128U const uSrc2 = *puSrc2;
16983 ASMCompilerBarrier();
16984
16985 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
16986 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
16987 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
16988 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
16989}
16990
16991IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
16992{
16993 RTUINT256U const uSrc1 = *puSrc1;
16994 RTUINT256U const uSrc2 = *puSrc2;
16995 ASMCompilerBarrier();
16996
16997 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
16998 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
16999 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17000 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17001
17002 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17003 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17004 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17005 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17006}
17007#endif
17008
17009
17010IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17011{
17012 RTUINT128U const uSrc = *puSrc;
17013 ASMCompilerBarrier();
17014
17015 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17016 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17017 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17018 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17019}
17020
17021
17022IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17023{
17024 RTUINT256U const uSrc = *puSrc;
17025 ASMCompilerBarrier();
17026
17027 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17028 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17029 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17030 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17031
17032 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17033 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17034 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17035 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17036}
17037
17038IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17039{
17040 RTUINT128U const uSrc1 = *puSrc1;
17041 RTUINT128U const uSrc2 = *puSrc2;
17042 ASMCompilerBarrier();
17043
17044 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17045 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17046 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17047 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17048}
17049
17050IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17051{
17052 RTUINT256U const uSrc1 = *puSrc1;
17053 RTUINT256U const uSrc2 = *puSrc2;
17054 ASMCompilerBarrier();
17055
17056 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17057 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17058 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17059 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17060
17061 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17062 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17063 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17064 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17065}
17066
17067
17068/**
17069 * VPERMILPD
17070 */
17071#ifdef IEM_WITHOUT_ASSEMBLY
17072IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17073{
17074 RTUINT128U const uSrc = *puSrc;
17075 ASMCompilerBarrier();
17076
17077 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17078 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17079}
17080
17081
17082IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17083{
17084 RTUINT256U const uSrc = *puSrc;
17085 ASMCompilerBarrier();
17086
17087 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17088 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17089
17090 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17091 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17092}
17093
17094IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17095{
17096 RTUINT128U const uSrc1 = *puSrc1;
17097 RTUINT128U const uSrc2 = *puSrc2;
17098 ASMCompilerBarrier();
17099
17100 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17101 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17102}
17103
17104IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17105{
17106 RTUINT256U const uSrc1 = *puSrc1;
17107 RTUINT256U const uSrc2 = *puSrc2;
17108 ASMCompilerBarrier();
17109
17110 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17111 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17112
17113 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17114 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17115}
17116#endif
17117
17118
17119IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17120{
17121 RTUINT128U const uSrc = *puSrc;
17122 ASMCompilerBarrier();
17123
17124 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17125 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17126}
17127
17128
17129IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17130{
17131 RTUINT256U const uSrc = *puSrc;
17132 ASMCompilerBarrier();
17133
17134 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17135 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17136
17137 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17138 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17139}
17140
17141IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17142{
17143 RTUINT128U const uSrc1 = *puSrc1;
17144 RTUINT128U const uSrc2 = *puSrc2;
17145 ASMCompilerBarrier();
17146
17147 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17148 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17149}
17150
17151IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17152{
17153 RTUINT256U const uSrc1 = *puSrc1;
17154 RTUINT256U const uSrc2 = *puSrc2;
17155 ASMCompilerBarrier();
17156
17157 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17158 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17159
17160 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17161 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17162}
17163
17164
17165/*
17166 * [V]PBLENDVB
17167 */
17168IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17169{
17170 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17171 if (puMask->au8[i] & RT_BIT(7))
17172 puDst->au8[i] = puSrc->au8[i];
17173}
17174
17175
17176IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17177{
17178 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17179 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17180}
17181
17182
17183IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17184{
17185 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17186 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17187}
17188
17189
17190/*
17191 * [V]BLENDVPS
17192 */
17193IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17194{
17195 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17196 if (puMask->au32[i] & RT_BIT_32(31))
17197 puDst->au32[i] = puSrc->au32[i];
17198}
17199
17200
17201IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17202{
17203 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17204 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17205}
17206
17207
17208IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17209{
17210 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17211 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17212}
17213
17214
17215/*
17216 * [V]BLENDVPD
17217 */
17218IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17219{
17220 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
17221 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
17222}
17223
17224
17225IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17226{
17227 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17228 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17229}
17230
17231
17232IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17233{
17234 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17235 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17236}
17237
17238
17239/**
17240 * [V]PALIGNR
17241 */
17242IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
17243{
17244 uint64_t const u64Src1 = *pu64Dst;
17245 ASMCompilerBarrier();
17246
17247 if (bEvil >= 16)
17248 *pu64Dst = 0;
17249 else if (bEvil >= 8)
17250 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
17251 else
17252 {
17253 uint8_t cShift = bEvil * 8;
17254 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
17255 | (u64Src2 >> cShift);
17256 }
17257}
17258
17259
17260IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17261{
17262 RTUINT128U const uSrc1 = *puDst;
17263 RTUINT128U const uSrc2 = *puSrc;
17264 ASMCompilerBarrier();
17265
17266 puDst->au64[0] = 0;
17267 puDst->au64[1] = 0;
17268 if (bEvil >= 32)
17269 { /* Everything stays 0. */ }
17270 else if (bEvil >= 16)
17271 {
17272 bEvil -= 16;
17273 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17274 puDst->au8[i - bEvil] = uSrc1.au8[i];
17275 }
17276 else
17277 {
17278 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17279 puDst->au8[i] = uSrc2.au8[i + bEvil];
17280 for (uint8_t i = 0; i < bEvil; i++)
17281 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17282 }
17283}
17284
17285
17286IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17287{
17288 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17289 RTUINT128U const uSrc2 = *puSrc2;
17290 ASMCompilerBarrier();
17291
17292 puDst->au64[0] = 0;
17293 puDst->au64[1] = 0;
17294 if (bEvil >= 32)
17295 { /* Everything stays 0. */ }
17296 else if (bEvil >= 16)
17297 {
17298 bEvil -= 16;
17299 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17300 puDst->au8[i - bEvil] = uSrc1.au8[i];
17301 }
17302 else
17303 {
17304 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17305 puDst->au8[i] = uSrc2.au8[i + bEvil];
17306 for (uint8_t i = 0; i < bEvil; i++)
17307 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17308 }
17309}
17310
17311
17312IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17313{
17314 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17315 RTUINT256U const uSrc2 = *puSrc2;
17316 ASMCompilerBarrier();
17317
17318 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
17319 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
17320}
17321
17322
17323/**
17324 * [V]PBLENDW
17325 */
17326IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17327{
17328 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17329 if (bEvil & RT_BIT(i))
17330 puDst->au16[i] = puSrc->au16[i];
17331}
17332
17333
17334IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17335{
17336 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17337 if (bEvil & RT_BIT(i))
17338 puDst->au16[i] = puSrc2->au16[i];
17339 else
17340 puDst->au16[i] = puSrc1->au16[i];
17341}
17342
17343
17344IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17345{
17346 for (uint8_t i = 0; i < 8; i++)
17347 if (bEvil & RT_BIT(i))
17348 {
17349 puDst->au16[ i] = puSrc2->au16[ i];
17350 puDst->au16[8 + i] = puSrc2->au16[8 + i];
17351 }
17352 else
17353 {
17354 puDst->au16[ i] = puSrc1->au16[ i];
17355 puDst->au16[8 + i] = puSrc1->au16[8 + i];
17356 }
17357}
17358
17359
17360/**
17361 * [V]PBLENDD
17362 */
17363IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17364{
17365 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17366 if (bEvil & RT_BIT(i))
17367 puDst->au32[i] = puSrc2->au32[i];
17368 else
17369 puDst->au32[i] = puSrc1->au32[i];
17370}
17371
17372
17373IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17374{
17375 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17376 if (bEvil & RT_BIT(i))
17377 puDst->au32[i] = puSrc2->au32[i];
17378 else
17379 puDst->au32[i] = puSrc1->au32[i];
17380}
17381
17382
17383/**
17384 * [V]BLENDPS
17385 */
17386IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17387{
17388 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17389 if (bEvil & RT_BIT(i))
17390 puDst->au32[i] = puSrc->au32[i];
17391}
17392
17393
17394IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17395{
17396 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17397 if (bEvil & RT_BIT(i))
17398 puDst->au32[i] = puSrc2->au32[i];
17399 else
17400 puDst->au32[i] = puSrc1->au32[i];
17401}
17402
17403
17404IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17405{
17406 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17407 if (bEvil & RT_BIT(i))
17408 puDst->au32[i] = puSrc2->au32[i];
17409 else
17410 puDst->au32[i] = puSrc1->au32[i];
17411}
17412
17413
17414/**
17415 * [V]BLENDPD
17416 */
17417IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17418{
17419 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17420 if (bEvil & RT_BIT(i))
17421 puDst->au64[i] = puSrc->au64[i];
17422}
17423
17424
17425IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17426{
17427 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17428 if (bEvil & RT_BIT(i))
17429 puDst->au64[i] = puSrc2->au64[i];
17430 else
17431 puDst->au64[i] = puSrc1->au64[i];
17432}
17433
17434
17435IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17436{
17437 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17438 if (bEvil & RT_BIT(i))
17439 puDst->au64[i] = puSrc2->au64[i];
17440 else
17441 puDst->au64[i] = puSrc1->au64[i];
17442}
17443
17444
17445/**
17446 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
17447 */
17448
17449static uint8_t iemAImpl_aes_sbox[] = {
17450 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
17451 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
17452 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
17453 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
17454 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
17455 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
17456 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
17457 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
17458 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
17459 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
17460 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
17461 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17462 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
17463 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
17464 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
17465 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
17466};
17467
17468/* The InvS-Box lookup table. */
17469static uint8_t iemAImpl_aes_inv_sbox[] = {
17470 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
17471 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
17472 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
17473 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
17474 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
17475 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
17476 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
17477 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
17478 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
17479 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
17480 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
17481 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
17482 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
17483 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
17484 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
17485 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
17486};
17487
17488/* The ShiftRows lookup table. */
17489static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
17490 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
17491};
17492
17493/* The InvShiftRows lookup table. */
17494static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
17495 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
17496};
17497
17498static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
17499{
17500 RTUINT128U uVal;
17501 int i;
17502
17503 for (i = 0; i < 16; ++i)
17504 uVal.au8[i] = abSubst[puSrc->au8[i]];
17505
17506 return uVal;
17507}
17508
17509static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
17510{
17511 return (u << 1) ^ (((u >> 7) & 1) * 27);
17512}
17513
17514static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
17515{
17516 RTUINT128U uVal;
17517 int i;
17518 uint8_t tmp;
17519
17520 for (i = 0; i < 16; i += 4) {
17521 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
17522 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
17523 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
17524 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
17525 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
17526 }
17527
17528 return uVal;
17529}
17530
17531static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
17532{
17533 RTUINT128U uVal;
17534 int i;
17535
17536 for (i = 0; i < 16; ++i)
17537 uVal.au8[i] = puSrc->au8[abShift[i]];
17538
17539 return uVal;
17540}
17541
17542static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
17543{
17544 uint8_t val;
17545
17546 val = ((b >> 0) & 1) * a;
17547 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
17548 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
17549 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
17550 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
17551
17552 return val;
17553}
17554
17555static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
17556{
17557 RTUINT128U uVal;
17558 int i;
17559
17560 for (i = 0; i < 16; i += 4) {
17561 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
17562 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
17563 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
17564 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
17565 }
17566
17567 return uVal;
17568}
17569
17570static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
17571{
17572 RTUINT32U uTmp;
17573
17574 uTmp.au32[0] = w;
17575 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
17576 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
17577 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
17578 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
17579
17580 return uTmp.au32[0];
17581}
17582
17583static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
17584{
17585 return (w << 24) | (w >> 8);
17586}
17587
17588/**
17589 * [V]AESKEYGENASSIST
17590 */
17591IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
17592{
17593 RTUINT128U uTmp;
17594 uint32_t uRCon = bImm; /* Round constant. */
17595
17596 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
17597 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
17598 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
17599 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
17600
17601 *puDst = uTmp;
17602}
17603
17604
17605/**
17606 * [V]AESIMC
17607 */
17608IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17609{
17610 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
17611}
17612
17613
17614/**
17615 * [V]AESENC
17616 */
17617IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17618{
17619 RTUINT128U uTmp;
17620
17621 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17622 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17623 uTmp = iemAImpl_aes_mix_col(&uTmp);
17624 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17625 uTmp.au64[1] ^= puSrc->au64[1];
17626
17627 *puDst = uTmp;
17628}
17629
17630
17631/**
17632 * [V]AESENCLAST
17633 */
17634IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17635{
17636 RTUINT128U uTmp;
17637
17638 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17639 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17640 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17641 uTmp.au64[1] ^= puSrc->au64[1];
17642
17643 *puDst = uTmp;
17644}
17645
17646
17647/**
17648 * [V]AESDEC
17649 */
17650IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17651{
17652 RTUINT128U uTmp;
17653
17654 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17655 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17656 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17657 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17658 uTmp.au64[1] ^= puSrc->au64[1];
17659
17660 *puDst = uTmp;
17661}
17662
17663
17664/**
17665 * [V]AESDECLAST
17666 */
17667IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17668{
17669 RTUINT128U uTmp;
17670
17671 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17672 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17673 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17674 uTmp.au64[1] ^= puSrc->au64[1];
17675
17676 *puDst = uTmp;
17677}
17678
17679
17680/**
17681 * [V]PCMPISTRI
17682 */
17683
17684/**
17685 * Does the comparisons based on the mode and source input format.
17686 */
17687static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17688{
17689#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17690 do \
17691 { \
17692 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17693 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17694 { \
17695 switch (a_bAggOp) \
17696 { \
17697 case 0: \
17698 case 2: \
17699 case 3: \
17700 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17701 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17702 break; \
17703 case 1: \
17704 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17705 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17706 break; \
17707 default: \
17708 AssertReleaseFailed(); \
17709 } \
17710 } \
17711 } while(0)
17712
17713 uint8_t bAggOp = (bImm >> 2) & 0x3;
17714 switch (bImm & 0x3)
17715 {
17716 case 0:
17717 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17718 break;
17719 case 1:
17720 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17721 break;
17722 case 2:
17723 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17724 break;
17725 case 3:
17726 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17727 break;
17728 default:
17729 AssertReleaseFailed();
17730 }
17731#undef PCMPXSTRX_CMP_CASE
17732}
17733
17734static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17735{
17736 if (bImm & 0x1)
17737 {
17738 /* Words -> 8 elements. */
17739 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17740 if (puSrc->au16[i] == 0)
17741 return i;
17742
17743 return 8;
17744 }
17745 else
17746 {
17747 /* Bytes -> 16 elements. */
17748 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17749 if (puSrc->au8[i] == 0)
17750 return i;
17751
17752 return 16;
17753 }
17754}
17755
17756static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17757{
17758 if (bImm & 0x1)
17759 {
17760 if (i64Len > -8 && i64Len < 8)
17761 return RT_ABS(i64Len);
17762
17763 return 8;
17764 }
17765 else
17766 {
17767 if (i64Len > -16 && i64Len < 16)
17768 return RT_ABS(i64Len);
17769
17770 return 16;
17771 }
17772}
17773
17774/**
17775 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17776 */
17777static const bool g_afCmpOverride[4][4] =
17778{
17779 /* xmm1 AND xmm2/m128 invalid, xmm1 invalid BUT xmm2/m128 valid, xmm1 valid BUT xmm2/m128 invalid, unused dummy/padding for parfait */
17780 { false, false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17781 { false, false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17782 { true, false, false, false }, /* Imm8[3:2] = 10b (equal each) */
17783 { true, true, false, false }, /* Imm8[3:2] = 11b (equal ordered) */
17784};
17785
17786DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17787{
17788 if (fSrc1Valid && fSrc2Valid)
17789 return fCmpRes;
17790
17791 uint8_t const bSrc1Valid = fSrc1Valid ? 2 : 0;
17792 uint8_t const bSrc2Valid = fSrc2Valid ? 1 : 0;
17793 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17794}
17795
17796static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
17797{
17798 uint8_t bAggOp = (bImm >> 2) & 0x3;
17799 uint16_t u16Result = 0;
17800
17801 switch (bAggOp)
17802 {
17803 case 0: /* Equal any */
17804 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17805 {
17806 uint16_t u16Res = 0;
17807 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
17808 {
17809 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17810 idxSrc1 < idxLen1,
17811 idxSrc2 < idxLen2,
17812 bAggOp))
17813 {
17814 u16Res = RT_BIT(idxSrc2);
17815 break;
17816 }
17817 }
17818
17819 u16Result |= u16Res;
17820 }
17821 break;
17822
17823 case 1: /* Ranges */
17824 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17825 {
17826 uint16_t u16Res = 0;
17827 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
17828 {
17829 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17830 idxSrc1 < idxLen1,
17831 idxSrc2 < idxLen2,
17832 bAggOp)
17833 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
17834 (idxSrc1 + 1) < idxLen1,
17835 idxSrc2 < idxLen2,
17836 bAggOp))
17837 {
17838 u16Res = RT_BIT(idxSrc2);
17839 break;
17840 }
17841 }
17842
17843 u16Result |= u16Res;
17844 }
17845 break;
17846
17847 case 2: /* Equal each */
17848 for (uint8_t i = 0; i < cElems; i++)
17849 {
17850 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
17851 i < idxLen1,
17852 i < idxLen2,
17853 bAggOp))
17854 u16Result |= RT_BIT(i);
17855 }
17856 break;
17857
17858 case 3: /* Equal ordered */
17859 u16Result = 0;
17860 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17861 {
17862 uint16_t u16Res = RT_BIT(idxSrc2);
17863 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
17864 {
17865 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
17866 idxSrc1 < idxLen1,
17867 k < idxLen2,
17868 bAggOp))
17869 {
17870 u16Res = 0;
17871 break;
17872 }
17873 }
17874
17875 u16Result |= u16Res;
17876 }
17877 break;
17878 }
17879
17880 /* Polarity selection. */
17881 switch ((bImm >> 4) & 0x3)
17882 {
17883 case 0:
17884 case 2:
17885 /* Nothing to do. */
17886 break;
17887 case 1:
17888 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
17889 break;
17890 case 3:
17891 u16Result ^= RT_BIT(idxLen2) - 1;
17892 break;
17893 default:
17894 AssertReleaseFailed();
17895 }
17896
17897 return u16Result;
17898}
17899
17900DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
17901{
17902 uint32_t fEFlags = 0;
17903
17904 if (u16Result)
17905 fEFlags |= X86_EFL_CF;
17906 if (cLen2 < cElems)
17907 fEFlags |= X86_EFL_ZF;
17908 if (cLen1 < cElems)
17909 fEFlags |= X86_EFL_SF;
17910 if (u16Result & 0x1)
17911 fEFlags |= X86_EFL_OF;
17912 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
17913}
17914
17915DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
17916 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
17917{
17918 bool afCmpRes[16][16];
17919 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17920
17921 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
17922 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
17923 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
17924
17925 return u16Result;
17926}
17927
17928DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17929{
17930 if (bImm & RT_BIT(6))
17931 {
17932 /* Index for MSB set. */
17933 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
17934 if (idxMsb)
17935 *pu32Ecx = idxMsb - 1;
17936 else
17937 *pu32Ecx = cElems;
17938 }
17939 else
17940 {
17941 /* Index for LSB set. */
17942 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
17943 if (idxLsb)
17944 *pu32Ecx = idxLsb - 1;
17945 else
17946 *pu32Ecx = cElems;
17947 }
17948}
17949
17950IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17951{
17952 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17953 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17954 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17955
17956 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17957 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17958}
17959
17960
17961/**
17962 * [V]PCMPESTRI
17963 */
17964IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17965{
17966 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17967 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17968 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17969
17970 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17971 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17972}
17973
17974
17975/**
17976 * [V]PCMPISTRM
17977 */
17978DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17979{
17980 if (bImm & RT_BIT(6))
17981 {
17982 /* Generate a mask. */
17983 if (cElems == 8)
17984 {
17985 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17986 if (u16Result & RT_BIT(i))
17987 puDst->au16[i] = 0xffff;
17988 else
17989 puDst->au16[i] = 0;
17990 }
17991 else
17992 {
17993 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17994 if (u16Result & RT_BIT(i))
17995 puDst->au8[i] = 0xff;
17996 else
17997 puDst->au8[i] = 0;
17998 }
17999 }
18000 else
18001 {
18002 /* Store the result. */
18003 puDst->au64[0] = u16Result;
18004 puDst->au64[1] = 0;
18005 }
18006}
18007
18008IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18009{
18010 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18011 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18012 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18013
18014 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18015 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18016}
18017
18018
18019/**
18020 * [V]PCMPESTRM
18021 */
18022IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18023{
18024 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18025 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18026 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18027
18028 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18029 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18030}
18031
18032
18033/*
18034 * [V]PCLMULQDQ
18035 */
18036IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18037{
18038 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
18039}
18040
18041
18042IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18043{
18044 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
18045 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
18046
18047 puDst->au64[0] = 0;
18048 puDst->au64[1] = 0;
18049
18050 /*
18051 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
18052 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
18053 * and squeeze out some optimizations.
18054 */
18055 if (uSrc1 & 0x1)
18056 puDst->au64[0] = uSrc2;
18057
18058 uSrc1 >>= 1;
18059
18060 uint8_t iDigit = 1;
18061 while (uSrc1)
18062 {
18063 if (uSrc1 & 0x1)
18064 {
18065 puDst->au64[0] ^= (uSrc2 << iDigit);
18066 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
18067 }
18068
18069 uSrc1 >>= 1;
18070 iDigit++;
18071 }
18072}
18073
18074
18075/**
18076 * [V]MOVMSKPS
18077 */
18078#ifdef IEM_WITHOUT_ASSEMBLY
18079IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18080{
18081 *pu8Dst = puSrc->au32[0] >> 31;
18082 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18083 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18084 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18085}
18086
18087#endif
18088
18089IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18090{
18091 *pu8Dst = puSrc->au32[0] >> 31;
18092 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18093 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18094 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18095}
18096
18097
18098IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18099{
18100 *pu8Dst = puSrc->au32[0] >> 31;
18101 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18102 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18103 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18104 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
18105 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
18106 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
18107 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
18108}
18109
18110
18111/**
18112 * [V]MOVMSKPD
18113 */
18114#ifdef IEM_WITHOUT_ASSEMBLY
18115IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18116{
18117 *pu8Dst = puSrc->au64[0] >> 63;
18118 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18119}
18120
18121#endif
18122
18123IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18124{
18125 *pu8Dst = puSrc->au64[0] >> 63;
18126 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18127}
18128
18129
18130IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18131{
18132 *pu8Dst = puSrc->au64[0] >> 63;
18133 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18134 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
18135 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
18136}
18137
18138
18139/**
18140 * CVTTSD2SI
18141 */
18142#ifdef IEM_WITHOUT_ASSEMBLY
18143IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttsd2si_i32_r64,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint64_t *pu64Src))
18144{
18145 RTFLOAT64U r64Src;
18146
18147 r64Src.u = *pu64Src;
18148 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18149
18150 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18151 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18152 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18153}
18154
18155
18156IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttsd2si_i64_r64,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint64_t *pu64Src))
18157{
18158 RTFLOAT64U r64Src;
18159
18160 r64Src.u = *pu64Src;
18161 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18162
18163 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18164 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18165 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18166}
18167#endif
18168
18169
18170/**
18171 * CVTSD2SI
18172 */
18173#ifdef IEM_WITHOUT_ASSEMBLY
18174IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2si_i32_r64,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint64_t *pu64Src))
18175{
18176 RTFLOAT64U r64Src;
18177
18178 r64Src.u = *pu64Src;
18179 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18180
18181 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18182 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18183 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18184}
18185
18186
18187IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2si_i64_r64,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint64_t *pu64Src))
18188{
18189 RTFLOAT64U r64Src;
18190
18191 r64Src.u = *pu64Src;
18192 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18193
18194 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18195 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18196 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18197}
18198#endif
18199
18200
18201/**
18202 * CVTTSS2SI
18203 */
18204#ifdef IEM_WITHOUT_ASSEMBLY
18205IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttss2si_i32_r32,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint32_t *pu32Src))
18206{
18207 RTFLOAT32U r32Src;
18208
18209 r32Src.u = *pu32Src;
18210 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18211
18212 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18213 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18214 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18215}
18216
18217
18218IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttss2si_i64_r32,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint32_t *pu32Src))
18219{
18220 RTFLOAT32U r32Src;
18221
18222 r32Src.u = *pu32Src;
18223 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18224
18225 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18226 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18227 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18228}
18229#endif
18230
18231
18232/**
18233 * CVTSS2SI
18234 */
18235#ifdef IEM_WITHOUT_ASSEMBLY
18236IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2si_i32_r32,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint32_t *pu32Src))
18237{
18238 RTFLOAT32U r32Src;
18239
18240 r32Src.u = *pu32Src;
18241 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18242
18243 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18244 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18245 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18246}
18247
18248
18249IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2si_i64_r32,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint32_t *pu32Src))
18250{
18251 RTFLOAT32U r32Src;
18252
18253 r32Src.u = *pu32Src;
18254 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18255
18256 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18257 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18258 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18259}
18260#endif
18261
18262
18263/**
18264 * CVTSI2SD
18265 */
18266#ifdef IEM_WITHOUT_ASSEMBLY
18267IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2sd_r64_i32,(uint32_t uMxCsrIn, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
18268{
18269 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18270 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
18271 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, uMxCsrIn);
18272}
18273
18274
18275IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2sd_r64_i64,(uint32_t uMxCsrIn, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
18276{
18277 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18278 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
18279 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, uMxCsrIn);
18280}
18281#endif
18282
18283
18284/**
18285 * CVTSI2SS
18286 */
18287#ifdef IEM_WITHOUT_ASSEMBLY
18288IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2ss_r32_i32,(uint32_t uMxCsrIn, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
18289{
18290 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18291 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
18292 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, uMxCsrIn);
18293}
18294
18295
18296IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2ss_r32_i64,(uint32_t uMxCsrIn, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
18297{
18298 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18299 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
18300 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, uMxCsrIn);
18301}
18302#endif
18303
18304
18305/**
18306 * [V]UCOMISS
18307 */
18308#ifdef IEM_WITHOUT_ASSEMBLY
18309IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ucomiss_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18310{
18311 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18312
18313 if (RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc2))
18314 {
18315 uMxCsrIn |= X86_MXCSR_IE;
18316 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18317 }
18318 else if (RTFLOAT32U_IS_QUIET_NAN(&uSrc1) || RTFLOAT32U_IS_QUIET_NAN(&uSrc2))
18319 {
18320 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18321 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18322 }
18323 else
18324 {
18325 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18326
18327 RTFLOAT32U r32Src1, r32Src2;
18328 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, uMxCsrIn, &uSrc1);
18329 fDe |= iemSsePrepareValueR32(&r32Src2, uMxCsrIn, &uSrc2);
18330
18331 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18332 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18333 if (f32_eq(f32Src1, f32Src2, &SoftState))
18334 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18335 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18336 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18337 /* else: GREATER_THAN 000 */
18338
18339 uMxCsrIn |= fDe;
18340 }
18341
18342 *pfEFlags = fEFlagsNew;
18343 return uMxCsrIn;
18344}
18345#endif
18346
18347IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vucomiss_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18348{
18349 return iemAImpl_ucomiss_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18350}
18351
18352
18353/**
18354 * [V]UCOMISD
18355 */
18356#ifdef IEM_WITHOUT_ASSEMBLY
18357IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ucomisd_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18358{
18359 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18360
18361 if (RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc2))
18362 {
18363 uMxCsrIn |= X86_MXCSR_IE;
18364 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18365 }
18366 else if (RTFLOAT64U_IS_QUIET_NAN(&uSrc1) || RTFLOAT64U_IS_QUIET_NAN(&uSrc2))
18367 {
18368 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18369 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18370 }
18371 else
18372 {
18373 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18374
18375 RTFLOAT64U r64Src1, r64Src2;
18376 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, uMxCsrIn, &uSrc1)
18377 | iemSsePrepareValueR64(&r64Src2, uMxCsrIn, &uSrc2);
18378
18379 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18380 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18381 if (f64_eq(f64Src1, f64Src2, &SoftState))
18382 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18383 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18384 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18385 /* else: GREATER_THAN 000 */
18386
18387 uMxCsrIn |= fDe;
18388 }
18389
18390 *pfEFlags = fEFlagsNew;
18391 return uMxCsrIn;
18392}
18393#endif
18394
18395IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vucomisd_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18396{
18397 return iemAImpl_ucomisd_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18398}
18399
18400
18401/**
18402 * [V]COMISS
18403 */
18404#ifdef IEM_WITHOUT_ASSEMBLY
18405IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_comiss_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18406{
18407 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18408
18409 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc2)
18410 || RTFLOAT32U_IS_QUIET_NAN(&uSrc1) || RTFLOAT32U_IS_QUIET_NAN(&uSrc2))
18411 {
18412 uMxCsrIn |= X86_MXCSR_IE;
18413 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18414 }
18415 else
18416 {
18417 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18418
18419 RTFLOAT32U r32Src1, r32Src2;
18420 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, uMxCsrIn, &uSrc1)
18421 | iemSsePrepareValueR32(&r32Src2, uMxCsrIn, &uSrc2);
18422
18423 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18424 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18425 if (f32_eq(f32Src1, f32Src2, &SoftState))
18426 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18427 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18428 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18429 /* else: GREATER_THAN 000 */
18430
18431 uMxCsrIn |= fDe;
18432 }
18433
18434 *pfEFlags = fEFlagsNew;
18435 return uMxCsrIn;
18436}
18437#endif
18438
18439
18440IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vcomiss_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18441{
18442 return iemAImpl_comiss_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18443}
18444
18445
18446/**
18447 * [V]COMISD
18448 */
18449#ifdef IEM_WITHOUT_ASSEMBLY
18450IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_comisd_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18451{
18452 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18453
18454 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc2)
18455 || RTFLOAT64U_IS_QUIET_NAN(&uSrc1) || RTFLOAT64U_IS_QUIET_NAN(&uSrc2))
18456 {
18457 uMxCsrIn |= X86_MXCSR_IE;
18458 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18459 }
18460 else
18461 {
18462 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18463
18464 RTFLOAT64U r64Src1, r64Src2;
18465 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, uMxCsrIn, &uSrc1);
18466 fDe |= iemSsePrepareValueR64(&r64Src2, uMxCsrIn, &uSrc2);
18467
18468 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18469 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18470 if (f64_eq(f64Src1, f64Src2, &SoftState))
18471 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18472 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18473 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18474 /* else: GREATER_THAN 000 */
18475
18476 uMxCsrIn |= fDe;
18477 }
18478
18479 *pfEFlags = fEFlagsNew;
18480 return uMxCsrIn;
18481}
18482#endif
18483
18484IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vcomisd_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18485{
18486 return iemAImpl_comisd_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18487}
18488
18489
18490/**
18491 * CMPPS / CMPPD / CMPSS / CMPSD
18492 */
18493#ifdef IEM_WITHOUT_ASSEMBLY
18494/**
18495 * A compare truth table entry.
18496 */
18497typedef struct CMPTRUTHTBLENTRY
18498{
18499 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
18500 bool fSignalsOnQNan;
18501 /** The boolean result when the input operands are unordered. */
18502 bool fUnordered;
18503 /** The boolean result when A = B. */
18504 bool fEqual;
18505 /** The boolean result when A < B. */
18506 bool fLowerThan;
18507 /** The boolean result when A > B. */
18508 bool fGreaterThan;
18509} CMPTRUTHTBLENTRY;
18510/** Pointer to a const truth table entry. */
18511typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
18512
18513
18514/** The compare truth table (indexed by immediate). */
18515static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
18516{
18517 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
18518 /* 00H (EQ_OQ) */ { false, false, true, false, false },
18519 /* 01H (LT_OS) */ { true, false, false, true, false },
18520 /* 02H (LE_OS) */ { true, false, true, true, false },
18521 /* 03H (UNORD_Q) */ { false, true, false, false, false },
18522 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
18523 /* 05H (NLT_US) */ { true, true, true, false, true },
18524 /* 06H (NLE_US) */ { true, true, false, false, true },
18525 /* 07H (ORQ_Q) */ { false, false, true, true, true },
18526 /** @todo AVX variants. */
18527};
18528
18529
18530static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
18531{
18532 bool fRes;
18533 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18534
18535 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
18536 {
18537 *pfMxcsr |= X86_MXCSR_IE;
18538 fRes = g_aCmpTbl[bEvil].fUnordered;
18539 }
18540 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
18541 {
18542 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18543 *pfMxcsr |= X86_MXCSR_IE;
18544 fRes = g_aCmpTbl[bEvil].fUnordered;
18545 }
18546 else
18547 {
18548 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18549
18550 RTFLOAT32U r32Src1, r32Src2;
18551 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
18552 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
18553
18554 *pfMxcsr |= fDe;
18555 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18556 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18557 if (f32_eq(f32Src1, f32Src2, &SoftState))
18558 fRes = g_aCmpTbl[bEvil].fEqual;
18559 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18560 fRes = g_aCmpTbl[bEvil].fLowerThan;
18561 else
18562 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18563 }
18564
18565 return fRes;
18566}
18567
18568
18569static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
18570{
18571 bool fRes;
18572 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18573
18574 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
18575 {
18576 *pfMxcsr |= X86_MXCSR_IE;
18577 fRes = g_aCmpTbl[bEvil].fUnordered;
18578 }
18579 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
18580 {
18581 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18582 *pfMxcsr |= X86_MXCSR_IE;
18583 fRes = g_aCmpTbl[bEvil].fUnordered;
18584 }
18585 else
18586 {
18587 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18588
18589 RTFLOAT64U r64Src1, r64Src2;
18590 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
18591 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
18592
18593 *pfMxcsr |= fDe;
18594 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18595 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18596 if (f64_eq(f64Src1, f64Src2, &SoftState))
18597 fRes = g_aCmpTbl[bEvil].fEqual;
18598 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18599 fRes = g_aCmpTbl[bEvil].fLowerThan;
18600 else
18601 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18602 }
18603
18604 return fRes;
18605}
18606
18607
18608IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpps_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18609{
18610 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18611 {
18612 if (iemAImpl_cmp_worker_r32(&uMxCsrIn, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18613 puDst->au32[i] = UINT32_MAX;
18614 else
18615 puDst->au32[i] = 0;
18616 }
18617
18618 return uMxCsrIn;
18619}
18620
18621
18622IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmppd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18623{
18624 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18625 {
18626 if (iemAImpl_cmp_worker_r64(&uMxCsrIn, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18627 puDst->au64[i] = UINT64_MAX;
18628 else
18629 puDst->au64[i] = 0;
18630 }
18631
18632 return uMxCsrIn;
18633}
18634
18635
18636IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpss_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18637{
18638 if (iemAImpl_cmp_worker_r32(&uMxCsrIn, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18639 puDst->au32[0] = UINT32_MAX;
18640 else
18641 puDst->au32[0] = 0;
18642
18643 puDst->au32[1] = pSrc->uSrc1.au32[1];
18644 puDst->au64[1] = pSrc->uSrc1.au64[1];
18645 return uMxCsrIn;
18646}
18647
18648
18649IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpsd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18650{
18651 if (iemAImpl_cmp_worker_r64(&uMxCsrIn, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18652 puDst->au64[0] = UINT64_MAX;
18653 else
18654 puDst->au64[0] = 0;
18655
18656 puDst->au64[1] = pSrc->uSrc1.au64[1];
18657 return uMxCsrIn;
18658}
18659#endif
18660
18661
18662/**
18663 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18664 */
18665
18666#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18667#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18668#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18669
18670#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18671
18672DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18673{
18674 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18675 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18676
18677 fMxcsr &= ~X86_MXCSR_RC_MASK;
18678 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18679 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18680}
18681
18682static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18683{
18684 RTFLOAT32U r32Src, r32Dst;
18685 float32_t f32Src;
18686 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18687 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18688
18689 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18690 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18691
18692 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18693 return r32Dst;
18694}
18695
18696static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18697{
18698 RTFLOAT64U r64Src, r64Dst;
18699 float64_t f64Src;
18700 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18701 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18702
18703 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18704 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18705
18706 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18707 return r64Dst;
18708}
18709
18710#ifdef IEM_WITHOUT_ASSEMBLY
18711IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundss_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18712{
18713 puDst->ar32[0] = iemAImpl_round_worker_r32(&uMxCsrIn, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18714 puDst->au32[1] = pSrc->uSrc1.au32[1];
18715 puDst->au64[1] = pSrc->uSrc1.au64[1];
18716 return uMxCsrIn;
18717}
18718
18719
18720IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundsd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18721{
18722 puDst->ar64[0] = iemAImpl_round_worker_r64(&uMxCsrIn, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18723 puDst->au64[1] = pSrc->uSrc1.au64[1];
18724 return uMxCsrIn;
18725}
18726#endif
18727
18728IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundps_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18729{
18730 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18731 {
18732 puDst->ar32[i] = iemAImpl_round_worker_r32(&uMxCsrIn, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18733 }
18734
18735 return uMxCsrIn;
18736}
18737
18738
18739IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundpd_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18740{
18741 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18742 {
18743 puDst->ar64[i] = iemAImpl_round_worker_r64(&uMxCsrIn, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18744 }
18745
18746 return uMxCsrIn;
18747}
18748
18749/**
18750 * CVTPD2PI
18751 */
18752#ifdef IEM_WITHOUT_ASSEMBLY
18753static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18754{
18755 RTFLOAT64U r64Src;
18756 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18757
18758 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18759 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18760 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18761}
18762
18763
18764IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18765{
18766 RTUINT64U u64Res;
18767 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[0], &pSrc->ar64[0]);
18768 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[1], &pSrc->ar64[1]);
18769
18770 *pu64Dst = u64Res.u;
18771 return fMxcsrOut;
18772}
18773#endif
18774
18775
18776/**
18777 * CVTTPD2PI
18778 */
18779#ifdef IEM_WITHOUT_ASSEMBLY
18780static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18781{
18782 RTFLOAT64U r64Src;
18783 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18784
18785 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18786 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18787 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18788}
18789
18790
18791IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttpd2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18792{
18793 RTUINT64U u64Res;
18794 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[0], &pSrc->ar64[0]);
18795 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[1], &pSrc->ar64[1]);
18796
18797 *pu64Dst = u64Res.u;
18798 return fMxcsrOut;
18799}
18800#endif
18801
18802
18803/**
18804 * CVTPI2PS
18805 */
18806#ifdef IEM_WITHOUT_ASSEMBLY
18807static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
18808{
18809 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18810 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
18811 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
18812}
18813
18814
18815IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpi2ps_u128,(uint32_t fMxCsrIn, PX86XMMREG pDst, uint64_t u64Src))
18816{
18817 RTUINT64U uSrc = { u64Src };
18818 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(fMxCsrIn, &pDst->ar32[0], uSrc.ai32[0]);
18819 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(fMxCsrIn, &pDst->ar32[1], uSrc.ai32[1]);
18820 return fMxcsrOut;
18821}
18822#endif
18823
18824
18825/**
18826 * CVTPI2PD
18827 */
18828#ifdef IEM_WITHOUT_ASSEMBLY
18829static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
18830{
18831 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18832 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
18833 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
18834}
18835
18836
18837IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpi2pd_u128,(uint32_t fMxCsrIn, PX86XMMREG pDst, uint64_t u64Src))
18838{
18839 RTUINT64U uSrc = { u64Src };
18840 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(fMxCsrIn, &pDst->ar64[0], uSrc.ai32[0]);
18841 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(fMxCsrIn, &pDst->ar64[1], uSrc.ai32[1]);
18842 return fMxcsrOut;
18843}
18844#endif
18845
18846
18847/**
18848 * CVTPS2PI
18849 */
18850#ifdef IEM_WITHOUT_ASSEMBLY
18851static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18852{
18853 RTFLOAT32U r32Src;
18854 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18855
18856 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18857 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18858 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18859}
18860
18861
18862IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, uint64_t u64Src))
18863{
18864 RTUINT64U uDst;
18865 RTUINT64U uSrc = { u64Src };
18866 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(fMxCsrIn, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18867 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(fMxCsrIn, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18868 *pu64Dst = uDst.u;
18869 return fMxcsrOut;
18870}
18871#endif
18872
18873
18874/**
18875 * CVTTPS2PI
18876 */
18877#ifdef IEM_WITHOUT_ASSEMBLY
18878static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18879{
18880 RTFLOAT32U r32Src;
18881 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18882
18883 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18884 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18885 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18886}
18887
18888
18889IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttps2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, uint64_t u64Src))
18890{
18891 RTUINT64U uDst;
18892 RTUINT64U uSrc = { u64Src };
18893 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(fMxCsrIn, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18894 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(fMxCsrIn, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18895 *pu64Dst = uDst.u;
18896 return fMxcsrOut;
18897}
18898#endif
18899
18900/**
18901 * RDRAND
18902 */
18903IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18904{
18905 *puDst = 0;
18906 *pEFlags &= ~X86_EFL_STATUS_BITS;
18907 *pEFlags |= X86_EFL_CF;
18908}
18909
18910IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18911{
18912 *puDst = 0;
18913 *pEFlags &= ~X86_EFL_STATUS_BITS;
18914 *pEFlags |= X86_EFL_CF;
18915}
18916
18917IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18918{
18919 *puDst = 0;
18920 *pEFlags &= ~X86_EFL_STATUS_BITS;
18921 *pEFlags |= X86_EFL_CF;
18922}
18923
18924/**
18925 * RDSEED
18926 */
18927IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18928{
18929 *puDst = 0;
18930 *pEFlags &= ~X86_EFL_STATUS_BITS;
18931 *pEFlags |= X86_EFL_CF;
18932}
18933
18934IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18935{
18936 *puDst = 0;
18937 *pEFlags &= ~X86_EFL_STATUS_BITS;
18938 *pEFlags |= X86_EFL_CF;
18939}
18940
18941IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18942{
18943 *puDst = 0;
18944 *pEFlags &= ~X86_EFL_STATUS_BITS;
18945 *pEFlags |= X86_EFL_CF;
18946}
18947
18948
18949/**
18950 * SHA1NEXTE
18951 */
18952IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18953{
18954 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
18955
18956 puDst->au32[0] = puSrc->au32[0];
18957 puDst->au32[1] = puSrc->au32[1];
18958 puDst->au32[2] = puSrc->au32[2];
18959 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
18960}
18961
18962/**
18963 * SHA1MSG1
18964 */
18965IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18966{
18967 uint32_t u32W0 = puDst->au32[3];
18968 uint32_t u32W1 = puDst->au32[2];
18969 uint32_t u32W2 = puDst->au32[1];
18970 uint32_t u32W3 = puDst->au32[0];
18971 uint32_t u32W4 = puSrc->au32[3];
18972 uint32_t u32W5 = puSrc->au32[2];
18973
18974 puDst->au32[3] = u32W2 ^ u32W0;
18975 puDst->au32[2] = u32W3 ^ u32W1;
18976 puDst->au32[1] = u32W4 ^ u32W2;
18977 puDst->au32[0] = u32W5 ^ u32W3;
18978}
18979
18980/**
18981 * SHA1MSG2
18982 */
18983IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18984{
18985 uint32_t u32W13 = puSrc->au32[2];
18986 uint32_t u32W14 = puSrc->au32[1];
18987 uint32_t u32W15 = puSrc->au32[0];
18988 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
18989 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
18990 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
18991 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
18992
18993 puDst->au32[3] = u32W16;
18994 puDst->au32[2] = u32W17;
18995 puDst->au32[1] = u32W18;
18996 puDst->au32[0] = u32W19;
18997}
18998
18999/**
19000 * SHA1RNDS4
19001 */
19002typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
19003typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
19004
19005static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19006{
19007 return (u32B & u32C) ^ (~u32B & u32D);
19008}
19009
19010static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19011{
19012 return u32B ^ u32C ^ u32D;
19013}
19014
19015static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19016{
19017 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
19018}
19019
19020static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19021{
19022 return u32B ^ u32C ^ u32D;
19023}
19024
19025IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19026{
19027 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
19028 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
19029
19030 uint32_t au32A[5];
19031 uint32_t au32B[5];
19032 uint32_t au32C[5];
19033 uint32_t au32D[5];
19034 uint32_t au32E[5];
19035 uint32_t au32W[4];
19036 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
19037 uint32_t u32K = s_au32K[bEvil & 0x3];
19038
19039 au32A[0] = puDst->au32[3];
19040 au32B[0] = puDst->au32[2];
19041 au32C[0] = puDst->au32[1];
19042 au32D[0] = puDst->au32[0];
19043 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
19044 au32W[i] = puSrc->au32[3 - i];
19045
19046 /* Round 0 is a bit different than the other rounds. */
19047 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
19048 au32B[1] = au32A[0];
19049 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
19050 au32D[1] = au32C[0];
19051 au32E[1] = au32D[0];
19052
19053 for (uint32_t i = 1; i <= 3; i++)
19054 {
19055 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
19056 au32B[i + 1] = au32A[i];
19057 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
19058 au32D[i + 1] = au32C[i];
19059 au32E[i + 1] = au32D[i];
19060 }
19061
19062 puDst->au32[3] = au32A[4];
19063 puDst->au32[2] = au32B[4];
19064 puDst->au32[1] = au32C[4];
19065 puDst->au32[0] = au32D[4];
19066}
19067
19068
19069/**
19070 * SHA256MSG1
19071 */
19072DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
19073{
19074 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
19075}
19076
19077IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19078{
19079 uint32_t u32W4 = puSrc->au32[0];
19080 uint32_t u32W3 = puDst->au32[3];
19081 uint32_t u32W2 = puDst->au32[2];
19082 uint32_t u32W1 = puDst->au32[1];
19083 uint32_t u32W0 = puDst->au32[0];
19084
19085 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
19086 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
19087 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
19088 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
19089}
19090
19091/**
19092 * SHA256MSG2
19093 */
19094DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
19095{
19096 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
19097}
19098
19099IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19100{
19101 uint32_t u32W14 = puSrc->au32[2];
19102 uint32_t u32W15 = puSrc->au32[3];
19103 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
19104 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
19105 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
19106 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
19107
19108 puDst->au32[3] = u32W19;
19109 puDst->au32[2] = u32W18;
19110 puDst->au32[1] = u32W17;
19111 puDst->au32[0] = u32W16;
19112}
19113
19114/**
19115 * SHA256RNDS2
19116 */
19117DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19118{
19119 return (u32X & u32Y) ^ (~u32X & u32Z);
19120}
19121
19122DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19123{
19124 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
19125}
19126
19127DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
19128{
19129 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
19130}
19131
19132DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
19133{
19134 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
19135}
19136
19137IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
19138{
19139 uint32_t au32A[3];
19140 uint32_t au32B[3];
19141 uint32_t au32C[3];
19142 uint32_t au32D[3];
19143 uint32_t au32E[3];
19144 uint32_t au32F[3];
19145 uint32_t au32G[3];
19146 uint32_t au32H[3];
19147 uint32_t au32WK[2];
19148
19149 au32A[0] = puSrc->au32[3];
19150 au32B[0] = puSrc->au32[2];
19151 au32C[0] = puDst->au32[3];
19152 au32D[0] = puDst->au32[2];
19153 au32E[0] = puSrc->au32[1];
19154 au32F[0] = puSrc->au32[0];
19155 au32G[0] = puDst->au32[1];
19156 au32H[0] = puDst->au32[0];
19157
19158 au32WK[0] = puXmm0Constants->au32[0];
19159 au32WK[1] = puXmm0Constants->au32[1];
19160
19161 for (uint32_t i = 0; i < 2; i++)
19162 {
19163 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19164 + iemAImpl_sha256_upper_sigma1(au32E[i])
19165 + au32WK[i]
19166 + au32H[i]
19167 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
19168 + iemAImpl_sha256_upper_sigma0(au32A[i]);
19169 au32B[i + 1] = au32A[i];
19170 au32C[i + 1] = au32B[i];
19171 au32D[i + 1] = au32C[i];
19172 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19173 + iemAImpl_sha256_upper_sigma1(au32E[i])
19174 + au32WK[i]
19175 + au32H[i]
19176 + au32D[i];
19177 au32F[i + 1] = au32E[i];
19178 au32G[i + 1] = au32F[i];
19179 au32H[i + 1] = au32G[i];
19180 }
19181
19182 puDst->au32[3] = au32A[2];
19183 puDst->au32[2] = au32B[2];
19184 puDst->au32[1] = au32E[2];
19185 puDst->au32[0] = au32F[2];
19186}
19187
19188
19189/**
19190 * ADCX
19191 */
19192#define ADX_EMIT(a_Flag, a_Type, a_Max) \
19193 do \
19194 { \
19195 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
19196 a_Type uTmp = *puDst + uSrc; \
19197 if (uTmp < uSrc) \
19198 *pfEFlags |= (a_Flag); \
19199 else \
19200 *pfEFlags &= ~(a_Flag); \
19201 if ( uTmp == a_Max \
19202 && f) \
19203 *pfEFlags |= (a_Flag); \
19204 if (f) \
19205 uTmp++; \
19206 *puDst = uTmp; \
19207 } \
19208 while (0)
19209
19210IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19211{
19212 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19213}
19214
19215IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19216{
19217 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19218}
19219
19220# if defined(IEM_WITHOUT_ASSEMBLY)
19221
19222IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19223{
19224 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19225}
19226
19227IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19228{
19229 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19230}
19231
19232#endif
19233
19234
19235/**
19236 * ADOX
19237 */
19238IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19239{
19240 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19241}
19242
19243IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19244{
19245 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19246}
19247
19248# if defined(IEM_WITHOUT_ASSEMBLY)
19249
19250IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19251{
19252 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19253}
19254
19255IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19256{
19257 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19258}
19259
19260# endif
19261
19262
19263/**
19264 * MPSADBW
19265 */
19266IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19267{
19268 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19269 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19270 int16_t ai16Src1[11];
19271 int16_t ai16Src2[4];
19272
19273 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19274 ai16Src1[i] = puDst->au8[idxSrc1 + i];
19275
19276 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19277 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
19278
19279 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19280 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19281 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19282 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19283 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19284}
19285
19286
19287IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
19288{
19289 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19290 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19291 int16_t ai16Src1[11];
19292 int16_t ai16Src2[4];
19293
19294 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19295 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
19296
19297 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19298 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
19299
19300 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19301 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19302 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19303 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19304 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19305}
19306
19307
19308IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
19309{
19310 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
19311 RTUINT256U const uSrc2 = *puSrc2;
19312 ASMCompilerBarrier();
19313 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
19314 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
19315}
19316
19317
19318/**
19319 * VPERM2I128
19320 */
19321IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19322{
19323 if (bImm & RT_BIT(3))
19324 {
19325 puDst->au64[0] = 0;
19326 puDst->au64[1] = 0;
19327 }
19328 else
19329 {
19330 switch (bImm & 0x3)
19331 {
19332 case 0:
19333 puDst->au64[0] = puSrc1->au64[0];
19334 puDst->au64[1] = puSrc1->au64[1];
19335 break;
19336 case 1:
19337 puDst->au64[0] = puSrc1->au64[2];
19338 puDst->au64[1] = puSrc1->au64[3];
19339 break;
19340 case 2:
19341 puDst->au64[0] = puSrc2->au64[0];
19342 puDst->au64[1] = puSrc2->au64[1];
19343 break;
19344 case 3:
19345 puDst->au64[0] = puSrc2->au64[2];
19346 puDst->au64[1] = puSrc2->au64[3];
19347 break;
19348 }
19349 }
19350
19351 if (bImm & RT_BIT(7))
19352 {
19353 puDst->au64[2] = 0;
19354 puDst->au64[3] = 0;
19355 }
19356 else
19357 {
19358 switch ((bImm >> 4) & 0x3)
19359 {
19360 case 0:
19361 puDst->au64[2] = puSrc1->au64[0];
19362 puDst->au64[3] = puSrc1->au64[1];
19363 break;
19364 case 1:
19365 puDst->au64[2] = puSrc1->au64[2];
19366 puDst->au64[3] = puSrc1->au64[3];
19367 break;
19368 case 2:
19369 puDst->au64[2] = puSrc2->au64[0];
19370 puDst->au64[3] = puSrc2->au64[1];
19371 break;
19372 case 3:
19373 puDst->au64[2] = puSrc2->au64[2];
19374 puDst->au64[3] = puSrc2->au64[3];
19375 break;
19376 }
19377 }
19378}
19379
19380
19381/**
19382 * VPERM2F128
19383 */
19384IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19385{
19386 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
19387}
19388
19389
19390/**
19391 * DPPS
19392 */
19393IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_dpps_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19394{
19395 RT_NOREF(puDst, pSrc, bImm);
19396 AssertReleaseFailed();
19397 return uMxCsrIn;
19398}
19399
19400
19401/**
19402 * DPPD
19403 */
19404IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_dppd_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19405{
19406 RT_NOREF(puDst, pSrc, bImm);
19407 AssertReleaseFailed();
19408 return uMxCsrIn;
19409}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette